xref: /titanic_41/usr/src/uts/common/xen/io/xdf.c (revision 90685d2c52744c6540828f16cdd2db815d467e37)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include "xdf.h"
37 
38 #define	FLUSH_DISKCACHE	0x1
39 #define	WRITE_BARRIER	0x2
40 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
41 #define	USE_WRITE_BARRIER(vdp)				\
42 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
43 #define	USE_FLUSH_DISKCACHE(vdp)			\
44 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
45 #define	IS_WRITE_BARRIER(vdp, bp)			\
46 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
47 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
48 #define	IS_FLUSH_DISKCACHE(bp)				\
49 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
50 
51 static void *vbd_ss;
52 static kmem_cache_t *xdf_vreq_cache;
53 static kmem_cache_t *xdf_gs_cache;
54 static int xdf_maxphys = XB_MAXPHYS;
55 int xdfdebug = 0;
56 extern int do_polled_io;
57 diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
58 int	xdf_barrier_flush_disable = 0;
59 
60 /*
61  * dev_ops and cb_ops entrypoints
62  */
63 static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
64 static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
65 static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
66 static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
67 static int xdf_open(dev_t *, int, int, cred_t *);
68 static int xdf_close(dev_t, int, int, struct cred *);
69 static int xdf_strategy(struct buf *);
70 static int xdf_read(dev_t, struct uio *, cred_t *);
71 static int xdf_aread(dev_t, struct aio_req *, cred_t *);
72 static int xdf_write(dev_t, struct uio *, cred_t *);
73 static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
74 static int xdf_dump(dev_t, caddr_t, daddr_t, int);
75 static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
76 static uint_t xdf_intr(caddr_t);
77 static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
78     caddr_t, int *);
79 
80 /*
81  * misc private functions
82  */
83 static int xdf_suspend(dev_info_t *);
84 static int xdf_resume(dev_info_t *);
85 static int xdf_start_connect(xdf_t *);
86 static int xdf_start_disconnect(xdf_t *);
87 static int xdf_post_connect(xdf_t *);
88 static void xdf_post_disconnect(xdf_t *);
89 static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
90 static void xdf_iostart(xdf_t *);
91 static void xdf_iofini(xdf_t *, uint64_t, int);
92 static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
93 static int xdf_drain_io(xdf_t *);
94 static boolean_t xdf_isopen(xdf_t *, int);
95 static int xdf_check_state_transition(xdf_t *, XenbusState);
96 static int xdf_connect(xdf_t *, boolean_t);
97 static int xdf_dmacallback(caddr_t);
98 static void xdf_timeout_handler(void *);
99 static uint_t xdf_iorestart(caddr_t);
100 static v_req_t *vreq_get(xdf_t *, buf_t *);
101 static void vreq_free(xdf_t *, v_req_t *);
102 static int vreq_setup(xdf_t *, v_req_t *);
103 static ge_slot_t *gs_get(xdf_t *, int);
104 static void gs_free(xdf_t *, ge_slot_t *);
105 static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
106 static void unexpectedie(xdf_t *);
107 static void xdfmin(struct buf *);
108 
109 static 	struct cb_ops xdf_cbops = {
110 	xdf_open,
111 	xdf_close,
112 	xdf_strategy,
113 	nodev,
114 	xdf_dump,
115 	xdf_read,
116 	xdf_write,
117 	xdf_ioctl,
118 	nodev,
119 	nodev,
120 	nodev,
121 	nochpoll,
122 	xdf_prop_op,
123 	NULL,
124 	D_MP | D_NEW | D_64BIT,
125 	CB_REV,
126 	xdf_aread,
127 	xdf_awrite
128 };
129 
130 struct dev_ops xdf_devops = {
131 	DEVO_REV,		/* devo_rev */
132 	0,			/* devo_refcnt */
133 	xdf_getinfo,		/* devo_getinfo */
134 	nulldev,		/* devo_identify */
135 	nulldev,		/* devo_probe */
136 	xdf_attach,		/* devo_attach */
137 	xdf_detach,		/* devo_detach */
138 	xdf_reset,		/* devo_reset */
139 	&xdf_cbops,		/* devo_cb_ops */
140 	(struct bus_ops *)NULL	/* devo_bus_ops */
141 };
142 
143 static struct modldrv modldrv = {
144 	&mod_driverops,		/* Type of module.  This one is a driver */
145 	"virtual block driver %I%",	/* short description */
146 	&xdf_devops		/* driver specific ops */
147 };
148 
149 static struct modlinkage xdf_modlinkage = {
150 	MODREV_1, (void *)&modldrv, NULL
151 };
152 
153 /*
154  * I/O buffer DMA attributes
155  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
156  */
157 static ddi_dma_attr_t xb_dma_attr = {
158 	DMA_ATTR_V0,
159 	(uint64_t)0,			/* lowest address */
160 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
161 	(uint64_t)0xffffff,		/* DMA counter limit max */
162 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
163 	XB_BSIZE - 1,			/* bitmap of burst sizes */
164 	XB_BSIZE,			/* min transfer */
165 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
166 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
167 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
168 	XB_BSIZE,			/* granularity */
169 	0,				/* flags (reserved) */
170 };
171 
172 static ddi_device_acc_attr_t xc_acc_attr = {
173 	DDI_DEVICE_ATTR_V0,
174 	DDI_NEVERSWAP_ACC,
175 	DDI_STRICTORDER_ACC
176 };
177 
178 /* callbacks from commmon label */
179 
180 static int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
181 	void *);
182 static int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
183 
184 static cmlb_tg_ops_t xdf_lb_ops = {
185 	TG_DK_OPS_VERSION_1,
186 	xdf_lb_rdwr,
187 	xdf_lb_getinfo
188 };
189 
190 int
191 _init(void)
192 {
193 	int rc;
194 
195 	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) == 0) {
196 		xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
197 		    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
198 		ASSERT(xdf_vreq_cache != NULL);
199 		xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
200 		    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
201 		ASSERT(xdf_gs_cache != NULL);
202 		if ((rc = mod_install(&xdf_modlinkage)) != 0) {
203 			kmem_cache_destroy(xdf_vreq_cache);
204 			kmem_cache_destroy(xdf_gs_cache);
205 			ddi_soft_state_fini(&vbd_ss);
206 		}
207 	}
208 
209 	return (rc);
210 }
211 
212 int
213 _fini(void)
214 {
215 	int err;
216 
217 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
218 		return (err);
219 
220 	kmem_cache_destroy(xdf_vreq_cache);
221 	kmem_cache_destroy(xdf_gs_cache);
222 	ddi_soft_state_fini(&vbd_ss);
223 
224 	return (0);
225 }
226 
227 int
228 _info(struct modinfo *modinfop)
229 {
230 	return (mod_info(&xdf_modlinkage, modinfop));
231 }
232 
233 /*ARGSUSED*/
234 static int
235 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
236 {
237 	int instance;
238 	xdf_t *vbdp;
239 
240 	instance = XDF_INST(getminor((dev_t)arg));
241 
242 	switch (cmd) {
243 	case DDI_INFO_DEVT2DEVINFO:
244 		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
245 			*rp = NULL;
246 			return (DDI_FAILURE);
247 		}
248 		*rp = vbdp->xdf_dip;
249 		return (DDI_SUCCESS);
250 
251 	case DDI_INFO_DEVT2INSTANCE:
252 		*rp = (void *)(uintptr_t)instance;
253 		return (DDI_SUCCESS);
254 
255 	default:
256 		return (DDI_FAILURE);
257 	}
258 }
259 
260 static int
261 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
262 	char *name, caddr_t valuep, int *lengthp)
263 {
264 	int instance = ddi_get_instance(dip);
265 	xdf_t *vdp;
266 	diskaddr_t p_blkcnt;
267 
268 	/*
269 	 * xdf dynamic properties are device specific and size oriented.
270 	 * Requests issued under conditions where size is valid are passed
271 	 * to ddi_prop_op_nblocks with the size information, otherwise the
272 	 * request is passed to ddi_prop_op.
273 	 */
274 	vdp = ddi_get_soft_state(vbd_ss, instance);
275 
276 	if ((dev == DDI_DEV_T_ANY) || (vdp == NULL))
277 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
278 		    name, valuep, lengthp));
279 
280 	/* do cv_wait until connected or failed */
281 	mutex_enter(&vdp->xdf_dev_lk);
282 	if (xdf_connect(vdp, B_TRUE) != XD_READY) {
283 		mutex_exit(&vdp->xdf_dev_lk);
284 		goto out;
285 	}
286 	mutex_exit(&vdp->xdf_dev_lk);
287 
288 	if (cmlb_partinfo(vdp->xdf_vd_lbl, XDF_PART(getminor(dev)), &p_blkcnt,
289 	    NULL, NULL, NULL, NULL) == 0)
290 		return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
291 		    name, valuep, lengthp, (uint64_t)p_blkcnt));
292 
293 out:
294 	return (ddi_prop_op(dev, dip, prop_op, mod_flags, name, valuep,
295 	    lengthp));
296 }
297 
298 static int
299 xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
300 {
301 	xdf_t *vdp;
302 	ddi_iblock_cookie_t ibc;
303 	ddi_iblock_cookie_t softibc;
304 	int instance;
305 
306 	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
307 	    "xdfdebug", 0);
308 
309 	switch (cmd) {
310 		case DDI_ATTACH:
311 			break;
312 
313 		case DDI_RESUME:
314 			return (xdf_resume(devi));
315 
316 		default:
317 			return (DDI_FAILURE);
318 	}
319 
320 	instance = ddi_get_instance(devi);
321 	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
322 		return (DDI_FAILURE);
323 
324 	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
325 	vdp = ddi_get_soft_state(vbd_ss, instance);
326 	vdp->xdf_dip = devi;
327 	if (ddi_get_iblock_cookie(devi, 0, &ibc) != DDI_SUCCESS) {
328 		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
329 		    ddi_get_name_addr(devi));
330 		goto errout1;
331 	}
332 
333 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
334 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
335 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
336 	ddi_set_driver_private(devi, vdp);
337 
338 	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
339 	    != DDI_SUCCESS) {
340 		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
341 		    ddi_get_name_addr(devi));
342 		goto errout2;
343 	}
344 	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
345 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
346 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
347 		    ddi_get_name_addr(devi));
348 		goto errout2;
349 	}
350 
351 	/*
352 	 * create kstat for iostat(1M)
353 	 */
354 	if ((vdp->xdf_xdev_iostat = kstat_create("xdf", instance, NULL, "disk",
355 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
356 		vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
357 		kstat_install(vdp->xdf_xdev_iostat);
358 	} else {
359 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
360 		    ddi_get_name_addr(devi));
361 		goto errout3;
362 	}
363 
364 	/*
365 	 * driver handles kernel-issued IOCTLs
366 	 */
367 	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
368 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
369 		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
370 		    ddi_get_name_addr(devi));
371 		goto errout4;
372 	}
373 
374 	/*
375 	 * create default device minor nodes: non-removable disk
376 	 * we will adjust minor nodes after we are connected w/ backend
377 	 */
378 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
379 	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1, DDI_NT_BLOCK,
380 	    CMLB_FAKE_LABEL_ONE_PARTITION, vdp->xdf_vd_lbl, NULL) != 0) {
381 		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
382 		    ddi_get_name_addr(devi));
383 		goto errout5;
384 	}
385 
386 	/*
387 	 * We ship with cache-enabled disks
388 	 */
389 	vdp->xdf_wce = 1;
390 
391 	mutex_enter(&vdp->xdf_cb_lk);
392 
393 	/* Watch backend XenbusState change */
394 	if (xvdi_add_event_handler(devi, XS_OE_STATE,
395 	    xdf_oe_change) != DDI_SUCCESS) {
396 		mutex_exit(&vdp->xdf_cb_lk);
397 		goto errout6;
398 	}
399 
400 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
401 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
402 		    ddi_get_name_addr(devi));
403 		(void) xdf_start_disconnect(vdp);
404 		mutex_exit(&vdp->xdf_cb_lk);
405 		goto errout7;
406 	}
407 
408 	mutex_exit(&vdp->xdf_cb_lk);
409 
410 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
411 	    offsetof(v_req_t, v_link));
412 	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
413 	    offsetof(ge_slot_t, link));
414 
415 	ddi_report_dev(devi);
416 	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
417 
418 	return (DDI_SUCCESS);
419 
420 errout7:
421 	xvdi_remove_event_handler(devi, XS_OE_STATE);
422 errout6:
423 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
424 errout5:
425 	cmlb_free_handle(&vdp->xdf_vd_lbl);
426 	ddi_prop_remove_all(devi);
427 errout4:
428 	kstat_delete(vdp->xdf_xdev_iostat);
429 errout3:
430 	ddi_remove_softintr(vdp->xdf_softintr_id);
431 errout2:
432 	ddi_set_driver_private(devi, NULL);
433 	cv_destroy(&vdp->xdf_dev_cv);
434 	mutex_destroy(&vdp->xdf_cb_lk);
435 	mutex_destroy(&vdp->xdf_dev_lk);
436 errout1:
437 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
438 	ddi_soft_state_free(vbd_ss, instance);
439 	return (DDI_FAILURE);
440 }
441 
442 static int
443 xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
444 {
445 	xdf_t *vdp;
446 	int instance;
447 
448 	switch (cmd) {
449 
450 	case DDI_PM_SUSPEND:
451 		break;
452 
453 	case DDI_SUSPEND:
454 		return (xdf_suspend(devi));
455 
456 	case DDI_DETACH:
457 		break;
458 
459 	default:
460 		return (DDI_FAILURE);
461 	}
462 
463 	instance = ddi_get_instance(devi);
464 	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
465 	vdp = ddi_get_soft_state(vbd_ss, instance);
466 
467 	if (vdp == NULL)
468 		return (DDI_FAILURE);
469 
470 	mutex_enter(&vdp->xdf_dev_lk);
471 	if (xdf_isopen(vdp, -1)) {
472 		mutex_exit(&vdp->xdf_dev_lk);
473 		return (DDI_FAILURE);
474 	}
475 
476 	if (vdp->xdf_status != XD_CLOSED) {
477 		mutex_exit(&vdp->xdf_dev_lk);
478 		return (DDI_FAILURE);
479 	}
480 
481 	ASSERT(!ISDMACBON(vdp));
482 	mutex_exit(&vdp->xdf_dev_lk);
483 
484 	if (vdp->xdf_timeout_id != 0)
485 		(void) untimeout(vdp->xdf_timeout_id);
486 
487 	xvdi_remove_event_handler(devi, XS_OE_STATE);
488 
489 	/* we'll support backend running in domU later */
490 #ifdef	DOMU_BACKEND
491 	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
492 #endif
493 
494 	list_destroy(&vdp->xdf_vreq_act);
495 	list_destroy(&vdp->xdf_gs_act);
496 	ddi_prop_remove_all(devi);
497 	kstat_delete(vdp->xdf_xdev_iostat);
498 	ddi_remove_softintr(vdp->xdf_softintr_id);
499 	ddi_set_driver_private(devi, NULL);
500 	cv_destroy(&vdp->xdf_dev_cv);
501 	mutex_destroy(&vdp->xdf_cb_lk);
502 	mutex_destroy(&vdp->xdf_dev_lk);
503 	if (vdp->xdf_cache_flush_block != NULL)
504 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
505 	ddi_soft_state_free(vbd_ss, instance);
506 	return (DDI_SUCCESS);
507 }
508 
509 static int
510 xdf_suspend(dev_info_t *devi)
511 {
512 	xdf_t *vdp;
513 	int instance;
514 
515 	instance = ddi_get_instance(devi);
516 
517 	if (xdfdebug & SUSRES_DBG)
518 		xen_printf("xdf_suspend: xdf#%d\n", instance);
519 
520 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
521 		return (DDI_FAILURE);
522 
523 	xvdi_suspend(devi);
524 
525 	/* stop further I/O requests */
526 	mutex_enter(&vdp->xdf_cb_lk);
527 	mutex_enter(&vdp->xdf_dev_lk);
528 	vdp->xdf_status = XD_SUSPEND;
529 	mutex_exit(&vdp->xdf_dev_lk);
530 	mutex_exit(&vdp->xdf_cb_lk);
531 
532 	/* make sure no more I/O responses left in the ring buffer */
533 	(void) ddi_remove_intr(devi, 0, NULL);
534 	(void) xdf_drain_io(vdp);
535 
536 	if (xdfdebug & SUSRES_DBG)
537 		xen_printf("xdf_suspend: SUCCESS\n");
538 
539 	return (DDI_SUCCESS);
540 }
541 
542 /*ARGSUSED*/
543 static int
544 xdf_resume(dev_info_t *devi)
545 {
546 	xdf_t *vdp;
547 	int instance;
548 
549 	instance = ddi_get_instance(devi);
550 	if (xdfdebug & SUSRES_DBG)
551 		xen_printf("xdf_resume: xdf%d\n", instance);
552 
553 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
554 		return (DDI_FAILURE);
555 
556 	mutex_enter(&vdp->xdf_cb_lk);
557 
558 	if (xvdi_resume(devi) != DDI_SUCCESS) {
559 		mutex_exit(&vdp->xdf_cb_lk);
560 		return (DDI_FAILURE);
561 	}
562 
563 	mutex_enter(&vdp->xdf_dev_lk);
564 	ASSERT(vdp->xdf_status == XD_SUSPEND);
565 	vdp->xdf_status = XD_UNKNOWN;
566 	mutex_exit(&vdp->xdf_dev_lk);
567 
568 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
569 		mutex_exit(&vdp->xdf_cb_lk);
570 		return (DDI_FAILURE);
571 	}
572 
573 	mutex_exit(&vdp->xdf_cb_lk);
574 
575 	if (xdfdebug & SUSRES_DBG)
576 		xen_printf("xdf_resume: done\n");
577 	return (DDI_SUCCESS);
578 }
579 
580 /*ARGSUSED*/
581 static int
582 xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
583 {
584 	xdf_t *vdp;
585 	int instance;
586 
587 	instance = ddi_get_instance(devi);
588 	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
589 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
590 		return (DDI_FAILURE);
591 
592 	/*
593 	 * wait for any outstanding I/O to complete
594 	 */
595 	(void) xdf_drain_io(vdp);
596 
597 	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
598 	return (DDI_SUCCESS);
599 }
600 
601 static int
602 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
603 {
604 	minor_t	minor;
605 	xdf_t	*vdp;
606 	int part;
607 	ulong_t parbit;
608 	diskaddr_t p_blkct = 0;
609 	boolean_t firstopen;
610 
611 	minor = getminor(*devp);
612 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
613 		return (ENXIO);
614 
615 	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
616 
617 	/* do cv_wait until connected or failed */
618 	mutex_enter(&vdp->xdf_dev_lk);
619 	if (xdf_connect(vdp, B_TRUE) != XD_READY) {
620 		mutex_exit(&vdp->xdf_dev_lk);
621 		return (ENXIO);
622 	}
623 
624 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
625 		mutex_exit(&vdp->xdf_dev_lk);
626 		return (EROFS);
627 	}
628 
629 	part = XDF_PART(minor);
630 	parbit = 1 << part;
631 	if (vdp->xdf_vd_exclopen & parbit) {
632 		mutex_exit(&vdp->xdf_dev_lk);
633 		return (EBUSY);
634 	}
635 
636 	/* are we the first one to open this node? */
637 	firstopen = !xdf_isopen(vdp, -1);
638 
639 	if ((flag & FEXCL) && !firstopen) {
640 		mutex_exit(&vdp->xdf_dev_lk);
641 		return (EBUSY);
642 	}
643 
644 	if (otyp == OTYP_LYR)
645 		vdp->xdf_vd_lyropen[part]++;
646 
647 	vdp->xdf_vd_open[otyp] |= parbit;
648 
649 	if (flag & FEXCL)
650 		vdp->xdf_vd_exclopen |= parbit;
651 
652 	mutex_exit(&vdp->xdf_dev_lk);
653 
654 	/* force a re-validation */
655 	if (firstopen)
656 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
657 
658 	/*
659 	 * check size
660 	 * ignore CD/DVD which contains a zero-sized s0
661 	 */
662 	if (!(flag & (FNDELAY | FNONBLOCK)) && !XD_IS_CD(vdp) &&
663 	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
664 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
665 		(void) xdf_close(*devp, flag, otyp, credp);
666 		return (ENXIO);
667 	}
668 
669 	return (0);
670 }
671 
672 /*ARGSUSED*/
673 static int
674 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
675 {
676 	minor_t	minor;
677 	xdf_t	*vdp;
678 	int part;
679 	ulong_t parbit;
680 
681 	minor = getminor(dev);
682 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
683 		return (ENXIO);
684 
685 	mutex_enter(&vdp->xdf_dev_lk);
686 	part = XDF_PART(minor);
687 	if (!xdf_isopen(vdp, part)) {
688 		mutex_exit(&vdp->xdf_dev_lk);
689 		return (ENXIO);
690 	}
691 	parbit = 1 << part;
692 
693 	if (otyp == OTYP_LYR) {
694 		if (vdp->xdf_vd_lyropen[part] != 0)
695 			vdp->xdf_vd_lyropen[part]--;
696 		if (vdp->xdf_vd_lyropen[part] == 0)
697 			vdp->xdf_vd_open[OTYP_LYR] &= ~parbit;
698 	} else {
699 		vdp->xdf_vd_open[otyp] &= ~parbit;
700 	}
701 	vdp->xdf_vd_exclopen &= ~parbit;
702 
703 	mutex_exit(&vdp->xdf_dev_lk);
704 	return (0);
705 }
706 
707 static int
708 xdf_strategy(struct buf *bp)
709 {
710 	xdf_t	*vdp;
711 	minor_t minor;
712 	diskaddr_t p_blkct, p_blkst;
713 	ulong_t nblks;
714 	int part;
715 
716 	minor = getminor(bp->b_edev);
717 	part = XDF_PART(minor);
718 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) ||
719 	    !xdf_isopen(vdp, part) ||
720 	    cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
721 	    &p_blkst, NULL, NULL, NULL)) {
722 		bioerror(bp, ENXIO);
723 		bp->b_resid = bp->b_bcount;
724 		biodone(bp);
725 		return (0);
726 	}
727 
728 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
729 		bioerror(bp, EROFS);
730 		bp->b_resid = bp->b_bcount;
731 		biodone(bp);
732 		return (0);
733 	}
734 
735 	/*
736 	 * starting beyond partition
737 	 */
738 	if (bp->b_blkno > p_blkct) {
739 		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
740 		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
741 		bioerror(bp, EINVAL);
742 		bp->b_resid = bp->b_bcount;
743 		biodone(bp);
744 		return (0);
745 	}
746 
747 	/* Legacy: don't set error flag at this case */
748 	if (bp->b_blkno == p_blkct) {
749 		bp->b_resid = bp->b_bcount;
750 		biodone(bp);
751 		return (0);
752 	}
753 
754 	/*
755 	 * adjust for partial transfer
756 	 */
757 	nblks = bp->b_bcount >> XB_BSHIFT;
758 	if ((bp->b_blkno + nblks) > p_blkct) {
759 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
760 		bp->b_bcount -= bp->b_resid;
761 	}
762 
763 
764 	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
765 	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
766 
767 	mutex_enter(&vdp->xdf_dev_lk);
768 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
769 	if (vdp->xdf_f_act == NULL) {
770 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
771 	} else {
772 		vdp->xdf_l_act->av_forw = bp;
773 		vdp->xdf_l_act = bp;
774 	}
775 	bp->av_forw = NULL;
776 	bp->av_back = NULL; /* not tagged with a v_req */
777 	bp->b_private = (void *)(uintptr_t)p_blkst;
778 	mutex_exit(&vdp->xdf_dev_lk);
779 	xdf_iostart(vdp);
780 	if (do_polled_io)
781 		(void) xdf_drain_io(vdp);
782 	return (0);
783 }
784 
785 /*ARGSUSED*/
786 static int
787 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
788 {
789 
790 	xdf_t	*vdp;
791 	minor_t minor;
792 	diskaddr_t p_blkcnt;
793 	int part;
794 
795 	minor = getminor(dev);
796 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
797 		return (ENXIO);
798 
799 	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
800 	    (int64_t)uiop->uio_offset));
801 
802 	part = XDF_PART(minor);
803 	if (!xdf_isopen(vdp, part))
804 		return (ENXIO);
805 
806 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
807 	    NULL, NULL, NULL, NULL))
808 		return (ENXIO);
809 
810 	if (U_INVAL(uiop))
811 		return (EINVAL);
812 
813 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
814 }
815 
816 /*ARGSUSED*/
817 static int
818 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
819 {
820 	xdf_t *vdp;
821 	minor_t minor;
822 	diskaddr_t p_blkcnt;
823 	int part;
824 
825 	minor = getminor(dev);
826 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
827 		return (ENXIO);
828 
829 	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
830 	    (int64_t)uiop->uio_offset));
831 
832 	part = XDF_PART(minor);
833 	if (!xdf_isopen(vdp, part))
834 		return (ENXIO);
835 
836 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
837 	    NULL, NULL, NULL, NULL))
838 		return (ENXIO);
839 
840 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
841 		return (ENOSPC);
842 
843 	if (U_INVAL(uiop))
844 		return (EINVAL);
845 
846 	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
847 }
848 
849 /*ARGSUSED*/
850 static int
851 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
852 {
853 	xdf_t	*vdp;
854 	minor_t minor;
855 	struct uio *uiop = aiop->aio_uio;
856 	diskaddr_t p_blkcnt;
857 	int part;
858 
859 	minor = getminor(dev);
860 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
861 		return (ENXIO);
862 
863 	part = XDF_PART(minor);
864 	if (!xdf_isopen(vdp, part))
865 		return (ENXIO);
866 
867 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
868 	    NULL, NULL, NULL, NULL))
869 		return (ENXIO);
870 
871 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
872 		return (ENOSPC);
873 
874 	if (U_INVAL(uiop))
875 		return (EINVAL);
876 
877 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
878 }
879 
880 /*ARGSUSED*/
881 static int
882 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
883 {
884 	xdf_t *vdp;
885 	minor_t minor;
886 	struct uio *uiop = aiop->aio_uio;
887 	diskaddr_t p_blkcnt;
888 	int part;
889 
890 	minor = getminor(dev);
891 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
892 		return (ENXIO);
893 
894 	part = XDF_PART(minor);
895 	if (!xdf_isopen(vdp, part))
896 		return (ENXIO);
897 
898 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
899 	    NULL, NULL, NULL, NULL))
900 		return (ENXIO);
901 
902 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
903 		return (ENOSPC);
904 
905 	if (U_INVAL(uiop))
906 		return (EINVAL);
907 
908 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
909 }
910 
911 static int
912 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
913 {
914 	struct buf dumpbuf, *dbp;
915 	xdf_t	*vdp;
916 	minor_t minor;
917 	int err = 0;
918 	int part;
919 	diskaddr_t p_blkcnt, p_blkst;
920 
921 	minor = getminor(dev);
922 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
923 		return (ENXIO);
924 
925 	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
926 	    addr, blkno, nblk));
927 
928 	part = XDF_PART(minor);
929 	if (!xdf_isopen(vdp, part))
930 		return (ENXIO);
931 
932 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
933 	    NULL, NULL, NULL))
934 		return (ENXIO);
935 
936 	if ((blkno + nblk) > p_blkcnt) {
937 		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
938 		    blkno + nblk, (uint64_t)vdp->xdf_xdev_nblocks);
939 		return (EINVAL);
940 	}
941 
942 	dbp = &dumpbuf;
943 	bioinit(dbp);
944 	dbp->b_flags = B_BUSY;
945 	dbp->b_un.b_addr = addr;
946 	dbp->b_bcount	= nblk << DEV_BSHIFT;
947 	dbp->b_resid = 0;
948 	dbp->b_blkno = blkno;
949 	dbp->b_edev = dev;
950 	dbp->b_private = (void *)(uintptr_t)p_blkst;
951 
952 	mutex_enter(&vdp->xdf_dev_lk);
953 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
954 	if (vdp->xdf_f_act == NULL) {
955 		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
956 	} else {
957 		vdp->xdf_l_act->av_forw = dbp;
958 		vdp->xdf_l_act = dbp;
959 	}
960 	dbp->av_forw = NULL;
961 	dbp->av_back = NULL;
962 	mutex_exit(&vdp->xdf_dev_lk);
963 	xdf_iostart(vdp);
964 	err = xdf_drain_io(vdp);
965 	biofini(dbp);
966 	return (err);
967 }
968 
969 /*ARGSUSED*/
970 static int
971 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
972     int *rvalp)
973 {
974 	int instance;
975 	xdf_t	*vdp;
976 	minor_t minor;
977 	int part;
978 
979 	minor = getminor(dev);
980 	instance = XDF_INST(minor);
981 
982 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
983 		return (ENXIO);
984 
985 	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
986 	    instance, cmd, cmd));
987 
988 	part = XDF_PART(minor);
989 	if (!xdf_isopen(vdp, part))
990 		return (ENXIO);
991 
992 	switch (cmd) {
993 	case DKIOCGMEDIAINFO: {
994 		struct dk_minfo	media_info;
995 
996 		media_info.dki_lbsize = DEV_BSIZE;
997 		media_info.dki_capacity = vdp->xdf_xdev_nblocks;
998 		media_info.dki_media_type = DK_FIXED_DISK;
999 
1000 		if (ddi_copyout(&media_info, (void *)arg,
1001 		    sizeof (struct dk_minfo), mode)) {
1002 			return (EFAULT);
1003 		} else {
1004 			return (0);
1005 		}
1006 	}
1007 
1008 	case DKIOCINFO: {
1009 		struct dk_cinfo info;
1010 
1011 		/* controller information */
1012 		if (XD_IS_CD(vdp))
1013 			info.dki_ctype = DKC_CDROM;
1014 		else
1015 			info.dki_ctype = DKC_VBD;
1016 
1017 		info.dki_cnum = 0;
1018 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1019 
1020 		/* unit information */
1021 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1022 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1023 		info.dki_flags = DKI_FMTVOL;
1024 		info.dki_partition = part;
1025 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1026 		info.dki_addr = 0;
1027 		info.dki_space = 0;
1028 		info.dki_prio = 0;
1029 		info.dki_vec = 0;
1030 
1031 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1032 			return (EFAULT);
1033 		else
1034 			return (0);
1035 	}
1036 
1037 	case DKIOCSTATE: {
1038 		enum dkio_state	dkstate = DKIO_INSERTED;
1039 		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1040 		    mode) != 0)
1041 			return (EFAULT);
1042 		return (0);
1043 	}
1044 
1045 	/*
1046 	 * is media removable?
1047 	 */
1048 	case DKIOCREMOVABLE: {
1049 		int i = XD_IS_RM(vdp) ? 1 : 0;
1050 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1051 			return (EFAULT);
1052 		return (0);
1053 	}
1054 
1055 	case DKIOCG_PHYGEOM:
1056 	case DKIOCG_VIRTGEOM:
1057 	case DKIOCGGEOM:
1058 	case DKIOCSGEOM:
1059 	case DKIOCGAPART:
1060 	case DKIOCGVTOC:
1061 	case DKIOCSVTOC:
1062 	case DKIOCPARTINFO:
1063 	case DKIOCGETEFI:
1064 	case DKIOCSETEFI:
1065 	case DKIOCPARTITION: {
1066 		int rc;
1067 
1068 		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1069 		    rvalp, NULL);
1070 		return (rc);
1071 	}
1072 
1073 	case DKIOCGETWCE:
1074 		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1075 		    sizeof (vdp->xdf_wce), mode))
1076 			return (EFAULT);
1077 		return (0);
1078 	case DKIOCSETWCE:
1079 		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1080 		    sizeof (vdp->xdf_wce), mode))
1081 			return (EFAULT);
1082 		return (0);
1083 	case DKIOCFLUSHWRITECACHE: {
1084 		int rc;
1085 		struct dk_callback *dkc = (struct dk_callback *)arg;
1086 
1087 		if (vdp->xdf_flush_supported) {
1088 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1089 			    NULL, 0, 0, (void *)dev);
1090 		} else {
1091 			if (xdf_barrier_flush_disable)
1092 				return (ENOTTY);
1093 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1094 			    vdp->xdf_cache_flush_block, xdf_flush_block,
1095 			    DEV_BSIZE, (void *)dev);
1096 		}
1097 		if ((mode & FKIOCTL) && (dkc != NULL) &&
1098 		    (dkc->dkc_callback != NULL)) {
1099 			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1100 			/* need to return 0 after calling callback */
1101 			rc = 0;
1102 		}
1103 		return (rc);
1104 	}
1105 
1106 	default:
1107 		return (ENOTTY);
1108 	}
1109 }
1110 
1111 /*
1112  * xdf interrupt handler
1113  */
1114 static uint_t
1115 xdf_intr(caddr_t arg)
1116 {
1117 	xdf_t *vdp = (xdf_t *)arg;
1118 	xendev_ring_t *xbr;
1119 	blkif_response_t *resp;
1120 	int bioerr = 0;
1121 	uint64_t id;
1122 	extern int do_polled_io;
1123 	uint8_t op;
1124 	uint16_t status;
1125 	ddi_acc_handle_t acchdl;
1126 
1127 	mutex_enter(&vdp->xdf_dev_lk);
1128 
1129 	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1130 		mutex_exit(&vdp->xdf_dev_lk);
1131 		return (DDI_INTR_UNCLAIMED);
1132 	}
1133 
1134 	acchdl = vdp->xdf_xb_ring_hdl;
1135 
1136 	/*
1137 	 * complete all requests which have a response
1138 	 */
1139 	while (resp = xvdi_ring_get_response(xbr)) {
1140 		id = ddi_get64(acchdl, &resp->id);
1141 		op = ddi_get8(acchdl, &resp->operation);
1142 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1143 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1144 		    op, id, status));
1145 
1146 		/*
1147 		 * XXPV - close connection to the backend and restart
1148 		 */
1149 		if (status != BLKIF_RSP_OKAY) {
1150 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1151 			    ddi_get_name_addr(vdp->xdf_dip),
1152 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1153 			bioerr = EIO;
1154 		}
1155 
1156 		xdf_iofini(vdp, id, bioerr);
1157 	}
1158 
1159 	mutex_exit(&vdp->xdf_dev_lk);
1160 
1161 	if (!do_polled_io)
1162 		xdf_iostart(vdp);
1163 
1164 	return (DDI_INTR_CLAIMED);
1165 }
1166 
1167 int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1168 
1169 /*
1170  * Snarf new data if our flush block was re-written
1171  */
1172 static void
1173 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1174 {
1175 	int nblks;
1176 	boolean_t mapin;
1177 
1178 	if (IS_WRITE_BARRIER(vdp, bp))
1179 		return; /* write was a flush write */
1180 
1181 	mapin = B_FALSE;
1182 	nblks = bp->b_bcount >> DEV_BSHIFT;
1183 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1184 		xdf_fbrewrites++;
1185 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1186 			mapin = B_TRUE;
1187 			bp_mapin(bp);
1188 		}
1189 		bcopy(bp->b_un.b_addr +
1190 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1191 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1192 		if (mapin)
1193 			bp_mapout(bp);
1194 	}
1195 }
1196 
1197 static void
1198 xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1199 {
1200 	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1201 	v_req_t *vreq = gs->vreq;
1202 	buf_t *bp = vreq->v_buf;
1203 
1204 	gs_free(vdp, gs);
1205 	if (bioerr)
1206 		bioerror(bp, bioerr);
1207 	vreq->v_nslots--;
1208 	if (vreq->v_nslots != 0)
1209 		return;
1210 
1211 	XDF_UPDATE_IO_STAT(vdp, bp);
1212 	kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1213 
1214 	if (IS_ERROR(bp))
1215 		bp->b_resid = bp->b_bcount;
1216 
1217 	vreq_free(vdp, vreq);
1218 	biodone(bp);
1219 }
1220 
1221 /*
1222  * return value of xdf_prepare_rreq()
1223  * used in xdf_iostart()
1224  */
1225 #define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1226 #define	XF_COMP		1 /* no more I/O left in buf */
1227 
1228 static void
1229 xdf_iostart(xdf_t *vdp)
1230 {
1231 	xendev_ring_t *xbr;
1232 	struct buf *bp;
1233 	blkif_request_t *rreq;
1234 	int retval;
1235 	int rreqready = 0;
1236 
1237 	xbr = vdp->xdf_xb_ring;
1238 
1239 	/*
1240 	 * populate the ring request(s)
1241 	 *
1242 	 * loop until there is no buf to transfer or no free slot
1243 	 * available in I/O ring
1244 	 */
1245 	for (;;) {
1246 		mutex_enter(&vdp->xdf_dev_lk);
1247 
1248 		if (vdp->xdf_status != XD_READY)
1249 			break;
1250 
1251 		/* active buf queue empty? */
1252 		if ((bp = vdp->xdf_f_act) == NULL)
1253 			break;
1254 
1255 		/* try to grab a vreq for this bp */
1256 		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1257 				break;
1258 		/* alloc DMA/GTE resources */
1259 		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1260 			break;
1261 
1262 		/* get next blkif_request in the ring */
1263 		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1264 			break;
1265 		bzero(rreq, sizeof (blkif_request_t));
1266 
1267 		/* populate blkif_request with this buf */
1268 		rreqready++;
1269 		retval = xdf_prepare_rreq(vdp, bp, rreq);
1270 		if (retval == XF_COMP) {
1271 			/* finish this bp, switch to next one */
1272 			kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1273 			vdp->xdf_f_act = bp->av_forw;
1274 			bp->av_forw = NULL;
1275 		}
1276 
1277 		mutex_exit(&vdp->xdf_dev_lk);
1278 	}
1279 
1280 	/*
1281 	 * Send the request(s) to the backend
1282 	 */
1283 	if (rreqready) {
1284 		if (xvdi_ring_push_request(xbr)) {
1285 			DPRINTF(IO_DBG, ("xdf_iostart: "
1286 			    "sent request(s) to backend\n"));
1287 			xvdi_notify_oe(vdp->xdf_dip);
1288 		}
1289 	}
1290 
1291 	mutex_exit(&vdp->xdf_dev_lk);
1292 }
1293 
1294 /*
1295  * populate a single blkif_request_t w/ a buf
1296  */
1297 static int
1298 xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1299 {
1300 	int		rval;
1301 	grant_ref_t	gr;
1302 	uint8_t		fsect, lsect;
1303 	size_t		bcnt;
1304 	paddr_t		dma_addr;
1305 	off_t		blk_off;
1306 	dev_info_t	*dip = vdp->xdf_dip;
1307 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1308 	v_req_t		*vreq = BP2VREQ(bp);
1309 	uint64_t	blkno = vreq->v_blkno;
1310 	uint_t		ndmacs = vreq->v_ndmacs;
1311 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1312 	int		seg = 0;
1313 	int		isread = IS_READ(bp);
1314 
1315 	if (isread)
1316 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1317 	else {
1318 		switch (vreq->v_flush_diskcache) {
1319 		case FLUSH_DISKCACHE:
1320 			ddi_put8(acchdl, &rreq->operation,
1321 			    BLKIF_OP_FLUSH_DISKCACHE);
1322 			ddi_put16(acchdl, &rreq->handle, vdev);
1323 			ddi_put64(acchdl, &rreq->id,
1324 			    (uint64_t)(uintptr_t)(vreq->v_gs));
1325 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1326 			return (XF_COMP);
1327 		case WRITE_BARRIER:
1328 			ddi_put8(acchdl, &rreq->operation,
1329 			    BLKIF_OP_WRITE_BARRIER);
1330 			break;
1331 		default:
1332 			if (!vdp->xdf_wce)
1333 				ddi_put8(acchdl, &rreq->operation,
1334 				    BLKIF_OP_WRITE_BARRIER);
1335 			else
1336 				ddi_put8(acchdl, &rreq->operation,
1337 				    BLKIF_OP_WRITE);
1338 			break;
1339 		}
1340 	}
1341 
1342 	ddi_put16(acchdl, &rreq->handle, vdev);
1343 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1344 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1345 
1346 	/*
1347 	 * loop until all segments are populated or no more dma cookie in buf
1348 	 */
1349 	for (;;) {
1350 	/*
1351 	 * Each segment of a blkif request can transfer up to
1352 	 * one 4K page of data.
1353 	 */
1354 		bcnt = vreq->v_dmac.dmac_size;
1355 		ASSERT(bcnt <= PAGESIZE);
1356 		ASSERT((bcnt % XB_BSIZE) == 0);
1357 		dma_addr = vreq->v_dmac.dmac_laddress;
1358 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1359 		ASSERT((blk_off & XB_BMASK) == 0);
1360 		fsect = blk_off >> XB_BSHIFT;
1361 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1362 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1363 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1364 		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1365 		    seg, vreq->v_dmac.dmac_size, blk_off));
1366 		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1367 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1368 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1369 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1370 		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1371 		    "\n", seg, fsect, lsect, gr, dma_addr));
1372 
1373 		blkno += (bcnt >> XB_BSHIFT);
1374 		seg++;
1375 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1376 		if (--ndmacs) {
1377 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1378 			continue;
1379 		}
1380 
1381 		vreq->v_status = VREQ_DMAWIN_DONE;
1382 		vreq->v_blkno = blkno;
1383 		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1384 			/* last win */
1385 			rval = XF_COMP;
1386 		else
1387 			rval = XF_PARTIAL;
1388 		break;
1389 	}
1390 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1391 	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1392 	    rreq->id));
1393 
1394 	return (rval);
1395 }
1396 
1397 #define	XDF_QSEC	50000	/* .005 second */
1398 #define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1399 
1400 static int
1401 xdf_drain_io(xdf_t *vdp)
1402 {
1403 	int pollc, rval;
1404 	xendev_ring_t *xbr;
1405 
1406 	if (xdfdebug & SUSRES_DBG)
1407 		xen_printf("xdf_drain_io: start\n");
1408 
1409 	mutex_enter(&vdp->xdf_dev_lk);
1410 
1411 	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1412 		goto out;
1413 
1414 	rval = 0;
1415 	xbr = vdp->xdf_xb_ring;
1416 	ASSERT(xbr != NULL);
1417 
1418 	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1419 		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1420 			mutex_exit(&vdp->xdf_dev_lk);
1421 			(void) xdf_intr((caddr_t)vdp);
1422 			mutex_enter(&vdp->xdf_dev_lk);
1423 		}
1424 		if (!xvdi_ring_has_incomp_request(xbr))
1425 			goto out;
1426 
1427 		(void) HYPERVISOR_yield();
1428 		/*
1429 		 * file-backed devices can be slow
1430 		 */
1431 		drv_usecwait(XDF_QSEC << pollc);
1432 	}
1433 	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1434 	rval = EIO;
1435 out:
1436 	mutex_exit(&vdp->xdf_dev_lk);
1437 	if (xdfdebug & SUSRES_DBG)
1438 		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1439 	return (rval);
1440 }
1441 
1442 /* ARGSUSED5 */
1443 static int
1444 xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1445     diskaddr_t start, size_t reqlen, void *tg_cookie)
1446 {
1447 	xdf_t *vdp;
1448 	struct buf *bp;
1449 	int err = 0;
1450 
1451 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1452 	if (vdp == NULL)
1453 		return (ENXIO);
1454 
1455 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_xdev_nblocks)
1456 		return (EINVAL);
1457 
1458 	bp = getrbuf(KM_SLEEP);
1459 	if (cmd == TG_READ)
1460 		bp->b_flags = B_BUSY | B_READ;
1461 	else
1462 		bp->b_flags = B_BUSY | B_WRITE;
1463 	bp->b_un.b_addr = bufp;
1464 	bp->b_bcount = reqlen;
1465 	bp->b_resid = 0;
1466 	bp->b_blkno = start;
1467 	bp->av_forw = NULL;
1468 	bp->av_back = NULL;
1469 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1470 
1471 	mutex_enter(&vdp->xdf_dev_lk);
1472 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1473 	if (vdp->xdf_f_act == NULL) {
1474 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1475 	} else {
1476 		vdp->xdf_l_act->av_forw = bp;
1477 		vdp->xdf_l_act = bp;
1478 	}
1479 	mutex_exit(&vdp->xdf_dev_lk);
1480 	xdf_iostart(vdp);
1481 	err = biowait(bp);
1482 
1483 	ASSERT(bp->b_flags & B_DONE);
1484 
1485 	freerbuf(bp);
1486 	return (err);
1487 }
1488 
1489 /*
1490  * synthetic geometry
1491  */
1492 #define	XDF_NSECTS	256
1493 #define	XDF_NHEADS	16
1494 
1495 static int
1496 xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1497 {
1498 	xdf_t *vdp;
1499 
1500 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1501 
1502 	if (vdp == NULL)
1503 		return (ENXIO);
1504 
1505 	mutex_enter(&vdp->xdf_dev_lk);
1506 	*capp = vdp->xdf_xdev_nblocks;
1507 	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1508 	mutex_exit(&vdp->xdf_dev_lk);
1509 	return (0);
1510 }
1511 
1512 static int
1513 xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1514 {
1515 	xdf_t *vdp;
1516 	uint_t ncyl;
1517 	uint_t spc = XDF_NHEADS * XDF_NSECTS;
1518 
1519 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1520 
1521 	if (vdp == NULL)
1522 		return (ENXIO);
1523 
1524 	ncyl = vdp->xdf_xdev_nblocks / spc;
1525 
1526 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1527 	geomp->g_acyl = 0;
1528 	geomp->g_nhead = XDF_NHEADS;
1529 	geomp->g_secsize = XB_BSIZE;
1530 	geomp->g_nsect = XDF_NSECTS;
1531 	geomp->g_intrlv = 0;
1532 	geomp->g_rpm = 7200;
1533 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1534 	return (0);
1535 }
1536 
1537 /*
1538  * No real HBA, no geometry available from it
1539  */
1540 /*ARGSUSED*/
1541 static int
1542 xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1543 {
1544 	return (EINVAL);
1545 }
1546 
1547 static int
1548 xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1549 {
1550 	xdf_t *vdp;
1551 
1552 	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1553 		return (ENXIO);
1554 
1555 	if (XD_IS_RO(vdp))
1556 		tgattributep->media_is_writable = 0;
1557 	else
1558 		tgattributep->media_is_writable = 1;
1559 	return (0);
1560 }
1561 
1562 /* ARGSUSED3 */
1563 static int
1564 xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1565 {
1566 	switch (cmd) {
1567 	case TG_GETPHYGEOM:
1568 		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1569 	case TG_GETVIRTGEOM:
1570 		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1571 	case TG_GETCAPACITY:
1572 		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1573 	case TG_GETBLOCKSIZE:
1574 		*(uint32_t *)arg = XB_BSIZE;
1575 		return (0);
1576 	case TG_GETATTR:
1577 		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1578 	default:
1579 		return (ENOTTY);
1580 	}
1581 }
1582 
1583 /*
1584  * Kick-off connect process
1585  * Status should be XD_UNKNOWN or XD_CLOSED
1586  * On success, status will be changed to XD_INIT
1587  * On error, status won't be changed
1588  */
1589 static int
1590 xdf_start_connect(xdf_t *vdp)
1591 {
1592 	char *xsnode;
1593 	grant_ref_t gref;
1594 	xenbus_transaction_t xbt;
1595 	int rv;
1596 	dev_info_t *dip = vdp->xdf_dip;
1597 
1598 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1599 		goto errout;
1600 
1601 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1602 		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1603 		    ddi_get_name_addr(dip));
1604 		goto errout;
1605 	}
1606 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1607 	    DDI_SUCCESS) {
1608 		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1609 		    "failed to add intr handler", ddi_get_name_addr(dip));
1610 		goto errout1;
1611 	}
1612 
1613 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1614 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1615 	    DDI_SUCCESS) {
1616 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1617 		    ddi_get_name_addr(dip));
1618 		goto errout2;
1619 	}
1620 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1621 
1622 	/*
1623 	 * Write into xenstore the info needed by backend
1624 	 */
1625 	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1626 		cmn_err(CE_WARN, "xdf@%s: "
1627 		    "failed to get xenstore node path",
1628 		    ddi_get_name_addr(dip));
1629 		goto fail_trans;
1630 	}
1631 trans_retry:
1632 	if (xenbus_transaction_start(&xbt)) {
1633 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1634 		    ddi_get_name_addr(dip));
1635 		xvdi_fatal_error(dip, EIO, "transaction start");
1636 		goto fail_trans;
1637 	}
1638 
1639 	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1640 		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1641 		    ddi_get_name_addr(dip));
1642 		xvdi_fatal_error(dip, rv, "writing ring-ref");
1643 		goto abort_trans;
1644 	}
1645 
1646 	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1647 	    xvdi_get_evtchn(dip))) {
1648 		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1649 		    ddi_get_name_addr(dip));
1650 		xvdi_fatal_error(dip, rv, "writing event-channel");
1651 		goto abort_trans;
1652 	}
1653 
1654 	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1655 		cmn_err(CE_WARN, "xdf@%s: "
1656 		    "failed to switch state to XenbusStateInitialised",
1657 		    ddi_get_name_addr(dip));
1658 		xvdi_fatal_error(dip, rv, "writing state");
1659 		goto abort_trans;
1660 	}
1661 
1662 	/* kick-off connect process */
1663 	if (rv = xenbus_transaction_end(xbt, 0)) {
1664 		if (rv == EAGAIN)
1665 			goto trans_retry;
1666 		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1667 		    ddi_get_name_addr(dip));
1668 		xvdi_fatal_error(dip, rv, "completing transaction");
1669 		goto fail_trans;
1670 	}
1671 
1672 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1673 	mutex_enter(&vdp->xdf_dev_lk);
1674 	vdp->xdf_status = XD_INIT;
1675 	mutex_exit(&vdp->xdf_dev_lk);
1676 
1677 	return (DDI_SUCCESS);
1678 
1679 abort_trans:
1680 	(void) xenbus_transaction_end(xbt, 1);
1681 fail_trans:
1682 	xvdi_free_ring(vdp->xdf_xb_ring);
1683 errout2:
1684 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1685 errout1:
1686 	xvdi_free_evtchn(dip);
1687 errout:
1688 	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1689 	    ddi_get_name_addr(dip));
1690 	return (DDI_FAILURE);
1691 }
1692 
1693 /*
1694  * Kick-off disconnect process
1695  * Status won't be changed
1696  */
1697 static int
1698 xdf_start_disconnect(xdf_t *vdp)
1699 {
1700 	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1701 		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1702 		    ddi_get_name_addr(vdp->xdf_dip));
1703 		return (DDI_FAILURE);
1704 	}
1705 
1706 	return (DDI_SUCCESS);
1707 }
1708 
1709 int
1710 xdf_get_flush_block(xdf_t *vdp)
1711 {
1712 	/*
1713 	 * Get a DEV_BSIZE aligned bufer
1714 	 */
1715 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1716 	vdp->xdf_cache_flush_block =
1717 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1718 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1719 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1720 		return (DDI_FAILURE);
1721 	return (DDI_SUCCESS);
1722 }
1723 
1724 /*
1725  * Finish other initialization after we've connected to backend
1726  * Status should be XD_INIT before calling this routine
1727  * On success, status should be changed to XD_READY
1728  * On error, status should stay XD_INIT
1729  */
1730 static int
1731 xdf_post_connect(xdf_t *vdp)
1732 {
1733 	int rv;
1734 	uint_t len;
1735 	char *type;
1736 	char *barrier;
1737 	dev_info_t *devi = vdp->xdf_dip;
1738 
1739 	/*
1740 	 * Determine if feature barrier is supported by backend
1741 	 */
1742 	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1743 	    "feature-barrier", (void **)&barrier, &len) == 0) {
1744 		vdp->xdf_feature_barrier = 1;
1745 		kmem_free(barrier, len);
1746 	} else {
1747 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1748 		    ddi_get_name_addr(vdp->xdf_dip));
1749 		vdp->xdf_feature_barrier = 0;
1750 	}
1751 
1752 	/* probe backend */
1753 	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1754 	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1755 	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1756 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1757 		    "cannot read backend info", ddi_get_name_addr(devi));
1758 		xvdi_fatal_error(devi, rv, "reading backend info");
1759 		return (DDI_FAILURE);
1760 	}
1761 
1762 	/* fix disk type */
1763 	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1764 	    (void **)&type, &len) != 0) {
1765 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1766 		    "cannot read device-type", ddi_get_name_addr(devi));
1767 		xvdi_fatal_error(devi, rv, "reading device-type");
1768 		return (DDI_FAILURE);
1769 	}
1770 	if (strcmp(type, "cdrom") == 0)
1771 		vdp->xdf_xdev_info |= VDISK_CDROM;
1772 	kmem_free(type, len);
1773 
1774 	/*
1775 	 * We've created all the minor nodes via cmlb_attach() using default
1776 	 * value in xdf_attach() to make it possbile to block in xdf_open(),
1777 	 * in case there's anyone (say, booting thread) ever trying to open
1778 	 * it before connected to backend. We will refresh all those minor
1779 	 * nodes w/ latest info we've got now when we are almost connected.
1780 	 *
1781 	 * Don't do this when xdf is already opened by someone (could happen
1782 	 * during resume), for that cmlb_attach() will invalid the label info
1783 	 * and confuse those who has already opened the node, which is bad.
1784 	 */
1785 	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1786 		/* re-init cmlb w/ latest info we got from backend */
1787 		if (cmlb_attach(devi, &xdf_lb_ops,
1788 		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1789 		    XD_IS_RM(vdp), 1, DDI_NT_BLOCK,
1790 		    CMLB_FAKE_LABEL_ONE_PARTITION,
1791 		    vdp->xdf_vd_lbl, NULL) != 0) {
1792 			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1793 			    ddi_get_name_addr(devi));
1794 			return (DDI_FAILURE);
1795 		}
1796 	}
1797 
1798 	/* mark vbd is ready for I/O */
1799 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1800 	mutex_enter(&vdp->xdf_dev_lk);
1801 	vdp->xdf_status = XD_READY;
1802 	mutex_exit(&vdp->xdf_dev_lk);
1803 	/*
1804 	 * If backend has feature-barrier, see if it supports disk
1805 	 * cache flush op.
1806 	 */
1807 	vdp->xdf_flush_supported = 0;
1808 	if (vdp->xdf_feature_barrier) {
1809 		/*
1810 		 * Pretend we already know flush is supported so probe
1811 		 * will attempt the correct op.
1812 		 */
1813 		vdp->xdf_flush_supported = 1;
1814 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1815 			vdp->xdf_flush_supported = 1;
1816 		} else {
1817 			vdp->xdf_flush_supported = 0;
1818 			/*
1819 			 * If the other end does not support the cache flush op
1820 			 * then we must use a barrier-write to force disk
1821 			 * cache flushing.  Barrier writes require that a data
1822 			 * block actually be written.
1823 			 * Cache a block to barrier-write when we are
1824 			 * asked to perform a flush.
1825 			 * XXX - would it be better to just copy 1 block
1826 			 * (512 bytes) from whatever write we did last
1827 			 * and rewrite that block?
1828 			 */
1829 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1830 				return (DDI_FAILURE);
1831 		}
1832 	}
1833 
1834 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
1835 	    (uint64_t)vdp->xdf_xdev_nblocks);
1836 
1837 	return (DDI_SUCCESS);
1838 }
1839 
1840 /*
1841  * Finish other uninitialization after we've disconnected from backend
1842  * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
1843  */
1844 static void
1845 xdf_post_disconnect(xdf_t *vdp)
1846 {
1847 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1848 	xvdi_free_evtchn(vdp->xdf_dip);
1849 	xvdi_free_ring(vdp->xdf_xb_ring);
1850 	vdp->xdf_xb_ring = NULL;
1851 	vdp->xdf_xb_ring_hdl = NULL;
1852 	vdp->xdf_peer = (domid_t)-1;
1853 
1854 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1855 	mutex_enter(&vdp->xdf_dev_lk);
1856 	vdp->xdf_status = XD_CLOSED;
1857 	mutex_exit(&vdp->xdf_dev_lk);
1858 }
1859 
1860 /*ARGSUSED*/
1861 static void
1862 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1863 {
1864 	XenbusState new_state = *(XenbusState *)impl_data;
1865 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1866 	boolean_t unexpect_die = B_FALSE;
1867 	int status;
1868 
1869 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1870 	    ddi_get_name_addr(dip), new_state));
1871 
1872 	mutex_enter(&vdp->xdf_cb_lk);
1873 
1874 	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
1875 		mutex_exit(&vdp->xdf_cb_lk);
1876 		return;
1877 	}
1878 
1879 	switch (new_state) {
1880 	case XenbusStateInitialising:
1881 		ASSERT(vdp->xdf_status == XD_CLOSED);
1882 		/*
1883 		 * backend recovered from a previous failure,
1884 		 * kick-off connect process again
1885 		 */
1886 		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
1887 			cmn_err(CE_WARN, "xdf@%s:"
1888 			    " failed to start reconnecting to backend",
1889 			    ddi_get_name_addr(dip));
1890 		}
1891 		break;
1892 	case XenbusStateConnected:
1893 		ASSERT(vdp->xdf_status == XD_INIT);
1894 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1895 		/* finish final init after connect */
1896 		if (xdf_post_connect(vdp) != DDI_SUCCESS)
1897 			(void) xdf_start_disconnect(vdp);
1898 		break;
1899 	case XenbusStateClosing:
1900 		if (vdp->xdf_status == XD_READY) {
1901 			mutex_enter(&vdp->xdf_dev_lk);
1902 			if (xdf_isopen(vdp, -1)) {
1903 				cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
1904 				    "still in use", ddi_get_name_addr(dip));
1905 				mutex_exit(&vdp->xdf_dev_lk);
1906 				break;
1907 			} else {
1908 				vdp->xdf_status = XD_CLOSING;
1909 			}
1910 			mutex_exit(&vdp->xdf_dev_lk);
1911 		}
1912 		(void) xdf_start_disconnect(vdp);
1913 		break;
1914 	case XenbusStateClosed:
1915 		/* first check if BE closed unexpectedly */
1916 		mutex_enter(&vdp->xdf_dev_lk);
1917 		if (xdf_isopen(vdp, -1)) {
1918 			unexpect_die = B_TRUE;
1919 			unexpectedie(vdp);
1920 			cmn_err(CE_WARN, "xdf@%s: backend closed, "
1921 			    "reconnecting...", ddi_get_name_addr(dip));
1922 		}
1923 		mutex_exit(&vdp->xdf_dev_lk);
1924 
1925 		if (vdp->xdf_status == XD_READY) {
1926 			mutex_enter(&vdp->xdf_dev_lk);
1927 			vdp->xdf_status = XD_CLOSING;
1928 			mutex_exit(&vdp->xdf_dev_lk);
1929 
1930 #ifdef	DOMU_BACKEND
1931 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1932 #endif
1933 
1934 			xdf_post_disconnect(vdp);
1935 			(void) xvdi_switch_state(dip, XBT_NULL,
1936 			    XenbusStateClosed);
1937 		} else if ((vdp->xdf_status == XD_INIT) ||
1938 		    (vdp->xdf_status == XD_CLOSING)) {
1939 			xdf_post_disconnect(vdp);
1940 		} else {
1941 			mutex_enter(&vdp->xdf_dev_lk);
1942 			vdp->xdf_status = XD_CLOSED;
1943 			mutex_exit(&vdp->xdf_dev_lk);
1944 		}
1945 	}
1946 
1947 	/* notify anybody waiting for oe state change */
1948 	mutex_enter(&vdp->xdf_dev_lk);
1949 	cv_broadcast(&vdp->xdf_dev_cv);
1950 	mutex_exit(&vdp->xdf_dev_lk);
1951 
1952 	status = vdp->xdf_status;
1953 	mutex_exit(&vdp->xdf_cb_lk);
1954 
1955 	if (status == XD_READY) {
1956 		xdf_iostart(vdp);
1957 	} else if ((status == XD_CLOSED) && !unexpect_die) {
1958 		/* interface is closed successfully, remove all minor nodes */
1959 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
1960 		cmlb_free_handle(&vdp->xdf_vd_lbl);
1961 	}
1962 }
1963 
1964 /* check if partition is open, -1 - check all partitions on the disk */
1965 static boolean_t
1966 xdf_isopen(xdf_t *vdp, int partition)
1967 {
1968 	int i;
1969 	ulong_t parbit;
1970 	boolean_t rval = B_FALSE;
1971 
1972 	if (partition == -1)
1973 		parbit = (ulong_t)-1;
1974 	else
1975 		parbit = 1 << partition;
1976 
1977 	for (i = 0; i < OTYPCNT; i++) {
1978 		if (vdp->xdf_vd_open[i] & parbit)
1979 			rval = B_TRUE;
1980 	}
1981 
1982 	return (rval);
1983 }
1984 
1985 /*
1986  * Xdf_check_state_transition will check the XenbusState change to see
1987  * if the change is a valid transition or not.
1988  * The new state is written by backend domain, or by running xenstore-write
1989  * to change it manually in dom0
1990  */
1991 static int
1992 xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
1993 {
1994 	int status;
1995 	int stcheck;
1996 #define	STOK	0 /* need further process */
1997 #define	STNOP	1 /* no action need taking */
1998 #define	STBUG	2 /* unexpected state change, could be a bug */
1999 
2000 	status = vdp->xdf_status;
2001 	stcheck = STOK;
2002 
2003 	switch (status) {
2004 	case XD_UNKNOWN:
2005 		if ((oestate == XenbusStateUnknown)		||
2006 		    (oestate == XenbusStateConnected))
2007 			stcheck = STBUG;
2008 		else if ((oestate == XenbusStateInitialising)	||
2009 		    (oestate == XenbusStateInitWait)		||
2010 		    (oestate == XenbusStateInitialised))
2011 			stcheck = STNOP;
2012 		break;
2013 	case XD_INIT:
2014 		if (oestate == XenbusStateUnknown)
2015 			stcheck = STBUG;
2016 		else if ((oestate == XenbusStateInitialising)	||
2017 		    (oestate == XenbusStateInitWait)		||
2018 		    (oestate == XenbusStateInitialised))
2019 			stcheck = STNOP;
2020 		break;
2021 	case XD_READY:
2022 		if ((oestate == XenbusStateUnknown)		||
2023 		    (oestate == XenbusStateInitialising)	||
2024 		    (oestate == XenbusStateInitWait)		||
2025 		    (oestate == XenbusStateInitialised))
2026 			stcheck = STBUG;
2027 		else if (oestate == XenbusStateConnected)
2028 			stcheck = STNOP;
2029 		break;
2030 	case XD_CLOSING:
2031 		if ((oestate == XenbusStateUnknown)		||
2032 		    (oestate == XenbusStateInitialising)	||
2033 		    (oestate == XenbusStateInitWait)		||
2034 		    (oestate == XenbusStateInitialised)		||
2035 		    (oestate == XenbusStateConnected))
2036 			stcheck = STBUG;
2037 		else if (oestate == XenbusStateClosing)
2038 			stcheck = STNOP;
2039 		break;
2040 	case XD_CLOSED:
2041 		if ((oestate == XenbusStateUnknown)		||
2042 		    (oestate == XenbusStateConnected))
2043 			stcheck = STBUG;
2044 		else if ((oestate == XenbusStateInitWait)	||
2045 		    (oestate == XenbusStateInitialised)		||
2046 		    (oestate == XenbusStateClosing)		||
2047 		    (oestate == XenbusStateClosed))
2048 			stcheck = STNOP;
2049 		break;
2050 	case XD_SUSPEND:
2051 	default:
2052 			stcheck = STBUG;
2053 	}
2054 
2055 	if (stcheck == STOK)
2056 		return (DDI_SUCCESS);
2057 
2058 	if (stcheck == STBUG)
2059 		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2060 		    "state change to %d!, when status is %d",
2061 		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2062 
2063 	return (DDI_FAILURE);
2064 }
2065 
2066 static int
2067 xdf_connect(xdf_t *vdp, boolean_t wait)
2068 {
2069 	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2070 	while (vdp->xdf_status != XD_READY) {
2071 		if (!wait || (vdp->xdf_status > XD_READY))
2072 			break;
2073 
2074 		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2075 			break;
2076 	}
2077 
2078 	return (vdp->xdf_status);
2079 }
2080 
2081 /*
2082  * callback func when DMA/GTE resources is available
2083  *
2084  * Note: we only register one callback function to grant table subsystem
2085  * since we only have one 'struct gnttab_free_callback' in xdf_t.
2086  */
2087 static int
2088 xdf_dmacallback(caddr_t arg)
2089 {
2090 	xdf_t *vdp = (xdf_t *)arg;
2091 	ASSERT(vdp != NULL);
2092 
2093 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2094 	    ddi_get_name_addr(vdp->xdf_dip)));
2095 
2096 	ddi_trigger_softintr(vdp->xdf_softintr_id);
2097 	return (DDI_DMA_CALLBACK_DONE);
2098 }
2099 
2100 static uint_t
2101 xdf_iorestart(caddr_t arg)
2102 {
2103 	xdf_t *vdp = (xdf_t *)arg;
2104 
2105 	ASSERT(vdp != NULL);
2106 
2107 	mutex_enter(&vdp->xdf_dev_lk);
2108 	ASSERT(ISDMACBON(vdp));
2109 	SETDMACBOFF(vdp);
2110 	mutex_exit(&vdp->xdf_dev_lk);
2111 
2112 	xdf_iostart(vdp);
2113 
2114 	return (DDI_INTR_CLAIMED);
2115 }
2116 
2117 static void
2118 xdf_timeout_handler(void *arg)
2119 {
2120 	xdf_t *vdp = arg;
2121 
2122 	mutex_enter(&vdp->xdf_dev_lk);
2123 	vdp->xdf_timeout_id = 0;
2124 	mutex_exit(&vdp->xdf_dev_lk);
2125 
2126 	/* new timeout thread could be re-scheduled */
2127 	xdf_iostart(vdp);
2128 }
2129 
2130 /*
2131  * Alloc a vreq for this bp
2132  * bp->av_back contains the pointer to the vreq upon return
2133  */
2134 static v_req_t *
2135 vreq_get(xdf_t *vdp, buf_t *bp)
2136 {
2137 	v_req_t *vreq = NULL;
2138 
2139 	ASSERT(BP2VREQ(bp) == NULL);
2140 
2141 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2142 	if (vreq == NULL) {
2143 		if (vdp->xdf_timeout_id == 0)
2144 			/* restart I/O after one second */
2145 			vdp->xdf_timeout_id =
2146 			    timeout(xdf_timeout_handler, vdp, hz);
2147 		return (NULL);
2148 	}
2149 	bzero(vreq, sizeof (v_req_t));
2150 
2151 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2152 	bp->av_back = (buf_t *)vreq;
2153 	vreq->v_buf = bp;
2154 	vreq->v_status = VREQ_INIT;
2155 	/* init of other fields in vreq is up to the caller */
2156 
2157 	return (vreq);
2158 }
2159 
2160 static void
2161 vreq_free(xdf_t *vdp, v_req_t *vreq)
2162 {
2163 	buf_t *bp = vreq->v_buf;
2164 
2165 	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2166 
2167 	switch (vreq->v_status) {
2168 	case VREQ_DMAWIN_DONE:
2169 	case VREQ_GS_ALLOCED:
2170 	case VREQ_DMABUF_BOUND:
2171 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2172 		/*FALLTHRU*/
2173 	case VREQ_DMAMEM_ALLOCED:
2174 		if (!ALIGNED_XFER(bp)) {
2175 			ASSERT(vreq->v_abuf != NULL);
2176 			if (!IS_ERROR(bp) && IS_READ(bp))
2177 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2178 				    bp->b_bcount);
2179 			ddi_dma_mem_free(&vreq->v_align);
2180 		}
2181 		/*FALLTHRU*/
2182 	case VREQ_MEMDMAHDL_ALLOCED:
2183 		if (!ALIGNED_XFER(bp))
2184 			ddi_dma_free_handle(&vreq->v_memdmahdl);
2185 		/*FALLTHRU*/
2186 	case VREQ_DMAHDL_ALLOCED:
2187 		ddi_dma_free_handle(&vreq->v_dmahdl);
2188 		break;
2189 	default:
2190 		break;
2191 	}
2192 	vreq->v_buf->av_back = NULL;
2193 	kmem_cache_free(xdf_vreq_cache, vreq);
2194 }
2195 
2196 /*
2197  * Initalize the DMA and grant table resources for the buf
2198  */
2199 static int
2200 vreq_setup(xdf_t *vdp, v_req_t *vreq)
2201 {
2202 	int rc;
2203 	ddi_dma_attr_t dmaattr;
2204 	uint_t ndcs, ndws;
2205 	ddi_dma_handle_t dh;
2206 	ddi_dma_handle_t mdh;
2207 	ddi_dma_cookie_t dc;
2208 	ddi_acc_handle_t abh;
2209 	caddr_t	aba;
2210 	ge_slot_t *gs;
2211 	size_t bufsz;
2212 	off_t off;
2213 	size_t sz;
2214 	buf_t *bp = vreq->v_buf;
2215 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2216 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2217 
2218 	switch (vreq->v_status) {
2219 	case VREQ_INIT:
2220 		if (IS_FLUSH_DISKCACHE(bp)) {
2221 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2222 				DPRINTF(DMA_DBG, (
2223 				    "xdf@%s: get ge_slotfailed\n",
2224 				    ddi_get_name_addr(vdp->xdf_dip)));
2225 				return (DDI_FAILURE);
2226 			}
2227 			vreq->v_blkno = 0;
2228 			vreq->v_nslots = 1;
2229 			vreq->v_gs = gs;
2230 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2231 			gs->vreq = vreq;
2232 			return (DDI_SUCCESS);
2233 		}
2234 
2235 		if (IS_WRITE_BARRIER(vdp, bp))
2236 			vreq->v_flush_diskcache = WRITE_BARRIER;
2237 		vreq->v_blkno = bp->b_blkno +
2238 		    (diskaddr_t)(uintptr_t)bp->b_private;
2239 		bp->b_private = NULL;
2240 		/* See if we wrote new data to our flush block */
2241 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2242 			check_fbwrite(vdp, bp, vreq->v_blkno);
2243 		vreq->v_status = VREQ_INIT_DONE;
2244 		/*FALLTHRU*/
2245 
2246 	case VREQ_INIT_DONE:
2247 		/*
2248 		 * alloc DMA handle
2249 		 */
2250 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2251 		    xdf_dmacallback, (caddr_t)vdp, &dh);
2252 		if (rc != DDI_SUCCESS) {
2253 			SETDMACBON(vdp);
2254 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2255 			    ddi_get_name_addr(vdp->xdf_dip)));
2256 			return (DDI_FAILURE);
2257 		}
2258 
2259 		vreq->v_dmahdl = dh;
2260 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2261 		/*FALLTHRU*/
2262 
2263 	case VREQ_DMAHDL_ALLOCED:
2264 		/*
2265 		 * alloc dma handle for 512-byte aligned buf
2266 		 */
2267 		if (!ALIGNED_XFER(bp)) {
2268 			/*
2269 			 * XXPV: we need to temporarily enlarge the seg
2270 			 * boundary and s/g length to work round CR6381968
2271 			 */
2272 			dmaattr = xb_dma_attr;
2273 			dmaattr.dma_attr_seg = (uint64_t)-1;
2274 			dmaattr.dma_attr_sgllen = INT_MAX;
2275 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2276 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2277 			if (rc != DDI_SUCCESS) {
2278 				SETDMACBON(vdp);
2279 				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2280 				    "handle alloc failed\n",
2281 				    ddi_get_name_addr(vdp->xdf_dip)));
2282 				return (DDI_FAILURE);
2283 			}
2284 			vreq->v_memdmahdl = mdh;
2285 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2286 		}
2287 		/*FALLTHRU*/
2288 
2289 	case VREQ_MEMDMAHDL_ALLOCED:
2290 		/*
2291 		 * alloc 512-byte aligned buf
2292 		 */
2293 		if (!ALIGNED_XFER(bp)) {
2294 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2295 				bp_mapin(bp);
2296 
2297 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2298 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2299 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2300 			    &aba, &bufsz, &abh);
2301 			if (rc != DDI_SUCCESS) {
2302 				SETDMACBON(vdp);
2303 				DPRINTF(DMA_DBG, (
2304 				    "xdf@%s: DMA mem allocation failed\n",
2305 				    ddi_get_name_addr(vdp->xdf_dip)));
2306 				return (DDI_FAILURE);
2307 			}
2308 
2309 			vreq->v_abuf = aba;
2310 			vreq->v_align = abh;
2311 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2312 
2313 			ASSERT(bufsz >= bp->b_bcount);
2314 			if (!IS_READ(bp))
2315 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2316 				    bp->b_bcount);
2317 		}
2318 		/*FALLTHRU*/
2319 
2320 	case VREQ_DMAMEM_ALLOCED:
2321 		/*
2322 		 * dma bind
2323 		 */
2324 		if (ALIGNED_XFER(bp)) {
2325 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2326 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2327 			    &dc, &ndcs);
2328 		} else {
2329 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2330 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2331 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2332 		}
2333 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2334 			/* get num of dma windows */
2335 			if (rc == DDI_DMA_PARTIAL_MAP) {
2336 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2337 				ASSERT(rc == DDI_SUCCESS);
2338 			} else {
2339 				ndws = 1;
2340 			}
2341 		} else {
2342 			SETDMACBON(vdp);
2343 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2344 			    ddi_get_name_addr(vdp->xdf_dip)));
2345 			return (DDI_FAILURE);
2346 		}
2347 
2348 		vreq->v_dmac = dc;
2349 		vreq->v_dmaw = 0;
2350 		vreq->v_ndmacs = ndcs;
2351 		vreq->v_ndmaws = ndws;
2352 		vreq->v_nslots = ndws;
2353 		vreq->v_status = VREQ_DMABUF_BOUND;
2354 		/*FALLTHRU*/
2355 
2356 	case VREQ_DMABUF_BOUND:
2357 		/*
2358 		 * get ge_slot, callback is set upon failure from gs_get(),
2359 		 * if not set previously
2360 		 */
2361 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2362 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2363 			    ddi_get_name_addr(vdp->xdf_dip)));
2364 			return (DDI_FAILURE);
2365 		}
2366 
2367 		vreq->v_gs = gs;
2368 		gs->vreq = vreq;
2369 		vreq->v_status = VREQ_GS_ALLOCED;
2370 		break;
2371 
2372 	case VREQ_GS_ALLOCED:
2373 		/* nothing need to be done */
2374 		break;
2375 
2376 	case VREQ_DMAWIN_DONE:
2377 		/*
2378 		 * move to the next dma window
2379 		 */
2380 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2381 
2382 		/* get a ge_slot for this DMA window */
2383 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2384 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2385 			    ddi_get_name_addr(vdp->xdf_dip)));
2386 			return (DDI_FAILURE);
2387 		}
2388 
2389 		vreq->v_gs = gs;
2390 		gs->vreq = vreq;
2391 		vreq->v_dmaw++;
2392 		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2393 		    &vreq->v_dmac, &vreq->v_ndmacs);
2394 		ASSERT(rc == DDI_SUCCESS);
2395 		vreq->v_status = VREQ_GS_ALLOCED;
2396 		break;
2397 
2398 	default:
2399 		return (DDI_FAILURE);
2400 	}
2401 
2402 	return (DDI_SUCCESS);
2403 }
2404 
2405 static ge_slot_t *
2406 gs_get(xdf_t *vdp, int isread)
2407 {
2408 	grant_ref_t gh;
2409 	ge_slot_t *gs;
2410 
2411 	/* try to alloc GTEs needed in this slot, first */
2412 	if (gnttab_alloc_grant_references(
2413 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2414 		if (vdp->xdf_gnt_callback.next == NULL) {
2415 			SETDMACBON(vdp);
2416 			gnttab_request_free_callback(
2417 			    &vdp->xdf_gnt_callback,
2418 			    (void (*)(void *))xdf_dmacallback,
2419 			    (void *)vdp,
2420 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2421 		}
2422 		return (NULL);
2423 	}
2424 
2425 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2426 	if (gs == NULL) {
2427 		gnttab_free_grant_references(gh);
2428 		if (vdp->xdf_timeout_id == 0)
2429 			/* restart I/O after one second */
2430 			vdp->xdf_timeout_id =
2431 			    timeout(xdf_timeout_handler, vdp, hz);
2432 		return (NULL);
2433 	}
2434 
2435 	/* init gs_slot */
2436 	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2437 	gs->oeid = vdp->xdf_peer;
2438 	gs->isread = isread;
2439 	gs->ghead = gh;
2440 	gs->ngrefs = 0;
2441 
2442 	return (gs);
2443 }
2444 
2445 static void
2446 gs_free(xdf_t *vdp, ge_slot_t *gs)
2447 {
2448 	int i;
2449 	grant_ref_t *gp = gs->ge;
2450 	int ngrefs = gs->ngrefs;
2451 	boolean_t isread = gs->isread;
2452 
2453 	list_remove(&vdp->xdf_gs_act, (void *)gs);
2454 
2455 	/* release all grant table entry resources used in this slot */
2456 	for (i = 0; i < ngrefs; i++, gp++)
2457 		gnttab_end_foreign_access(*gp, !isread, 0);
2458 	gnttab_free_grant_references(gs->ghead);
2459 
2460 	kmem_cache_free(xdf_gs_cache, (void *)gs);
2461 }
2462 
2463 static grant_ref_t
2464 gs_grant(ge_slot_t *gs, mfn_t mfn)
2465 {
2466 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2467 
2468 	ASSERT(gr != -1);
2469 	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2470 	gs->ge[gs->ngrefs++] = gr;
2471 	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2472 
2473 	return (gr);
2474 }
2475 
2476 static void
2477 unexpectedie(xdf_t *vdp)
2478 {
2479 	/* clean up I/Os in ring that have responses */
2480 	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2481 		mutex_exit(&vdp->xdf_dev_lk);
2482 		(void) xdf_intr((caddr_t)vdp);
2483 		mutex_enter(&vdp->xdf_dev_lk);
2484 	}
2485 
2486 	/* free up all grant table entries */
2487 	while (!list_is_empty(&vdp->xdf_gs_act))
2488 		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2489 
2490 	/*
2491 	 * move bp back to active list orderly
2492 	 * vreq_busy is updated in vreq_free()
2493 	 */
2494 	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2495 		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2496 		buf_t *bp = vreq->v_buf;
2497 
2498 		bp->av_back = NULL;
2499 		bp->b_resid = bp->b_bcount;
2500 		if (vdp->xdf_f_act == NULL) {
2501 			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2502 		} else {
2503 			/* move to the head of list */
2504 			bp->av_forw = vdp->xdf_f_act;
2505 			vdp->xdf_f_act = bp;
2506 		}
2507 		kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2508 		vreq_free(vdp, vreq);
2509 	}
2510 }
2511 
2512 static void
2513 xdfmin(struct buf *bp)
2514 {
2515 	if (bp->b_bcount > xdf_maxphys)
2516 		bp->b_bcount = xdf_maxphys;
2517 }
2518