xref: /titanic_41/usr/src/uts/common/xen/io/xdf.c (revision cd37da7426f0c49c14ad9a8a07638ca971477566)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include "xdf.h"
37 
38 #define	FLUSH_DISKCACHE	0x1
39 #define	WRITE_BARRIER	0x2
40 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
41 #define	USE_WRITE_BARRIER(vdp)				\
42 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
43 #define	USE_FLUSH_DISKCACHE(vdp)			\
44 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
45 #define	IS_WRITE_BARRIER(vdp, bp)			\
46 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
47 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
48 #define	IS_FLUSH_DISKCACHE(bp)				\
49 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
50 
51 static void *vbd_ss;
52 static kmem_cache_t *xdf_vreq_cache;
53 static kmem_cache_t *xdf_gs_cache;
54 static int xdf_maxphys = XB_MAXPHYS;
55 int xdfdebug = 0;
56 extern int do_polled_io;
57 diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
58 int	xdf_barrier_flush_disable = 0;
59 
60 /*
61  * dev_ops and cb_ops entrypoints
62  */
63 static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
64 static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
65 static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
66 static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
67 static int xdf_open(dev_t *, int, int, cred_t *);
68 static int xdf_close(dev_t, int, int, struct cred *);
69 static int xdf_strategy(struct buf *);
70 static int xdf_read(dev_t, struct uio *, cred_t *);
71 static int xdf_aread(dev_t, struct aio_req *, cred_t *);
72 static int xdf_write(dev_t, struct uio *, cred_t *);
73 static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
74 static int xdf_dump(dev_t, caddr_t, daddr_t, int);
75 static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
76 static uint_t xdf_intr(caddr_t);
77 static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
78     caddr_t, int *);
79 
80 /*
81  * misc private functions
82  */
83 static int xdf_suspend(dev_info_t *);
84 static int xdf_resume(dev_info_t *);
85 static int xdf_start_connect(xdf_t *);
86 static int xdf_start_disconnect(xdf_t *);
87 static int xdf_post_connect(xdf_t *);
88 static void xdf_post_disconnect(xdf_t *);
89 static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
90 static void xdf_iostart(xdf_t *);
91 static void xdf_iofini(xdf_t *, uint64_t, int);
92 static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
93 static int xdf_drain_io(xdf_t *);
94 static boolean_t xdf_isopen(xdf_t *, int);
95 static int xdf_check_state_transition(xdf_t *, XenbusState);
96 static int xdf_connect(xdf_t *, boolean_t);
97 static int xdf_dmacallback(caddr_t);
98 static void xdf_timeout_handler(void *);
99 static uint_t xdf_iorestart(caddr_t);
100 static v_req_t *vreq_get(xdf_t *, buf_t *);
101 static void vreq_free(xdf_t *, v_req_t *);
102 static int vreq_setup(xdf_t *, v_req_t *);
103 static ge_slot_t *gs_get(xdf_t *, int);
104 static void gs_free(xdf_t *, ge_slot_t *);
105 static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
106 static void unexpectedie(xdf_t *);
107 static void xdfmin(struct buf *);
108 
109 static 	struct cb_ops xdf_cbops = {
110 	xdf_open,
111 	xdf_close,
112 	xdf_strategy,
113 	nodev,
114 	xdf_dump,
115 	xdf_read,
116 	xdf_write,
117 	xdf_ioctl,
118 	nodev,
119 	nodev,
120 	nodev,
121 	nochpoll,
122 	xdf_prop_op,
123 	NULL,
124 	D_MP | D_NEW | D_64BIT,
125 	CB_REV,
126 	xdf_aread,
127 	xdf_awrite
128 };
129 
130 struct dev_ops xdf_devops = {
131 	DEVO_REV,		/* devo_rev */
132 	0,			/* devo_refcnt */
133 	xdf_getinfo,		/* devo_getinfo */
134 	nulldev,		/* devo_identify */
135 	nulldev,		/* devo_probe */
136 	xdf_attach,		/* devo_attach */
137 	xdf_detach,		/* devo_detach */
138 	xdf_reset,		/* devo_reset */
139 	&xdf_cbops,		/* devo_cb_ops */
140 	(struct bus_ops *)NULL	/* devo_bus_ops */
141 };
142 
143 static struct modldrv modldrv = {
144 	&mod_driverops,		/* Type of module.  This one is a driver */
145 	"virtual block driver %I%",	/* short description */
146 	&xdf_devops		/* driver specific ops */
147 };
148 
149 static struct modlinkage xdf_modlinkage = {
150 	MODREV_1, (void *)&modldrv, NULL
151 };
152 
153 /*
154  * I/O buffer DMA attributes
155  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
156  */
157 static ddi_dma_attr_t xb_dma_attr = {
158 	DMA_ATTR_V0,
159 	(uint64_t)0,			/* lowest address */
160 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
161 	(uint64_t)0xffffff,		/* DMA counter limit max */
162 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
163 	XB_BSIZE - 1,			/* bitmap of burst sizes */
164 	XB_BSIZE,			/* min transfer */
165 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
166 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
167 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
168 	XB_BSIZE,			/* granularity */
169 	0,				/* flags (reserved) */
170 };
171 
172 static ddi_device_acc_attr_t xc_acc_attr = {
173 	DDI_DEVICE_ATTR_V0,
174 	DDI_NEVERSWAP_ACC,
175 	DDI_STRICTORDER_ACC
176 };
177 
178 /* callbacks from commmon label */
179 
180 static int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
181 	void *);
182 static int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
183 
184 static cmlb_tg_ops_t xdf_lb_ops = {
185 	TG_DK_OPS_VERSION_1,
186 	xdf_lb_rdwr,
187 	xdf_lb_getinfo
188 };
189 
190 int
191 _init(void)
192 {
193 	int rc;
194 
195 	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) == 0) {
196 		xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
197 		    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
198 		ASSERT(xdf_vreq_cache != NULL);
199 		xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
200 		    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
201 		ASSERT(xdf_gs_cache != NULL);
202 		if ((rc = mod_install(&xdf_modlinkage)) != 0) {
203 			kmem_cache_destroy(xdf_vreq_cache);
204 			kmem_cache_destroy(xdf_gs_cache);
205 			ddi_soft_state_fini(&vbd_ss);
206 		}
207 	}
208 
209 	return (rc);
210 }
211 
212 int
213 _fini(void)
214 {
215 	int err;
216 
217 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
218 		return (err);
219 
220 	kmem_cache_destroy(xdf_vreq_cache);
221 	kmem_cache_destroy(xdf_gs_cache);
222 	ddi_soft_state_fini(&vbd_ss);
223 
224 	return (0);
225 }
226 
227 int
228 _info(struct modinfo *modinfop)
229 {
230 	return (mod_info(&xdf_modlinkage, modinfop));
231 }
232 
233 /*ARGSUSED*/
234 static int
235 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
236 {
237 	int instance;
238 	xdf_t *vbdp;
239 
240 	instance = XDF_INST(getminor((dev_t)arg));
241 
242 	switch (cmd) {
243 	case DDI_INFO_DEVT2DEVINFO:
244 		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
245 			*rp = NULL;
246 			return (DDI_FAILURE);
247 		}
248 		*rp = vbdp->xdf_dip;
249 		return (DDI_SUCCESS);
250 
251 	case DDI_INFO_DEVT2INSTANCE:
252 		*rp = (void *)(uintptr_t)instance;
253 		return (DDI_SUCCESS);
254 
255 	default:
256 		return (DDI_FAILURE);
257 	}
258 }
259 
260 static int
261 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
262 	char *name, caddr_t valuep, int *lengthp)
263 {
264 	int instance = ddi_get_instance(dip);
265 	xdf_t *vdp;
266 	diskaddr_t p_blkcnt;
267 
268 	/*
269 	 * xdf dynamic properties are device specific and size oriented.
270 	 * Requests issued under conditions where size is valid are passed
271 	 * to ddi_prop_op_nblocks with the size information, otherwise the
272 	 * request is passed to ddi_prop_op.
273 	 */
274 	vdp = ddi_get_soft_state(vbd_ss, instance);
275 
276 	if ((dev == DDI_DEV_T_ANY) || (vdp == NULL))
277 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
278 		    name, valuep, lengthp));
279 
280 	/* do cv_wait until connected or failed */
281 	mutex_enter(&vdp->xdf_dev_lk);
282 	if (xdf_connect(vdp, B_TRUE) != XD_READY) {
283 		mutex_exit(&vdp->xdf_dev_lk);
284 		goto out;
285 	}
286 	mutex_exit(&vdp->xdf_dev_lk);
287 
288 	if (cmlb_partinfo(vdp->xdf_vd_lbl, XDF_PART(getminor(dev)), &p_blkcnt,
289 	    NULL, NULL, NULL, NULL) == 0)
290 		return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
291 		    name, valuep, lengthp, (uint64_t)p_blkcnt));
292 
293 out:
294 	return (ddi_prop_op(dev, dip, prop_op, mod_flags, name, valuep,
295 	    lengthp));
296 }
297 
298 static int
299 xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
300 {
301 	xdf_t *vdp;
302 	ddi_iblock_cookie_t ibc;
303 	ddi_iblock_cookie_t softibc;
304 	int instance;
305 
306 	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
307 	    "xdfdebug", 0);
308 
309 	switch (cmd) {
310 		case DDI_ATTACH:
311 			break;
312 
313 		case DDI_RESUME:
314 			return (xdf_resume(devi));
315 
316 		default:
317 			return (DDI_FAILURE);
318 	}
319 
320 	instance = ddi_get_instance(devi);
321 	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
322 		return (DDI_FAILURE);
323 
324 	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
325 	vdp = ddi_get_soft_state(vbd_ss, instance);
326 	vdp->xdf_dip = devi;
327 	if (ddi_get_iblock_cookie(devi, 0, &ibc) != DDI_SUCCESS) {
328 		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
329 		    ddi_get_name_addr(devi));
330 		goto errout1;
331 	}
332 
333 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
334 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
335 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
336 	ddi_set_driver_private(devi, vdp);
337 
338 	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
339 	    != DDI_SUCCESS) {
340 		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
341 		    ddi_get_name_addr(devi));
342 		goto errout2;
343 	}
344 	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
345 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
346 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
347 		    ddi_get_name_addr(devi));
348 		goto errout2;
349 	}
350 
351 	/*
352 	 * create kstat for iostat(1M)
353 	 */
354 	if ((vdp->xdf_xdev_iostat = kstat_create("xdf", instance, NULL, "disk",
355 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
356 		vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
357 		kstat_install(vdp->xdf_xdev_iostat);
358 	} else {
359 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
360 		    ddi_get_name_addr(devi));
361 		goto errout3;
362 	}
363 
364 	/*
365 	 * driver handles kernel-issued IOCTLs
366 	 */
367 	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
368 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
369 		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
370 		    ddi_get_name_addr(devi));
371 		goto errout4;
372 	}
373 
374 	/*
375 	 * create default device minor nodes: non-removable disk
376 	 * we will adjust minor nodes after we are connected w/ backend
377 	 */
378 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
379 	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1, DDI_NT_BLOCK,
380 	    CMLB_FAKE_LABEL_ONE_PARTITION, vdp->xdf_vd_lbl, NULL) != 0) {
381 		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
382 		    ddi_get_name_addr(devi));
383 		goto errout5;
384 	}
385 
386 	/*
387 	 * We ship with cache-enabled disks
388 	 */
389 	vdp->xdf_wce = 1;
390 
391 	mutex_enter(&vdp->xdf_cb_lk);
392 
393 	/* Watch backend XenbusState change */
394 	if (xvdi_add_event_handler(devi, XS_OE_STATE,
395 	    xdf_oe_change) != DDI_SUCCESS) {
396 		mutex_exit(&vdp->xdf_cb_lk);
397 		goto errout6;
398 	}
399 
400 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
401 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
402 		    ddi_get_name_addr(devi));
403 		(void) xdf_start_disconnect(vdp);
404 		mutex_exit(&vdp->xdf_cb_lk);
405 		goto errout7;
406 	}
407 
408 	mutex_exit(&vdp->xdf_cb_lk);
409 
410 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
411 	    offsetof(v_req_t, v_link));
412 	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
413 	    offsetof(ge_slot_t, link));
414 
415 	ddi_report_dev(devi);
416 	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
417 
418 	return (DDI_SUCCESS);
419 
420 errout7:
421 	xvdi_remove_event_handler(devi, XS_OE_STATE);
422 errout6:
423 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
424 errout5:
425 	cmlb_free_handle(&vdp->xdf_vd_lbl);
426 	ddi_prop_remove_all(devi);
427 errout4:
428 	kstat_delete(vdp->xdf_xdev_iostat);
429 errout3:
430 	ddi_remove_softintr(vdp->xdf_softintr_id);
431 errout2:
432 	ddi_set_driver_private(devi, NULL);
433 	cv_destroy(&vdp->xdf_dev_cv);
434 	mutex_destroy(&vdp->xdf_cb_lk);
435 	mutex_destroy(&vdp->xdf_dev_lk);
436 errout1:
437 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
438 	ddi_soft_state_free(vbd_ss, instance);
439 	return (DDI_FAILURE);
440 }
441 
442 static int
443 xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
444 {
445 	xdf_t *vdp;
446 	int instance;
447 
448 	switch (cmd) {
449 
450 	case DDI_PM_SUSPEND:
451 		break;
452 
453 	case DDI_SUSPEND:
454 		return (xdf_suspend(devi));
455 
456 	case DDI_DETACH:
457 		break;
458 
459 	default:
460 		return (DDI_FAILURE);
461 	}
462 
463 	instance = ddi_get_instance(devi);
464 	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
465 	vdp = ddi_get_soft_state(vbd_ss, instance);
466 
467 	if (vdp == NULL)
468 		return (DDI_FAILURE);
469 
470 	mutex_enter(&vdp->xdf_dev_lk);
471 	if (xdf_isopen(vdp, -1)) {
472 		mutex_exit(&vdp->xdf_dev_lk);
473 		return (DDI_FAILURE);
474 	}
475 
476 	if (vdp->xdf_status != XD_CLOSED) {
477 		mutex_exit(&vdp->xdf_dev_lk);
478 		return (DDI_FAILURE);
479 	}
480 
481 	ASSERT(!ISDMACBON(vdp));
482 	mutex_exit(&vdp->xdf_dev_lk);
483 
484 	if (vdp->xdf_timeout_id != 0)
485 		(void) untimeout(vdp->xdf_timeout_id);
486 
487 	xvdi_remove_event_handler(devi, XS_OE_STATE);
488 
489 	/* we'll support backend running in domU later */
490 #ifdef	DOMU_BACKEND
491 	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
492 #endif
493 
494 	list_destroy(&vdp->xdf_vreq_act);
495 	list_destroy(&vdp->xdf_gs_act);
496 	ddi_prop_remove_all(devi);
497 	kstat_delete(vdp->xdf_xdev_iostat);
498 	ddi_remove_softintr(vdp->xdf_softintr_id);
499 	ddi_set_driver_private(devi, NULL);
500 	cv_destroy(&vdp->xdf_dev_cv);
501 	mutex_destroy(&vdp->xdf_cb_lk);
502 	mutex_destroy(&vdp->xdf_dev_lk);
503 	if (vdp->xdf_cache_flush_block != NULL)
504 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
505 	ddi_soft_state_free(vbd_ss, instance);
506 	return (DDI_SUCCESS);
507 }
508 
509 static int
510 xdf_suspend(dev_info_t *devi)
511 {
512 	xdf_t *vdp;
513 	int instance;
514 	enum xdf_state st;
515 
516 	instance = ddi_get_instance(devi);
517 
518 	if (xdfdebug & SUSRES_DBG)
519 		xen_printf("xdf_suspend: xdf#%d\n", instance);
520 
521 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
522 		return (DDI_FAILURE);
523 
524 	xvdi_suspend(devi);
525 
526 	mutex_enter(&vdp->xdf_cb_lk);
527 	mutex_enter(&vdp->xdf_dev_lk);
528 	st = vdp->xdf_status;
529 	/* change status to stop further I/O requests */
530 	if (st == XD_READY)
531 		vdp->xdf_status = XD_SUSPEND;
532 	mutex_exit(&vdp->xdf_dev_lk);
533 	mutex_exit(&vdp->xdf_cb_lk);
534 
535 	/* make sure no more I/O responses left in the ring buffer */
536 	if ((st == XD_INIT) || (st == XD_READY)) {
537 		(void) ddi_remove_intr(devi, 0, NULL);
538 		(void) xdf_drain_io(vdp);
539 		/*
540 		 * no need to teardown the ring buffer here
541 		 * it will be simply re-init'ed during resume when
542 		 * we call xvdi_alloc_ring
543 		 */
544 	}
545 
546 	if (xdfdebug & SUSRES_DBG)
547 		xen_printf("xdf_suspend: SUCCESS\n");
548 
549 	return (DDI_SUCCESS);
550 }
551 
552 /*ARGSUSED*/
553 static int
554 xdf_resume(dev_info_t *devi)
555 {
556 	xdf_t *vdp;
557 	int instance;
558 
559 	instance = ddi_get_instance(devi);
560 	if (xdfdebug & SUSRES_DBG)
561 		xen_printf("xdf_resume: xdf%d\n", instance);
562 
563 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
564 		return (DDI_FAILURE);
565 
566 	mutex_enter(&vdp->xdf_cb_lk);
567 
568 	if (xvdi_resume(devi) != DDI_SUCCESS) {
569 		mutex_exit(&vdp->xdf_cb_lk);
570 		return (DDI_FAILURE);
571 	}
572 
573 	mutex_enter(&vdp->xdf_dev_lk);
574 	ASSERT(vdp->xdf_status != XD_READY);
575 	vdp->xdf_status = XD_UNKNOWN;
576 	mutex_exit(&vdp->xdf_dev_lk);
577 
578 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
579 		mutex_exit(&vdp->xdf_cb_lk);
580 		return (DDI_FAILURE);
581 	}
582 
583 	mutex_exit(&vdp->xdf_cb_lk);
584 
585 	if (xdfdebug & SUSRES_DBG)
586 		xen_printf("xdf_resume: done\n");
587 	return (DDI_SUCCESS);
588 }
589 
590 /*ARGSUSED*/
591 static int
592 xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
593 {
594 	xdf_t *vdp;
595 	int instance;
596 
597 	instance = ddi_get_instance(devi);
598 	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
599 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
600 		return (DDI_FAILURE);
601 
602 	/*
603 	 * wait for any outstanding I/O to complete
604 	 */
605 	(void) xdf_drain_io(vdp);
606 
607 	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
608 	return (DDI_SUCCESS);
609 }
610 
611 static int
612 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
613 {
614 	minor_t	minor;
615 	xdf_t	*vdp;
616 	int part;
617 	ulong_t parbit;
618 	diskaddr_t p_blkct = 0;
619 	boolean_t firstopen;
620 	boolean_t nodelay;
621 
622 	nodelay = (flag & (FNDELAY | FNONBLOCK));
623 	minor = getminor(*devp);
624 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
625 		return (ENXIO);
626 
627 	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
628 
629 	/* do cv_wait until connected or failed */
630 	mutex_enter(&vdp->xdf_dev_lk);
631 	if (!nodelay && (xdf_connect(vdp, B_TRUE) != XD_READY)) {
632 		mutex_exit(&vdp->xdf_dev_lk);
633 		return (ENXIO);
634 	}
635 
636 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
637 		mutex_exit(&vdp->xdf_dev_lk);
638 		return (EROFS);
639 	}
640 
641 	part = XDF_PART(minor);
642 	parbit = 1 << part;
643 	if (vdp->xdf_vd_exclopen & parbit) {
644 		mutex_exit(&vdp->xdf_dev_lk);
645 		return (EBUSY);
646 	}
647 
648 	/* are we the first one to open this node? */
649 	firstopen = !xdf_isopen(vdp, -1);
650 
651 	if ((flag & FEXCL) && !firstopen) {
652 		mutex_exit(&vdp->xdf_dev_lk);
653 		return (EBUSY);
654 	}
655 
656 	if (otyp == OTYP_LYR)
657 		vdp->xdf_vd_lyropen[part]++;
658 
659 	vdp->xdf_vd_open[otyp] |= parbit;
660 
661 	if (flag & FEXCL)
662 		vdp->xdf_vd_exclopen |= parbit;
663 
664 	mutex_exit(&vdp->xdf_dev_lk);
665 
666 	/* force a re-validation */
667 	if (firstopen)
668 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
669 
670 	/*
671 	 * check size
672 	 * ignore CD/DVD which contains a zero-sized s0
673 	 */
674 	if (!nodelay && !XD_IS_CD(vdp) &&
675 	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
676 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
677 		(void) xdf_close(*devp, flag, otyp, credp);
678 		return (ENXIO);
679 	}
680 
681 	return (0);
682 }
683 
684 /*ARGSUSED*/
685 static int
686 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
687 {
688 	minor_t	minor;
689 	xdf_t	*vdp;
690 	int part;
691 	ulong_t parbit;
692 
693 	minor = getminor(dev);
694 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
695 		return (ENXIO);
696 
697 	mutex_enter(&vdp->xdf_dev_lk);
698 	part = XDF_PART(minor);
699 	if (!xdf_isopen(vdp, part)) {
700 		mutex_exit(&vdp->xdf_dev_lk);
701 		return (ENXIO);
702 	}
703 	parbit = 1 << part;
704 
705 	if (otyp == OTYP_LYR) {
706 		if (vdp->xdf_vd_lyropen[part] != 0)
707 			vdp->xdf_vd_lyropen[part]--;
708 		if (vdp->xdf_vd_lyropen[part] == 0)
709 			vdp->xdf_vd_open[OTYP_LYR] &= ~parbit;
710 	} else {
711 		vdp->xdf_vd_open[otyp] &= ~parbit;
712 	}
713 	vdp->xdf_vd_exclopen &= ~parbit;
714 
715 	mutex_exit(&vdp->xdf_dev_lk);
716 	return (0);
717 }
718 
719 static int
720 xdf_strategy(struct buf *bp)
721 {
722 	xdf_t	*vdp;
723 	minor_t minor;
724 	diskaddr_t p_blkct, p_blkst;
725 	ulong_t nblks;
726 	int part;
727 
728 	minor = getminor(bp->b_edev);
729 	part = XDF_PART(minor);
730 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) ||
731 	    !xdf_isopen(vdp, part) ||
732 	    cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
733 	    &p_blkst, NULL, NULL, NULL)) {
734 		bioerror(bp, ENXIO);
735 		bp->b_resid = bp->b_bcount;
736 		biodone(bp);
737 		return (0);
738 	}
739 
740 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
741 		bioerror(bp, EROFS);
742 		bp->b_resid = bp->b_bcount;
743 		biodone(bp);
744 		return (0);
745 	}
746 
747 	/*
748 	 * starting beyond partition
749 	 */
750 	if (bp->b_blkno > p_blkct) {
751 		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
752 		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
753 		bioerror(bp, EINVAL);
754 		bp->b_resid = bp->b_bcount;
755 		biodone(bp);
756 		return (0);
757 	}
758 
759 	/* Legacy: don't set error flag at this case */
760 	if (bp->b_blkno == p_blkct) {
761 		bp->b_resid = bp->b_bcount;
762 		biodone(bp);
763 		return (0);
764 	}
765 
766 	/*
767 	 * adjust for partial transfer
768 	 */
769 	nblks = bp->b_bcount >> XB_BSHIFT;
770 	if ((bp->b_blkno + nblks) > p_blkct) {
771 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
772 		bp->b_bcount -= bp->b_resid;
773 	}
774 
775 
776 	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
777 	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
778 
779 	mutex_enter(&vdp->xdf_dev_lk);
780 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
781 	if (vdp->xdf_f_act == NULL) {
782 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
783 	} else {
784 		vdp->xdf_l_act->av_forw = bp;
785 		vdp->xdf_l_act = bp;
786 	}
787 	bp->av_forw = NULL;
788 	bp->av_back = NULL; /* not tagged with a v_req */
789 	bp->b_private = (void *)(uintptr_t)p_blkst;
790 	mutex_exit(&vdp->xdf_dev_lk);
791 	xdf_iostart(vdp);
792 	if (do_polled_io)
793 		(void) xdf_drain_io(vdp);
794 	return (0);
795 }
796 
797 /*ARGSUSED*/
798 static int
799 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
800 {
801 
802 	xdf_t	*vdp;
803 	minor_t minor;
804 	diskaddr_t p_blkcnt;
805 	int part;
806 
807 	minor = getminor(dev);
808 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
809 		return (ENXIO);
810 
811 	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
812 	    (int64_t)uiop->uio_offset));
813 
814 	part = XDF_PART(minor);
815 	if (!xdf_isopen(vdp, part))
816 		return (ENXIO);
817 
818 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
819 	    NULL, NULL, NULL, NULL))
820 		return (ENXIO);
821 
822 	if (U_INVAL(uiop))
823 		return (EINVAL);
824 
825 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
826 }
827 
828 /*ARGSUSED*/
829 static int
830 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
831 {
832 	xdf_t *vdp;
833 	minor_t minor;
834 	diskaddr_t p_blkcnt;
835 	int part;
836 
837 	minor = getminor(dev);
838 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
839 		return (ENXIO);
840 
841 	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
842 	    (int64_t)uiop->uio_offset));
843 
844 	part = XDF_PART(minor);
845 	if (!xdf_isopen(vdp, part))
846 		return (ENXIO);
847 
848 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
849 	    NULL, NULL, NULL, NULL))
850 		return (ENXIO);
851 
852 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
853 		return (ENOSPC);
854 
855 	if (U_INVAL(uiop))
856 		return (EINVAL);
857 
858 	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
859 }
860 
861 /*ARGSUSED*/
862 static int
863 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
864 {
865 	xdf_t	*vdp;
866 	minor_t minor;
867 	struct uio *uiop = aiop->aio_uio;
868 	diskaddr_t p_blkcnt;
869 	int part;
870 
871 	minor = getminor(dev);
872 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
873 		return (ENXIO);
874 
875 	part = XDF_PART(minor);
876 	if (!xdf_isopen(vdp, part))
877 		return (ENXIO);
878 
879 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
880 	    NULL, NULL, NULL, NULL))
881 		return (ENXIO);
882 
883 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
884 		return (ENOSPC);
885 
886 	if (U_INVAL(uiop))
887 		return (EINVAL);
888 
889 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
890 }
891 
892 /*ARGSUSED*/
893 static int
894 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
895 {
896 	xdf_t *vdp;
897 	minor_t minor;
898 	struct uio *uiop = aiop->aio_uio;
899 	diskaddr_t p_blkcnt;
900 	int part;
901 
902 	minor = getminor(dev);
903 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
904 		return (ENXIO);
905 
906 	part = XDF_PART(minor);
907 	if (!xdf_isopen(vdp, part))
908 		return (ENXIO);
909 
910 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
911 	    NULL, NULL, NULL, NULL))
912 		return (ENXIO);
913 
914 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
915 		return (ENOSPC);
916 
917 	if (U_INVAL(uiop))
918 		return (EINVAL);
919 
920 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
921 }
922 
923 static int
924 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
925 {
926 	struct buf dumpbuf, *dbp;
927 	xdf_t	*vdp;
928 	minor_t minor;
929 	int err = 0;
930 	int part;
931 	diskaddr_t p_blkcnt, p_blkst;
932 
933 	minor = getminor(dev);
934 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
935 		return (ENXIO);
936 
937 	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
938 	    addr, blkno, nblk));
939 
940 	part = XDF_PART(minor);
941 	if (!xdf_isopen(vdp, part))
942 		return (ENXIO);
943 
944 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
945 	    NULL, NULL, NULL))
946 		return (ENXIO);
947 
948 	if ((blkno + nblk) > p_blkcnt) {
949 		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
950 		    blkno + nblk, (uint64_t)vdp->xdf_xdev_nblocks);
951 		return (EINVAL);
952 	}
953 
954 	dbp = &dumpbuf;
955 	bioinit(dbp);
956 	dbp->b_flags = B_BUSY;
957 	dbp->b_un.b_addr = addr;
958 	dbp->b_bcount	= nblk << DEV_BSHIFT;
959 	dbp->b_resid = 0;
960 	dbp->b_blkno = blkno;
961 	dbp->b_edev = dev;
962 	dbp->b_private = (void *)(uintptr_t)p_blkst;
963 
964 	mutex_enter(&vdp->xdf_dev_lk);
965 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
966 	if (vdp->xdf_f_act == NULL) {
967 		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
968 	} else {
969 		vdp->xdf_l_act->av_forw = dbp;
970 		vdp->xdf_l_act = dbp;
971 	}
972 	dbp->av_forw = NULL;
973 	dbp->av_back = NULL;
974 	mutex_exit(&vdp->xdf_dev_lk);
975 	xdf_iostart(vdp);
976 	err = xdf_drain_io(vdp);
977 	biofini(dbp);
978 	return (err);
979 }
980 
981 /*ARGSUSED*/
982 static int
983 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
984     int *rvalp)
985 {
986 	int instance;
987 	xdf_t	*vdp;
988 	minor_t minor;
989 	int part;
990 
991 	minor = getminor(dev);
992 	instance = XDF_INST(minor);
993 
994 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
995 		return (ENXIO);
996 
997 	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
998 	    instance, cmd, cmd));
999 
1000 	part = XDF_PART(minor);
1001 	if (!xdf_isopen(vdp, part))
1002 		return (ENXIO);
1003 
1004 	switch (cmd) {
1005 	case DKIOCGMEDIAINFO: {
1006 		struct dk_minfo	media_info;
1007 
1008 		media_info.dki_lbsize = DEV_BSIZE;
1009 		media_info.dki_capacity = vdp->xdf_xdev_nblocks;
1010 		media_info.dki_media_type = DK_FIXED_DISK;
1011 
1012 		if (ddi_copyout(&media_info, (void *)arg,
1013 		    sizeof (struct dk_minfo), mode)) {
1014 			return (EFAULT);
1015 		} else {
1016 			return (0);
1017 		}
1018 	}
1019 
1020 	case DKIOCINFO: {
1021 		struct dk_cinfo info;
1022 
1023 		/* controller information */
1024 		if (XD_IS_CD(vdp))
1025 			info.dki_ctype = DKC_CDROM;
1026 		else
1027 			info.dki_ctype = DKC_VBD;
1028 
1029 		info.dki_cnum = 0;
1030 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1031 
1032 		/* unit information */
1033 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1034 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1035 		info.dki_flags = DKI_FMTVOL;
1036 		info.dki_partition = part;
1037 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1038 		info.dki_addr = 0;
1039 		info.dki_space = 0;
1040 		info.dki_prio = 0;
1041 		info.dki_vec = 0;
1042 
1043 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1044 			return (EFAULT);
1045 		else
1046 			return (0);
1047 	}
1048 
1049 	case DKIOCSTATE: {
1050 		enum dkio_state	dkstate = DKIO_INSERTED;
1051 		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1052 		    mode) != 0)
1053 			return (EFAULT);
1054 		return (0);
1055 	}
1056 
1057 	/*
1058 	 * is media removable?
1059 	 */
1060 	case DKIOCREMOVABLE: {
1061 		int i = XD_IS_RM(vdp) ? 1 : 0;
1062 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1063 			return (EFAULT);
1064 		return (0);
1065 	}
1066 
1067 	case DKIOCG_PHYGEOM:
1068 	case DKIOCG_VIRTGEOM:
1069 	case DKIOCGGEOM:
1070 	case DKIOCSGEOM:
1071 	case DKIOCGAPART:
1072 	case DKIOCGVTOC:
1073 	case DKIOCSVTOC:
1074 	case DKIOCPARTINFO:
1075 	case DKIOCGETEFI:
1076 	case DKIOCSETEFI:
1077 	case DKIOCPARTITION: {
1078 		int rc;
1079 
1080 		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1081 		    rvalp, NULL);
1082 		return (rc);
1083 	}
1084 
1085 	case DKIOCGETWCE:
1086 		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1087 		    sizeof (vdp->xdf_wce), mode))
1088 			return (EFAULT);
1089 		return (0);
1090 	case DKIOCSETWCE:
1091 		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1092 		    sizeof (vdp->xdf_wce), mode))
1093 			return (EFAULT);
1094 		return (0);
1095 	case DKIOCFLUSHWRITECACHE: {
1096 		int rc;
1097 		struct dk_callback *dkc = (struct dk_callback *)arg;
1098 
1099 		if (vdp->xdf_flush_supported) {
1100 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1101 			    NULL, 0, 0, (void *)dev);
1102 		} else if (vdp->xdf_feature_barrier &&
1103 		    !xdf_barrier_flush_disable) {
1104 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1105 			    vdp->xdf_cache_flush_block, xdf_flush_block,
1106 			    DEV_BSIZE, (void *)dev);
1107 		} else {
1108 			return (ENOTTY);
1109 		}
1110 		if ((mode & FKIOCTL) && (dkc != NULL) &&
1111 		    (dkc->dkc_callback != NULL)) {
1112 			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1113 			/* need to return 0 after calling callback */
1114 			rc = 0;
1115 		}
1116 		return (rc);
1117 	}
1118 
1119 	default:
1120 		return (ENOTTY);
1121 	}
1122 }
1123 
1124 /*
1125  * xdf interrupt handler
1126  */
1127 static uint_t
1128 xdf_intr(caddr_t arg)
1129 {
1130 	xdf_t *vdp = (xdf_t *)arg;
1131 	xendev_ring_t *xbr;
1132 	blkif_response_t *resp;
1133 	int bioerr;
1134 	uint64_t id;
1135 	extern int do_polled_io;
1136 	uint8_t op;
1137 	uint16_t status;
1138 	ddi_acc_handle_t acchdl;
1139 
1140 	mutex_enter(&vdp->xdf_dev_lk);
1141 
1142 	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1143 		mutex_exit(&vdp->xdf_dev_lk);
1144 		return (DDI_INTR_UNCLAIMED);
1145 	}
1146 
1147 	acchdl = vdp->xdf_xb_ring_hdl;
1148 
1149 	/*
1150 	 * complete all requests which have a response
1151 	 */
1152 	while (resp = xvdi_ring_get_response(xbr)) {
1153 		id = ddi_get64(acchdl, &resp->id);
1154 		op = ddi_get8(acchdl, &resp->operation);
1155 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1156 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1157 		    op, id, status));
1158 
1159 		/*
1160 		 * XXPV - close connection to the backend and restart
1161 		 */
1162 		if (status != BLKIF_RSP_OKAY) {
1163 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1164 			    ddi_get_name_addr(vdp->xdf_dip),
1165 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1166 			bioerr = EIO;
1167 		} else {
1168 			bioerr = 0;
1169 		}
1170 
1171 		xdf_iofini(vdp, id, bioerr);
1172 	}
1173 
1174 	mutex_exit(&vdp->xdf_dev_lk);
1175 
1176 	if (!do_polled_io)
1177 		xdf_iostart(vdp);
1178 
1179 	return (DDI_INTR_CLAIMED);
1180 }
1181 
1182 int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1183 
1184 /*
1185  * Snarf new data if our flush block was re-written
1186  */
1187 static void
1188 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1189 {
1190 	int nblks;
1191 	boolean_t mapin;
1192 
1193 	if (IS_WRITE_BARRIER(vdp, bp))
1194 		return; /* write was a flush write */
1195 
1196 	mapin = B_FALSE;
1197 	nblks = bp->b_bcount >> DEV_BSHIFT;
1198 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1199 		xdf_fbrewrites++;
1200 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1201 			mapin = B_TRUE;
1202 			bp_mapin(bp);
1203 		}
1204 		bcopy(bp->b_un.b_addr +
1205 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1206 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1207 		if (mapin)
1208 			bp_mapout(bp);
1209 	}
1210 }
1211 
1212 static void
1213 xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1214 {
1215 	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1216 	v_req_t *vreq = gs->vreq;
1217 	buf_t *bp = vreq->v_buf;
1218 
1219 	gs_free(vdp, gs);
1220 	if (bioerr)
1221 		bioerror(bp, bioerr);
1222 	vreq->v_nslots--;
1223 	if (vreq->v_nslots != 0)
1224 		return;
1225 
1226 	XDF_UPDATE_IO_STAT(vdp, bp);
1227 	kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1228 
1229 	if (IS_ERROR(bp))
1230 		bp->b_resid = bp->b_bcount;
1231 
1232 	vreq_free(vdp, vreq);
1233 	biodone(bp);
1234 }
1235 
1236 /*
1237  * return value of xdf_prepare_rreq()
1238  * used in xdf_iostart()
1239  */
1240 #define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1241 #define	XF_COMP		1 /* no more I/O left in buf */
1242 
1243 static void
1244 xdf_iostart(xdf_t *vdp)
1245 {
1246 	xendev_ring_t *xbr;
1247 	struct buf *bp;
1248 	blkif_request_t *rreq;
1249 	int retval;
1250 	int rreqready = 0;
1251 
1252 	xbr = vdp->xdf_xb_ring;
1253 
1254 	/*
1255 	 * populate the ring request(s)
1256 	 *
1257 	 * loop until there is no buf to transfer or no free slot
1258 	 * available in I/O ring
1259 	 */
1260 	mutex_enter(&vdp->xdf_dev_lk);
1261 
1262 	for (;;) {
1263 		if (vdp->xdf_status != XD_READY)
1264 			break;
1265 
1266 		/* active buf queue empty? */
1267 		if ((bp = vdp->xdf_f_act) == NULL)
1268 			break;
1269 
1270 		/* try to grab a vreq for this bp */
1271 		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1272 				break;
1273 		/* alloc DMA/GTE resources */
1274 		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1275 			break;
1276 
1277 		/* get next blkif_request in the ring */
1278 		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1279 			break;
1280 		bzero(rreq, sizeof (blkif_request_t));
1281 
1282 		/* populate blkif_request with this buf */
1283 		rreqready++;
1284 		retval = xdf_prepare_rreq(vdp, bp, rreq);
1285 		if (retval == XF_COMP) {
1286 			/* finish this bp, switch to next one */
1287 			kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1288 			vdp->xdf_f_act = bp->av_forw;
1289 			bp->av_forw = NULL;
1290 		}
1291 	}
1292 
1293 	/*
1294 	 * Send the request(s) to the backend
1295 	 */
1296 	if (rreqready) {
1297 		if (xvdi_ring_push_request(xbr)) {
1298 			DPRINTF(IO_DBG, ("xdf_iostart: "
1299 			    "sent request(s) to backend\n"));
1300 			xvdi_notify_oe(vdp->xdf_dip);
1301 		}
1302 	}
1303 
1304 	mutex_exit(&vdp->xdf_dev_lk);
1305 }
1306 
1307 /*
1308  * populate a single blkif_request_t w/ a buf
1309  */
1310 static int
1311 xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1312 {
1313 	int		rval;
1314 	grant_ref_t	gr;
1315 	uint8_t		fsect, lsect;
1316 	size_t		bcnt;
1317 	paddr_t		dma_addr;
1318 	off_t		blk_off;
1319 	dev_info_t	*dip = vdp->xdf_dip;
1320 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1321 	v_req_t		*vreq = BP2VREQ(bp);
1322 	uint64_t	blkno = vreq->v_blkno;
1323 	uint_t		ndmacs = vreq->v_ndmacs;
1324 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1325 	int		seg = 0;
1326 	int		isread = IS_READ(bp);
1327 
1328 	if (isread)
1329 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1330 	else {
1331 		switch (vreq->v_flush_diskcache) {
1332 		case FLUSH_DISKCACHE:
1333 			ddi_put8(acchdl, &rreq->operation,
1334 			    BLKIF_OP_FLUSH_DISKCACHE);
1335 			ddi_put16(acchdl, &rreq->handle, vdev);
1336 			ddi_put64(acchdl, &rreq->id,
1337 			    (uint64_t)(uintptr_t)(vreq->v_gs));
1338 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1339 			return (XF_COMP);
1340 		case WRITE_BARRIER:
1341 			ddi_put8(acchdl, &rreq->operation,
1342 			    BLKIF_OP_WRITE_BARRIER);
1343 			break;
1344 		default:
1345 			if (!vdp->xdf_wce)
1346 				ddi_put8(acchdl, &rreq->operation,
1347 				    BLKIF_OP_WRITE_BARRIER);
1348 			else
1349 				ddi_put8(acchdl, &rreq->operation,
1350 				    BLKIF_OP_WRITE);
1351 			break;
1352 		}
1353 	}
1354 
1355 	ddi_put16(acchdl, &rreq->handle, vdev);
1356 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1357 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1358 
1359 	/*
1360 	 * loop until all segments are populated or no more dma cookie in buf
1361 	 */
1362 	for (;;) {
1363 	/*
1364 	 * Each segment of a blkif request can transfer up to
1365 	 * one 4K page of data.
1366 	 */
1367 		bcnt = vreq->v_dmac.dmac_size;
1368 		ASSERT(bcnt <= PAGESIZE);
1369 		ASSERT((bcnt % XB_BSIZE) == 0);
1370 		dma_addr = vreq->v_dmac.dmac_laddress;
1371 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1372 		ASSERT((blk_off & XB_BMASK) == 0);
1373 		fsect = blk_off >> XB_BSHIFT;
1374 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1375 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1376 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1377 		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1378 		    seg, vreq->v_dmac.dmac_size, blk_off));
1379 		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1380 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1381 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1382 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1383 		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1384 		    "\n", seg, fsect, lsect, gr, dma_addr));
1385 
1386 		blkno += (bcnt >> XB_BSHIFT);
1387 		seg++;
1388 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1389 		if (--ndmacs) {
1390 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1391 			continue;
1392 		}
1393 
1394 		vreq->v_status = VREQ_DMAWIN_DONE;
1395 		vreq->v_blkno = blkno;
1396 		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1397 			/* last win */
1398 			rval = XF_COMP;
1399 		else
1400 			rval = XF_PARTIAL;
1401 		break;
1402 	}
1403 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1404 	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1405 	    rreq->id));
1406 
1407 	return (rval);
1408 }
1409 
1410 #define	XDF_QSEC	50000	/* .005 second */
1411 #define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1412 
1413 static int
1414 xdf_drain_io(xdf_t *vdp)
1415 {
1416 	int pollc, rval;
1417 	xendev_ring_t *xbr;
1418 
1419 	if (xdfdebug & SUSRES_DBG)
1420 		xen_printf("xdf_drain_io: start\n");
1421 
1422 	mutex_enter(&vdp->xdf_dev_lk);
1423 
1424 	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1425 		goto out;
1426 
1427 	rval = 0;
1428 	xbr = vdp->xdf_xb_ring;
1429 	ASSERT(xbr != NULL);
1430 
1431 	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1432 		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1433 			mutex_exit(&vdp->xdf_dev_lk);
1434 			(void) xdf_intr((caddr_t)vdp);
1435 			mutex_enter(&vdp->xdf_dev_lk);
1436 		}
1437 		if (!xvdi_ring_has_incomp_request(xbr))
1438 			goto out;
1439 
1440 		(void) HYPERVISOR_yield();
1441 		/*
1442 		 * file-backed devices can be slow
1443 		 */
1444 		drv_usecwait(XDF_QSEC << pollc);
1445 	}
1446 	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1447 	rval = EIO;
1448 out:
1449 	mutex_exit(&vdp->xdf_dev_lk);
1450 	if (xdfdebug & SUSRES_DBG)
1451 		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1452 	return (rval);
1453 }
1454 
1455 /* ARGSUSED5 */
1456 static int
1457 xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1458     diskaddr_t start, size_t reqlen, void *tg_cookie)
1459 {
1460 	xdf_t *vdp;
1461 	struct buf *bp;
1462 	int err = 0;
1463 
1464 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1465 	if (vdp == NULL)
1466 		return (ENXIO);
1467 
1468 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_xdev_nblocks)
1469 		return (EINVAL);
1470 
1471 	bp = getrbuf(KM_SLEEP);
1472 	if (cmd == TG_READ)
1473 		bp->b_flags = B_BUSY | B_READ;
1474 	else
1475 		bp->b_flags = B_BUSY | B_WRITE;
1476 	bp->b_un.b_addr = bufp;
1477 	bp->b_bcount = reqlen;
1478 	bp->b_resid = 0;
1479 	bp->b_blkno = start;
1480 	bp->av_forw = NULL;
1481 	bp->av_back = NULL;
1482 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1483 
1484 	mutex_enter(&vdp->xdf_dev_lk);
1485 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1486 	if (vdp->xdf_f_act == NULL) {
1487 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1488 	} else {
1489 		vdp->xdf_l_act->av_forw = bp;
1490 		vdp->xdf_l_act = bp;
1491 	}
1492 	mutex_exit(&vdp->xdf_dev_lk);
1493 	xdf_iostart(vdp);
1494 	err = biowait(bp);
1495 
1496 	ASSERT(bp->b_flags & B_DONE);
1497 
1498 	freerbuf(bp);
1499 	return (err);
1500 }
1501 
1502 /*
1503  * synthetic geometry
1504  */
1505 #define	XDF_NSECTS	256
1506 #define	XDF_NHEADS	16
1507 
1508 static int
1509 xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1510 {
1511 	xdf_t *vdp;
1512 
1513 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1514 
1515 	if (vdp == NULL)
1516 		return (ENXIO);
1517 
1518 	mutex_enter(&vdp->xdf_dev_lk);
1519 	*capp = vdp->xdf_xdev_nblocks;
1520 	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1521 	mutex_exit(&vdp->xdf_dev_lk);
1522 	return (0);
1523 }
1524 
1525 static int
1526 xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1527 {
1528 	xdf_t *vdp;
1529 	uint_t ncyl;
1530 	uint_t spc = XDF_NHEADS * XDF_NSECTS;
1531 
1532 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1533 
1534 	if (vdp == NULL)
1535 		return (ENXIO);
1536 
1537 	ncyl = vdp->xdf_xdev_nblocks / spc;
1538 
1539 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1540 	geomp->g_acyl = 0;
1541 	geomp->g_nhead = XDF_NHEADS;
1542 	geomp->g_secsize = XB_BSIZE;
1543 	geomp->g_nsect = XDF_NSECTS;
1544 	geomp->g_intrlv = 0;
1545 	geomp->g_rpm = 7200;
1546 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1547 	return (0);
1548 }
1549 
1550 /*
1551  * No real HBA, no geometry available from it
1552  */
1553 /*ARGSUSED*/
1554 static int
1555 xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1556 {
1557 	return (EINVAL);
1558 }
1559 
1560 static int
1561 xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1562 {
1563 	xdf_t *vdp;
1564 
1565 	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1566 		return (ENXIO);
1567 
1568 	if (XD_IS_RO(vdp))
1569 		tgattributep->media_is_writable = 0;
1570 	else
1571 		tgattributep->media_is_writable = 1;
1572 	return (0);
1573 }
1574 
1575 /* ARGSUSED3 */
1576 static int
1577 xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1578 {
1579 	switch (cmd) {
1580 	case TG_GETPHYGEOM:
1581 		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1582 	case TG_GETVIRTGEOM:
1583 		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1584 	case TG_GETCAPACITY:
1585 		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1586 	case TG_GETBLOCKSIZE:
1587 		*(uint32_t *)arg = XB_BSIZE;
1588 		return (0);
1589 	case TG_GETATTR:
1590 		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1591 	default:
1592 		return (ENOTTY);
1593 	}
1594 }
1595 
1596 /*
1597  * Kick-off connect process
1598  * Status should be XD_UNKNOWN or XD_CLOSED
1599  * On success, status will be changed to XD_INIT
1600  * On error, status won't be changed
1601  */
1602 static int
1603 xdf_start_connect(xdf_t *vdp)
1604 {
1605 	char *xsnode;
1606 	grant_ref_t gref;
1607 	xenbus_transaction_t xbt;
1608 	int rv;
1609 	dev_info_t *dip = vdp->xdf_dip;
1610 
1611 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1612 		goto errout;
1613 
1614 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1615 		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1616 		    ddi_get_name_addr(dip));
1617 		goto errout;
1618 	}
1619 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1620 	    DDI_SUCCESS) {
1621 		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1622 		    "failed to add intr handler", ddi_get_name_addr(dip));
1623 		goto errout1;
1624 	}
1625 
1626 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1627 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1628 	    DDI_SUCCESS) {
1629 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1630 		    ddi_get_name_addr(dip));
1631 		goto errout2;
1632 	}
1633 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1634 
1635 	/*
1636 	 * Write into xenstore the info needed by backend
1637 	 */
1638 	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1639 		cmn_err(CE_WARN, "xdf@%s: "
1640 		    "failed to get xenstore node path",
1641 		    ddi_get_name_addr(dip));
1642 		goto fail_trans;
1643 	}
1644 trans_retry:
1645 	if (xenbus_transaction_start(&xbt)) {
1646 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1647 		    ddi_get_name_addr(dip));
1648 		xvdi_fatal_error(dip, EIO, "transaction start");
1649 		goto fail_trans;
1650 	}
1651 
1652 	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1653 		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1654 		    ddi_get_name_addr(dip));
1655 		xvdi_fatal_error(dip, rv, "writing ring-ref");
1656 		goto abort_trans;
1657 	}
1658 
1659 	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1660 	    xvdi_get_evtchn(dip))) {
1661 		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1662 		    ddi_get_name_addr(dip));
1663 		xvdi_fatal_error(dip, rv, "writing event-channel");
1664 		goto abort_trans;
1665 	}
1666 
1667 	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1668 		cmn_err(CE_WARN, "xdf@%s: "
1669 		    "failed to switch state to XenbusStateInitialised",
1670 		    ddi_get_name_addr(dip));
1671 		xvdi_fatal_error(dip, rv, "writing state");
1672 		goto abort_trans;
1673 	}
1674 
1675 	/* kick-off connect process */
1676 	if (rv = xenbus_transaction_end(xbt, 0)) {
1677 		if (rv == EAGAIN)
1678 			goto trans_retry;
1679 		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1680 		    ddi_get_name_addr(dip));
1681 		xvdi_fatal_error(dip, rv, "completing transaction");
1682 		goto fail_trans;
1683 	}
1684 
1685 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1686 	mutex_enter(&vdp->xdf_dev_lk);
1687 	vdp->xdf_status = XD_INIT;
1688 	mutex_exit(&vdp->xdf_dev_lk);
1689 
1690 	return (DDI_SUCCESS);
1691 
1692 abort_trans:
1693 	(void) xenbus_transaction_end(xbt, 1);
1694 fail_trans:
1695 	xvdi_free_ring(vdp->xdf_xb_ring);
1696 errout2:
1697 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1698 errout1:
1699 	xvdi_free_evtchn(dip);
1700 errout:
1701 	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1702 	    ddi_get_name_addr(dip));
1703 	return (DDI_FAILURE);
1704 }
1705 
1706 /*
1707  * Kick-off disconnect process
1708  * Status won't be changed
1709  */
1710 static int
1711 xdf_start_disconnect(xdf_t *vdp)
1712 {
1713 	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1714 		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1715 		    ddi_get_name_addr(vdp->xdf_dip));
1716 		return (DDI_FAILURE);
1717 	}
1718 
1719 	return (DDI_SUCCESS);
1720 }
1721 
1722 int
1723 xdf_get_flush_block(xdf_t *vdp)
1724 {
1725 	/*
1726 	 * Get a DEV_BSIZE aligned bufer
1727 	 */
1728 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1729 	vdp->xdf_cache_flush_block =
1730 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1731 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1732 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1733 		return (DDI_FAILURE);
1734 	return (DDI_SUCCESS);
1735 }
1736 
1737 /*
1738  * Finish other initialization after we've connected to backend
1739  * Status should be XD_INIT before calling this routine
1740  * On success, status should be changed to XD_READY
1741  * On error, status should stay XD_INIT
1742  */
1743 static int
1744 xdf_post_connect(xdf_t *vdp)
1745 {
1746 	int rv;
1747 	uint_t len;
1748 	char *type;
1749 	char *barrier;
1750 	dev_info_t *devi = vdp->xdf_dip;
1751 
1752 	/*
1753 	 * Determine if feature barrier is supported by backend
1754 	 */
1755 	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1756 	    "feature-barrier", (void **)&barrier, &len) == 0) {
1757 		vdp->xdf_feature_barrier = 1;
1758 		kmem_free(barrier, len);
1759 	} else {
1760 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1761 		    ddi_get_name_addr(vdp->xdf_dip));
1762 		vdp->xdf_feature_barrier = 0;
1763 	}
1764 
1765 	/* probe backend */
1766 	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1767 	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1768 	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1769 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1770 		    "cannot read backend info", ddi_get_name_addr(devi));
1771 		xvdi_fatal_error(devi, rv, "reading backend info");
1772 		return (DDI_FAILURE);
1773 	}
1774 
1775 	/* fix disk type */
1776 	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1777 	    (void **)&type, &len) != 0) {
1778 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1779 		    "cannot read device-type", ddi_get_name_addr(devi));
1780 		xvdi_fatal_error(devi, rv, "reading device-type");
1781 		return (DDI_FAILURE);
1782 	}
1783 	if (strcmp(type, "cdrom") == 0)
1784 		vdp->xdf_xdev_info |= VDISK_CDROM;
1785 	kmem_free(type, len);
1786 
1787 	/*
1788 	 * We've created all the minor nodes via cmlb_attach() using default
1789 	 * value in xdf_attach() to make it possbile to block in xdf_open(),
1790 	 * in case there's anyone (say, booting thread) ever trying to open
1791 	 * it before connected to backend. We will refresh all those minor
1792 	 * nodes w/ latest info we've got now when we are almost connected.
1793 	 *
1794 	 * Don't do this when xdf is already opened by someone (could happen
1795 	 * during resume), for that cmlb_attach() will invalid the label info
1796 	 * and confuse those who has already opened the node, which is bad.
1797 	 */
1798 	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1799 		/* re-init cmlb w/ latest info we got from backend */
1800 		if (cmlb_attach(devi, &xdf_lb_ops,
1801 		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1802 		    XD_IS_RM(vdp), 1, DDI_NT_BLOCK,
1803 		    CMLB_FAKE_LABEL_ONE_PARTITION,
1804 		    vdp->xdf_vd_lbl, NULL) != 0) {
1805 			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1806 			    ddi_get_name_addr(devi));
1807 			return (DDI_FAILURE);
1808 		}
1809 	}
1810 
1811 	/* mark vbd is ready for I/O */
1812 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1813 	mutex_enter(&vdp->xdf_dev_lk);
1814 	vdp->xdf_status = XD_READY;
1815 	mutex_exit(&vdp->xdf_dev_lk);
1816 	/*
1817 	 * If backend has feature-barrier, see if it supports disk
1818 	 * cache flush op.
1819 	 */
1820 	vdp->xdf_flush_supported = 0;
1821 	if (vdp->xdf_feature_barrier) {
1822 		/*
1823 		 * Pretend we already know flush is supported so probe
1824 		 * will attempt the correct op.
1825 		 */
1826 		vdp->xdf_flush_supported = 1;
1827 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1828 			vdp->xdf_flush_supported = 1;
1829 		} else {
1830 			vdp->xdf_flush_supported = 0;
1831 			/*
1832 			 * If the other end does not support the cache flush op
1833 			 * then we must use a barrier-write to force disk
1834 			 * cache flushing.  Barrier writes require that a data
1835 			 * block actually be written.
1836 			 * Cache a block to barrier-write when we are
1837 			 * asked to perform a flush.
1838 			 * XXX - would it be better to just copy 1 block
1839 			 * (512 bytes) from whatever write we did last
1840 			 * and rewrite that block?
1841 			 */
1842 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1843 				return (DDI_FAILURE);
1844 		}
1845 	}
1846 
1847 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
1848 	    (uint64_t)vdp->xdf_xdev_nblocks);
1849 
1850 	return (DDI_SUCCESS);
1851 }
1852 
1853 /*
1854  * Finish other uninitialization after we've disconnected from backend
1855  * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
1856  */
1857 static void
1858 xdf_post_disconnect(xdf_t *vdp)
1859 {
1860 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1861 	xvdi_free_evtchn(vdp->xdf_dip);
1862 	xvdi_free_ring(vdp->xdf_xb_ring);
1863 	vdp->xdf_xb_ring = NULL;
1864 	vdp->xdf_xb_ring_hdl = NULL;
1865 	vdp->xdf_peer = (domid_t)-1;
1866 
1867 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1868 	mutex_enter(&vdp->xdf_dev_lk);
1869 	vdp->xdf_status = XD_CLOSED;
1870 	mutex_exit(&vdp->xdf_dev_lk);
1871 }
1872 
1873 /*ARGSUSED*/
1874 static void
1875 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1876 {
1877 	XenbusState new_state = *(XenbusState *)impl_data;
1878 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1879 	boolean_t unexpect_die = B_FALSE;
1880 	int status;
1881 
1882 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1883 	    ddi_get_name_addr(dip), new_state));
1884 
1885 	mutex_enter(&vdp->xdf_cb_lk);
1886 
1887 	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
1888 		mutex_exit(&vdp->xdf_cb_lk);
1889 		return;
1890 	}
1891 
1892 	switch (new_state) {
1893 	case XenbusStateInitialising:
1894 		ASSERT(vdp->xdf_status == XD_CLOSED);
1895 		/*
1896 		 * backend recovered from a previous failure,
1897 		 * kick-off connect process again
1898 		 */
1899 		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
1900 			cmn_err(CE_WARN, "xdf@%s:"
1901 			    " failed to start reconnecting to backend",
1902 			    ddi_get_name_addr(dip));
1903 		}
1904 		break;
1905 	case XenbusStateConnected:
1906 		ASSERT(vdp->xdf_status == XD_INIT);
1907 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1908 		/* finish final init after connect */
1909 		if (xdf_post_connect(vdp) != DDI_SUCCESS)
1910 			(void) xdf_start_disconnect(vdp);
1911 		break;
1912 	case XenbusStateClosing:
1913 		if (vdp->xdf_status == XD_READY) {
1914 			mutex_enter(&vdp->xdf_dev_lk);
1915 			if (xdf_isopen(vdp, -1)) {
1916 				cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
1917 				    "still in use", ddi_get_name_addr(dip));
1918 				mutex_exit(&vdp->xdf_dev_lk);
1919 				break;
1920 			} else {
1921 				vdp->xdf_status = XD_CLOSING;
1922 			}
1923 			mutex_exit(&vdp->xdf_dev_lk);
1924 		}
1925 		(void) xdf_start_disconnect(vdp);
1926 		break;
1927 	case XenbusStateClosed:
1928 		/* first check if BE closed unexpectedly */
1929 		mutex_enter(&vdp->xdf_dev_lk);
1930 		if (xdf_isopen(vdp, -1)) {
1931 			unexpect_die = B_TRUE;
1932 			unexpectedie(vdp);
1933 			cmn_err(CE_WARN, "xdf@%s: backend closed, "
1934 			    "reconnecting...", ddi_get_name_addr(dip));
1935 		}
1936 		mutex_exit(&vdp->xdf_dev_lk);
1937 
1938 		if (vdp->xdf_status == XD_READY) {
1939 			mutex_enter(&vdp->xdf_dev_lk);
1940 			vdp->xdf_status = XD_CLOSING;
1941 			mutex_exit(&vdp->xdf_dev_lk);
1942 
1943 #ifdef	DOMU_BACKEND
1944 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1945 #endif
1946 
1947 			xdf_post_disconnect(vdp);
1948 			(void) xvdi_switch_state(dip, XBT_NULL,
1949 			    XenbusStateClosed);
1950 		} else if ((vdp->xdf_status == XD_INIT) ||
1951 		    (vdp->xdf_status == XD_CLOSING)) {
1952 			xdf_post_disconnect(vdp);
1953 		} else {
1954 			mutex_enter(&vdp->xdf_dev_lk);
1955 			vdp->xdf_status = XD_CLOSED;
1956 			mutex_exit(&vdp->xdf_dev_lk);
1957 		}
1958 	}
1959 
1960 	/* notify anybody waiting for oe state change */
1961 	mutex_enter(&vdp->xdf_dev_lk);
1962 	cv_broadcast(&vdp->xdf_dev_cv);
1963 	mutex_exit(&vdp->xdf_dev_lk);
1964 
1965 	status = vdp->xdf_status;
1966 	mutex_exit(&vdp->xdf_cb_lk);
1967 
1968 	if (status == XD_READY) {
1969 		xdf_iostart(vdp);
1970 	} else if ((status == XD_CLOSED) && !unexpect_die) {
1971 		/* interface is closed successfully, remove all minor nodes */
1972 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
1973 		cmlb_free_handle(&vdp->xdf_vd_lbl);
1974 	}
1975 }
1976 
1977 /* check if partition is open, -1 - check all partitions on the disk */
1978 static boolean_t
1979 xdf_isopen(xdf_t *vdp, int partition)
1980 {
1981 	int i;
1982 	ulong_t parbit;
1983 	boolean_t rval = B_FALSE;
1984 
1985 	if (partition == -1)
1986 		parbit = (ulong_t)-1;
1987 	else
1988 		parbit = 1 << partition;
1989 
1990 	for (i = 0; i < OTYPCNT; i++) {
1991 		if (vdp->xdf_vd_open[i] & parbit)
1992 			rval = B_TRUE;
1993 	}
1994 
1995 	return (rval);
1996 }
1997 
1998 /*
1999  * Xdf_check_state_transition will check the XenbusState change to see
2000  * if the change is a valid transition or not.
2001  * The new state is written by backend domain, or by running xenstore-write
2002  * to change it manually in dom0
2003  */
2004 static int
2005 xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
2006 {
2007 	int status;
2008 	int stcheck;
2009 #define	STOK	0 /* need further process */
2010 #define	STNOP	1 /* no action need taking */
2011 #define	STBUG	2 /* unexpected state change, could be a bug */
2012 
2013 	status = vdp->xdf_status;
2014 	stcheck = STOK;
2015 
2016 	switch (status) {
2017 	case XD_UNKNOWN:
2018 		if ((oestate == XenbusStateUnknown)		||
2019 		    (oestate == XenbusStateConnected))
2020 			stcheck = STBUG;
2021 		else if ((oestate == XenbusStateInitialising)	||
2022 		    (oestate == XenbusStateInitWait)		||
2023 		    (oestate == XenbusStateInitialised))
2024 			stcheck = STNOP;
2025 		break;
2026 	case XD_INIT:
2027 		if (oestate == XenbusStateUnknown)
2028 			stcheck = STBUG;
2029 		else if ((oestate == XenbusStateInitialising)	||
2030 		    (oestate == XenbusStateInitWait)		||
2031 		    (oestate == XenbusStateInitialised))
2032 			stcheck = STNOP;
2033 		break;
2034 	case XD_READY:
2035 		if ((oestate == XenbusStateUnknown)		||
2036 		    (oestate == XenbusStateInitialising)	||
2037 		    (oestate == XenbusStateInitWait)		||
2038 		    (oestate == XenbusStateInitialised))
2039 			stcheck = STBUG;
2040 		else if (oestate == XenbusStateConnected)
2041 			stcheck = STNOP;
2042 		break;
2043 	case XD_CLOSING:
2044 		if ((oestate == XenbusStateUnknown)		||
2045 		    (oestate == XenbusStateInitialising)	||
2046 		    (oestate == XenbusStateInitWait)		||
2047 		    (oestate == XenbusStateInitialised)		||
2048 		    (oestate == XenbusStateConnected))
2049 			stcheck = STBUG;
2050 		else if (oestate == XenbusStateClosing)
2051 			stcheck = STNOP;
2052 		break;
2053 	case XD_CLOSED:
2054 		if ((oestate == XenbusStateUnknown)		||
2055 		    (oestate == XenbusStateConnected))
2056 			stcheck = STBUG;
2057 		else if ((oestate == XenbusStateInitWait)	||
2058 		    (oestate == XenbusStateInitialised)		||
2059 		    (oestate == XenbusStateClosing)		||
2060 		    (oestate == XenbusStateClosed))
2061 			stcheck = STNOP;
2062 		break;
2063 	case XD_SUSPEND:
2064 	default:
2065 			stcheck = STBUG;
2066 	}
2067 
2068 	if (stcheck == STOK)
2069 		return (DDI_SUCCESS);
2070 
2071 	if (stcheck == STBUG)
2072 		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2073 		    "state change to %d!, when status is %d",
2074 		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2075 
2076 	return (DDI_FAILURE);
2077 }
2078 
2079 static int
2080 xdf_connect(xdf_t *vdp, boolean_t wait)
2081 {
2082 	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2083 	while (vdp->xdf_status != XD_READY) {
2084 		if (!wait || (vdp->xdf_status > XD_READY))
2085 			break;
2086 
2087 		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2088 			break;
2089 	}
2090 
2091 	return (vdp->xdf_status);
2092 }
2093 
2094 /*
2095  * callback func when DMA/GTE resources is available
2096  *
2097  * Note: we only register one callback function to grant table subsystem
2098  * since we only have one 'struct gnttab_free_callback' in xdf_t.
2099  */
2100 static int
2101 xdf_dmacallback(caddr_t arg)
2102 {
2103 	xdf_t *vdp = (xdf_t *)arg;
2104 	ASSERT(vdp != NULL);
2105 
2106 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2107 	    ddi_get_name_addr(vdp->xdf_dip)));
2108 
2109 	ddi_trigger_softintr(vdp->xdf_softintr_id);
2110 	return (DDI_DMA_CALLBACK_DONE);
2111 }
2112 
2113 static uint_t
2114 xdf_iorestart(caddr_t arg)
2115 {
2116 	xdf_t *vdp = (xdf_t *)arg;
2117 
2118 	ASSERT(vdp != NULL);
2119 
2120 	mutex_enter(&vdp->xdf_dev_lk);
2121 	ASSERT(ISDMACBON(vdp));
2122 	SETDMACBOFF(vdp);
2123 	mutex_exit(&vdp->xdf_dev_lk);
2124 
2125 	xdf_iostart(vdp);
2126 
2127 	return (DDI_INTR_CLAIMED);
2128 }
2129 
2130 static void
2131 xdf_timeout_handler(void *arg)
2132 {
2133 	xdf_t *vdp = arg;
2134 
2135 	mutex_enter(&vdp->xdf_dev_lk);
2136 	vdp->xdf_timeout_id = 0;
2137 	mutex_exit(&vdp->xdf_dev_lk);
2138 
2139 	/* new timeout thread could be re-scheduled */
2140 	xdf_iostart(vdp);
2141 }
2142 
2143 /*
2144  * Alloc a vreq for this bp
2145  * bp->av_back contains the pointer to the vreq upon return
2146  */
2147 static v_req_t *
2148 vreq_get(xdf_t *vdp, buf_t *bp)
2149 {
2150 	v_req_t *vreq = NULL;
2151 
2152 	ASSERT(BP2VREQ(bp) == NULL);
2153 
2154 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2155 	if (vreq == NULL) {
2156 		if (vdp->xdf_timeout_id == 0)
2157 			/* restart I/O after one second */
2158 			vdp->xdf_timeout_id =
2159 			    timeout(xdf_timeout_handler, vdp, hz);
2160 		return (NULL);
2161 	}
2162 	bzero(vreq, sizeof (v_req_t));
2163 
2164 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2165 	bp->av_back = (buf_t *)vreq;
2166 	vreq->v_buf = bp;
2167 	vreq->v_status = VREQ_INIT;
2168 	/* init of other fields in vreq is up to the caller */
2169 
2170 	return (vreq);
2171 }
2172 
2173 static void
2174 vreq_free(xdf_t *vdp, v_req_t *vreq)
2175 {
2176 	buf_t *bp = vreq->v_buf;
2177 
2178 	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2179 
2180 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
2181 		goto done;
2182 
2183 	switch (vreq->v_status) {
2184 	case VREQ_DMAWIN_DONE:
2185 	case VREQ_GS_ALLOCED:
2186 	case VREQ_DMABUF_BOUND:
2187 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2188 		/*FALLTHRU*/
2189 	case VREQ_DMAMEM_ALLOCED:
2190 		if (!ALIGNED_XFER(bp)) {
2191 			ASSERT(vreq->v_abuf != NULL);
2192 			if (!IS_ERROR(bp) && IS_READ(bp))
2193 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2194 				    bp->b_bcount);
2195 			ddi_dma_mem_free(&vreq->v_align);
2196 		}
2197 		/*FALLTHRU*/
2198 	case VREQ_MEMDMAHDL_ALLOCED:
2199 		if (!ALIGNED_XFER(bp))
2200 			ddi_dma_free_handle(&vreq->v_memdmahdl);
2201 		/*FALLTHRU*/
2202 	case VREQ_DMAHDL_ALLOCED:
2203 		ddi_dma_free_handle(&vreq->v_dmahdl);
2204 		break;
2205 	default:
2206 		break;
2207 	}
2208 done:
2209 	vreq->v_buf->av_back = NULL;
2210 	kmem_cache_free(xdf_vreq_cache, vreq);
2211 }
2212 
2213 /*
2214  * Initalize the DMA and grant table resources for the buf
2215  */
2216 static int
2217 vreq_setup(xdf_t *vdp, v_req_t *vreq)
2218 {
2219 	int rc;
2220 	ddi_dma_attr_t dmaattr;
2221 	uint_t ndcs, ndws;
2222 	ddi_dma_handle_t dh;
2223 	ddi_dma_handle_t mdh;
2224 	ddi_dma_cookie_t dc;
2225 	ddi_acc_handle_t abh;
2226 	caddr_t	aba;
2227 	ge_slot_t *gs;
2228 	size_t bufsz;
2229 	off_t off;
2230 	size_t sz;
2231 	buf_t *bp = vreq->v_buf;
2232 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2233 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2234 
2235 	switch (vreq->v_status) {
2236 	case VREQ_INIT:
2237 		if (IS_FLUSH_DISKCACHE(bp)) {
2238 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2239 				DPRINTF(DMA_DBG, (
2240 				    "xdf@%s: get ge_slotfailed\n",
2241 				    ddi_get_name_addr(vdp->xdf_dip)));
2242 				return (DDI_FAILURE);
2243 			}
2244 			vreq->v_blkno = 0;
2245 			vreq->v_nslots = 1;
2246 			vreq->v_gs = gs;
2247 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2248 			vreq->v_status = VREQ_GS_ALLOCED;
2249 			gs->vreq = vreq;
2250 			return (DDI_SUCCESS);
2251 		}
2252 
2253 		if (IS_WRITE_BARRIER(vdp, bp))
2254 			vreq->v_flush_diskcache = WRITE_BARRIER;
2255 		vreq->v_blkno = bp->b_blkno +
2256 		    (diskaddr_t)(uintptr_t)bp->b_private;
2257 		bp->b_private = NULL;
2258 		/* See if we wrote new data to our flush block */
2259 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2260 			check_fbwrite(vdp, bp, vreq->v_blkno);
2261 		vreq->v_status = VREQ_INIT_DONE;
2262 		/*FALLTHRU*/
2263 
2264 	case VREQ_INIT_DONE:
2265 		/*
2266 		 * alloc DMA handle
2267 		 */
2268 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2269 		    xdf_dmacallback, (caddr_t)vdp, &dh);
2270 		if (rc != DDI_SUCCESS) {
2271 			SETDMACBON(vdp);
2272 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2273 			    ddi_get_name_addr(vdp->xdf_dip)));
2274 			return (DDI_FAILURE);
2275 		}
2276 
2277 		vreq->v_dmahdl = dh;
2278 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2279 		/*FALLTHRU*/
2280 
2281 	case VREQ_DMAHDL_ALLOCED:
2282 		/*
2283 		 * alloc dma handle for 512-byte aligned buf
2284 		 */
2285 		if (!ALIGNED_XFER(bp)) {
2286 			/*
2287 			 * XXPV: we need to temporarily enlarge the seg
2288 			 * boundary and s/g length to work round CR6381968
2289 			 */
2290 			dmaattr = xb_dma_attr;
2291 			dmaattr.dma_attr_seg = (uint64_t)-1;
2292 			dmaattr.dma_attr_sgllen = INT_MAX;
2293 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2294 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2295 			if (rc != DDI_SUCCESS) {
2296 				SETDMACBON(vdp);
2297 				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2298 				    "handle alloc failed\n",
2299 				    ddi_get_name_addr(vdp->xdf_dip)));
2300 				return (DDI_FAILURE);
2301 			}
2302 			vreq->v_memdmahdl = mdh;
2303 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2304 		}
2305 		/*FALLTHRU*/
2306 
2307 	case VREQ_MEMDMAHDL_ALLOCED:
2308 		/*
2309 		 * alloc 512-byte aligned buf
2310 		 */
2311 		if (!ALIGNED_XFER(bp)) {
2312 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2313 				bp_mapin(bp);
2314 
2315 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2316 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2317 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2318 			    &aba, &bufsz, &abh);
2319 			if (rc != DDI_SUCCESS) {
2320 				SETDMACBON(vdp);
2321 				DPRINTF(DMA_DBG, (
2322 				    "xdf@%s: DMA mem allocation failed\n",
2323 				    ddi_get_name_addr(vdp->xdf_dip)));
2324 				return (DDI_FAILURE);
2325 			}
2326 
2327 			vreq->v_abuf = aba;
2328 			vreq->v_align = abh;
2329 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2330 
2331 			ASSERT(bufsz >= bp->b_bcount);
2332 			if (!IS_READ(bp))
2333 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2334 				    bp->b_bcount);
2335 		}
2336 		/*FALLTHRU*/
2337 
2338 	case VREQ_DMAMEM_ALLOCED:
2339 		/*
2340 		 * dma bind
2341 		 */
2342 		if (ALIGNED_XFER(bp)) {
2343 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2344 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2345 			    &dc, &ndcs);
2346 		} else {
2347 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2348 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2349 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2350 		}
2351 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2352 			/* get num of dma windows */
2353 			if (rc == DDI_DMA_PARTIAL_MAP) {
2354 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2355 				ASSERT(rc == DDI_SUCCESS);
2356 			} else {
2357 				ndws = 1;
2358 			}
2359 		} else {
2360 			SETDMACBON(vdp);
2361 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2362 			    ddi_get_name_addr(vdp->xdf_dip)));
2363 			return (DDI_FAILURE);
2364 		}
2365 
2366 		vreq->v_dmac = dc;
2367 		vreq->v_dmaw = 0;
2368 		vreq->v_ndmacs = ndcs;
2369 		vreq->v_ndmaws = ndws;
2370 		vreq->v_nslots = ndws;
2371 		vreq->v_status = VREQ_DMABUF_BOUND;
2372 		/*FALLTHRU*/
2373 
2374 	case VREQ_DMABUF_BOUND:
2375 		/*
2376 		 * get ge_slot, callback is set upon failure from gs_get(),
2377 		 * if not set previously
2378 		 */
2379 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2380 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2381 			    ddi_get_name_addr(vdp->xdf_dip)));
2382 			return (DDI_FAILURE);
2383 		}
2384 
2385 		vreq->v_gs = gs;
2386 		gs->vreq = vreq;
2387 		vreq->v_status = VREQ_GS_ALLOCED;
2388 		break;
2389 
2390 	case VREQ_GS_ALLOCED:
2391 		/* nothing need to be done */
2392 		break;
2393 
2394 	case VREQ_DMAWIN_DONE:
2395 		/*
2396 		 * move to the next dma window
2397 		 */
2398 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2399 
2400 		/* get a ge_slot for this DMA window */
2401 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2402 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2403 			    ddi_get_name_addr(vdp->xdf_dip)));
2404 			return (DDI_FAILURE);
2405 		}
2406 
2407 		vreq->v_gs = gs;
2408 		gs->vreq = vreq;
2409 		vreq->v_dmaw++;
2410 		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2411 		    &vreq->v_dmac, &vreq->v_ndmacs);
2412 		ASSERT(rc == DDI_SUCCESS);
2413 		vreq->v_status = VREQ_GS_ALLOCED;
2414 		break;
2415 
2416 	default:
2417 		return (DDI_FAILURE);
2418 	}
2419 
2420 	return (DDI_SUCCESS);
2421 }
2422 
2423 static ge_slot_t *
2424 gs_get(xdf_t *vdp, int isread)
2425 {
2426 	grant_ref_t gh;
2427 	ge_slot_t *gs;
2428 
2429 	/* try to alloc GTEs needed in this slot, first */
2430 	if (gnttab_alloc_grant_references(
2431 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2432 		if (vdp->xdf_gnt_callback.next == NULL) {
2433 			SETDMACBON(vdp);
2434 			gnttab_request_free_callback(
2435 			    &vdp->xdf_gnt_callback,
2436 			    (void (*)(void *))xdf_dmacallback,
2437 			    (void *)vdp,
2438 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2439 		}
2440 		return (NULL);
2441 	}
2442 
2443 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2444 	if (gs == NULL) {
2445 		gnttab_free_grant_references(gh);
2446 		if (vdp->xdf_timeout_id == 0)
2447 			/* restart I/O after one second */
2448 			vdp->xdf_timeout_id =
2449 			    timeout(xdf_timeout_handler, vdp, hz);
2450 		return (NULL);
2451 	}
2452 
2453 	/* init gs_slot */
2454 	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2455 	gs->oeid = vdp->xdf_peer;
2456 	gs->isread = isread;
2457 	gs->ghead = gh;
2458 	gs->ngrefs = 0;
2459 
2460 	return (gs);
2461 }
2462 
2463 static void
2464 gs_free(xdf_t *vdp, ge_slot_t *gs)
2465 {
2466 	int i;
2467 	grant_ref_t *gp = gs->ge;
2468 	int ngrefs = gs->ngrefs;
2469 	boolean_t isread = gs->isread;
2470 
2471 	list_remove(&vdp->xdf_gs_act, (void *)gs);
2472 
2473 	/* release all grant table entry resources used in this slot */
2474 	for (i = 0; i < ngrefs; i++, gp++)
2475 		gnttab_end_foreign_access(*gp, !isread, 0);
2476 	gnttab_free_grant_references(gs->ghead);
2477 
2478 	kmem_cache_free(xdf_gs_cache, (void *)gs);
2479 }
2480 
2481 static grant_ref_t
2482 gs_grant(ge_slot_t *gs, mfn_t mfn)
2483 {
2484 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2485 
2486 	ASSERT(gr != -1);
2487 	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2488 	gs->ge[gs->ngrefs++] = gr;
2489 	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2490 
2491 	return (gr);
2492 }
2493 
2494 static void
2495 unexpectedie(xdf_t *vdp)
2496 {
2497 	/* clean up I/Os in ring that have responses */
2498 	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2499 		mutex_exit(&vdp->xdf_dev_lk);
2500 		(void) xdf_intr((caddr_t)vdp);
2501 		mutex_enter(&vdp->xdf_dev_lk);
2502 	}
2503 
2504 	/* free up all grant table entries */
2505 	while (!list_is_empty(&vdp->xdf_gs_act))
2506 		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2507 
2508 	/*
2509 	 * move bp back to active list orderly
2510 	 * vreq_busy is updated in vreq_free()
2511 	 */
2512 	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2513 		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2514 		buf_t *bp = vreq->v_buf;
2515 
2516 		bp->av_back = NULL;
2517 		bp->b_resid = bp->b_bcount;
2518 		if (vdp->xdf_f_act == NULL) {
2519 			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2520 		} else {
2521 			/* move to the head of list */
2522 			bp->av_forw = vdp->xdf_f_act;
2523 			vdp->xdf_f_act = bp;
2524 		}
2525 		kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2526 		vreq_free(vdp, vreq);
2527 	}
2528 }
2529 
2530 static void
2531 xdfmin(struct buf *bp)
2532 {
2533 	if (bp->b_bcount > xdf_maxphys)
2534 		bp->b_bcount = xdf_maxphys;
2535 }
2536