xref: /illumos-gate/usr/src/uts/common/xen/io/xdf.c (revision a04cabea38af23dd1a0e76c56ca44260af2285e4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  *
33  * This driver export solaris disk device nodes, accepts IO requests from
34  * those nodes, and services those requests by talking to a backend device
35  * in another domain.
36  *
37  * Communication with the backend device is done via a ringbuffer (which is
38  * managed via xvdi interfaces) and dma memory (which is managed via ddi
39  * interfaces).
40  *
41  * Communication with the backend device is dependant upon establishing a
42  * connection to the backend device.  This connection process involves
43  * reading device configuration information from xenbus and publishing
44  * some frontend runtime configuration parameters via the xenbus (for
45  * consumption by the backend).  Once we've published runtime configuration
46  * information via the xenbus, the backend device can enter the connected
47  * state and we'll enter the XD_CONNECTED state.  But before we can allow
48  * random IO to begin, we need to do IO to the backend device to determine
49  * the device label and if flush operations are supported.  Once this is
50  * done we enter the XD_READY state and can process any IO operations.
51  *
52  * We recieve notifications of xenbus state changes for the backend device
53  * (aka, the "other end") via the xdf_oe_change() callback.  This callback
54  * is single threaded, meaning that we can't recieve new notification of
55  * other end state changes while we're processing an outstanding
56  * notification of an other end state change.  There for we can't do any
57  * blocking operations from the xdf_oe_change() callback.  This is why we
58  * have a seperate taskq (xdf_ready_tq) which exists to do the necessary
59  * IO to get us from the XD_CONNECTED to the XD_READY state.  All IO
60  * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go
61  * throught xdf_lb_rdwr(), which is a synchronous IO interface.  IOs
62  * generated by the xdf_ready_tq_thread thread have priority over all
63  * other IO requests.
64  *
65  * We also communicate with the backend device via the xenbus "media-req"
66  * (XBP_MEDIA_REQ) property.  For more information on this see the
67  * comments in blkif.h.
68  */
69 
70 #include <io/xdf.h>
71 
72 #include <sys/conf.h>
73 #include <sys/dkio.h>
74 #include <sys/promif.h>
75 #include <sys/sysmacros.h>
76 #include <sys/kstat.h>
77 #include <sys/mach_mmu.h>
78 #ifdef XPV_HVM_DRIVER
79 #include <sys/xpv_support.h>
80 #include <sys/sunndi.h>
81 #else /* !XPV_HVM_DRIVER */
82 #include <sys/evtchn_impl.h>
83 #endif /* !XPV_HVM_DRIVER */
84 #include <public/io/xenbus.h>
85 #include <xen/sys/xenbus_impl.h>
86 #include <sys/scsi/generic/inquiry.h>
87 #include <xen/io/blkif_impl.h>
88 #include <sys/fdio.h>
89 #include <sys/cdio.h>
90 
91 /*
92  * DEBUG_EVAL can be used to include debug only statements without
93  * having to use '#ifdef DEBUG' statements
94  */
95 #ifdef DEBUG
96 #define	DEBUG_EVAL(x)	(x)
97 #else /* !DEBUG */
98 #define	DEBUG_EVAL(x)
99 #endif /* !DEBUG */
100 
101 #define	XDF_DRAIN_MSEC_DELAY		(50*1000)	/* 00.05 sec */
102 #define	XDF_DRAIN_RETRY_COUNT		200		/* 10.00 sec */
103 
104 #define	INVALID_DOMID	((domid_t)-1)
105 #define	FLUSH_DISKCACHE	0x1
106 #define	WRITE_BARRIER	0x2
107 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
108 #define	USE_WRITE_BARRIER(vdp)						\
109 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
110 #define	USE_FLUSH_DISKCACHE(vdp)					\
111 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
112 #define	IS_WRITE_BARRIER(vdp, bp)					\
113 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&			\
114 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
115 #define	IS_FLUSH_DISKCACHE(bp)						\
116 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
117 
118 #define	VREQ_DONE(vreq)							\
119 	VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) &&		\
120 	    (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) ||		\
121 	    (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws)))
122 
123 #define	BP_VREQ(bp)		((v_req_t *)((bp)->av_back))
124 #define	BP_VREQ_SET(bp, vreq)	(((bp)->av_back = (buf_t *)(vreq)))
125 
126 extern int		do_polled_io;
127 
128 /* run-time tunables that we don't want the compiler to optimize away */
129 volatile int		xdf_debug = 0;
130 volatile boolean_t	xdf_barrier_flush_disable = B_FALSE;
131 
132 /* per module globals */
133 major_t			xdf_major;
134 static void		*xdf_ssp;
135 static kmem_cache_t	*xdf_vreq_cache;
136 static kmem_cache_t	*xdf_gs_cache;
137 static int		xdf_maxphys = XB_MAXPHYS;
138 static diskaddr_t	xdf_flush_block = DEFAULT_FLUSH_BLOCK;
139 static int		xdf_fbrewrites;	/* flush block re-write count */
140 
141 /* misc public functions (used by xdf_shell.c) */
142 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
143 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
144 
145 /*  misc private functions */
146 static void xdf_io_start(xdf_t *);
147 
148 /* callbacks from commmon label */
149 static cmlb_tg_ops_t xdf_lb_ops = {
150 	TG_DK_OPS_VERSION_1,
151 	xdf_lb_rdwr,
152 	xdf_lb_getinfo
153 };
154 
155 /*
156  * I/O buffer DMA attributes
157  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
158  */
159 static ddi_dma_attr_t xb_dma_attr = {
160 	DMA_ATTR_V0,
161 	(uint64_t)0,			/* lowest address */
162 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
163 	(uint64_t)0xffffff,		/* DMA counter limit max */
164 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
165 	XB_BSIZE - 1,			/* bitmap of burst sizes */
166 	XB_BSIZE,			/* min transfer */
167 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
168 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
169 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
170 	XB_BSIZE,			/* granularity */
171 	0,				/* flags (reserved) */
172 };
173 
174 static ddi_device_acc_attr_t xc_acc_attr = {
175 	DDI_DEVICE_ATTR_V0,
176 	DDI_NEVERSWAP_ACC,
177 	DDI_STRICTORDER_ACC
178 };
179 
180 static void
181 xdf_timeout_handler(void *arg)
182 {
183 	xdf_t *vdp = arg;
184 
185 	mutex_enter(&vdp->xdf_dev_lk);
186 	vdp->xdf_timeout_id = 0;
187 	mutex_exit(&vdp->xdf_dev_lk);
188 
189 	/* new timeout thread could be re-scheduled */
190 	xdf_io_start(vdp);
191 }
192 
193 /*
194  * callback func when DMA/GTE resources is available
195  *
196  * Note: we only register one callback function to grant table subsystem
197  * since we only have one 'struct gnttab_free_callback' in xdf_t.
198  */
199 static int
200 xdf_dmacallback(caddr_t arg)
201 {
202 	xdf_t *vdp = (xdf_t *)arg;
203 	ASSERT(vdp != NULL);
204 
205 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
206 	    vdp->xdf_addr));
207 
208 	ddi_trigger_softintr(vdp->xdf_softintr_id);
209 	return (DDI_DMA_CALLBACK_DONE);
210 }
211 
212 static ge_slot_t *
213 gs_get(xdf_t *vdp, int isread)
214 {
215 	grant_ref_t gh;
216 	ge_slot_t *gs;
217 
218 	/* try to alloc GTEs needed in this slot, first */
219 	if (gnttab_alloc_grant_references(
220 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
221 		if (vdp->xdf_gnt_callback.next == NULL) {
222 			SETDMACBON(vdp);
223 			gnttab_request_free_callback(
224 			    &vdp->xdf_gnt_callback,
225 			    (void (*)(void *))xdf_dmacallback,
226 			    (void *)vdp,
227 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
228 		}
229 		return (NULL);
230 	}
231 
232 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
233 	if (gs == NULL) {
234 		gnttab_free_grant_references(gh);
235 		if (vdp->xdf_timeout_id == 0)
236 			/* restart I/O after one second */
237 			vdp->xdf_timeout_id =
238 			    timeout(xdf_timeout_handler, vdp, hz);
239 		return (NULL);
240 	}
241 
242 	/* init gs_slot */
243 	gs->gs_oeid = vdp->xdf_peer;
244 	gs->gs_isread = isread;
245 	gs->gs_ghead = gh;
246 	gs->gs_ngrefs = 0;
247 
248 	return (gs);
249 }
250 
251 static void
252 gs_free(ge_slot_t *gs)
253 {
254 	int		i;
255 
256 	/* release all grant table entry resources used in this slot */
257 	for (i = 0; i < gs->gs_ngrefs; i++)
258 		gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0);
259 	gnttab_free_grant_references(gs->gs_ghead);
260 	list_remove(&gs->gs_vreq->v_gs, gs);
261 	kmem_cache_free(xdf_gs_cache, gs);
262 }
263 
264 static grant_ref_t
265 gs_grant(ge_slot_t *gs, mfn_t mfn)
266 {
267 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead);
268 
269 	ASSERT(gr != -1);
270 	ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
271 	gs->gs_ge[gs->gs_ngrefs++] = gr;
272 	gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread);
273 
274 	return (gr);
275 }
276 
277 /*
278  * Alloc a vreq for this bp
279  * bp->av_back contains the pointer to the vreq upon return
280  */
281 static v_req_t *
282 vreq_get(xdf_t *vdp, buf_t *bp)
283 {
284 	v_req_t *vreq = NULL;
285 
286 	ASSERT(BP_VREQ(bp) == NULL);
287 
288 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
289 	if (vreq == NULL) {
290 		if (vdp->xdf_timeout_id == 0)
291 			/* restart I/O after one second */
292 			vdp->xdf_timeout_id =
293 			    timeout(xdf_timeout_handler, vdp, hz);
294 		return (NULL);
295 	}
296 	bzero(vreq, sizeof (v_req_t));
297 	list_create(&vreq->v_gs, sizeof (ge_slot_t),
298 	    offsetof(ge_slot_t, gs_vreq_link));
299 	vreq->v_buf = bp;
300 	vreq->v_status = VREQ_INIT;
301 	vreq->v_runq = B_FALSE;
302 	BP_VREQ_SET(bp, vreq);
303 	/* init of other fields in vreq is up to the caller */
304 
305 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
306 
307 	return (vreq);
308 }
309 
310 static void
311 vreq_free(xdf_t *vdp, v_req_t *vreq)
312 {
313 	buf_t	*bp = vreq->v_buf;
314 
315 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
316 	ASSERT(BP_VREQ(bp) == vreq);
317 
318 	list_remove(&vdp->xdf_vreq_act, vreq);
319 
320 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
321 		goto done;
322 
323 	switch (vreq->v_status) {
324 	case VREQ_DMAWIN_DONE:
325 	case VREQ_GS_ALLOCED:
326 	case VREQ_DMABUF_BOUND:
327 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
328 		/*FALLTHRU*/
329 	case VREQ_DMAMEM_ALLOCED:
330 		if (!ALIGNED_XFER(bp)) {
331 			ASSERT(vreq->v_abuf != NULL);
332 			if (!IS_ERROR(bp) && IS_READ(bp))
333 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
334 				    bp->b_bcount);
335 			ddi_dma_mem_free(&vreq->v_align);
336 		}
337 		/*FALLTHRU*/
338 	case VREQ_MEMDMAHDL_ALLOCED:
339 		if (!ALIGNED_XFER(bp))
340 			ddi_dma_free_handle(&vreq->v_memdmahdl);
341 		/*FALLTHRU*/
342 	case VREQ_DMAHDL_ALLOCED:
343 		ddi_dma_free_handle(&vreq->v_dmahdl);
344 		break;
345 	default:
346 		break;
347 	}
348 done:
349 	ASSERT(!vreq->v_runq);
350 	list_destroy(&vreq->v_gs);
351 	kmem_cache_free(xdf_vreq_cache, vreq);
352 }
353 
354 /*
355  * Snarf new data if our flush block was re-written
356  */
357 static void
358 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
359 {
360 	int nblks;
361 	boolean_t mapin;
362 
363 	if (IS_WRITE_BARRIER(vdp, bp))
364 		return; /* write was a flush write */
365 
366 	mapin = B_FALSE;
367 	nblks = bp->b_bcount >> DEV_BSHIFT;
368 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
369 		xdf_fbrewrites++;
370 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
371 			mapin = B_TRUE;
372 			bp_mapin(bp);
373 		}
374 		bcopy(bp->b_un.b_addr +
375 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
376 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
377 		if (mapin)
378 			bp_mapout(bp);
379 	}
380 }
381 
382 /*
383  * Initalize the DMA and grant table resources for the buf
384  */
385 static int
386 vreq_setup(xdf_t *vdp, v_req_t *vreq)
387 {
388 	int rc;
389 	ddi_dma_attr_t dmaattr;
390 	uint_t ndcs, ndws;
391 	ddi_dma_handle_t dh;
392 	ddi_dma_handle_t mdh;
393 	ddi_dma_cookie_t dc;
394 	ddi_acc_handle_t abh;
395 	caddr_t	aba;
396 	ge_slot_t *gs;
397 	size_t bufsz;
398 	off_t off;
399 	size_t sz;
400 	buf_t *bp = vreq->v_buf;
401 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
402 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
403 
404 	switch (vreq->v_status) {
405 	case VREQ_INIT:
406 		if (IS_FLUSH_DISKCACHE(bp)) {
407 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
408 				DPRINTF(DMA_DBG, ("xdf@%s: "
409 				    "get ge_slotfailed\n", vdp->xdf_addr));
410 				return (DDI_FAILURE);
411 			}
412 			vreq->v_blkno = 0;
413 			vreq->v_nslots = 1;
414 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
415 			vreq->v_status = VREQ_GS_ALLOCED;
416 			gs->gs_vreq = vreq;
417 			list_insert_head(&vreq->v_gs, gs);
418 			return (DDI_SUCCESS);
419 		}
420 
421 		if (IS_WRITE_BARRIER(vdp, bp))
422 			vreq->v_flush_diskcache = WRITE_BARRIER;
423 		vreq->v_blkno = bp->b_blkno +
424 		    (diskaddr_t)(uintptr_t)bp->b_private;
425 		/* See if we wrote new data to our flush block */
426 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
427 			check_fbwrite(vdp, bp, vreq->v_blkno);
428 		vreq->v_status = VREQ_INIT_DONE;
429 		/*FALLTHRU*/
430 
431 	case VREQ_INIT_DONE:
432 		/*
433 		 * alloc DMA handle
434 		 */
435 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
436 		    xdf_dmacallback, (caddr_t)vdp, &dh);
437 		if (rc != DDI_SUCCESS) {
438 			SETDMACBON(vdp);
439 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
440 			    vdp->xdf_addr));
441 			return (DDI_FAILURE);
442 		}
443 
444 		vreq->v_dmahdl = dh;
445 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
446 		/*FALLTHRU*/
447 
448 	case VREQ_DMAHDL_ALLOCED:
449 		/*
450 		 * alloc dma handle for 512-byte aligned buf
451 		 */
452 		if (!ALIGNED_XFER(bp)) {
453 			/*
454 			 * XXPV: we need to temporarily enlarge the seg
455 			 * boundary and s/g length to work round CR6381968
456 			 */
457 			dmaattr = xb_dma_attr;
458 			dmaattr.dma_attr_seg = (uint64_t)-1;
459 			dmaattr.dma_attr_sgllen = INT_MAX;
460 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
461 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
462 			if (rc != DDI_SUCCESS) {
463 				SETDMACBON(vdp);
464 				DPRINTF(DMA_DBG, ("xdf@%s: "
465 				    "unaligned buf DMAhandle alloc failed\n",
466 				    vdp->xdf_addr));
467 				return (DDI_FAILURE);
468 			}
469 			vreq->v_memdmahdl = mdh;
470 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
471 		}
472 		/*FALLTHRU*/
473 
474 	case VREQ_MEMDMAHDL_ALLOCED:
475 		/*
476 		 * alloc 512-byte aligned buf
477 		 */
478 		if (!ALIGNED_XFER(bp)) {
479 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
480 				bp_mapin(bp);
481 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
482 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
483 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
484 			    &aba, &bufsz, &abh);
485 			if (rc != DDI_SUCCESS) {
486 				SETDMACBON(vdp);
487 				DPRINTF(DMA_DBG, ("xdf@%s: "
488 				    "DMA mem allocation failed\n",
489 				    vdp->xdf_addr));
490 				return (DDI_FAILURE);
491 			}
492 
493 			vreq->v_abuf = aba;
494 			vreq->v_align = abh;
495 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
496 
497 			ASSERT(bufsz >= bp->b_bcount);
498 			if (!IS_READ(bp))
499 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
500 				    bp->b_bcount);
501 		}
502 		/*FALLTHRU*/
503 
504 	case VREQ_DMAMEM_ALLOCED:
505 		/*
506 		 * dma bind
507 		 */
508 		if (ALIGNED_XFER(bp)) {
509 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
510 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
511 			    &dc, &ndcs);
512 		} else {
513 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
514 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
515 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
516 		}
517 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
518 			/* get num of dma windows */
519 			if (rc == DDI_DMA_PARTIAL_MAP) {
520 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
521 				ASSERT(rc == DDI_SUCCESS);
522 			} else {
523 				ndws = 1;
524 			}
525 		} else {
526 			SETDMACBON(vdp);
527 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
528 			    vdp->xdf_addr));
529 			return (DDI_FAILURE);
530 		}
531 
532 		vreq->v_dmac = dc;
533 		vreq->v_dmaw = 0;
534 		vreq->v_ndmacs = ndcs;
535 		vreq->v_ndmaws = ndws;
536 		vreq->v_nslots = ndws;
537 		vreq->v_status = VREQ_DMABUF_BOUND;
538 		/*FALLTHRU*/
539 
540 	case VREQ_DMABUF_BOUND:
541 		/*
542 		 * get ge_slot, callback is set upon failure from gs_get(),
543 		 * if not set previously
544 		 */
545 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
546 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
547 			    vdp->xdf_addr));
548 			return (DDI_FAILURE);
549 		}
550 
551 		vreq->v_status = VREQ_GS_ALLOCED;
552 		gs->gs_vreq = vreq;
553 		list_insert_head(&vreq->v_gs, gs);
554 		break;
555 
556 	case VREQ_GS_ALLOCED:
557 		/* nothing need to be done */
558 		break;
559 
560 	case VREQ_DMAWIN_DONE:
561 		/*
562 		 * move to the next dma window
563 		 */
564 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
565 
566 		/* get a ge_slot for this DMA window */
567 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
568 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
569 			    vdp->xdf_addr));
570 			return (DDI_FAILURE);
571 		}
572 
573 		vreq->v_dmaw++;
574 		VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
575 		    &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS);
576 		vreq->v_status = VREQ_GS_ALLOCED;
577 		gs->gs_vreq = vreq;
578 		list_insert_head(&vreq->v_gs, gs);
579 		break;
580 
581 	default:
582 		return (DDI_FAILURE);
583 	}
584 
585 	return (DDI_SUCCESS);
586 }
587 
588 static int
589 xdf_cmlb_attach(xdf_t *vdp)
590 {
591 	dev_info_t	*dip = vdp->xdf_dip;
592 
593 	return (cmlb_attach(dip, &xdf_lb_ops,
594 	    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
595 	    XD_IS_RM(vdp),
596 	    B_TRUE,
597 	    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
598 #if defined(XPV_HVM_DRIVER)
599 	    (XD_IS_CD(vdp) ? 0 : CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT) |
600 	    CMLB_INTERNAL_MINOR_NODES,
601 #else /* !XPV_HVM_DRIVER */
602 	    XD_IS_CD(vdp) ? 0 : CMLB_FAKE_LABEL_ONE_PARTITION,
603 #endif /* !XPV_HVM_DRIVER */
604 	    vdp->xdf_vd_lbl, NULL));
605 }
606 
607 static void
608 xdf_io_err(buf_t *bp, int err, size_t resid)
609 {
610 	bioerror(bp, err);
611 	if (resid == 0)
612 		bp->b_resid = bp->b_bcount;
613 	biodone(bp);
614 }
615 
616 static void
617 xdf_kstat_enter(xdf_t *vdp, buf_t *bp)
618 {
619 	v_req_t *vreq = BP_VREQ(bp);
620 
621 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
622 
623 	if (vdp->xdf_xdev_iostat == NULL)
624 		return;
625 	if ((vreq != NULL) && vreq->v_runq) {
626 		kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
627 	} else {
628 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
629 	}
630 }
631 
632 static void
633 xdf_kstat_exit(xdf_t *vdp, buf_t *bp)
634 {
635 	v_req_t *vreq = BP_VREQ(bp);
636 
637 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
638 
639 	if (vdp->xdf_xdev_iostat == NULL)
640 		return;
641 
642 	if ((vreq != NULL) && vreq->v_runq) {
643 		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
644 	} else {
645 		kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
646 	}
647 
648 	if (bp->b_flags & B_READ) {
649 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->reads++;
650 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nread += bp->b_bcount;
651 	} else if (bp->b_flags & B_WRITE) {
652 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->writes++;
653 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nwritten += bp->b_bcount;
654 	}
655 }
656 
657 static void
658 xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp)
659 {
660 	v_req_t *vreq = BP_VREQ(bp);
661 
662 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
663 	ASSERT(!vreq->v_runq);
664 
665 	vreq->v_runq = B_TRUE;
666 	if (vdp->xdf_xdev_iostat == NULL)
667 		return;
668 	kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
669 }
670 
671 static void
672 xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp)
673 {
674 	v_req_t *vreq = BP_VREQ(bp);
675 
676 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
677 	ASSERT(vreq->v_runq);
678 
679 	vreq->v_runq = B_FALSE;
680 	if (vdp->xdf_xdev_iostat == NULL)
681 		return;
682 	kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
683 }
684 
685 int
686 xdf_kstat_create(dev_info_t *dip, char *ks_module, int instance)
687 {
688 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
689 	kstat_t		*kstat;
690 	buf_t		*bp;
691 
692 	if ((kstat = kstat_create(
693 	    ks_module, instance, NULL, "disk",
694 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL)
695 		return (-1);
696 
697 	/* See comment about locking in xdf_kstat_delete(). */
698 	mutex_enter(&vdp->xdf_iostat_lk);
699 	mutex_enter(&vdp->xdf_dev_lk);
700 
701 	/* only one kstat can exist at a time */
702 	if (vdp->xdf_xdev_iostat != NULL) {
703 		mutex_exit(&vdp->xdf_dev_lk);
704 		mutex_exit(&vdp->xdf_iostat_lk);
705 		kstat_delete(kstat);
706 		return (-1);
707 	}
708 
709 	vdp->xdf_xdev_iostat = kstat;
710 	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
711 	kstat_install(vdp->xdf_xdev_iostat);
712 
713 	/*
714 	 * Now that we've created a kstat, we need to update the waitq and
715 	 * runq counts for the kstat to reflect our current state.
716 	 *
717 	 * For a buf_t structure to be on the runq, it must have a ring
718 	 * buffer slot associated with it.  To get a ring buffer slot the
719 	 * buf must first have a v_req_t and a ge_slot_t associated with it.
720 	 * Then when it is granted a ring buffer slot, v_runq will be set to
721 	 * true.
722 	 *
723 	 * For a buf_t structure to be on the waitq, it must not be on the
724 	 * runq.  So to find all the buf_t's that should be on waitq, we
725 	 * walk the active buf list and add any buf_t's which aren't on the
726 	 * runq to the waitq.
727 	 */
728 	bp = vdp->xdf_f_act;
729 	while (bp != NULL) {
730 		xdf_kstat_enter(vdp, bp);
731 		bp = bp->av_forw;
732 	}
733 	if (vdp->xdf_ready_tq_bp != NULL)
734 		xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp);
735 
736 	mutex_exit(&vdp->xdf_dev_lk);
737 	mutex_exit(&vdp->xdf_iostat_lk);
738 	return (0);
739 }
740 
741 void
742 xdf_kstat_delete(dev_info_t *dip)
743 {
744 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
745 	kstat_t		*kstat;
746 	buf_t		*bp;
747 
748 	/*
749 	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
750 	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
751 	 * and the contents of the our kstat.  xdf_iostat_lk is used
752 	 * to protect the allocation and freeing of the actual kstat.
753 	 * xdf_dev_lk can't be used for this purpose because kstat
754 	 * readers use it to access the contents of the kstat and
755 	 * hence it can't be held when calling kstat_delete().
756 	 */
757 	mutex_enter(&vdp->xdf_iostat_lk);
758 	mutex_enter(&vdp->xdf_dev_lk);
759 
760 	if (vdp->xdf_xdev_iostat == NULL) {
761 		mutex_exit(&vdp->xdf_dev_lk);
762 		mutex_exit(&vdp->xdf_iostat_lk);
763 		return;
764 	}
765 
766 	/*
767 	 * We're about to destroy the kstat structures, so it isn't really
768 	 * necessary to update the runq and waitq counts.  But, since this
769 	 * isn't a hot code path we can afford to be a little pedantic and
770 	 * go ahead and decrement the runq and waitq kstat counters to zero
771 	 * before free'ing them.  This helps us ensure that we've gotten all
772 	 * our accounting correct.
773 	 *
774 	 * For an explanation of how we determine which buffers go on the
775 	 * runq vs which go on the waitq, see the comments in
776 	 * xdf_kstat_create().
777 	 */
778 	bp = vdp->xdf_f_act;
779 	while (bp != NULL) {
780 		xdf_kstat_exit(vdp, bp);
781 		bp = bp->av_forw;
782 	}
783 	if (vdp->xdf_ready_tq_bp != NULL)
784 		xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp);
785 
786 	kstat = vdp->xdf_xdev_iostat;
787 	vdp->xdf_xdev_iostat = NULL;
788 	mutex_exit(&vdp->xdf_dev_lk);
789 	kstat_delete(kstat);
790 	mutex_exit(&vdp->xdf_iostat_lk);
791 }
792 
793 /*
794  * Add an IO requests onto the active queue.
795  *
796  * We have to detect IOs generated by xdf_ready_tq_thread.  These IOs
797  * are used to establish a connection to the backend, so they recieve
798  * priority over all other IOs.  Since xdf_ready_tq_thread only does
799  * synchronous IO, there can only be one xdf_ready_tq_thread request at any
800  * given time and we record the buf associated with that request in
801  * xdf_ready_tq_bp.
802  */
803 static void
804 xdf_bp_push(xdf_t *vdp, buf_t *bp)
805 {
806 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
807 	ASSERT(bp->av_forw == NULL);
808 
809 	xdf_kstat_enter(vdp, bp);
810 
811 	if (curthread == vdp->xdf_ready_tq_thread) {
812 		/* new IO requests from the ready thread */
813 		ASSERT(vdp->xdf_ready_tq_bp == NULL);
814 		vdp->xdf_ready_tq_bp = bp;
815 		return;
816 	}
817 
818 	/* this is normal IO request */
819 	ASSERT(bp != vdp->xdf_ready_tq_bp);
820 
821 	if (vdp->xdf_f_act == NULL) {
822 		/* this is only only IO on the active queue */
823 		ASSERT(vdp->xdf_l_act == NULL);
824 		ASSERT(vdp->xdf_i_act == NULL);
825 		vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp;
826 		return;
827 	}
828 
829 	/* add this IO to the tail of the active queue */
830 	vdp->xdf_l_act->av_forw = bp;
831 	vdp->xdf_l_act = bp;
832 	if (vdp->xdf_i_act == NULL)
833 		vdp->xdf_i_act = bp;
834 }
835 
836 static void
837 xdf_bp_pop(xdf_t *vdp, buf_t *bp)
838 {
839 	buf_t	*bp_iter;
840 
841 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
842 	ASSERT(VREQ_DONE(BP_VREQ(bp)));
843 
844 	if (vdp->xdf_ready_tq_bp == bp) {
845 		/* we're done with a ready thread IO request */
846 		ASSERT(bp->av_forw == NULL);
847 		vdp->xdf_ready_tq_bp = NULL;
848 		return;
849 	}
850 
851 	/* we're done with a normal IO request */
852 	ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act));
853 	ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act));
854 	ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act)));
855 	ASSERT(vdp->xdf_f_act != vdp->xdf_i_act);
856 
857 	if (bp == vdp->xdf_f_act) {
858 		/* This IO was at the head of our active queue. */
859 		vdp->xdf_f_act = bp->av_forw;
860 		if (bp == vdp->xdf_l_act)
861 			vdp->xdf_l_act = NULL;
862 	} else {
863 		/* There IO finished before some other pending IOs. */
864 		bp_iter = vdp->xdf_f_act;
865 		while (bp != bp_iter->av_forw) {
866 			bp_iter = bp_iter->av_forw;
867 			ASSERT(VREQ_DONE(BP_VREQ(bp_iter)));
868 			ASSERT(bp_iter != vdp->xdf_i_act);
869 		}
870 		bp_iter->av_forw = bp->av_forw;
871 		if (bp == vdp->xdf_l_act)
872 			vdp->xdf_l_act = bp_iter;
873 	}
874 	bp->av_forw = NULL;
875 }
876 
877 static buf_t *
878 xdf_bp_next(xdf_t *vdp)
879 {
880 	v_req_t	*vreq;
881 	buf_t	*bp;
882 
883 	if (vdp->xdf_state == XD_CONNECTED) {
884 		/*
885 		 * If we're in the XD_CONNECTED state, we only service IOs
886 		 * from the xdf_ready_tq_thread thread.
887 		 */
888 		if ((bp = vdp->xdf_ready_tq_bp) == NULL)
889 			return (NULL);
890 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
891 			return (bp);
892 		return (NULL);
893 	}
894 
895 	/* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */
896 	if (vdp->xdf_state != XD_READY)
897 		return (NULL);
898 
899 	ASSERT(vdp->xdf_ready_tq_bp == NULL);
900 	for (;;) {
901 		if ((bp = vdp->xdf_i_act) == NULL)
902 			return (NULL);
903 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
904 			return (bp);
905 
906 		/* advance the active buf index pointer */
907 		vdp->xdf_i_act = bp->av_forw;
908 	}
909 }
910 
911 static void
912 xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr)
913 {
914 	ge_slot_t	*gs = (ge_slot_t *)(uintptr_t)id;
915 	v_req_t		*vreq = gs->gs_vreq;
916 	buf_t		*bp = vreq->v_buf;
917 
918 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
919 	ASSERT(BP_VREQ(bp) == vreq);
920 
921 	gs_free(gs);
922 
923 	if (bioerr != 0)
924 		bioerror(bp, bioerr);
925 	ASSERT(vreq->v_nslots > 0);
926 	if (--vreq->v_nslots > 0)
927 		return;
928 
929 	/* remove this IO from our active queue */
930 	xdf_bp_pop(vdp, bp);
931 
932 	ASSERT(vreq->v_runq);
933 	xdf_kstat_exit(vdp, bp);
934 	vreq->v_runq = B_FALSE;
935 	vreq_free(vdp, vreq);
936 
937 	if (IS_ERROR(bp)) {
938 		xdf_io_err(bp, geterror(bp), 0);
939 	} else if (bp->b_resid != 0) {
940 		/* Partial transfers are an error */
941 		xdf_io_err(bp, EIO, bp->b_resid);
942 	} else {
943 		biodone(bp);
944 	}
945 }
946 
947 /*
948  * xdf interrupt handler
949  */
950 static uint_t
951 xdf_intr_locked(xdf_t *vdp)
952 {
953 	xendev_ring_t *xbr;
954 	blkif_response_t *resp;
955 	int bioerr;
956 	uint64_t id;
957 	uint8_t op;
958 	uint16_t status;
959 	ddi_acc_handle_t acchdl;
960 
961 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
962 
963 	if ((xbr = vdp->xdf_xb_ring) == NULL)
964 		return (DDI_INTR_UNCLAIMED);
965 
966 	acchdl = vdp->xdf_xb_ring_hdl;
967 
968 	/*
969 	 * complete all requests which have a response
970 	 */
971 	while (resp = xvdi_ring_get_response(xbr)) {
972 		id = ddi_get64(acchdl, &resp->id);
973 		op = ddi_get8(acchdl, &resp->operation);
974 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
975 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
976 		    op, id, status));
977 
978 		if (status != BLKIF_RSP_OKAY) {
979 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
980 			    vdp->xdf_addr,
981 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
982 			bioerr = EIO;
983 		} else {
984 			bioerr = 0;
985 		}
986 
987 		xdf_io_fini(vdp, id, bioerr);
988 	}
989 	return (DDI_INTR_CLAIMED);
990 }
991 
992 /*
993  * xdf_intr runs at PIL 5, so no one else can grab xdf_dev_lk and
994  * block at a lower pil.
995  */
996 static uint_t
997 xdf_intr(caddr_t arg)
998 {
999 	xdf_t *vdp = (xdf_t *)arg;
1000 	int rv;
1001 
1002 	mutex_enter(&vdp->xdf_dev_lk);
1003 	rv = xdf_intr_locked(vdp);
1004 	mutex_exit(&vdp->xdf_dev_lk);
1005 
1006 	if (!do_polled_io)
1007 		xdf_io_start(vdp);
1008 
1009 	return (rv);
1010 }
1011 
1012 static void
1013 xdf_ring_push(xdf_t *vdp)
1014 {
1015 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1016 
1017 	if (vdp->xdf_xb_ring == NULL)
1018 		return;
1019 
1020 	if (xvdi_ring_push_request(vdp->xdf_xb_ring)) {
1021 		DPRINTF(IO_DBG, (
1022 		    "xdf@%s: xdf_ring_push: sent request(s) to backend\n",
1023 		    vdp->xdf_addr));
1024 	}
1025 
1026 	if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN)
1027 		xvdi_notify_oe(vdp->xdf_dip);
1028 }
1029 
1030 static int
1031 xdf_ring_drain_locked(xdf_t *vdp)
1032 {
1033 	int		pollc, rv = 0;
1034 
1035 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1036 
1037 	if (xdf_debug & SUSRES_DBG)
1038 		xen_printf("xdf_ring_drain: start\n");
1039 
1040 	for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) {
1041 		if (vdp->xdf_xb_ring == NULL)
1042 			goto out;
1043 
1044 		if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1045 			(void) xdf_intr_locked(vdp);
1046 		if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring))
1047 			goto out;
1048 		xdf_ring_push(vdp);
1049 
1050 		/* file-backed devices can be slow */
1051 		mutex_exit(&vdp->xdf_dev_lk);
1052 #ifdef XPV_HVM_DRIVER
1053 		(void) HYPERVISOR_yield();
1054 #endif /* XPV_HVM_DRIVER */
1055 		delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY));
1056 		mutex_enter(&vdp->xdf_dev_lk);
1057 	}
1058 	cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr);
1059 
1060 out:
1061 	if (vdp->xdf_xb_ring != NULL) {
1062 		if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) ||
1063 		    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1064 			rv = EIO;
1065 	}
1066 	if (xdf_debug & SUSRES_DBG)
1067 		xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n",
1068 		    vdp->xdf_addr, rv);
1069 	return (rv);
1070 }
1071 
1072 static int
1073 xdf_ring_drain(xdf_t *vdp)
1074 {
1075 	int rv;
1076 	mutex_enter(&vdp->xdf_dev_lk);
1077 	rv = xdf_ring_drain_locked(vdp);
1078 	mutex_exit(&vdp->xdf_dev_lk);
1079 	return (rv);
1080 }
1081 
1082 /*
1083  * Destroy all v_req_t, grant table entries, and our ring buffer.
1084  */
1085 static void
1086 xdf_ring_destroy(xdf_t *vdp)
1087 {
1088 	v_req_t		*vreq;
1089 	buf_t		*bp;
1090 	ge_slot_t	*gs;
1091 
1092 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1093 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1094 
1095 	if ((vdp->xdf_state != XD_INIT) &&
1096 	    (vdp->xdf_state != XD_CONNECTED) &&
1097 	    (vdp->xdf_state != XD_READY)) {
1098 		ASSERT(vdp->xdf_xb_ring == NULL);
1099 		ASSERT(vdp->xdf_xb_ring_hdl == NULL);
1100 		ASSERT(vdp->xdf_peer == INVALID_DOMID);
1101 		ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN);
1102 		ASSERT(list_is_empty(&vdp->xdf_vreq_act));
1103 		return;
1104 	}
1105 
1106 	/*
1107 	 * We don't want to recieve async notifications from the backend
1108 	 * when it finishes processing ring entries.
1109 	 */
1110 #ifdef XPV_HVM_DRIVER
1111 	ec_unbind_evtchn(vdp->xdf_evtchn);
1112 #else /* !XPV_HVM_DRIVER */
1113 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1114 #endif /* !XPV_HVM_DRIVER */
1115 
1116 	/*
1117 	 * Drain any requests in the ring.  We need to do this before we
1118 	 * can free grant table entries, because if active ring entries
1119 	 * point to grants, then the backend could be trying to access
1120 	 * those grants.
1121 	 */
1122 	(void) xdf_ring_drain_locked(vdp);
1123 
1124 	/* We're done talking to the backend so free up our event channel */
1125 	xvdi_free_evtchn(vdp->xdf_dip);
1126 	vdp->xdf_evtchn = INVALID_EVTCHN;
1127 
1128 	while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) {
1129 		bp = vreq->v_buf;
1130 		ASSERT(BP_VREQ(bp) == vreq);
1131 
1132 		/* Free up any grant table entries associaed with this IO */
1133 		while ((gs = list_head(&vreq->v_gs)) != NULL)
1134 			gs_free(gs);
1135 
1136 		/* If this IO was on the runq, move it back to the waitq. */
1137 		if (vreq->v_runq)
1138 			xdf_kstat_runq_to_waitq(vdp, bp);
1139 
1140 		/*
1141 		 * Reset any buf IO state since we're going to re-issue the
1142 		 * IO when we reconnect.
1143 		 */
1144 		vreq_free(vdp, vreq);
1145 		BP_VREQ_SET(bp, NULL);
1146 		bioerror(bp, 0);
1147 	}
1148 
1149 	/* reset the active queue index pointer */
1150 	vdp->xdf_i_act = vdp->xdf_f_act;
1151 
1152 	/* Destroy the ring */
1153 	xvdi_free_ring(vdp->xdf_xb_ring);
1154 	vdp->xdf_xb_ring = NULL;
1155 	vdp->xdf_xb_ring_hdl = NULL;
1156 	vdp->xdf_peer = INVALID_DOMID;
1157 }
1158 
1159 void
1160 xdfmin(struct buf *bp)
1161 {
1162 	if (bp->b_bcount > xdf_maxphys)
1163 		bp->b_bcount = xdf_maxphys;
1164 }
1165 
1166 /*
1167  * Check if we have a pending "eject" media request.
1168  */
1169 static int
1170 xdf_eject_pending(xdf_t *vdp)
1171 {
1172 	dev_info_t	*dip = vdp->xdf_dip;
1173 	char		*xsname, *str;
1174 
1175 	if (!vdp->xdf_media_req_supported)
1176 		return (B_FALSE);
1177 
1178 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1179 	    (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0))
1180 		return (B_FALSE);
1181 
1182 	if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
1183 		strfree(str);
1184 		return (B_FALSE);
1185 	}
1186 	strfree(str);
1187 	return (B_TRUE);
1188 }
1189 
1190 /*
1191  * Generate a media request.
1192  */
1193 static int
1194 xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required)
1195 {
1196 	dev_info_t	*dip = vdp->xdf_dip;
1197 	char		*xsname;
1198 
1199 	/*
1200 	 * we can't be holding xdf_dev_lk because xenbus_printf() can
1201 	 * block while waiting for a PIL 1 interrupt message.  this
1202 	 * would cause a deadlock with xdf_intr() which needs to grab
1203 	 * xdf_dev_lk as well and runs at PIL 5.
1204 	 */
1205 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1206 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1207 
1208 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1209 		return (ENXIO);
1210 
1211 	/* Check if we support media requests */
1212 	if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported)
1213 		return (ENOTTY);
1214 
1215 	/* If an eject is pending then don't allow any new requests */
1216 	if (xdf_eject_pending(vdp))
1217 		return (ENXIO);
1218 
1219 	/* Make sure that there is media present */
1220 	if (media_required && (vdp->xdf_xdev_nblocks == 0))
1221 		return (ENXIO);
1222 
1223 	/* We only allow operations when the device is ready and connected */
1224 	if (vdp->xdf_state != XD_READY)
1225 		return (EIO);
1226 
1227 	if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0)
1228 		return (EIO);
1229 
1230 	return (0);
1231 }
1232 
1233 /*
1234  * populate a single blkif_request_t w/ a buf
1235  */
1236 static void
1237 xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1238 {
1239 	grant_ref_t	gr;
1240 	uint8_t		fsect, lsect;
1241 	size_t		bcnt;
1242 	paddr_t		dma_addr;
1243 	off_t		blk_off;
1244 	dev_info_t	*dip = vdp->xdf_dip;
1245 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1246 	v_req_t		*vreq = BP_VREQ(bp);
1247 	uint64_t	blkno = vreq->v_blkno;
1248 	uint_t		ndmacs = vreq->v_ndmacs;
1249 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1250 	int		seg = 0;
1251 	int		isread = IS_READ(bp);
1252 	ge_slot_t	*gs = list_head(&vreq->v_gs);
1253 
1254 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1255 	ASSERT(vreq->v_status == VREQ_GS_ALLOCED);
1256 
1257 	if (isread)
1258 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1259 	else {
1260 		switch (vreq->v_flush_diskcache) {
1261 		case FLUSH_DISKCACHE:
1262 			ddi_put8(acchdl, &rreq->operation,
1263 			    BLKIF_OP_FLUSH_DISKCACHE);
1264 			ddi_put16(acchdl, &rreq->handle, vdev);
1265 			ddi_put64(acchdl, &rreq->id,
1266 			    (uint64_t)(uintptr_t)(gs));
1267 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1268 			vreq->v_status = VREQ_DMAWIN_DONE;
1269 			return;
1270 		case WRITE_BARRIER:
1271 			ddi_put8(acchdl, &rreq->operation,
1272 			    BLKIF_OP_WRITE_BARRIER);
1273 			break;
1274 		default:
1275 			if (!vdp->xdf_wce)
1276 				ddi_put8(acchdl, &rreq->operation,
1277 				    BLKIF_OP_WRITE_BARRIER);
1278 			else
1279 				ddi_put8(acchdl, &rreq->operation,
1280 				    BLKIF_OP_WRITE);
1281 			break;
1282 		}
1283 	}
1284 
1285 	ddi_put16(acchdl, &rreq->handle, vdev);
1286 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1287 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs));
1288 
1289 	/*
1290 	 * loop until all segments are populated or no more dma cookie in buf
1291 	 */
1292 	for (;;) {
1293 		/*
1294 		 * Each segment of a blkif request can transfer up to
1295 		 * one 4K page of data.
1296 		 */
1297 		bcnt = vreq->v_dmac.dmac_size;
1298 		dma_addr = vreq->v_dmac.dmac_laddress;
1299 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1300 		fsect = blk_off >> XB_BSHIFT;
1301 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1302 
1303 		ASSERT(bcnt <= PAGESIZE);
1304 		ASSERT((bcnt % XB_BSIZE) == 0);
1305 		ASSERT((blk_off & XB_BMASK) == 0);
1306 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1307 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1308 
1309 		gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT);
1310 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1311 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1312 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1313 
1314 		DPRINTF(IO_DBG, (
1315 		    "xdf@%s: seg%d: dmacS %lu blk_off %ld\n",
1316 		    vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off));
1317 		DPRINTF(IO_DBG, (
1318 		    "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n",
1319 		    vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr));
1320 
1321 		blkno += (bcnt >> XB_BSHIFT);
1322 		seg++;
1323 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1324 		if (--ndmacs) {
1325 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1326 			continue;
1327 		}
1328 
1329 		vreq->v_status = VREQ_DMAWIN_DONE;
1330 		vreq->v_blkno = blkno;
1331 		break;
1332 	}
1333 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1334 	DPRINTF(IO_DBG, (
1335 	    "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n",
1336 	    vdp->xdf_addr, rreq->id));
1337 }
1338 
1339 static void
1340 xdf_io_start(xdf_t *vdp)
1341 {
1342 	struct buf	*bp;
1343 	v_req_t		*vreq;
1344 	blkif_request_t	*rreq;
1345 	boolean_t	rreqready = B_FALSE;
1346 
1347 	mutex_enter(&vdp->xdf_dev_lk);
1348 
1349 	/*
1350 	 * Populate the ring request(s).  Loop until there is no buf to
1351 	 * transfer or no free slot available in I/O ring.
1352 	 */
1353 	for (;;) {
1354 		/* don't start any new IO if we're suspending */
1355 		if (vdp->xdf_suspending)
1356 			break;
1357 		if ((bp = xdf_bp_next(vdp)) == NULL)
1358 			break;
1359 
1360 		/* if the buf doesn't already have a vreq, allocate one */
1361 		if (((vreq = BP_VREQ(bp)) == NULL) &&
1362 		    ((vreq = vreq_get(vdp, bp)) == NULL))
1363 			break;
1364 
1365 		/* alloc DMA/GTE resources */
1366 		if (vreq_setup(vdp, vreq) != DDI_SUCCESS)
1367 			break;
1368 
1369 		/* get next blkif_request in the ring */
1370 		if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL)
1371 			break;
1372 		bzero(rreq, sizeof (blkif_request_t));
1373 		rreqready = B_TRUE;
1374 
1375 		/* populate blkif_request with this buf */
1376 		xdf_process_rreq(vdp, bp, rreq);
1377 
1378 		/*
1379 		 * This buffer/vreq pair is has been allocated a ring buffer
1380 		 * resources, so if it isn't already in our runq, add it.
1381 		 */
1382 		if (!vreq->v_runq)
1383 			xdf_kstat_waitq_to_runq(vdp, bp);
1384 	}
1385 
1386 	/* Send the request(s) to the backend */
1387 	if (rreqready)
1388 		xdf_ring_push(vdp);
1389 
1390 	mutex_exit(&vdp->xdf_dev_lk);
1391 }
1392 
1393 
1394 /* check if partition is open, -1 - check all partitions on the disk */
1395 static boolean_t
1396 xdf_isopen(xdf_t *vdp, int partition)
1397 {
1398 	int i;
1399 	ulong_t parbit;
1400 	boolean_t rval = B_FALSE;
1401 
1402 	ASSERT((partition == -1) ||
1403 	    ((partition >= 0) || (partition < XDF_PEXT)));
1404 
1405 	if (partition == -1)
1406 		parbit = (ulong_t)-1;
1407 	else
1408 		parbit = 1 << partition;
1409 
1410 	for (i = 0; i < OTYPCNT; i++) {
1411 		if (vdp->xdf_vd_open[i] & parbit)
1412 			rval = B_TRUE;
1413 	}
1414 
1415 	return (rval);
1416 }
1417 
1418 /*
1419  * The connection should never be closed as long as someone is holding
1420  * us open, there is pending IO, or someone is waiting waiting for a
1421  * connection.
1422  */
1423 static boolean_t
1424 xdf_busy(xdf_t *vdp)
1425 {
1426 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1427 
1428 	if ((vdp->xdf_xb_ring != NULL) &&
1429 	    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
1430 		ASSERT(vdp->xdf_state != XD_CLOSED);
1431 		return (B_TRUE);
1432 	}
1433 
1434 	if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) {
1435 		ASSERT(vdp->xdf_state != XD_CLOSED);
1436 		return (B_TRUE);
1437 	}
1438 
1439 	if (xdf_isopen(vdp, -1)) {
1440 		ASSERT(vdp->xdf_state != XD_CLOSED);
1441 		return (B_TRUE);
1442 	}
1443 
1444 	if (vdp->xdf_connect_req > 0) {
1445 		ASSERT(vdp->xdf_state != XD_CLOSED);
1446 		return (B_TRUE);
1447 	}
1448 
1449 	return (B_FALSE);
1450 }
1451 
1452 static void
1453 xdf_set_state(xdf_t *vdp, xdf_state_t new_state)
1454 {
1455 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1456 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1457 	DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n",
1458 	    vdp->xdf_addr, vdp->xdf_state, new_state));
1459 	vdp->xdf_state = new_state;
1460 	cv_broadcast(&vdp->xdf_dev_cv);
1461 }
1462 
1463 static void
1464 xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet)
1465 {
1466 	dev_info_t	*dip = vdp->xdf_dip;
1467 	boolean_t	busy;
1468 
1469 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1470 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1471 	ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED));
1472 
1473 	/* Check if we're already there. */
1474 	if (vdp->xdf_state == new_state)
1475 		return;
1476 
1477 	mutex_enter(&vdp->xdf_dev_lk);
1478 	busy = xdf_busy(vdp);
1479 
1480 	/* If we're already closed then there's nothing todo. */
1481 	if (vdp->xdf_state == XD_CLOSED) {
1482 		ASSERT(!busy);
1483 		xdf_set_state(vdp, new_state);
1484 		mutex_exit(&vdp->xdf_dev_lk);
1485 		return;
1486 	}
1487 
1488 #ifdef DEBUG
1489 	/* UhOh.  Warn the user that something bad has happened. */
1490 	if (!quiet && busy && (vdp->xdf_state == XD_READY) &&
1491 	    (vdp->xdf_xdev_nblocks != 0)) {
1492 		cmn_err(CE_WARN, "xdf@%s: disconnected while in use",
1493 		    vdp->xdf_addr);
1494 	}
1495 #endif /* DEBUG */
1496 
1497 	xdf_ring_destroy(vdp);
1498 
1499 	/* If we're busy then we can only go into the unknown state */
1500 	xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state);
1501 	mutex_exit(&vdp->xdf_dev_lk);
1502 
1503 	/* if we're closed now, let the other end know */
1504 	if (vdp->xdf_state == XD_CLOSED)
1505 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1506 }
1507 
1508 
1509 /*
1510  * Kick-off connect process
1511  * Status should be XD_UNKNOWN or XD_CLOSED
1512  * On success, status will be changed to XD_INIT
1513  * On error, it will be changed to XD_UNKNOWN
1514  */
1515 static int
1516 xdf_setstate_init(xdf_t *vdp)
1517 {
1518 	dev_info_t		*dip = vdp->xdf_dip;
1519 	xenbus_transaction_t	xbt;
1520 	grant_ref_t		gref;
1521 	char			*xsname, *str;
1522 	int 			rv;
1523 
1524 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1525 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1526 	ASSERT((vdp->xdf_state == XD_UNKNOWN) ||
1527 	    (vdp->xdf_state == XD_CLOSED));
1528 
1529 	DPRINTF(DDI_DBG,
1530 	    ("xdf@%s: starting connection process\n", vdp->xdf_addr));
1531 
1532 	/*
1533 	 * If an eject is pending then don't allow a new connection.
1534 	 * (Only the backend can clear media request eject request.)
1535 	 */
1536 	if (xdf_eject_pending(vdp))
1537 		return (DDI_FAILURE);
1538 
1539 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1540 		goto errout;
1541 
1542 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID)
1543 		goto errout;
1544 
1545 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising);
1546 
1547 	/*
1548 	 * Sanity check for the existance of the xenbus device-type property.
1549 	 * This property might not exist if we our xenbus device nodes was
1550 	 * force destroyed while we were still connected to the backend.
1551 	 */
1552 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0)
1553 		goto errout;
1554 	strfree(str);
1555 
1556 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS)
1557 		goto errout;
1558 
1559 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1560 #ifdef XPV_HVM_DRIVER
1561 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1562 #else /* !XPV_HVM_DRIVER */
1563 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1564 	    DDI_SUCCESS) {
1565 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: "
1566 		    "failed to add intr handler", vdp->xdf_addr);
1567 		goto errout1;
1568 	}
1569 #endif /* !XPV_HVM_DRIVER */
1570 
1571 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1572 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1573 	    DDI_SUCCESS) {
1574 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1575 		    vdp->xdf_addr);
1576 		goto errout2;
1577 	}
1578 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1579 
1580 	/*
1581 	 * Write into xenstore the info needed by backend
1582 	 */
1583 trans_retry:
1584 	if (xenbus_transaction_start(&xbt)) {
1585 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1586 		    vdp->xdf_addr);
1587 		xvdi_fatal_error(dip, EIO, "connect transaction init");
1588 		goto fail_trans;
1589 	}
1590 
1591 	/*
1592 	 * XBP_PROTOCOL is written by the domain builder in the case of PV
1593 	 * domains. However, it is not written for HVM domains, so let's
1594 	 * write it here.
1595 	 */
1596 	if (((rv = xenbus_printf(xbt, xsname,
1597 	    XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) ||
1598 	    ((rv = xenbus_printf(xbt, xsname,
1599 	    XBP_RING_REF, "%u", gref)) != 0) ||
1600 	    ((rv = xenbus_printf(xbt, xsname,
1601 	    XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) ||
1602 	    ((rv = xenbus_printf(xbt, xsname,
1603 	    XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) ||
1604 	    ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) {
1605 		(void) xenbus_transaction_end(xbt, 1);
1606 		xvdi_fatal_error(dip, rv, "connect transaction setup");
1607 		goto fail_trans;
1608 	}
1609 
1610 	/* kick-off connect process */
1611 	if (rv = xenbus_transaction_end(xbt, 0)) {
1612 		if (rv == EAGAIN)
1613 			goto trans_retry;
1614 		xvdi_fatal_error(dip, rv, "connect transaction commit");
1615 		goto fail_trans;
1616 	}
1617 
1618 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1619 	mutex_enter(&vdp->xdf_dev_lk);
1620 	xdf_set_state(vdp, XD_INIT);
1621 	mutex_exit(&vdp->xdf_dev_lk);
1622 
1623 	return (DDI_SUCCESS);
1624 
1625 fail_trans:
1626 	xvdi_free_ring(vdp->xdf_xb_ring);
1627 errout2:
1628 #ifdef XPV_HVM_DRIVER
1629 	ec_unbind_evtchn(vdp->xdf_evtchn);
1630 #else /* !XPV_HVM_DRIVER */
1631 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1632 #endif /* !XPV_HVM_DRIVER */
1633 errout1:
1634 	xvdi_free_evtchn(dip);
1635 	vdp->xdf_evtchn = INVALID_EVTCHN;
1636 errout:
1637 	xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1638 	cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend",
1639 	    vdp->xdf_addr);
1640 	return (DDI_FAILURE);
1641 }
1642 
1643 int
1644 xdf_get_flush_block(xdf_t *vdp)
1645 {
1646 	/*
1647 	 * Get a DEV_BSIZE aligned bufer
1648 	 */
1649 	vdp->xdf_flush_mem = kmem_alloc(vdp->xdf_xdev_secsize * 2, KM_SLEEP);
1650 	vdp->xdf_cache_flush_block =
1651 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem),
1652 	    (int)vdp->xdf_xdev_secsize);
1653 
1654 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1655 	    xdf_flush_block, vdp->xdf_xdev_secsize, NULL) != 0)
1656 		return (DDI_FAILURE);
1657 	return (DDI_SUCCESS);
1658 }
1659 
1660 static void
1661 xdf_setstate_ready(void *arg)
1662 {
1663 	xdf_t	*vdp = (xdf_t *)arg;
1664 
1665 	vdp->xdf_ready_tq_thread = curthread;
1666 
1667 	/*
1668 	 * We've created all the minor nodes via cmlb_attach() using default
1669 	 * value in xdf_attach() to make it possible to block in xdf_open(),
1670 	 * in case there's anyone (say, booting thread) ever trying to open
1671 	 * it before connected to backend. We will refresh all those minor
1672 	 * nodes w/ latest info we've got now when we are almost connected.
1673 	 */
1674 	mutex_enter(&vdp->xdf_dev_lk);
1675 	if (vdp->xdf_cmbl_reattach) {
1676 		vdp->xdf_cmbl_reattach = B_FALSE;
1677 
1678 		mutex_exit(&vdp->xdf_dev_lk);
1679 		if (xdf_cmlb_attach(vdp) != 0) {
1680 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1681 			return;
1682 		}
1683 		mutex_enter(&vdp->xdf_dev_lk);
1684 	}
1685 
1686 	/* If we're not still trying to get to the ready state, then bail. */
1687 	if (vdp->xdf_state != XD_CONNECTED) {
1688 		mutex_exit(&vdp->xdf_dev_lk);
1689 		return;
1690 	}
1691 	mutex_exit(&vdp->xdf_dev_lk);
1692 
1693 	/*
1694 	 * If backend has feature-barrier, see if it supports disk
1695 	 * cache flush op.
1696 	 */
1697 	vdp->xdf_flush_supported = B_FALSE;
1698 	if (vdp->xdf_feature_barrier) {
1699 		/*
1700 		 * Pretend we already know flush is supported so probe
1701 		 * will attempt the correct op.
1702 		 */
1703 		vdp->xdf_flush_supported = B_TRUE;
1704 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1705 			vdp->xdf_flush_supported = B_TRUE;
1706 		} else {
1707 			vdp->xdf_flush_supported = B_FALSE;
1708 			/*
1709 			 * If the other end does not support the cache flush op
1710 			 * then we must use a barrier-write to force disk
1711 			 * cache flushing.  Barrier writes require that a data
1712 			 * block actually be written.
1713 			 * Cache a block to barrier-write when we are
1714 			 * asked to perform a flush.
1715 			 * XXX - would it be better to just copy 1 block
1716 			 * (512 bytes) from whatever write we did last
1717 			 * and rewrite that block?
1718 			 */
1719 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS) {
1720 				xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1721 				return;
1722 			}
1723 		}
1724 	}
1725 
1726 	mutex_enter(&vdp->xdf_cb_lk);
1727 	mutex_enter(&vdp->xdf_dev_lk);
1728 	if (vdp->xdf_state == XD_CONNECTED)
1729 		xdf_set_state(vdp, XD_READY);
1730 	mutex_exit(&vdp->xdf_dev_lk);
1731 
1732 	/* Restart any currently queued up io */
1733 	xdf_io_start(vdp);
1734 
1735 	mutex_exit(&vdp->xdf_cb_lk);
1736 }
1737 
1738 /*
1739  * synthetic geometry
1740  */
1741 #define	XDF_NSECTS	256
1742 #define	XDF_NHEADS	16
1743 
1744 static void
1745 xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp)
1746 {
1747 	xdf_t *vdp;
1748 	uint_t ncyl;
1749 
1750 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
1751 
1752 	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1753 
1754 	bzero(geomp, sizeof (*geomp));
1755 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1756 	geomp->g_acyl = 0;
1757 	geomp->g_nhead = XDF_NHEADS;
1758 	geomp->g_nsect = XDF_NSECTS;
1759 	geomp->g_secsize = vdp->xdf_xdev_secsize;
1760 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1761 	geomp->g_intrlv = 0;
1762 	geomp->g_rpm = 7200;
1763 }
1764 
1765 /*
1766  * Finish other initialization after we've connected to backend
1767  * Status should be XD_INIT before calling this routine
1768  * On success, status should be changed to XD_CONNECTED.
1769  * On error, status should stay XD_INIT
1770  */
1771 static int
1772 xdf_setstate_connected(xdf_t *vdp)
1773 {
1774 	dev_info_t	*dip = vdp->xdf_dip;
1775 	cmlb_geom_t	pgeom;
1776 	diskaddr_t	nblocks = 0;
1777 	uint_t		secsize = 0;
1778 	char		*oename, *xsname, *str;
1779 	uint_t		dinfo;
1780 
1781 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1782 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1783 	ASSERT(vdp->xdf_state == XD_INIT);
1784 
1785 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1786 	    ((oename = xvdi_get_oename(dip)) == NULL))
1787 		return (DDI_FAILURE);
1788 
1789 	/* Make sure the other end is XenbusStateConnected */
1790 	if (xenbus_read_driver_state(oename) != XenbusStateConnected)
1791 		return (DDI_FAILURE);
1792 
1793 	/* Determine if feature barrier is supported by backend */
1794 	if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB)))
1795 		cmn_err(CE_NOTE, "!xdf@%s: feature-barrier not supported",
1796 		    vdp->xdf_addr);
1797 
1798 	/*
1799 	 * Probe backend.  Read the device size into xdf_xdev_nblocks
1800 	 * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE
1801 	 * flags in xdf_dinfo.  If the emulated device type is "cdrom",
1802 	 * we always set VDISK_CDROM, regardless of if it's present in
1803 	 * the xenbus info parameter.
1804 	 */
1805 	if (xenbus_gather(XBT_NULL, oename,
1806 	    XBP_SECTORS, "%"SCNu64, &nblocks,
1807 	    XBP_SECTOR_SIZE, "%u", &secsize,
1808 	    XBP_INFO, "%u", &dinfo,
1809 	    NULL) != 0) {
1810 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1811 		    "cannot read backend info", vdp->xdf_addr);
1812 		return (DDI_FAILURE);
1813 	}
1814 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
1815 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
1816 		    vdp->xdf_addr);
1817 		return (DDI_FAILURE);
1818 	}
1819 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
1820 		dinfo |= VDISK_CDROM;
1821 	strfree(str);
1822 
1823 	if (secsize == 0 || !(ISP2(secsize / DEV_BSIZE)))
1824 		secsize = DEV_BSIZE;
1825 	vdp->xdf_xdev_nblocks = nblocks;
1826 	vdp->xdf_xdev_secsize = secsize;
1827 #ifdef _ILP32
1828 	if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1829 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1830 		    "backend disk device too large with %llu blocks for"
1831 		    " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks);
1832 		xvdi_fatal_error(dip, EFBIG, "reading backend info");
1833 		return (DDI_FAILURE);
1834 	}
1835 #endif
1836 
1837 	/*
1838 	 * If the physical geometry for a fixed disk has been explicity
1839 	 * set then make sure that the specified physical geometry isn't
1840 	 * larger than the device we connected to.
1841 	 */
1842 	if (vdp->xdf_pgeom_fixed &&
1843 	    (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) {
1844 		cmn_err(CE_WARN,
1845 		    "xdf@%s: connect failed, fixed geometry too large",
1846 		    vdp->xdf_addr);
1847 		return (DDI_FAILURE);
1848 	}
1849 
1850 	vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP);
1851 
1852 	/* mark vbd is ready for I/O */
1853 	mutex_enter(&vdp->xdf_dev_lk);
1854 	xdf_set_state(vdp, XD_CONNECTED);
1855 
1856 	/* check if the cmlb label should be updated */
1857 	xdf_synthetic_pgeom(dip, &pgeom);
1858 	if ((vdp->xdf_dinfo != dinfo) ||
1859 	    (!vdp->xdf_pgeom_fixed &&
1860 	    (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) {
1861 		vdp->xdf_cmbl_reattach = B_TRUE;
1862 
1863 		vdp->xdf_dinfo = dinfo;
1864 		if (!vdp->xdf_pgeom_fixed)
1865 			vdp->xdf_pgeom = pgeom;
1866 	}
1867 
1868 	if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) {
1869 		if (vdp->xdf_xdev_nblocks == 0) {
1870 			vdp->xdf_mstate = DKIO_EJECTED;
1871 			cv_broadcast(&vdp->xdf_mstate_cv);
1872 		} else {
1873 			vdp->xdf_mstate = DKIO_INSERTED;
1874 			cv_broadcast(&vdp->xdf_mstate_cv);
1875 		}
1876 	} else {
1877 		if (vdp->xdf_mstate != DKIO_NONE) {
1878 			vdp->xdf_mstate = DKIO_NONE;
1879 			cv_broadcast(&vdp->xdf_mstate_cv);
1880 		}
1881 	}
1882 
1883 	mutex_exit(&vdp->xdf_dev_lk);
1884 
1885 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr,
1886 	    (uint64_t)vdp->xdf_xdev_nblocks);
1887 
1888 	/* Restart any currently queued up io */
1889 	xdf_io_start(vdp);
1890 
1891 	/*
1892 	 * To get to the ready state we have to do IO to the backend device,
1893 	 * but we can't initiate IO from the other end change callback thread
1894 	 * (which is the current context we're executing in.)  This is because
1895 	 * if the other end disconnects while we're doing IO from the callback
1896 	 * thread, then we can't recieve that disconnect event and we hang
1897 	 * waiting for an IO that can never complete.
1898 	 */
1899 	(void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp,
1900 	    DDI_SLEEP);
1901 
1902 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1903 	return (DDI_SUCCESS);
1904 }
1905 
1906 /*ARGSUSED*/
1907 static void
1908 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1909 {
1910 	XenbusState new_state = *(XenbusState *)impl_data;
1911 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1912 
1913 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1914 	    vdp->xdf_addr, new_state));
1915 
1916 	mutex_enter(&vdp->xdf_cb_lk);
1917 
1918 	/* We assume that this callback is single threaded */
1919 	ASSERT(vdp->xdf_oe_change_thread == NULL);
1920 	DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread);
1921 
1922 	/* ignore any backend state changes if we're suspending/suspended */
1923 	if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) {
1924 		DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1925 		mutex_exit(&vdp->xdf_cb_lk);
1926 		return;
1927 	}
1928 
1929 	switch (new_state) {
1930 	case XenbusStateUnknown:
1931 	case XenbusStateInitialising:
1932 	case XenbusStateInitWait:
1933 	case XenbusStateInitialised:
1934 		if (vdp->xdf_state == XD_INIT)
1935 			break;
1936 
1937 		xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1938 		if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1939 			break;
1940 		ASSERT(vdp->xdf_state == XD_INIT);
1941 		break;
1942 
1943 	case XenbusStateConnected:
1944 		if ((vdp->xdf_state == XD_CONNECTED) ||
1945 		    (vdp->xdf_state == XD_READY))
1946 			break;
1947 
1948 		if (vdp->xdf_state != XD_INIT) {
1949 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1950 			if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1951 				break;
1952 			ASSERT(vdp->xdf_state == XD_INIT);
1953 		}
1954 
1955 		if (xdf_setstate_connected(vdp) != DDI_SUCCESS) {
1956 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1957 			break;
1958 		}
1959 		ASSERT(vdp->xdf_state == XD_CONNECTED);
1960 		break;
1961 
1962 	case XenbusStateClosing:
1963 		if (xdf_isopen(vdp, -1)) {
1964 			cmn_err(CE_NOTE,
1965 			    "xdf@%s: hot-unplug failed, still in use",
1966 			    vdp->xdf_addr);
1967 			break;
1968 		}
1969 		/*FALLTHROUGH*/
1970 	case XenbusStateClosed:
1971 		xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
1972 		break;
1973 	}
1974 
1975 	/* notify anybody waiting for oe state change */
1976 	cv_broadcast(&vdp->xdf_dev_cv);
1977 	DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1978 	mutex_exit(&vdp->xdf_cb_lk);
1979 }
1980 
1981 static int
1982 xdf_connect_locked(xdf_t *vdp, boolean_t wait)
1983 {
1984 	int	rv, timeouts = 0, reset = 20;
1985 
1986 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1987 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1988 
1989 	/* we can't connect once we're in the closed state */
1990 	if (vdp->xdf_state == XD_CLOSED)
1991 		return (XD_CLOSED);
1992 
1993 	vdp->xdf_connect_req++;
1994 	while (vdp->xdf_state != XD_READY) {
1995 		mutex_exit(&vdp->xdf_dev_lk);
1996 
1997 		/* only one thread at a time can be the connection thread */
1998 		if (vdp->xdf_connect_thread == NULL)
1999 			vdp->xdf_connect_thread = curthread;
2000 
2001 		if (vdp->xdf_connect_thread == curthread) {
2002 			if ((timeouts > 0) && ((timeouts % reset) == 0)) {
2003 				/*
2004 				 * If we haven't establised a connection
2005 				 * within the reset time, then disconnect
2006 				 * so we can try again, and double the reset
2007 				 * time.  The reset time starts at 2 sec.
2008 				 */
2009 				(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2010 				reset *= 2;
2011 			}
2012 			if (vdp->xdf_state == XD_UNKNOWN)
2013 				(void) xdf_setstate_init(vdp);
2014 			if (vdp->xdf_state == XD_INIT)
2015 				(void) xdf_setstate_connected(vdp);
2016 		}
2017 
2018 		mutex_enter(&vdp->xdf_dev_lk);
2019 		if (!wait || (vdp->xdf_state == XD_READY))
2020 			goto out;
2021 
2022 		mutex_exit((&vdp->xdf_cb_lk));
2023 		if (vdp->xdf_connect_thread != curthread) {
2024 			rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk);
2025 		} else {
2026 			/* delay for 0.1 sec */
2027 			rv = cv_reltimedwait_sig(&vdp->xdf_dev_cv,
2028 			    &vdp->xdf_dev_lk, drv_usectohz(100*1000),
2029 			    TR_CLOCK_TICK);
2030 			if (rv == -1)
2031 				timeouts++;
2032 		}
2033 		mutex_exit((&vdp->xdf_dev_lk));
2034 		mutex_enter((&vdp->xdf_cb_lk));
2035 		mutex_enter((&vdp->xdf_dev_lk));
2036 		if (rv == 0)
2037 			goto out;
2038 	}
2039 
2040 out:
2041 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2042 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2043 
2044 	if (vdp->xdf_connect_thread == curthread) {
2045 		/*
2046 		 * wake up someone else so they can become the connection
2047 		 * thread.
2048 		 */
2049 		cv_signal(&vdp->xdf_dev_cv);
2050 		vdp->xdf_connect_thread = NULL;
2051 	}
2052 
2053 	/* Try to lock the media */
2054 	mutex_exit((&vdp->xdf_dev_lk));
2055 	(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2056 	mutex_enter((&vdp->xdf_dev_lk));
2057 
2058 	vdp->xdf_connect_req--;
2059 	return (vdp->xdf_state);
2060 }
2061 
2062 static uint_t
2063 xdf_iorestart(caddr_t arg)
2064 {
2065 	xdf_t *vdp = (xdf_t *)arg;
2066 
2067 	ASSERT(vdp != NULL);
2068 
2069 	mutex_enter(&vdp->xdf_dev_lk);
2070 	ASSERT(ISDMACBON(vdp));
2071 	SETDMACBOFF(vdp);
2072 	mutex_exit(&vdp->xdf_dev_lk);
2073 
2074 	xdf_io_start(vdp);
2075 
2076 	return (DDI_INTR_CLAIMED);
2077 }
2078 
2079 #if defined(XPV_HVM_DRIVER)
2080 
2081 typedef struct xdf_hvm_entry {
2082 	list_node_t	xdf_he_list;
2083 	char		*xdf_he_path;
2084 	dev_info_t	*xdf_he_dip;
2085 } xdf_hvm_entry_t;
2086 
2087 static list_t xdf_hvm_list;
2088 static kmutex_t xdf_hvm_list_lock;
2089 
2090 static xdf_hvm_entry_t *
2091 i_xdf_hvm_find(const char *path, dev_info_t *dip)
2092 {
2093 	xdf_hvm_entry_t	*i;
2094 
2095 	ASSERT((path != NULL) || (dip != NULL));
2096 	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2097 
2098 	i = list_head(&xdf_hvm_list);
2099 	while (i != NULL) {
2100 		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2101 			i = list_next(&xdf_hvm_list, i);
2102 			continue;
2103 		}
2104 		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2105 			i = list_next(&xdf_hvm_list, i);
2106 			continue;
2107 		}
2108 		break;
2109 	}
2110 	return (i);
2111 }
2112 
2113 dev_info_t *
2114 xdf_hvm_hold(const char *path)
2115 {
2116 	xdf_hvm_entry_t	*i;
2117 	dev_info_t	*dip;
2118 
2119 	mutex_enter(&xdf_hvm_list_lock);
2120 	i = i_xdf_hvm_find(path, NULL);
2121 	if (i == NULL) {
2122 		mutex_exit(&xdf_hvm_list_lock);
2123 		return (B_FALSE);
2124 	}
2125 	ndi_hold_devi(dip = i->xdf_he_dip);
2126 	mutex_exit(&xdf_hvm_list_lock);
2127 	return (dip);
2128 }
2129 
2130 static void
2131 xdf_hvm_add(dev_info_t *dip)
2132 {
2133 	xdf_hvm_entry_t	*i;
2134 	char		*path;
2135 
2136 	/* figure out the path for the dip */
2137 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2138 	(void) ddi_pathname(dip, path);
2139 
2140 	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2141 	i->xdf_he_dip = dip;
2142 	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2143 
2144 	mutex_enter(&xdf_hvm_list_lock);
2145 	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2146 	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2147 	list_insert_head(&xdf_hvm_list, i);
2148 	mutex_exit(&xdf_hvm_list_lock);
2149 
2150 	kmem_free(path, MAXPATHLEN);
2151 }
2152 
2153 static void
2154 xdf_hvm_rm(dev_info_t *dip)
2155 {
2156 	xdf_hvm_entry_t	*i;
2157 
2158 	mutex_enter(&xdf_hvm_list_lock);
2159 	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2160 	list_remove(&xdf_hvm_list, i);
2161 	mutex_exit(&xdf_hvm_list_lock);
2162 
2163 	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2164 	kmem_free(i, sizeof (*i));
2165 }
2166 
2167 static void
2168 xdf_hvm_init(void)
2169 {
2170 	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2171 	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2172 	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2173 }
2174 
2175 static void
2176 xdf_hvm_fini(void)
2177 {
2178 	ASSERT(list_head(&xdf_hvm_list) == NULL);
2179 	list_destroy(&xdf_hvm_list);
2180 	mutex_destroy(&xdf_hvm_list_lock);
2181 }
2182 
2183 boolean_t
2184 xdf_hvm_connect(dev_info_t *dip)
2185 {
2186 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2187 	char	*oename, *str;
2188 	int	rv;
2189 
2190 	mutex_enter(&vdp->xdf_cb_lk);
2191 
2192 	/*
2193 	 * Before try to establish a connection we need to wait for the
2194 	 * backend hotplug scripts to have run.  Once they are run the
2195 	 * "<oename>/hotplug-status" property will be set to "connected".
2196 	 */
2197 	for (;;) {
2198 		ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2199 
2200 		/*
2201 		 * Get the xenbus path to the backend device.  Note that
2202 		 * we can't cache this path (and we look it up on each pass
2203 		 * through this loop) because it could change during
2204 		 * suspend, resume, and migration operations.
2205 		 */
2206 		if ((oename = xvdi_get_oename(dip)) == NULL) {
2207 			mutex_exit(&vdp->xdf_cb_lk);
2208 			return (B_FALSE);
2209 		}
2210 
2211 		str = NULL;
2212 		if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) &&
2213 		    (strcmp(str, XBV_HP_STATUS_CONN) == 0))
2214 			break;
2215 
2216 		if (str != NULL)
2217 			strfree(str);
2218 
2219 		/* wait for an update to "<oename>/hotplug-status" */
2220 		if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) {
2221 			/* we got interrupted by a signal */
2222 			mutex_exit(&vdp->xdf_cb_lk);
2223 			return (B_FALSE);
2224 		}
2225 	}
2226 
2227 	/* Good news.  The backend hotplug scripts have been run. */
2228 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2229 	ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0);
2230 	strfree(str);
2231 
2232 	/*
2233 	 * If we're emulating a cd device and if the backend doesn't support
2234 	 * media request opreations, then we're not going to bother trying
2235 	 * to establish a connection for a couple reasons.  First off, media
2236 	 * requests support is required to support operations like eject and
2237 	 * media locking.  Second, other backend platforms like Linux don't
2238 	 * support hvm pv cdrom access.  They don't even have a backend pv
2239 	 * driver for cdrom device nodes, so we don't want to block forever
2240 	 * waiting for a connection to a backend driver that doesn't exist.
2241 	 */
2242 	if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) {
2243 		mutex_exit(&vdp->xdf_cb_lk);
2244 		return (B_FALSE);
2245 	}
2246 
2247 	mutex_enter(&vdp->xdf_dev_lk);
2248 	rv = xdf_connect_locked(vdp, B_TRUE);
2249 	mutex_exit(&vdp->xdf_dev_lk);
2250 	mutex_exit(&vdp->xdf_cb_lk);
2251 
2252 	return ((rv == XD_READY) ? B_TRUE : B_FALSE);
2253 }
2254 
2255 int
2256 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2257 {
2258 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2259 
2260 	/* sanity check the requested physical geometry */
2261 	mutex_enter(&vdp->xdf_dev_lk);
2262 	if ((geomp->g_secsize != XB_BSIZE) ||
2263 	    (geomp->g_capacity == 0)) {
2264 		mutex_exit(&vdp->xdf_dev_lk);
2265 		return (EINVAL);
2266 	}
2267 
2268 	/*
2269 	 * If we've already connected to the backend device then make sure
2270 	 * we're not defining a physical geometry larger than our backend
2271 	 * device.
2272 	 */
2273 	if ((vdp->xdf_xdev_nblocks != 0) &&
2274 	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2275 		mutex_exit(&vdp->xdf_dev_lk);
2276 		return (EINVAL);
2277 	}
2278 
2279 	bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom));
2280 	vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl;
2281 	vdp->xdf_pgeom.g_acyl = geomp->g_acyl;
2282 	vdp->xdf_pgeom.g_nhead = geomp->g_nhead;
2283 	vdp->xdf_pgeom.g_nsect = geomp->g_nsect;
2284 	vdp->xdf_pgeom.g_secsize = geomp->g_secsize;
2285 	vdp->xdf_pgeom.g_capacity = geomp->g_capacity;
2286 	vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv;
2287 	vdp->xdf_pgeom.g_rpm = geomp->g_rpm;
2288 
2289 	vdp->xdf_pgeom_fixed = B_TRUE;
2290 	mutex_exit(&vdp->xdf_dev_lk);
2291 
2292 	/* force a re-validation */
2293 	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2294 
2295 	return (0);
2296 }
2297 
2298 boolean_t
2299 xdf_is_cd(dev_info_t *dip)
2300 {
2301 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2302 	boolean_t	rv;
2303 
2304 	mutex_enter(&vdp->xdf_cb_lk);
2305 	rv = XD_IS_CD(vdp);
2306 	mutex_exit(&vdp->xdf_cb_lk);
2307 	return (rv);
2308 }
2309 
2310 boolean_t
2311 xdf_is_rm(dev_info_t *dip)
2312 {
2313 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2314 	boolean_t	rv;
2315 
2316 	mutex_enter(&vdp->xdf_cb_lk);
2317 	rv = XD_IS_RM(vdp);
2318 	mutex_exit(&vdp->xdf_cb_lk);
2319 	return (rv);
2320 }
2321 
2322 boolean_t
2323 xdf_media_req_supported(dev_info_t *dip)
2324 {
2325 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2326 	boolean_t	rv;
2327 
2328 	mutex_enter(&vdp->xdf_cb_lk);
2329 	rv = vdp->xdf_media_req_supported;
2330 	mutex_exit(&vdp->xdf_cb_lk);
2331 	return (rv);
2332 }
2333 
2334 #endif /* XPV_HVM_DRIVER */
2335 
2336 static int
2337 xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp)
2338 {
2339 	xdf_t *vdp;
2340 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2341 
2342 	if (vdp == NULL)
2343 		return (ENXIO);
2344 
2345 	mutex_enter(&vdp->xdf_dev_lk);
2346 	*capp = vdp->xdf_pgeom.g_capacity;
2347 	DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp));
2348 	mutex_exit(&vdp->xdf_dev_lk);
2349 	return (0);
2350 }
2351 
2352 static int
2353 xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2354 {
2355 	xdf_t *vdp;
2356 
2357 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
2358 		return (ENXIO);
2359 	*geomp = vdp->xdf_pgeom;
2360 	return (0);
2361 }
2362 
2363 /*
2364  * No real HBA, no geometry available from it
2365  */
2366 /*ARGSUSED*/
2367 static int
2368 xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2369 {
2370 	return (EINVAL);
2371 }
2372 
2373 static int
2374 xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep)
2375 {
2376 	xdf_t *vdp;
2377 
2378 	if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))))
2379 		return (ENXIO);
2380 
2381 	if (XD_IS_RO(vdp))
2382 		tgattributep->media_is_writable = 0;
2383 	else
2384 		tgattributep->media_is_writable = 1;
2385 	tgattributep->media_is_rotational = 0;
2386 	return (0);
2387 }
2388 
2389 /* ARGSUSED3 */
2390 int
2391 xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
2392 {
2393 	int instance;
2394 	xdf_t   *vdp;
2395 
2396 	instance = ddi_get_instance(dip);
2397 
2398 	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
2399 		return (ENXIO);
2400 
2401 	switch (cmd) {
2402 	case TG_GETPHYGEOM:
2403 		return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg));
2404 	case TG_GETVIRTGEOM:
2405 		return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg));
2406 	case TG_GETCAPACITY:
2407 		return (xdf_lb_getcap(dip, (diskaddr_t *)arg));
2408 	case TG_GETBLOCKSIZE:
2409 		mutex_enter(&vdp->xdf_cb_lk);
2410 		*(uint32_t *)arg = vdp->xdf_xdev_secsize;
2411 		mutex_exit(&vdp->xdf_cb_lk);
2412 		return (0);
2413 	case TG_GETATTR:
2414 		return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg));
2415 	default:
2416 		return (ENOTTY);
2417 	}
2418 }
2419 
2420 /* ARGSUSED5 */
2421 int
2422 xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp,
2423     diskaddr_t start, size_t reqlen, void *tg_cookie)
2424 {
2425 	xdf_t *vdp;
2426 	struct buf *bp;
2427 	int err = 0;
2428 
2429 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2430 
2431 	/* We don't allow IO from the oe_change callback thread */
2432 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2433 
2434 	if ((start + ((reqlen / (vdp->xdf_xdev_secsize / DEV_BSIZE))
2435 	    >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
2436 		return (EINVAL);
2437 
2438 	bp = getrbuf(KM_SLEEP);
2439 	if (cmd == TG_READ)
2440 		bp->b_flags = B_BUSY | B_READ;
2441 	else
2442 		bp->b_flags = B_BUSY | B_WRITE;
2443 
2444 	bp->b_un.b_addr = bufp;
2445 	bp->b_bcount = reqlen;
2446 	bp->b_blkno = start * (vdp->xdf_xdev_secsize / DEV_BSIZE);
2447 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
2448 
2449 	mutex_enter(&vdp->xdf_dev_lk);
2450 	xdf_bp_push(vdp, bp);
2451 	mutex_exit(&vdp->xdf_dev_lk);
2452 	xdf_io_start(vdp);
2453 	if (curthread == vdp->xdf_ready_tq_thread)
2454 		(void) xdf_ring_drain(vdp);
2455 	err = biowait(bp);
2456 	ASSERT(bp->b_flags & B_DONE);
2457 	freerbuf(bp);
2458 	return (err);
2459 }
2460 
2461 /*
2462  * Lock the current media.  Set the media state to "lock".
2463  * (Media locks are only respected by the backend driver.)
2464  */
2465 static int
2466 xdf_ioctl_mlock(xdf_t *vdp)
2467 {
2468 	int rv;
2469 	mutex_enter(&vdp->xdf_cb_lk);
2470 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2471 	mutex_exit(&vdp->xdf_cb_lk);
2472 	return (rv);
2473 }
2474 
2475 /*
2476  * Release a media lock.  Set the media state to "none".
2477  */
2478 static int
2479 xdf_ioctl_munlock(xdf_t *vdp)
2480 {
2481 	int rv;
2482 	mutex_enter(&vdp->xdf_cb_lk);
2483 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE);
2484 	mutex_exit(&vdp->xdf_cb_lk);
2485 	return (rv);
2486 }
2487 
2488 /*
2489  * Eject the current media.  Ignores any media locks.  (Media locks
2490  * are only for benifit of the the backend.)
2491  */
2492 static int
2493 xdf_ioctl_eject(xdf_t *vdp)
2494 {
2495 	int rv;
2496 
2497 	mutex_enter(&vdp->xdf_cb_lk);
2498 	if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) {
2499 		mutex_exit(&vdp->xdf_cb_lk);
2500 		return (rv);
2501 	}
2502 
2503 	/*
2504 	 * We've set the media requests xenbus parameter to eject, so now
2505 	 * disconnect from the backend, wait for the backend to clear
2506 	 * the media requets xenbus paramter, and then we can reconnect
2507 	 * to the backend.
2508 	 */
2509 	(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2510 	mutex_enter(&vdp->xdf_dev_lk);
2511 	if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) {
2512 		mutex_exit(&vdp->xdf_dev_lk);
2513 		mutex_exit(&vdp->xdf_cb_lk);
2514 		return (EIO);
2515 	}
2516 	mutex_exit(&vdp->xdf_dev_lk);
2517 	mutex_exit(&vdp->xdf_cb_lk);
2518 	return (0);
2519 }
2520 
2521 /*
2522  * Watch for media state changes.  This can be an insertion of a device
2523  * (triggered by a 'xm block-configure' request in another domain) or
2524  * the ejection of a device (triggered by a local "eject" operation).
2525  * For a full description of the DKIOCSTATE ioctl behavior see dkio(7I).
2526  */
2527 static int
2528 xdf_dkstate(xdf_t *vdp, enum dkio_state mstate)
2529 {
2530 	enum dkio_state		prev_state;
2531 
2532 	mutex_enter(&vdp->xdf_cb_lk);
2533 	prev_state = vdp->xdf_mstate;
2534 
2535 	if (vdp->xdf_mstate == mstate) {
2536 		while (vdp->xdf_mstate == prev_state) {
2537 			if (cv_wait_sig(&vdp->xdf_mstate_cv,
2538 			    &vdp->xdf_cb_lk) == 0) {
2539 				mutex_exit(&vdp->xdf_cb_lk);
2540 				return (EINTR);
2541 			}
2542 		}
2543 	}
2544 
2545 	if ((prev_state != DKIO_INSERTED) &&
2546 	    (vdp->xdf_mstate == DKIO_INSERTED)) {
2547 		(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2548 		mutex_exit(&vdp->xdf_cb_lk);
2549 		return (0);
2550 	}
2551 
2552 	mutex_exit(&vdp->xdf_cb_lk);
2553 	return (0);
2554 }
2555 
2556 /*ARGSUSED*/
2557 static int
2558 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2559     int *rvalp)
2560 {
2561 	minor_t		minor = getminor(dev);
2562 	int		part = XDF_PART(minor);
2563 	xdf_t		*vdp;
2564 	int		rv;
2565 
2566 	if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) ||
2567 	    (!xdf_isopen(vdp, part)))
2568 		return (ENXIO);
2569 
2570 	DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n",
2571 	    vdp->xdf_addr, cmd, cmd));
2572 
2573 	switch (cmd) {
2574 	default:
2575 		return (ENOTTY);
2576 	case DKIOCG_PHYGEOM:
2577 	case DKIOCG_VIRTGEOM:
2578 	case DKIOCGGEOM:
2579 	case DKIOCSGEOM:
2580 	case DKIOCGAPART:
2581 	case DKIOCSAPART:
2582 	case DKIOCGVTOC:
2583 	case DKIOCSVTOC:
2584 	case DKIOCPARTINFO:
2585 	case DKIOCGEXTVTOC:
2586 	case DKIOCSEXTVTOC:
2587 	case DKIOCEXTPARTINFO:
2588 	case DKIOCGMBOOT:
2589 	case DKIOCSMBOOT:
2590 	case DKIOCGETEFI:
2591 	case DKIOCSETEFI:
2592 	case DKIOCSETEXTPART:
2593 	case DKIOCPARTITION:
2594 		return (cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
2595 		    rvalp, NULL));
2596 	case FDEJECT:
2597 	case DKIOCEJECT:
2598 	case CDROMEJECT:
2599 		return (xdf_ioctl_eject(vdp));
2600 	case DKIOCLOCK:
2601 		return (xdf_ioctl_mlock(vdp));
2602 	case DKIOCUNLOCK:
2603 		return (xdf_ioctl_munlock(vdp));
2604 	case CDROMREADOFFSET: {
2605 		int offset = 0;
2606 		if (!XD_IS_CD(vdp))
2607 			return (ENOTTY);
2608 		if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode))
2609 			return (EFAULT);
2610 		return (0);
2611 	}
2612 	case DKIOCGMEDIAINFO: {
2613 		struct dk_minfo media_info;
2614 
2615 		media_info.dki_lbsize = vdp->xdf_xdev_secsize;
2616 		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
2617 		if (XD_IS_CD(vdp))
2618 			media_info.dki_media_type = DK_CDROM;
2619 		else
2620 			media_info.dki_media_type = DK_FIXED_DISK;
2621 
2622 		if (ddi_copyout(&media_info, (void *)arg,
2623 		    sizeof (struct dk_minfo), mode))
2624 			return (EFAULT);
2625 		return (0);
2626 	}
2627 	case DKIOCINFO: {
2628 		struct dk_cinfo info;
2629 
2630 		/* controller information */
2631 		if (XD_IS_CD(vdp))
2632 			info.dki_ctype = DKC_CDROM;
2633 		else
2634 			info.dki_ctype = DKC_VBD;
2635 
2636 		info.dki_cnum = 0;
2637 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
2638 
2639 		/* unit information */
2640 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
2641 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
2642 		info.dki_flags = DKI_FMTVOL;
2643 		info.dki_partition = part;
2644 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
2645 		info.dki_addr = 0;
2646 		info.dki_space = 0;
2647 		info.dki_prio = 0;
2648 		info.dki_vec = 0;
2649 
2650 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
2651 			return (EFAULT);
2652 		return (0);
2653 	}
2654 	case DKIOCSTATE: {
2655 		enum dkio_state mstate;
2656 
2657 		if (ddi_copyin((void *)arg, &mstate,
2658 		    sizeof (mstate), mode) != 0)
2659 			return (EFAULT);
2660 		if ((rv = xdf_dkstate(vdp, mstate)) != 0)
2661 			return (rv);
2662 		mstate = vdp->xdf_mstate;
2663 		if (ddi_copyout(&mstate, (void *)arg,
2664 		    sizeof (mstate), mode) != 0)
2665 			return (EFAULT);
2666 		return (0);
2667 	}
2668 	case DKIOCREMOVABLE: {
2669 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2670 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode))
2671 			return (EFAULT);
2672 		return (0);
2673 	}
2674 	case DKIOCGETWCE: {
2675 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2676 		if (ddi_copyout(&i, (void *)arg, sizeof (i), mode))
2677 			return (EFAULT);
2678 		return (0);
2679 	}
2680 	case DKIOCSETWCE: {
2681 		int i;
2682 		if (ddi_copyin((void *)arg, &i, sizeof (i), mode))
2683 			return (EFAULT);
2684 		vdp->xdf_wce = VOID2BOOLEAN(i);
2685 		return (0);
2686 	}
2687 	case DKIOCFLUSHWRITECACHE: {
2688 		struct dk_callback *dkc = (struct dk_callback *)arg;
2689 
2690 		if (vdp->xdf_flush_supported) {
2691 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2692 			    NULL, 0, 0, (void *)dev);
2693 		} else if (vdp->xdf_feature_barrier &&
2694 		    !xdf_barrier_flush_disable) {
2695 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2696 			    vdp->xdf_cache_flush_block, xdf_flush_block,
2697 			    vdp->xdf_xdev_secsize, (void *)dev);
2698 		} else {
2699 			return (ENOTTY);
2700 		}
2701 		if ((mode & FKIOCTL) && (dkc != NULL) &&
2702 		    (dkc->dkc_callback != NULL)) {
2703 			(*dkc->dkc_callback)(dkc->dkc_cookie, rv);
2704 			/* need to return 0 after calling callback */
2705 			rv = 0;
2706 		}
2707 		return (rv);
2708 	}
2709 	}
2710 	/*NOTREACHED*/
2711 }
2712 
2713 static int
2714 xdf_strategy(struct buf *bp)
2715 {
2716 	xdf_t	*vdp;
2717 	minor_t minor;
2718 	diskaddr_t p_blkct, p_blkst;
2719 	daddr_t blkno;
2720 	ulong_t nblks;
2721 	int part;
2722 
2723 	minor = getminor(bp->b_edev);
2724 	part = XDF_PART(minor);
2725 	vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor));
2726 
2727 	mutex_enter(&vdp->xdf_dev_lk);
2728 	if (!xdf_isopen(vdp, part)) {
2729 		mutex_exit(&vdp->xdf_dev_lk);
2730 		xdf_io_err(bp, ENXIO, 0);
2731 		return (0);
2732 	}
2733 
2734 	/* We don't allow IO from the oe_change callback thread */
2735 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2736 
2737 	/* Check for writes to a read only device */
2738 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
2739 		mutex_exit(&vdp->xdf_dev_lk);
2740 		xdf_io_err(bp, EROFS, 0);
2741 		return (0);
2742 	}
2743 
2744 	/* Check if this I/O is accessing a partition or the entire disk */
2745 	if ((long)bp->b_private == XB_SLICE_NONE) {
2746 		/* This I/O is using an absolute offset */
2747 		p_blkct = vdp->xdf_xdev_nblocks;
2748 		p_blkst = 0;
2749 	} else {
2750 		/* This I/O is using a partition relative offset */
2751 		mutex_exit(&vdp->xdf_dev_lk);
2752 		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
2753 		    &p_blkst, NULL, NULL, NULL)) {
2754 			xdf_io_err(bp, ENXIO, 0);
2755 			return (0);
2756 		}
2757 		mutex_enter(&vdp->xdf_dev_lk);
2758 	}
2759 
2760 	/*
2761 	 * Adjust the real blkno and bcount according to the underline
2762 	 * physical sector size.
2763 	 */
2764 	blkno = bp->b_blkno / (vdp->xdf_xdev_secsize / XB_BSIZE);
2765 
2766 	/* check for a starting block beyond the disk or partition limit */
2767 	if (blkno > p_blkct) {
2768 		DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64,
2769 		    vdp->xdf_addr, (longlong_t)blkno, (uint64_t)p_blkct));
2770 		mutex_exit(&vdp->xdf_dev_lk);
2771 		xdf_io_err(bp, EINVAL, 0);
2772 		return (0);
2773 	}
2774 
2775 	/* Legacy: don't set error flag at this case */
2776 	if (blkno == p_blkct) {
2777 		mutex_exit(&vdp->xdf_dev_lk);
2778 		bp->b_resid = bp->b_bcount;
2779 		biodone(bp);
2780 		return (0);
2781 	}
2782 
2783 	/* sanitize the input buf */
2784 	bioerror(bp, 0);
2785 	bp->b_resid = 0;
2786 	bp->av_back = bp->av_forw = NULL;
2787 
2788 	/* Adjust for partial transfer, this will result in an error later */
2789 	if (vdp->xdf_xdev_secsize != 0 &&
2790 	    vdp->xdf_xdev_secsize != XB_BSIZE) {
2791 		nblks = bp->b_bcount / vdp->xdf_xdev_secsize;
2792 	} else {
2793 		nblks = bp->b_bcount >> XB_BSHIFT;
2794 	}
2795 
2796 	if ((blkno + nblks) > p_blkct) {
2797 		if (vdp->xdf_xdev_secsize != 0 &&
2798 		    vdp->xdf_xdev_secsize != XB_BSIZE) {
2799 			bp->b_resid =
2800 			    ((blkno + nblks) - p_blkct) *
2801 			    vdp->xdf_xdev_secsize;
2802 		} else {
2803 			bp->b_resid =
2804 			    ((blkno + nblks) - p_blkct) <<
2805 			    XB_BSHIFT;
2806 		}
2807 		bp->b_bcount -= bp->b_resid;
2808 	}
2809 
2810 	DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n",
2811 	    vdp->xdf_addr, (longlong_t)blkno, (ulong_t)bp->b_bcount));
2812 
2813 	/* Fix up the buf struct */
2814 	bp->b_flags |= B_BUSY;
2815 	bp->b_private = (void *)(uintptr_t)p_blkst;
2816 
2817 	xdf_bp_push(vdp, bp);
2818 	mutex_exit(&vdp->xdf_dev_lk);
2819 	xdf_io_start(vdp);
2820 	if (do_polled_io)
2821 		(void) xdf_ring_drain(vdp);
2822 	return (0);
2823 }
2824 
2825 /*ARGSUSED*/
2826 static int
2827 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
2828 {
2829 	xdf_t	*vdp;
2830 	minor_t minor;
2831 	diskaddr_t p_blkcnt;
2832 	int part;
2833 
2834 	minor = getminor(dev);
2835 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2836 		return (ENXIO);
2837 
2838 	DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n",
2839 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2840 
2841 	part = XDF_PART(minor);
2842 	if (!xdf_isopen(vdp, part))
2843 		return (ENXIO);
2844 
2845 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2846 	    NULL, NULL, NULL, NULL))
2847 		return (ENXIO);
2848 
2849 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2850 		return (ENOSPC);
2851 
2852 	if (U_INVAL(uiop))
2853 		return (EINVAL);
2854 
2855 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
2856 }
2857 
2858 /*ARGSUSED*/
2859 static int
2860 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
2861 {
2862 	xdf_t *vdp;
2863 	minor_t minor;
2864 	diskaddr_t p_blkcnt;
2865 	int part;
2866 
2867 	minor = getminor(dev);
2868 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2869 		return (ENXIO);
2870 
2871 	DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n",
2872 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2873 
2874 	part = XDF_PART(minor);
2875 	if (!xdf_isopen(vdp, part))
2876 		return (ENXIO);
2877 
2878 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2879 	    NULL, NULL, NULL, NULL))
2880 		return (ENXIO);
2881 
2882 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2883 		return (ENOSPC);
2884 
2885 	if (U_INVAL(uiop))
2886 		return (EINVAL);
2887 
2888 	return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop));
2889 }
2890 
2891 /*ARGSUSED*/
2892 static int
2893 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
2894 {
2895 	xdf_t	*vdp;
2896 	minor_t minor;
2897 	struct uio *uiop = aiop->aio_uio;
2898 	diskaddr_t p_blkcnt;
2899 	int part;
2900 
2901 	minor = getminor(dev);
2902 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2903 		return (ENXIO);
2904 
2905 	part = XDF_PART(minor);
2906 	if (!xdf_isopen(vdp, part))
2907 		return (ENXIO);
2908 
2909 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2910 	    NULL, NULL, NULL, NULL))
2911 		return (ENXIO);
2912 
2913 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2914 		return (ENOSPC);
2915 
2916 	if (U_INVAL(uiop))
2917 		return (EINVAL);
2918 
2919 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop));
2920 }
2921 
2922 /*ARGSUSED*/
2923 static int
2924 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
2925 {
2926 	xdf_t *vdp;
2927 	minor_t minor;
2928 	struct uio *uiop = aiop->aio_uio;
2929 	diskaddr_t p_blkcnt;
2930 	int part;
2931 
2932 	minor = getminor(dev);
2933 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2934 		return (ENXIO);
2935 
2936 	part = XDF_PART(minor);
2937 	if (!xdf_isopen(vdp, part))
2938 		return (ENXIO);
2939 
2940 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2941 	    NULL, NULL, NULL, NULL))
2942 		return (ENXIO);
2943 
2944 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2945 		return (ENOSPC);
2946 
2947 	if (U_INVAL(uiop))
2948 		return (EINVAL);
2949 
2950 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop));
2951 }
2952 
2953 static int
2954 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
2955 {
2956 	struct buf dumpbuf, *dbp = &dumpbuf;
2957 	xdf_t	*vdp;
2958 	minor_t minor;
2959 	int err = 0;
2960 	int part;
2961 	diskaddr_t p_blkcnt, p_blkst;
2962 
2963 	minor = getminor(dev);
2964 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2965 		return (ENXIO);
2966 
2967 	DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n",
2968 	    vdp->xdf_addr, (void *)addr, blkno, nblk));
2969 
2970 	/* We don't allow IO from the oe_change callback thread */
2971 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2972 
2973 	part = XDF_PART(minor);
2974 	if (!xdf_isopen(vdp, part))
2975 		return (ENXIO);
2976 
2977 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
2978 	    NULL, NULL, NULL))
2979 		return (ENXIO);
2980 
2981 	if ((blkno + nblk) >
2982 	    (p_blkcnt * (vdp->xdf_xdev_secsize / XB_BSIZE))) {
2983 		cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64,
2984 		    vdp->xdf_addr, (daddr_t)((blkno + nblk) /
2985 		    (vdp->xdf_xdev_secsize / XB_BSIZE)), (uint64_t)p_blkcnt);
2986 		return (EINVAL);
2987 	}
2988 
2989 	bioinit(dbp);
2990 	dbp->b_flags = B_BUSY;
2991 	dbp->b_un.b_addr = addr;
2992 	dbp->b_bcount = nblk << DEV_BSHIFT;
2993 	dbp->b_blkno = blkno;
2994 	dbp->b_edev = dev;
2995 	dbp->b_private = (void *)(uintptr_t)p_blkst;
2996 
2997 	mutex_enter(&vdp->xdf_dev_lk);
2998 	xdf_bp_push(vdp, dbp);
2999 	mutex_exit(&vdp->xdf_dev_lk);
3000 	xdf_io_start(vdp);
3001 	err = xdf_ring_drain(vdp);
3002 	biofini(dbp);
3003 	return (err);
3004 }
3005 
3006 /*ARGSUSED*/
3007 static int
3008 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
3009 {
3010 	minor_t	minor;
3011 	xdf_t	*vdp;
3012 	int part;
3013 	ulong_t parbit;
3014 
3015 	minor = getminor(dev);
3016 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3017 		return (ENXIO);
3018 
3019 	mutex_enter(&vdp->xdf_dev_lk);
3020 	part = XDF_PART(minor);
3021 	if (!xdf_isopen(vdp, part)) {
3022 		mutex_exit(&vdp->xdf_dev_lk);
3023 		return (ENXIO);
3024 	}
3025 	parbit = 1 << part;
3026 
3027 	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
3028 	if (otyp == OTYP_LYR) {
3029 		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
3030 		if (--vdp->xdf_vd_lyropen[part] == 0)
3031 			vdp->xdf_vd_open[otyp] &= ~parbit;
3032 	} else {
3033 		vdp->xdf_vd_open[otyp] &= ~parbit;
3034 	}
3035 	vdp->xdf_vd_exclopen &= ~parbit;
3036 
3037 	mutex_exit(&vdp->xdf_dev_lk);
3038 	return (0);
3039 }
3040 
3041 static int
3042 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
3043 {
3044 	minor_t	minor;
3045 	xdf_t	*vdp;
3046 	int part;
3047 	ulong_t parbit;
3048 	diskaddr_t p_blkct = 0;
3049 	boolean_t firstopen;
3050 	boolean_t nodelay;
3051 
3052 	minor = getminor(*devp);
3053 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3054 		return (ENXIO);
3055 
3056 	nodelay = (flag & (FNDELAY | FNONBLOCK));
3057 
3058 	DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr));
3059 
3060 	/* do cv_wait until connected or failed */
3061 	mutex_enter(&vdp->xdf_cb_lk);
3062 	mutex_enter(&vdp->xdf_dev_lk);
3063 	if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) {
3064 		mutex_exit(&vdp->xdf_dev_lk);
3065 		mutex_exit(&vdp->xdf_cb_lk);
3066 		return (ENXIO);
3067 	}
3068 	mutex_exit(&vdp->xdf_cb_lk);
3069 
3070 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
3071 		mutex_exit(&vdp->xdf_dev_lk);
3072 		return (EROFS);
3073 	}
3074 
3075 	part = XDF_PART(minor);
3076 	parbit = 1 << part;
3077 	if ((vdp->xdf_vd_exclopen & parbit) ||
3078 	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
3079 		mutex_exit(&vdp->xdf_dev_lk);
3080 		return (EBUSY);
3081 	}
3082 
3083 	/* are we the first one to open this node? */
3084 	firstopen = !xdf_isopen(vdp, -1);
3085 
3086 	if (otyp == OTYP_LYR)
3087 		vdp->xdf_vd_lyropen[part]++;
3088 
3089 	vdp->xdf_vd_open[otyp] |= parbit;
3090 
3091 	if (flag & FEXCL)
3092 		vdp->xdf_vd_exclopen |= parbit;
3093 
3094 	mutex_exit(&vdp->xdf_dev_lk);
3095 
3096 	/* force a re-validation */
3097 	if (firstopen)
3098 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
3099 
3100 	/* If this is a non-blocking open then we're done */
3101 	if (nodelay)
3102 		return (0);
3103 
3104 	/*
3105 	 * This is a blocking open, so we require:
3106 	 * - that the disk have a valid label on it
3107 	 * - that the size of the partition that we're opening is non-zero
3108 	 */
3109 	if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
3110 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) {
3111 		(void) xdf_close(*devp, flag, otyp, credp);
3112 		return (ENXIO);
3113 	}
3114 
3115 	return (0);
3116 }
3117 
3118 /*ARGSUSED*/
3119 static void
3120 xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg)
3121 {
3122 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
3123 	cv_broadcast(&vdp->xdf_hp_status_cv);
3124 }
3125 
3126 static int
3127 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
3128     char *name, caddr_t valuep, int *lengthp)
3129 {
3130 	xdf_t	*vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
3131 
3132 	/*
3133 	 * Sanity check that if a dev_t or dip were specified that they
3134 	 * correspond to this device driver.  On debug kernels we'll
3135 	 * panic and on non-debug kernels we'll return failure.
3136 	 */
3137 	ASSERT(ddi_driver_major(dip) == xdf_major);
3138 	ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major));
3139 	if ((ddi_driver_major(dip) != xdf_major) ||
3140 	    ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major)))
3141 		return (DDI_PROP_NOT_FOUND);
3142 
3143 	if (vdp == NULL)
3144 		return (ddi_prop_op(dev, dip, prop_op, flags,
3145 		    name, valuep, lengthp));
3146 
3147 	return (cmlb_prop_op(vdp->xdf_vd_lbl,
3148 	    dev, dip, prop_op, flags, name, valuep, lengthp,
3149 	    XDF_PART(getminor(dev)), NULL));
3150 }
3151 
3152 /*ARGSUSED*/
3153 static int
3154 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
3155 {
3156 	int	instance = XDF_INST(getminor((dev_t)arg));
3157 	xdf_t	*vbdp;
3158 
3159 	switch (cmd) {
3160 	case DDI_INFO_DEVT2DEVINFO:
3161 		if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) {
3162 			*rp = NULL;
3163 			return (DDI_FAILURE);
3164 		}
3165 		*rp = vbdp->xdf_dip;
3166 		return (DDI_SUCCESS);
3167 
3168 	case DDI_INFO_DEVT2INSTANCE:
3169 		*rp = (void *)(uintptr_t)instance;
3170 		return (DDI_SUCCESS);
3171 
3172 	default:
3173 		return (DDI_FAILURE);
3174 	}
3175 }
3176 
3177 /*ARGSUSED*/
3178 static int
3179 xdf_resume(dev_info_t *dip)
3180 {
3181 	xdf_t	*vdp;
3182 	char	*oename;
3183 
3184 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
3185 		goto err;
3186 
3187 	if (xdf_debug & SUSRES_DBG)
3188 		xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr);
3189 
3190 	mutex_enter(&vdp->xdf_cb_lk);
3191 
3192 	if (xvdi_resume(dip) != DDI_SUCCESS) {
3193 		mutex_exit(&vdp->xdf_cb_lk);
3194 		goto err;
3195 	}
3196 
3197 	if (((oename = xvdi_get_oename(dip)) == NULL) ||
3198 	    (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3199 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) {
3200 		mutex_exit(&vdp->xdf_cb_lk);
3201 		goto err;
3202 	}
3203 
3204 	mutex_enter(&vdp->xdf_dev_lk);
3205 	ASSERT(vdp->xdf_state != XD_READY);
3206 	xdf_set_state(vdp, XD_UNKNOWN);
3207 	mutex_exit(&vdp->xdf_dev_lk);
3208 
3209 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3210 		mutex_exit(&vdp->xdf_cb_lk);
3211 		goto err;
3212 	}
3213 
3214 	mutex_exit(&vdp->xdf_cb_lk);
3215 
3216 	if (xdf_debug & SUSRES_DBG)
3217 		xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr);
3218 	return (DDI_SUCCESS);
3219 err:
3220 	if (xdf_debug & SUSRES_DBG)
3221 		xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr);
3222 	return (DDI_FAILURE);
3223 }
3224 
3225 static int
3226 xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3227 {
3228 	int			n, instance = ddi_get_instance(dip);
3229 	ddi_iblock_cookie_t	ibc, softibc;
3230 	boolean_t		dev_iscd = B_FALSE;
3231 	xdf_t			*vdp;
3232 	char			*oename, *xsname, *str;
3233 
3234 	if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM,
3235 	    "xdf_debug", 0)) != 0)
3236 		xdf_debug = n;
3237 
3238 	switch (cmd) {
3239 	case DDI_RESUME:
3240 		return (xdf_resume(dip));
3241 	case DDI_ATTACH:
3242 		break;
3243 	default:
3244 		return (DDI_FAILURE);
3245 	}
3246 	/* DDI_ATTACH */
3247 
3248 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
3249 	    ((oename = xvdi_get_oename(dip)) == NULL))
3250 		return (DDI_FAILURE);
3251 
3252 	/*
3253 	 * Disable auto-detach.  This is necessary so that we don't get
3254 	 * detached while we're disconnected from the back end.
3255 	 */
3256 	if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip,
3257 	    DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS))
3258 		return (DDI_FAILURE);
3259 
3260 	/* driver handles kernel-issued IOCTLs */
3261 	if (ddi_prop_create(DDI_DEV_T_NONE, dip,
3262 	    DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS)
3263 		return (DDI_FAILURE);
3264 
3265 	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
3266 		return (DDI_FAILURE);
3267 
3268 	if (ddi_get_soft_iblock_cookie(dip,
3269 	    DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS)
3270 		return (DDI_FAILURE);
3271 
3272 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
3273 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
3274 		    ddi_get_name_addr(dip));
3275 		return (DDI_FAILURE);
3276 	}
3277 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
3278 		dev_iscd = B_TRUE;
3279 	strfree(str);
3280 
3281 	if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS)
3282 		return (DDI_FAILURE);
3283 
3284 	DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip)));
3285 	vdp = ddi_get_soft_state(xdf_ssp, instance);
3286 	ddi_set_driver_private(dip, vdp);
3287 	vdp->xdf_dip = dip;
3288 	vdp->xdf_addr = ddi_get_name_addr(dip);
3289 	vdp->xdf_suspending = B_FALSE;
3290 	vdp->xdf_media_req_supported = B_FALSE;
3291 	vdp->xdf_peer = INVALID_DOMID;
3292 	vdp->xdf_evtchn = INVALID_EVTCHN;
3293 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
3294 	    offsetof(v_req_t, v_link));
3295 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
3296 	cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL);
3297 	cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL);
3298 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3299 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3300 	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3301 	vdp->xdf_cmbl_reattach = B_TRUE;
3302 	if (dev_iscd) {
3303 		vdp->xdf_dinfo |= VDISK_CDROM;
3304 		vdp->xdf_mstate = DKIO_EJECTED;
3305 	} else {
3306 		vdp->xdf_mstate = DKIO_NONE;
3307 	}
3308 
3309 	if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq",
3310 	    1, TASKQ_DEFAULTPRI, 0)) == NULL)
3311 		goto errout0;
3312 
3313 	if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3314 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)
3315 		goto errout0;
3316 
3317 	if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
3318 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
3319 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
3320 		    ddi_get_name_addr(dip));
3321 		goto errout0;
3322 	}
3323 
3324 	/*
3325 	 * Initialize the physical geometry stucture.  Note that currently
3326 	 * we don't know the size of the backend device so the number
3327 	 * of blocks on the device will be initialized to zero.  Once
3328 	 * we connect to the backend device we'll update the physical
3329 	 * geometry to reflect the real size of the device.
3330 	 */
3331 	xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom);
3332 	vdp->xdf_pgeom_fixed = B_FALSE;
3333 
3334 	/*
3335 	 * create default device minor nodes: non-removable disk
3336 	 * we will adjust minor nodes after we are connected w/ backend
3337 	 */
3338 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
3339 	if (xdf_cmlb_attach(vdp) != 0) {
3340 		cmn_err(CE_WARN,
3341 		    "xdf@%s: attach failed, cmlb attach failed",
3342 		    ddi_get_name_addr(dip));
3343 		goto errout0;
3344 	}
3345 
3346 	/*
3347 	 * We ship with cache-enabled disks
3348 	 */
3349 	vdp->xdf_wce = B_TRUE;
3350 
3351 	mutex_enter(&vdp->xdf_cb_lk);
3352 	/* Watch backend XenbusState change */
3353 	if (xvdi_add_event_handler(dip,
3354 	    XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) {
3355 		mutex_exit(&vdp->xdf_cb_lk);
3356 		goto errout0;
3357 	}
3358 
3359 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3360 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
3361 		    ddi_get_name_addr(dip));
3362 		mutex_exit(&vdp->xdf_cb_lk);
3363 		goto errout1;
3364 	}
3365 	mutex_exit(&vdp->xdf_cb_lk);
3366 
3367 #if defined(XPV_HVM_DRIVER)
3368 
3369 	xdf_hvm_add(dip);
3370 
3371 	/* Report our version to dom0.  */
3372 	if (xenbus_printf(XBT_NULL, "guest/xdf", "version", "%d",
3373 	    HVMPV_XDF_VERS))
3374 		cmn_err(CE_WARN, "xdf: couldn't write version\n");
3375 
3376 #endif /* XPV_HVM_DRIVER */
3377 
3378 	/* create kstat for iostat(1M) */
3379 	if (xdf_kstat_create(dip, "xdf", instance) != 0) {
3380 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
3381 		    ddi_get_name_addr(dip));
3382 		goto errout1;
3383 	}
3384 
3385 
3386 	ddi_report_dev(dip);
3387 	DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr));
3388 	return (DDI_SUCCESS);
3389 
3390 errout1:
3391 	(void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed);
3392 	xvdi_remove_event_handler(dip, XS_OE_STATE);
3393 errout0:
3394 	if (vdp->xdf_vd_lbl != NULL) {
3395 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
3396 		cmlb_free_handle(&vdp->xdf_vd_lbl);
3397 		vdp->xdf_vd_lbl = NULL;
3398 	}
3399 	if (vdp->xdf_softintr_id != NULL)
3400 		ddi_remove_softintr(vdp->xdf_softintr_id);
3401 	xvdi_remove_xb_watch_handlers(dip);
3402 	if (vdp->xdf_ready_tq != NULL)
3403 		ddi_taskq_destroy(vdp->xdf_ready_tq);
3404 	mutex_destroy(&vdp->xdf_cb_lk);
3405 	mutex_destroy(&vdp->xdf_dev_lk);
3406 	cv_destroy(&vdp->xdf_dev_cv);
3407 	cv_destroy(&vdp->xdf_hp_status_cv);
3408 	ddi_soft_state_free(xdf_ssp, instance);
3409 	ddi_set_driver_private(dip, NULL);
3410 	ddi_prop_remove_all(dip);
3411 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip));
3412 	return (DDI_FAILURE);
3413 }
3414 
3415 static int
3416 xdf_suspend(dev_info_t *dip)
3417 {
3418 	int		instance = ddi_get_instance(dip);
3419 	xdf_t		*vdp;
3420 
3421 	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
3422 		return (DDI_FAILURE);
3423 
3424 	if (xdf_debug & SUSRES_DBG)
3425 		xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr);
3426 
3427 	xvdi_suspend(dip);
3428 
3429 	mutex_enter(&vdp->xdf_cb_lk);
3430 	mutex_enter(&vdp->xdf_dev_lk);
3431 
3432 	vdp->xdf_suspending = B_TRUE;
3433 	xdf_ring_destroy(vdp);
3434 	xdf_set_state(vdp, XD_SUSPEND);
3435 	vdp->xdf_suspending = B_FALSE;
3436 
3437 	mutex_exit(&vdp->xdf_dev_lk);
3438 	mutex_exit(&vdp->xdf_cb_lk);
3439 
3440 	if (xdf_debug & SUSRES_DBG)
3441 		xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr);
3442 
3443 	return (DDI_SUCCESS);
3444 }
3445 
3446 static int
3447 xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3448 {
3449 	xdf_t *vdp;
3450 	int instance;
3451 
3452 	switch (cmd) {
3453 
3454 	case DDI_PM_SUSPEND:
3455 		break;
3456 
3457 	case DDI_SUSPEND:
3458 		return (xdf_suspend(dip));
3459 
3460 	case DDI_DETACH:
3461 		break;
3462 
3463 	default:
3464 		return (DDI_FAILURE);
3465 	}
3466 
3467 	instance = ddi_get_instance(dip);
3468 	DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip)));
3469 	vdp = ddi_get_soft_state(xdf_ssp, instance);
3470 
3471 	if (vdp == NULL)
3472 		return (DDI_FAILURE);
3473 
3474 	mutex_enter(&vdp->xdf_cb_lk);
3475 	xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
3476 	if (vdp->xdf_state != XD_CLOSED) {
3477 		mutex_exit(&vdp->xdf_cb_lk);
3478 		return (DDI_FAILURE);
3479 	}
3480 	mutex_exit(&vdp->xdf_cb_lk);
3481 
3482 	ASSERT(!ISDMACBON(vdp));
3483 
3484 #if defined(XPV_HVM_DRIVER)
3485 	xdf_hvm_rm(dip);
3486 #endif /* XPV_HVM_DRIVER */
3487 
3488 	if (vdp->xdf_timeout_id != 0)
3489 		(void) untimeout(vdp->xdf_timeout_id);
3490 
3491 	xvdi_remove_event_handler(dip, XS_OE_STATE);
3492 	ddi_taskq_destroy(vdp->xdf_ready_tq);
3493 
3494 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
3495 	cmlb_free_handle(&vdp->xdf_vd_lbl);
3496 
3497 	/* we'll support backend running in domU later */
3498 #ifdef	DOMU_BACKEND
3499 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
3500 #endif
3501 
3502 	list_destroy(&vdp->xdf_vreq_act);
3503 	ddi_prop_remove_all(dip);
3504 	xdf_kstat_delete(dip);
3505 	ddi_remove_softintr(vdp->xdf_softintr_id);
3506 	xvdi_remove_xb_watch_handlers(dip);
3507 	ddi_set_driver_private(dip, NULL);
3508 	cv_destroy(&vdp->xdf_dev_cv);
3509 	mutex_destroy(&vdp->xdf_cb_lk);
3510 	mutex_destroy(&vdp->xdf_dev_lk);
3511 	if (vdp->xdf_cache_flush_block != NULL)
3512 		kmem_free(vdp->xdf_flush_mem, 2 * vdp->xdf_xdev_secsize);
3513 	ddi_soft_state_free(xdf_ssp, instance);
3514 	return (DDI_SUCCESS);
3515 }
3516 
3517 /*
3518  * Driver linkage structures.
3519  */
3520 static struct cb_ops xdf_cbops = {
3521 	xdf_open,
3522 	xdf_close,
3523 	xdf_strategy,
3524 	nodev,
3525 	xdf_dump,
3526 	xdf_read,
3527 	xdf_write,
3528 	xdf_ioctl,
3529 	nodev,
3530 	nodev,
3531 	nodev,
3532 	nochpoll,
3533 	xdf_prop_op,
3534 	NULL,
3535 	D_MP | D_NEW | D_64BIT,
3536 	CB_REV,
3537 	xdf_aread,
3538 	xdf_awrite
3539 };
3540 
3541 struct dev_ops xdf_devops = {
3542 	DEVO_REV,		/* devo_rev */
3543 	0,			/* devo_refcnt */
3544 	xdf_getinfo,		/* devo_getinfo */
3545 	nulldev,		/* devo_identify */
3546 	nulldev,		/* devo_probe */
3547 	xdf_attach,		/* devo_attach */
3548 	xdf_detach,		/* devo_detach */
3549 	nodev,			/* devo_reset */
3550 	&xdf_cbops,		/* devo_cb_ops */
3551 	NULL,			/* devo_bus_ops */
3552 	NULL,			/* devo_power */
3553 	ddi_quiesce_not_supported, /* devo_quiesce */
3554 };
3555 
3556 /*
3557  * Module linkage structures.
3558  */
3559 static struct modldrv modldrv = {
3560 	&mod_driverops,		/* Type of module.  This one is a driver */
3561 	"virtual block driver",	/* short description */
3562 	&xdf_devops		/* driver specific ops */
3563 };
3564 
3565 static struct modlinkage xdf_modlinkage = {
3566 	MODREV_1, (void *)&modldrv, NULL
3567 };
3568 
3569 /*
3570  * standard module entry points
3571  */
3572 int
3573 _init(void)
3574 {
3575 	int rc;
3576 
3577 	xdf_major = ddi_name_to_major("xdf");
3578 	if (xdf_major == (major_t)-1)
3579 		return (EINVAL);
3580 
3581 	if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0)
3582 		return (rc);
3583 
3584 	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
3585 	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3586 	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
3587 	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3588 
3589 #if defined(XPV_HVM_DRIVER)
3590 	xdf_hvm_init();
3591 #endif /* XPV_HVM_DRIVER */
3592 
3593 	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
3594 #if defined(XPV_HVM_DRIVER)
3595 		xdf_hvm_fini();
3596 #endif /* XPV_HVM_DRIVER */
3597 		kmem_cache_destroy(xdf_vreq_cache);
3598 		kmem_cache_destroy(xdf_gs_cache);
3599 		ddi_soft_state_fini(&xdf_ssp);
3600 		return (rc);
3601 	}
3602 
3603 	return (rc);
3604 }
3605 
3606 int
3607 _fini(void)
3608 {
3609 	int err;
3610 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
3611 		return (err);
3612 
3613 #if defined(XPV_HVM_DRIVER)
3614 	xdf_hvm_fini();
3615 #endif /* XPV_HVM_DRIVER */
3616 
3617 	kmem_cache_destroy(xdf_vreq_cache);
3618 	kmem_cache_destroy(xdf_gs_cache);
3619 	ddi_soft_state_fini(&xdf_ssp);
3620 
3621 	return (0);
3622 }
3623 
3624 int
3625 _info(struct modinfo *modinfop)
3626 {
3627 	return (mod_info(&xdf_modlinkage, modinfop));
3628 }
3629