xref: /illumos-gate/usr/src/uts/common/xen/io/xdf.c (revision c211fc479225fa54805cf480633bf6689ca9a2db)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  *
33  * This driver export solaris disk device nodes, accepts IO requests from
34  * those nodes, and services those requests by talking to a backend device
35  * in another domain.
36  *
37  * Communication with the backend device is done via a ringbuffer (which is
38  * managed via xvdi interfaces) and dma memory (which is managed via ddi
39  * interfaces).
40  *
41  * Communication with the backend device is dependant upon establishing a
42  * connection to the backend device.  This connection process involves
43  * reading device configuration information from xenbus and publishing
44  * some frontend runtime configuration parameters via the xenbus (for
45  * consumption by the backend).  Once we've published runtime configuration
46  * information via the xenbus, the backend device can enter the connected
47  * state and we'll enter the XD_CONNECTED state.  But before we can allow
48  * random IO to begin, we need to do IO to the backend device to determine
49  * the device label and if flush operations are supported.  Once this is
50  * done we enter the XD_READY state and can process any IO operations.
51  *
52  * We recieve notifications of xenbus state changes for the backend device
53  * (aka, the "other end") via the xdf_oe_change() callback.  This callback
54  * is single threaded, meaning that we can't recieve new notification of
55  * other end state changes while we're processing an outstanding
56  * notification of an other end state change.  There for we can't do any
57  * blocking operations from the xdf_oe_change() callback.  This is why we
58  * have a seperate taskq (xdf_ready_tq) which exists to do the necessary
59  * IO to get us from the XD_CONNECTED to the XD_READY state.  All IO
60  * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go
61  * throught xdf_lb_rdwr(), which is a synchronous IO interface.  IOs
62  * generated by the xdf_ready_tq_thread thread have priority over all
63  * other IO requests.
64  *
65  * We also communicate with the backend device via the xenbus "media-req"
66  * (XBP_MEDIA_REQ) property.  For more information on this see the
67  * comments in blkif.h.
68  */
69 
70 #include <io/xdf.h>
71 
72 #include <sys/conf.h>
73 #include <sys/dkio.h>
74 #include <sys/promif.h>
75 #include <sys/sysmacros.h>
76 #include <sys/kstat.h>
77 #include <sys/mach_mmu.h>
78 #ifdef XPV_HVM_DRIVER
79 #include <sys/xpv_support.h>
80 #include <sys/sunndi.h>
81 #else /* !XPV_HVM_DRIVER */
82 #include <sys/evtchn_impl.h>
83 #endif /* !XPV_HVM_DRIVER */
84 #include <public/io/xenbus.h>
85 #include <xen/sys/xenbus_impl.h>
86 #include <sys/scsi/generic/inquiry.h>
87 #include <xen/io/blkif_impl.h>
88 #include <sys/fdio.h>
89 #include <sys/cdio.h>
90 
91 /*
92  * DEBUG_EVAL can be used to include debug only statements without
93  * having to use '#ifdef DEBUG' statements
94  */
95 #ifdef DEBUG
96 #define	DEBUG_EVAL(x)	(x)
97 #else /* !DEBUG */
98 #define	DEBUG_EVAL(x)
99 #endif /* !DEBUG */
100 
101 #define	XDF_DRAIN_MSEC_DELAY		(50*1000)	/* 00.05 sec */
102 #define	XDF_DRAIN_RETRY_COUNT		200		/* 10.00 sec */
103 
104 #define	INVALID_DOMID	((domid_t)-1)
105 #define	FLUSH_DISKCACHE	0x1
106 #define	WRITE_BARRIER	0x2
107 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
108 #define	USE_WRITE_BARRIER(vdp)						\
109 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
110 #define	USE_FLUSH_DISKCACHE(vdp)					\
111 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
112 #define	IS_WRITE_BARRIER(vdp, bp)					\
113 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&			\
114 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
115 #define	IS_FLUSH_DISKCACHE(bp)						\
116 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
117 
118 #define	VREQ_DONE(vreq)							\
119 	VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) &&		\
120 	    (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) ||		\
121 	    (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws)))
122 
123 #define	BP_VREQ(bp)		((v_req_t *)((bp)->av_back))
124 #define	BP_VREQ_SET(bp, vreq)	(((bp)->av_back = (buf_t *)(vreq)))
125 
126 extern int		do_polled_io;
127 
128 /* run-time tunables that we don't want the compiler to optimize away */
129 volatile int		xdf_debug = 0;
130 volatile boolean_t	xdf_barrier_flush_disable = B_FALSE;
131 
132 /* per module globals */
133 major_t			xdf_major;
134 static void		*xdf_ssp;
135 static kmem_cache_t	*xdf_vreq_cache;
136 static kmem_cache_t	*xdf_gs_cache;
137 static int		xdf_maxphys = XB_MAXPHYS;
138 static diskaddr_t	xdf_flush_block = DEFAULT_FLUSH_BLOCK;
139 static int		xdf_fbrewrites;	/* flush block re-write count */
140 
141 /* misc public functions (used by xdf_shell.c) */
142 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
143 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
144 
145 /*  misc private functions */
146 static void xdf_io_start(xdf_t *);
147 
148 /* callbacks from commmon label */
149 static cmlb_tg_ops_t xdf_lb_ops = {
150 	TG_DK_OPS_VERSION_1,
151 	xdf_lb_rdwr,
152 	xdf_lb_getinfo
153 };
154 
155 /*
156  * I/O buffer DMA attributes
157  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
158  */
159 static ddi_dma_attr_t xb_dma_attr = {
160 	DMA_ATTR_V0,
161 	(uint64_t)0,			/* lowest address */
162 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
163 	(uint64_t)0xffffff,		/* DMA counter limit max */
164 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
165 	XB_BSIZE - 1,			/* bitmap of burst sizes */
166 	XB_BSIZE,			/* min transfer */
167 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
168 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
169 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
170 	XB_BSIZE,			/* granularity */
171 	0,				/* flags (reserved) */
172 };
173 
174 static ddi_device_acc_attr_t xc_acc_attr = {
175 	DDI_DEVICE_ATTR_V0,
176 	DDI_NEVERSWAP_ACC,
177 	DDI_STRICTORDER_ACC
178 };
179 
180 static void
181 xdf_timeout_handler(void *arg)
182 {
183 	xdf_t *vdp = arg;
184 
185 	mutex_enter(&vdp->xdf_dev_lk);
186 	vdp->xdf_timeout_id = 0;
187 	mutex_exit(&vdp->xdf_dev_lk);
188 
189 	/* new timeout thread could be re-scheduled */
190 	xdf_io_start(vdp);
191 }
192 
193 /*
194  * callback func when DMA/GTE resources is available
195  *
196  * Note: we only register one callback function to grant table subsystem
197  * since we only have one 'struct gnttab_free_callback' in xdf_t.
198  */
199 static int
200 xdf_dmacallback(caddr_t arg)
201 {
202 	xdf_t *vdp = (xdf_t *)arg;
203 	ASSERT(vdp != NULL);
204 
205 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
206 	    vdp->xdf_addr));
207 
208 	ddi_trigger_softintr(vdp->xdf_softintr_id);
209 	return (DDI_DMA_CALLBACK_DONE);
210 }
211 
212 static ge_slot_t *
213 gs_get(xdf_t *vdp, int isread)
214 {
215 	grant_ref_t gh;
216 	ge_slot_t *gs;
217 
218 	/* try to alloc GTEs needed in this slot, first */
219 	if (gnttab_alloc_grant_references(
220 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
221 		if (vdp->xdf_gnt_callback.next == NULL) {
222 			SETDMACBON(vdp);
223 			gnttab_request_free_callback(
224 			    &vdp->xdf_gnt_callback,
225 			    (void (*)(void *))xdf_dmacallback,
226 			    (void *)vdp,
227 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
228 		}
229 		return (NULL);
230 	}
231 
232 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
233 	if (gs == NULL) {
234 		gnttab_free_grant_references(gh);
235 		if (vdp->xdf_timeout_id == 0)
236 			/* restart I/O after one second */
237 			vdp->xdf_timeout_id =
238 			    timeout(xdf_timeout_handler, vdp, hz);
239 		return (NULL);
240 	}
241 
242 	/* init gs_slot */
243 	gs->gs_oeid = vdp->xdf_peer;
244 	gs->gs_isread = isread;
245 	gs->gs_ghead = gh;
246 	gs->gs_ngrefs = 0;
247 
248 	return (gs);
249 }
250 
251 static void
252 gs_free(ge_slot_t *gs)
253 {
254 	int		i;
255 
256 	/* release all grant table entry resources used in this slot */
257 	for (i = 0; i < gs->gs_ngrefs; i++)
258 		gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0);
259 	gnttab_free_grant_references(gs->gs_ghead);
260 	list_remove(&gs->gs_vreq->v_gs, gs);
261 	kmem_cache_free(xdf_gs_cache, gs);
262 }
263 
264 static grant_ref_t
265 gs_grant(ge_slot_t *gs, mfn_t mfn)
266 {
267 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead);
268 
269 	ASSERT(gr != -1);
270 	ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
271 	gs->gs_ge[gs->gs_ngrefs++] = gr;
272 	gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread);
273 
274 	return (gr);
275 }
276 
277 /*
278  * Alloc a vreq for this bp
279  * bp->av_back contains the pointer to the vreq upon return
280  */
281 static v_req_t *
282 vreq_get(xdf_t *vdp, buf_t *bp)
283 {
284 	v_req_t *vreq = NULL;
285 
286 	ASSERT(BP_VREQ(bp) == NULL);
287 
288 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
289 	if (vreq == NULL) {
290 		if (vdp->xdf_timeout_id == 0)
291 			/* restart I/O after one second */
292 			vdp->xdf_timeout_id =
293 			    timeout(xdf_timeout_handler, vdp, hz);
294 		return (NULL);
295 	}
296 	bzero(vreq, sizeof (v_req_t));
297 	list_create(&vreq->v_gs, sizeof (ge_slot_t),
298 	    offsetof(ge_slot_t, gs_vreq_link));
299 	vreq->v_buf = bp;
300 	vreq->v_status = VREQ_INIT;
301 	vreq->v_runq = B_FALSE;
302 	BP_VREQ_SET(bp, vreq);
303 	/* init of other fields in vreq is up to the caller */
304 
305 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
306 
307 	return (vreq);
308 }
309 
310 static void
311 vreq_free(xdf_t *vdp, v_req_t *vreq)
312 {
313 	buf_t	*bp = vreq->v_buf;
314 
315 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
316 	ASSERT(BP_VREQ(bp) == vreq);
317 
318 	list_remove(&vdp->xdf_vreq_act, vreq);
319 
320 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
321 		goto done;
322 
323 	switch (vreq->v_status) {
324 	case VREQ_DMAWIN_DONE:
325 	case VREQ_GS_ALLOCED:
326 	case VREQ_DMABUF_BOUND:
327 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
328 		/*FALLTHRU*/
329 	case VREQ_DMAMEM_ALLOCED:
330 		if (!ALIGNED_XFER(bp)) {
331 			ASSERT(vreq->v_abuf != NULL);
332 			if (!IS_ERROR(bp) && IS_READ(bp))
333 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
334 				    bp->b_bcount);
335 			ddi_dma_mem_free(&vreq->v_align);
336 		}
337 		/*FALLTHRU*/
338 	case VREQ_MEMDMAHDL_ALLOCED:
339 		if (!ALIGNED_XFER(bp))
340 			ddi_dma_free_handle(&vreq->v_memdmahdl);
341 		/*FALLTHRU*/
342 	case VREQ_DMAHDL_ALLOCED:
343 		ddi_dma_free_handle(&vreq->v_dmahdl);
344 		break;
345 	default:
346 		break;
347 	}
348 done:
349 	ASSERT(!vreq->v_runq);
350 	list_destroy(&vreq->v_gs);
351 	kmem_cache_free(xdf_vreq_cache, vreq);
352 }
353 
354 /*
355  * Snarf new data if our flush block was re-written
356  */
357 static void
358 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
359 {
360 	int nblks;
361 	boolean_t mapin;
362 
363 	if (IS_WRITE_BARRIER(vdp, bp))
364 		return; /* write was a flush write */
365 
366 	mapin = B_FALSE;
367 	nblks = bp->b_bcount >> DEV_BSHIFT;
368 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
369 		xdf_fbrewrites++;
370 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
371 			mapin = B_TRUE;
372 			bp_mapin(bp);
373 		}
374 		bcopy(bp->b_un.b_addr +
375 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
376 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
377 		if (mapin)
378 			bp_mapout(bp);
379 	}
380 }
381 
382 /*
383  * Initalize the DMA and grant table resources for the buf
384  */
385 static int
386 vreq_setup(xdf_t *vdp, v_req_t *vreq)
387 {
388 	int rc;
389 	ddi_dma_attr_t dmaattr;
390 	uint_t ndcs, ndws;
391 	ddi_dma_handle_t dh;
392 	ddi_dma_handle_t mdh;
393 	ddi_dma_cookie_t dc;
394 	ddi_acc_handle_t abh;
395 	caddr_t	aba;
396 	ge_slot_t *gs;
397 	size_t bufsz;
398 	off_t off;
399 	size_t sz;
400 	buf_t *bp = vreq->v_buf;
401 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
402 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
403 
404 	switch (vreq->v_status) {
405 	case VREQ_INIT:
406 		if (IS_FLUSH_DISKCACHE(bp)) {
407 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
408 				DPRINTF(DMA_DBG, ("xdf@%s: "
409 				    "get ge_slotfailed\n", vdp->xdf_addr));
410 				return (DDI_FAILURE);
411 			}
412 			vreq->v_blkno = 0;
413 			vreq->v_nslots = 1;
414 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
415 			vreq->v_status = VREQ_GS_ALLOCED;
416 			gs->gs_vreq = vreq;
417 			list_insert_head(&vreq->v_gs, gs);
418 			return (DDI_SUCCESS);
419 		}
420 
421 		if (IS_WRITE_BARRIER(vdp, bp))
422 			vreq->v_flush_diskcache = WRITE_BARRIER;
423 		vreq->v_blkno = bp->b_blkno +
424 		    (diskaddr_t)(uintptr_t)bp->b_private;
425 		/* See if we wrote new data to our flush block */
426 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
427 			check_fbwrite(vdp, bp, vreq->v_blkno);
428 		vreq->v_status = VREQ_INIT_DONE;
429 		/*FALLTHRU*/
430 
431 	case VREQ_INIT_DONE:
432 		/*
433 		 * alloc DMA handle
434 		 */
435 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
436 		    xdf_dmacallback, (caddr_t)vdp, &dh);
437 		if (rc != DDI_SUCCESS) {
438 			SETDMACBON(vdp);
439 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
440 			    vdp->xdf_addr));
441 			return (DDI_FAILURE);
442 		}
443 
444 		vreq->v_dmahdl = dh;
445 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
446 		/*FALLTHRU*/
447 
448 	case VREQ_DMAHDL_ALLOCED:
449 		/*
450 		 * alloc dma handle for 512-byte aligned buf
451 		 */
452 		if (!ALIGNED_XFER(bp)) {
453 			/*
454 			 * XXPV: we need to temporarily enlarge the seg
455 			 * boundary and s/g length to work round CR6381968
456 			 */
457 			dmaattr = xb_dma_attr;
458 			dmaattr.dma_attr_seg = (uint64_t)-1;
459 			dmaattr.dma_attr_sgllen = INT_MAX;
460 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
461 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
462 			if (rc != DDI_SUCCESS) {
463 				SETDMACBON(vdp);
464 				DPRINTF(DMA_DBG, ("xdf@%s: "
465 				    "unaligned buf DMAhandle alloc failed\n",
466 				    vdp->xdf_addr));
467 				return (DDI_FAILURE);
468 			}
469 			vreq->v_memdmahdl = mdh;
470 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
471 		}
472 		/*FALLTHRU*/
473 
474 	case VREQ_MEMDMAHDL_ALLOCED:
475 		/*
476 		 * alloc 512-byte aligned buf
477 		 */
478 		if (!ALIGNED_XFER(bp)) {
479 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
480 				bp_mapin(bp);
481 
482 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
483 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
484 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
485 			    &aba, &bufsz, &abh);
486 			if (rc != DDI_SUCCESS) {
487 				SETDMACBON(vdp);
488 				DPRINTF(DMA_DBG, ("xdf@%s: "
489 				    "DMA mem allocation failed\n",
490 				    vdp->xdf_addr));
491 				return (DDI_FAILURE);
492 			}
493 
494 			vreq->v_abuf = aba;
495 			vreq->v_align = abh;
496 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
497 
498 			ASSERT(bufsz >= bp->b_bcount);
499 			if (!IS_READ(bp))
500 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
501 				    bp->b_bcount);
502 		}
503 		/*FALLTHRU*/
504 
505 	case VREQ_DMAMEM_ALLOCED:
506 		/*
507 		 * dma bind
508 		 */
509 		if (ALIGNED_XFER(bp)) {
510 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
511 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
512 			    &dc, &ndcs);
513 		} else {
514 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
515 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
516 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
517 		}
518 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
519 			/* get num of dma windows */
520 			if (rc == DDI_DMA_PARTIAL_MAP) {
521 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
522 				ASSERT(rc == DDI_SUCCESS);
523 			} else {
524 				ndws = 1;
525 			}
526 		} else {
527 			SETDMACBON(vdp);
528 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
529 			    vdp->xdf_addr));
530 			return (DDI_FAILURE);
531 		}
532 
533 		vreq->v_dmac = dc;
534 		vreq->v_dmaw = 0;
535 		vreq->v_ndmacs = ndcs;
536 		vreq->v_ndmaws = ndws;
537 		vreq->v_nslots = ndws;
538 		vreq->v_status = VREQ_DMABUF_BOUND;
539 		/*FALLTHRU*/
540 
541 	case VREQ_DMABUF_BOUND:
542 		/*
543 		 * get ge_slot, callback is set upon failure from gs_get(),
544 		 * if not set previously
545 		 */
546 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
547 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
548 			    vdp->xdf_addr));
549 			return (DDI_FAILURE);
550 		}
551 
552 		vreq->v_status = VREQ_GS_ALLOCED;
553 		gs->gs_vreq = vreq;
554 		list_insert_head(&vreq->v_gs, gs);
555 		break;
556 
557 	case VREQ_GS_ALLOCED:
558 		/* nothing need to be done */
559 		break;
560 
561 	case VREQ_DMAWIN_DONE:
562 		/*
563 		 * move to the next dma window
564 		 */
565 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
566 
567 		/* get a ge_slot for this DMA window */
568 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
569 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
570 			    vdp->xdf_addr));
571 			return (DDI_FAILURE);
572 		}
573 
574 		vreq->v_dmaw++;
575 		VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
576 		    &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS);
577 		vreq->v_status = VREQ_GS_ALLOCED;
578 		gs->gs_vreq = vreq;
579 		list_insert_head(&vreq->v_gs, gs);
580 		break;
581 
582 	default:
583 		return (DDI_FAILURE);
584 	}
585 
586 	return (DDI_SUCCESS);
587 }
588 
589 static int
590 xdf_cmlb_attach(xdf_t *vdp)
591 {
592 	dev_info_t	*dip = vdp->xdf_dip;
593 
594 	return (cmlb_attach(dip, &xdf_lb_ops,
595 	    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
596 	    XD_IS_RM(vdp),
597 	    B_TRUE,
598 	    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
599 #if defined(XPV_HVM_DRIVER)
600 	    (XD_IS_CD(vdp) ? 0 : CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT) |
601 	    CMLB_INTERNAL_MINOR_NODES,
602 #else /* !XPV_HVM_DRIVER */
603 	    XD_IS_CD(vdp) ? 0 : CMLB_FAKE_LABEL_ONE_PARTITION,
604 #endif /* !XPV_HVM_DRIVER */
605 	    vdp->xdf_vd_lbl, NULL));
606 }
607 
608 static void
609 xdf_io_err(buf_t *bp, int err, size_t resid)
610 {
611 	bioerror(bp, err);
612 	if (resid == 0)
613 		bp->b_resid = bp->b_bcount;
614 	biodone(bp);
615 }
616 
617 static void
618 xdf_kstat_enter(xdf_t *vdp, buf_t *bp)
619 {
620 	v_req_t *vreq = BP_VREQ(bp);
621 
622 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
623 
624 	if (vdp->xdf_xdev_iostat == NULL)
625 		return;
626 	if ((vreq != NULL) && vreq->v_runq) {
627 		kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
628 	} else {
629 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
630 	}
631 }
632 
633 static void
634 xdf_kstat_exit(xdf_t *vdp, buf_t *bp)
635 {
636 	v_req_t *vreq = BP_VREQ(bp);
637 
638 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
639 
640 	if (vdp->xdf_xdev_iostat == NULL)
641 		return;
642 	if ((vreq != NULL) && vreq->v_runq) {
643 		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
644 	} else {
645 		kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
646 	}
647 }
648 
649 static void
650 xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp)
651 {
652 	v_req_t *vreq = BP_VREQ(bp);
653 
654 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
655 	ASSERT(!vreq->v_runq);
656 
657 	vreq->v_runq = B_TRUE;
658 	if (vdp->xdf_xdev_iostat == NULL)
659 		return;
660 	kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
661 }
662 
663 static void
664 xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp)
665 {
666 	v_req_t *vreq = BP_VREQ(bp);
667 
668 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
669 	ASSERT(vreq->v_runq);
670 
671 	vreq->v_runq = B_FALSE;
672 	if (vdp->xdf_xdev_iostat == NULL)
673 		return;
674 	kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
675 }
676 
677 int
678 xdf_kstat_create(dev_info_t *dip, char *ks_module, int instance)
679 {
680 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
681 	kstat_t		*kstat;
682 	buf_t		*bp;
683 
684 	if ((kstat = kstat_create(
685 	    ks_module, instance, NULL, "disk",
686 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL)
687 		return (-1);
688 
689 	/* See comment about locking in xdf_kstat_delete(). */
690 	mutex_enter(&vdp->xdf_iostat_lk);
691 	mutex_enter(&vdp->xdf_dev_lk);
692 
693 	/* only one kstat can exist at a time */
694 	if (vdp->xdf_xdev_iostat != NULL) {
695 		mutex_exit(&vdp->xdf_dev_lk);
696 		mutex_exit(&vdp->xdf_iostat_lk);
697 		kstat_delete(kstat);
698 		return (-1);
699 	}
700 
701 	vdp->xdf_xdev_iostat = kstat;
702 	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
703 	kstat_install(vdp->xdf_xdev_iostat);
704 
705 	/*
706 	 * Now that we've created a kstat, we need to update the waitq and
707 	 * runq counts for the kstat to reflect our current state.
708 	 *
709 	 * For a buf_t structure to be on the runq, it must have a ring
710 	 * buffer slot associated with it.  To get a ring buffer slot the
711 	 * buf must first have a v_req_t and a ge_slot_t associated with it.
712 	 * Then when it is granted a ring buffer slot, v_runq will be set to
713 	 * true.
714 	 *
715 	 * For a buf_t structure to be on the waitq, it must not be on the
716 	 * runq.  So to find all the buf_t's that should be on waitq, we
717 	 * walk the active buf list and add any buf_t's which aren't on the
718 	 * runq to the waitq.
719 	 */
720 	bp = vdp->xdf_f_act;
721 	while (bp != NULL) {
722 		xdf_kstat_enter(vdp, bp);
723 		bp = bp->av_forw;
724 	}
725 	if (vdp->xdf_ready_tq_bp != NULL)
726 		xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp);
727 
728 	mutex_exit(&vdp->xdf_dev_lk);
729 	mutex_exit(&vdp->xdf_iostat_lk);
730 	return (0);
731 }
732 
733 void
734 xdf_kstat_delete(dev_info_t *dip)
735 {
736 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
737 	kstat_t		*kstat;
738 	buf_t		*bp;
739 
740 	/*
741 	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
742 	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
743 	 * and the contents of the our kstat.  xdf_iostat_lk is used
744 	 * to protect the allocation and freeing of the actual kstat.
745 	 * xdf_dev_lk can't be used for this purpose because kstat
746 	 * readers use it to access the contents of the kstat and
747 	 * hence it can't be held when calling kstat_delete().
748 	 */
749 	mutex_enter(&vdp->xdf_iostat_lk);
750 	mutex_enter(&vdp->xdf_dev_lk);
751 
752 	if (vdp->xdf_xdev_iostat == NULL) {
753 		mutex_exit(&vdp->xdf_dev_lk);
754 		mutex_exit(&vdp->xdf_iostat_lk);
755 		return;
756 	}
757 
758 	/*
759 	 * We're about to destroy the kstat structures, so it isn't really
760 	 * necessary to update the runq and waitq counts.  But, since this
761 	 * isn't a hot code path we can afford to be a little pedantic and
762 	 * go ahead and decrement the runq and waitq kstat counters to zero
763 	 * before free'ing them.  This helps us ensure that we've gotten all
764 	 * our accounting correct.
765 	 *
766 	 * For an explanation of how we determine which buffers go on the
767 	 * runq vs which go on the waitq, see the comments in
768 	 * xdf_kstat_create().
769 	 */
770 	bp = vdp->xdf_f_act;
771 	while (bp != NULL) {
772 		xdf_kstat_exit(vdp, bp);
773 		bp = bp->av_forw;
774 	}
775 	if (vdp->xdf_ready_tq_bp != NULL)
776 		xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp);
777 
778 	kstat = vdp->xdf_xdev_iostat;
779 	vdp->xdf_xdev_iostat = NULL;
780 	mutex_exit(&vdp->xdf_dev_lk);
781 	kstat_delete(kstat);
782 	mutex_exit(&vdp->xdf_iostat_lk);
783 }
784 
785 /*
786  * Add an IO requests onto the active queue.
787  *
788  * We have to detect IOs generated by xdf_ready_tq_thread.  These IOs
789  * are used to establish a connection to the backend, so they recieve
790  * priority over all other IOs.  Since xdf_ready_tq_thread only does
791  * synchronous IO, there can only be one xdf_ready_tq_thread request at any
792  * given time and we record the buf associated with that request in
793  * xdf_ready_tq_bp.
794  */
795 static void
796 xdf_bp_push(xdf_t *vdp, buf_t *bp)
797 {
798 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
799 	ASSERT(bp->av_forw == NULL);
800 
801 	xdf_kstat_enter(vdp, bp);
802 
803 	if (curthread == vdp->xdf_ready_tq_thread) {
804 		/* new IO requests from the ready thread */
805 		ASSERT(vdp->xdf_ready_tq_bp == NULL);
806 		vdp->xdf_ready_tq_bp = bp;
807 		return;
808 	}
809 
810 	/* this is normal IO request */
811 	ASSERT(bp != vdp->xdf_ready_tq_bp);
812 
813 	if (vdp->xdf_f_act == NULL) {
814 		/* this is only only IO on the active queue */
815 		ASSERT(vdp->xdf_l_act == NULL);
816 		ASSERT(vdp->xdf_i_act == NULL);
817 		vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp;
818 		return;
819 	}
820 
821 	/* add this IO to the tail of the active queue */
822 	vdp->xdf_l_act->av_forw = bp;
823 	vdp->xdf_l_act = bp;
824 	if (vdp->xdf_i_act == NULL)
825 		vdp->xdf_i_act = bp;
826 }
827 
828 static void
829 xdf_bp_pop(xdf_t *vdp, buf_t *bp)
830 {
831 	buf_t	*bp_iter;
832 
833 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
834 	ASSERT(VREQ_DONE(BP_VREQ(bp)));
835 
836 	if (vdp->xdf_ready_tq_bp == bp) {
837 		/* we're done with a ready thread IO request */
838 		ASSERT(bp->av_forw == NULL);
839 		vdp->xdf_ready_tq_bp = NULL;
840 		return;
841 	}
842 
843 	/* we're done with a normal IO request */
844 	ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act));
845 	ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act));
846 	ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act)));
847 	ASSERT(vdp->xdf_f_act != vdp->xdf_i_act);
848 
849 	if (bp == vdp->xdf_f_act) {
850 		/* This IO was at the head of our active queue. */
851 		vdp->xdf_f_act = bp->av_forw;
852 		if (bp == vdp->xdf_l_act)
853 			vdp->xdf_l_act = NULL;
854 	} else {
855 		/* There IO finished before some other pending IOs. */
856 		bp_iter = vdp->xdf_f_act;
857 		while (bp != bp_iter->av_forw) {
858 			bp_iter = bp_iter->av_forw;
859 			ASSERT(VREQ_DONE(BP_VREQ(bp_iter)));
860 			ASSERT(bp_iter != vdp->xdf_i_act);
861 		}
862 		bp_iter->av_forw = bp->av_forw;
863 		if (bp == vdp->xdf_l_act)
864 			vdp->xdf_l_act = bp_iter;
865 	}
866 	bp->av_forw = NULL;
867 }
868 
869 static buf_t *
870 xdf_bp_next(xdf_t *vdp)
871 {
872 	v_req_t	*vreq;
873 	buf_t	*bp;
874 
875 	if (vdp->xdf_state == XD_CONNECTED) {
876 		/*
877 		 * If we're in the XD_CONNECTED state, we only service IOs
878 		 * from the xdf_ready_tq_thread thread.
879 		 */
880 		if ((bp = vdp->xdf_ready_tq_bp) == NULL)
881 			return (NULL);
882 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
883 			return (bp);
884 		return (NULL);
885 	}
886 
887 	/* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */
888 	if (vdp->xdf_state != XD_READY)
889 		return (NULL);
890 
891 	ASSERT(vdp->xdf_ready_tq_bp == NULL);
892 	for (;;) {
893 		if ((bp = vdp->xdf_i_act) == NULL)
894 			return (NULL);
895 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
896 			return (bp);
897 
898 		/* advance the active buf index pointer */
899 		vdp->xdf_i_act = bp->av_forw;
900 	}
901 }
902 
903 static void
904 xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr)
905 {
906 	ge_slot_t	*gs = (ge_slot_t *)(uintptr_t)id;
907 	v_req_t		*vreq = gs->gs_vreq;
908 	buf_t		*bp = vreq->v_buf;
909 
910 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
911 	ASSERT(BP_VREQ(bp) == vreq);
912 
913 	gs_free(gs);
914 
915 	if (bioerr != 0)
916 		bioerror(bp, bioerr);
917 	ASSERT(vreq->v_nslots > 0);
918 	if (--vreq->v_nslots > 0)
919 		return;
920 
921 	/* remove this IO from our active queue */
922 	xdf_bp_pop(vdp, bp);
923 
924 	ASSERT(vreq->v_runq);
925 	xdf_kstat_exit(vdp, bp);
926 	vreq->v_runq = B_FALSE;
927 	vreq_free(vdp, vreq);
928 
929 	if (IS_ERROR(bp)) {
930 		xdf_io_err(bp, geterror(bp), 0);
931 	} else if (bp->b_resid != 0) {
932 		/* Partial transfers are an error */
933 		xdf_io_err(bp, EIO, bp->b_resid);
934 	} else {
935 		biodone(bp);
936 	}
937 }
938 
939 /*
940  * xdf interrupt handler
941  */
942 static uint_t
943 xdf_intr_locked(xdf_t *vdp)
944 {
945 	xendev_ring_t *xbr;
946 	blkif_response_t *resp;
947 	int bioerr;
948 	uint64_t id;
949 	uint8_t op;
950 	uint16_t status;
951 	ddi_acc_handle_t acchdl;
952 
953 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
954 
955 	if ((xbr = vdp->xdf_xb_ring) == NULL)
956 		return (DDI_INTR_UNCLAIMED);
957 
958 	acchdl = vdp->xdf_xb_ring_hdl;
959 
960 	/*
961 	 * complete all requests which have a response
962 	 */
963 	while (resp = xvdi_ring_get_response(xbr)) {
964 		id = ddi_get64(acchdl, &resp->id);
965 		op = ddi_get8(acchdl, &resp->operation);
966 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
967 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
968 		    op, id, status));
969 
970 		if (status != BLKIF_RSP_OKAY) {
971 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
972 			    vdp->xdf_addr,
973 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
974 			bioerr = EIO;
975 		} else {
976 			bioerr = 0;
977 		}
978 
979 		xdf_io_fini(vdp, id, bioerr);
980 	}
981 	return (DDI_INTR_CLAIMED);
982 }
983 
984 static uint_t
985 xdf_intr(caddr_t arg)
986 {
987 	xdf_t *vdp = (xdf_t *)arg;
988 	int rv;
989 
990 	mutex_enter(&vdp->xdf_dev_lk);
991 	rv = xdf_intr_locked(vdp);
992 	mutex_exit(&vdp->xdf_dev_lk);
993 
994 	if (!do_polled_io)
995 		xdf_io_start(vdp);
996 
997 	return (rv);
998 }
999 
1000 static void
1001 xdf_ring_push(xdf_t *vdp)
1002 {
1003 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1004 
1005 	if (vdp->xdf_xb_ring == NULL)
1006 		return;
1007 
1008 	if (xvdi_ring_push_request(vdp->xdf_xb_ring)) {
1009 		DPRINTF(IO_DBG, (
1010 		    "xdf@%s: xdf_ring_push: sent request(s) to backend\n",
1011 		    vdp->xdf_addr));
1012 	}
1013 
1014 	if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN)
1015 		xvdi_notify_oe(vdp->xdf_dip);
1016 }
1017 
1018 static int
1019 xdf_ring_drain_locked(xdf_t *vdp)
1020 {
1021 	int		pollc, rv = 0;
1022 
1023 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1024 
1025 	if (xdf_debug & SUSRES_DBG)
1026 		xen_printf("xdf_ring_drain: start\n");
1027 
1028 	for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) {
1029 		if (vdp->xdf_xb_ring == NULL)
1030 			goto out;
1031 
1032 		if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1033 			(void) xdf_intr_locked(vdp);
1034 		if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring))
1035 			goto out;
1036 		xdf_ring_push(vdp);
1037 
1038 		/* file-backed devices can be slow */
1039 		mutex_exit(&vdp->xdf_dev_lk);
1040 #ifdef XPV_HVM_DRIVER
1041 		(void) HYPERVISOR_yield();
1042 #endif /* XPV_HVM_DRIVER */
1043 		delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY));
1044 		mutex_enter(&vdp->xdf_dev_lk);
1045 	}
1046 	cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr);
1047 
1048 out:
1049 	if (vdp->xdf_xb_ring != NULL) {
1050 		if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) ||
1051 		    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1052 			rv = EIO;
1053 	}
1054 	if (xdf_debug & SUSRES_DBG)
1055 		xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n",
1056 		    vdp->xdf_addr, rv);
1057 	return (rv);
1058 }
1059 
1060 static int
1061 xdf_ring_drain(xdf_t *vdp)
1062 {
1063 	int rv;
1064 	mutex_enter(&vdp->xdf_dev_lk);
1065 	rv = xdf_ring_drain_locked(vdp);
1066 	mutex_exit(&vdp->xdf_dev_lk);
1067 	return (rv);
1068 }
1069 
1070 /*
1071  * Destroy all v_req_t, grant table entries, and our ring buffer.
1072  */
1073 static void
1074 xdf_ring_destroy(xdf_t *vdp)
1075 {
1076 	v_req_t		*vreq;
1077 	buf_t		*bp;
1078 	ge_slot_t	*gs;
1079 
1080 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1081 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1082 
1083 	if ((vdp->xdf_state != XD_INIT) &&
1084 	    (vdp->xdf_state != XD_CONNECTED) &&
1085 	    (vdp->xdf_state != XD_READY)) {
1086 		ASSERT(vdp->xdf_xb_ring == NULL);
1087 		ASSERT(vdp->xdf_xb_ring_hdl == NULL);
1088 		ASSERT(vdp->xdf_peer == INVALID_DOMID);
1089 		ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN);
1090 		ASSERT(list_is_empty(&vdp->xdf_vreq_act));
1091 		return;
1092 	}
1093 
1094 	/*
1095 	 * We don't want to recieve async notifications from the backend
1096 	 * when it finishes processing ring entries.
1097 	 */
1098 #ifdef XPV_HVM_DRIVER
1099 	ec_unbind_evtchn(vdp->xdf_evtchn);
1100 #else /* !XPV_HVM_DRIVER */
1101 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1102 #endif /* !XPV_HVM_DRIVER */
1103 
1104 	/*
1105 	 * Drain any requests in the ring.  We need to do this before we
1106 	 * can free grant table entries, because if active ring entries
1107 	 * point to grants, then the backend could be trying to access
1108 	 * those grants.
1109 	 */
1110 	(void) xdf_ring_drain_locked(vdp);
1111 
1112 	/* We're done talking to the backend so free up our event channel */
1113 	xvdi_free_evtchn(vdp->xdf_dip);
1114 	vdp->xdf_evtchn = INVALID_EVTCHN;
1115 
1116 	while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) {
1117 		bp = vreq->v_buf;
1118 		ASSERT(BP_VREQ(bp) == vreq);
1119 
1120 		/* Free up any grant table entries associaed with this IO */
1121 		while ((gs = list_head(&vreq->v_gs)) != NULL)
1122 			gs_free(gs);
1123 
1124 		/* If this IO was on the runq, move it back to the waitq. */
1125 		if (vreq->v_runq)
1126 			xdf_kstat_runq_to_waitq(vdp, bp);
1127 
1128 		/*
1129 		 * Reset any buf IO state since we're going to re-issue the
1130 		 * IO when we reconnect.
1131 		 */
1132 		vreq_free(vdp, vreq);
1133 		BP_VREQ_SET(bp, NULL);
1134 		bioerror(bp, 0);
1135 	}
1136 
1137 	/* reset the active queue index pointer */
1138 	vdp->xdf_i_act = vdp->xdf_f_act;
1139 
1140 	/* Destroy the ring */
1141 	xvdi_free_ring(vdp->xdf_xb_ring);
1142 	vdp->xdf_xb_ring = NULL;
1143 	vdp->xdf_xb_ring_hdl = NULL;
1144 	vdp->xdf_peer = INVALID_DOMID;
1145 }
1146 
1147 void
1148 xdfmin(struct buf *bp)
1149 {
1150 	if (bp->b_bcount > xdf_maxphys)
1151 		bp->b_bcount = xdf_maxphys;
1152 }
1153 
1154 /*
1155  * Check if we have a pending "eject" media request.
1156  */
1157 static int
1158 xdf_eject_pending(xdf_t *vdp)
1159 {
1160 	dev_info_t	*dip = vdp->xdf_dip;
1161 	char		*xsname, *str;
1162 
1163 	if (!vdp->xdf_media_req_supported)
1164 		return (B_FALSE);
1165 
1166 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1167 	    (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0))
1168 		return (B_FALSE);
1169 
1170 	if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
1171 		strfree(str);
1172 		return (B_FALSE);
1173 	}
1174 	strfree(str);
1175 	return (B_TRUE);
1176 }
1177 
1178 /*
1179  * Generate a media request.
1180  */
1181 static int
1182 xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required)
1183 {
1184 	dev_info_t	*dip = vdp->xdf_dip;
1185 	char		*xsname;
1186 
1187 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1188 
1189 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1190 		return (ENXIO);
1191 
1192 	/* Check if we support media requests */
1193 	if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported)
1194 		return (ENOTTY);
1195 
1196 	/* If an eject is pending then don't allow any new requests */
1197 	if (xdf_eject_pending(vdp))
1198 		return (ENXIO);
1199 
1200 	/* Make sure that there is media present */
1201 	if (media_required && (vdp->xdf_xdev_nblocks == 0))
1202 		return (ENXIO);
1203 
1204 	/* We only allow operations when the device is ready and connected */
1205 	if (vdp->xdf_state != XD_READY)
1206 		return (EIO);
1207 
1208 	if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0)
1209 		return (EIO);
1210 
1211 	return (0);
1212 }
1213 
1214 /*
1215  * populate a single blkif_request_t w/ a buf
1216  */
1217 static void
1218 xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1219 {
1220 	grant_ref_t	gr;
1221 	uint8_t		fsect, lsect;
1222 	size_t		bcnt;
1223 	paddr_t		dma_addr;
1224 	off_t		blk_off;
1225 	dev_info_t	*dip = vdp->xdf_dip;
1226 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1227 	v_req_t		*vreq = BP_VREQ(bp);
1228 	uint64_t	blkno = vreq->v_blkno;
1229 	uint_t		ndmacs = vreq->v_ndmacs;
1230 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1231 	int		seg = 0;
1232 	int		isread = IS_READ(bp);
1233 	ge_slot_t	*gs = list_head(&vreq->v_gs);
1234 
1235 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1236 	ASSERT(vreq->v_status == VREQ_GS_ALLOCED);
1237 
1238 	if (isread)
1239 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1240 	else {
1241 		switch (vreq->v_flush_diskcache) {
1242 		case FLUSH_DISKCACHE:
1243 			ddi_put8(acchdl, &rreq->operation,
1244 			    BLKIF_OP_FLUSH_DISKCACHE);
1245 			ddi_put16(acchdl, &rreq->handle, vdev);
1246 			ddi_put64(acchdl, &rreq->id,
1247 			    (uint64_t)(uintptr_t)(gs));
1248 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1249 			vreq->v_status = VREQ_DMAWIN_DONE;
1250 			return;
1251 		case WRITE_BARRIER:
1252 			ddi_put8(acchdl, &rreq->operation,
1253 			    BLKIF_OP_WRITE_BARRIER);
1254 			break;
1255 		default:
1256 			if (!vdp->xdf_wce)
1257 				ddi_put8(acchdl, &rreq->operation,
1258 				    BLKIF_OP_WRITE_BARRIER);
1259 			else
1260 				ddi_put8(acchdl, &rreq->operation,
1261 				    BLKIF_OP_WRITE);
1262 			break;
1263 		}
1264 	}
1265 
1266 	ddi_put16(acchdl, &rreq->handle, vdev);
1267 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1268 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs));
1269 
1270 	/*
1271 	 * loop until all segments are populated or no more dma cookie in buf
1272 	 */
1273 	for (;;) {
1274 		/*
1275 		 * Each segment of a blkif request can transfer up to
1276 		 * one 4K page of data.
1277 		 */
1278 		bcnt = vreq->v_dmac.dmac_size;
1279 		dma_addr = vreq->v_dmac.dmac_laddress;
1280 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1281 		fsect = blk_off >> XB_BSHIFT;
1282 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1283 
1284 		ASSERT(bcnt <= PAGESIZE);
1285 		ASSERT((bcnt % XB_BSIZE) == 0);
1286 		ASSERT((blk_off & XB_BMASK) == 0);
1287 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1288 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1289 
1290 		gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT);
1291 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1292 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1293 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1294 
1295 		DPRINTF(IO_DBG, (
1296 		    "xdf@%s: seg%d: dmacS %lu blk_off %ld\n",
1297 		    vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off));
1298 		DPRINTF(IO_DBG, (
1299 		    "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n",
1300 		    vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr));
1301 
1302 		blkno += (bcnt >> XB_BSHIFT);
1303 		seg++;
1304 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1305 		if (--ndmacs) {
1306 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1307 			continue;
1308 		}
1309 
1310 		vreq->v_status = VREQ_DMAWIN_DONE;
1311 		vreq->v_blkno = blkno;
1312 		break;
1313 	}
1314 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1315 	DPRINTF(IO_DBG, (
1316 	    "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n",
1317 	    vdp->xdf_addr, rreq->id));
1318 }
1319 
1320 static void
1321 xdf_io_start(xdf_t *vdp)
1322 {
1323 	struct buf	*bp;
1324 	v_req_t		*vreq;
1325 	blkif_request_t	*rreq;
1326 	boolean_t	rreqready = B_FALSE;
1327 
1328 	mutex_enter(&vdp->xdf_dev_lk);
1329 
1330 	/*
1331 	 * Populate the ring request(s).  Loop until there is no buf to
1332 	 * transfer or no free slot available in I/O ring.
1333 	 */
1334 	for (;;) {
1335 		/* don't start any new IO if we're suspending */
1336 		if (vdp->xdf_suspending)
1337 			break;
1338 		if ((bp = xdf_bp_next(vdp)) == NULL)
1339 			break;
1340 
1341 		/* if the buf doesn't already have a vreq, allocate one */
1342 		if (((vreq = BP_VREQ(bp)) == NULL) &&
1343 		    ((vreq = vreq_get(vdp, bp)) == NULL))
1344 			break;
1345 
1346 		/* alloc DMA/GTE resources */
1347 		if (vreq_setup(vdp, vreq) != DDI_SUCCESS)
1348 			break;
1349 
1350 		/* get next blkif_request in the ring */
1351 		if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL)
1352 			break;
1353 		bzero(rreq, sizeof (blkif_request_t));
1354 		rreqready = B_TRUE;
1355 
1356 		/* populate blkif_request with this buf */
1357 		xdf_process_rreq(vdp, bp, rreq);
1358 
1359 		/*
1360 		 * This buffer/vreq pair is has been allocated a ring buffer
1361 		 * resources, so if it isn't already in our runq, add it.
1362 		 */
1363 		if (!vreq->v_runq)
1364 			xdf_kstat_waitq_to_runq(vdp, bp);
1365 	}
1366 
1367 	/* Send the request(s) to the backend */
1368 	if (rreqready)
1369 		xdf_ring_push(vdp);
1370 
1371 	mutex_exit(&vdp->xdf_dev_lk);
1372 }
1373 
1374 
1375 /* check if partition is open, -1 - check all partitions on the disk */
1376 static boolean_t
1377 xdf_isopen(xdf_t *vdp, int partition)
1378 {
1379 	int i;
1380 	ulong_t parbit;
1381 	boolean_t rval = B_FALSE;
1382 
1383 	ASSERT((partition == -1) ||
1384 	    ((partition >= 0) || (partition < XDF_PEXT)));
1385 
1386 	if (partition == -1)
1387 		parbit = (ulong_t)-1;
1388 	else
1389 		parbit = 1 << partition;
1390 
1391 	for (i = 0; i < OTYPCNT; i++) {
1392 		if (vdp->xdf_vd_open[i] & parbit)
1393 			rval = B_TRUE;
1394 	}
1395 
1396 	return (rval);
1397 }
1398 
1399 /*
1400  * The connection should never be closed as long as someone is holding
1401  * us open, there is pending IO, or someone is waiting waiting for a
1402  * connection.
1403  */
1404 static boolean_t
1405 xdf_busy(xdf_t *vdp)
1406 {
1407 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1408 
1409 	if ((vdp->xdf_xb_ring != NULL) &&
1410 	    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
1411 		ASSERT(vdp->xdf_state != XD_CLOSED);
1412 		return (B_TRUE);
1413 	}
1414 
1415 	if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) {
1416 		ASSERT(vdp->xdf_state != XD_CLOSED);
1417 		return (B_TRUE);
1418 	}
1419 
1420 	if (xdf_isopen(vdp, -1)) {
1421 		ASSERT(vdp->xdf_state != XD_CLOSED);
1422 		return (B_TRUE);
1423 	}
1424 
1425 	if (vdp->xdf_connect_req > 0) {
1426 		ASSERT(vdp->xdf_state != XD_CLOSED);
1427 		return (B_TRUE);
1428 	}
1429 
1430 	return (B_FALSE);
1431 }
1432 
1433 static void
1434 xdf_set_state(xdf_t *vdp, xdf_state_t new_state)
1435 {
1436 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1437 	DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n",
1438 	    vdp->xdf_addr, vdp->xdf_state, new_state));
1439 	vdp->xdf_state = new_state;
1440 	cv_broadcast(&vdp->xdf_dev_cv);
1441 }
1442 
1443 static void
1444 xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet)
1445 {
1446 	dev_info_t	*dip = vdp->xdf_dip;
1447 	boolean_t	busy;
1448 
1449 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1450 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1451 	ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED));
1452 
1453 	/* Check if we're already there. */
1454 	if (vdp->xdf_state == new_state)
1455 		return;
1456 
1457 	mutex_enter(&vdp->xdf_dev_lk);
1458 	busy = xdf_busy(vdp);
1459 
1460 	/* If we're already closed then there's nothing todo. */
1461 	if (vdp->xdf_state == XD_CLOSED) {
1462 		ASSERT(!busy);
1463 		xdf_set_state(vdp, new_state);
1464 		mutex_exit(&vdp->xdf_dev_lk);
1465 		return;
1466 	}
1467 
1468 #ifdef DEBUG
1469 	/* UhOh.  Warn the user that something bad has happened. */
1470 	if (!quiet && busy && (vdp->xdf_state == XD_READY) &&
1471 	    (vdp->xdf_xdev_nblocks != 0)) {
1472 		cmn_err(CE_WARN, "xdf@%s: disconnected while in use",
1473 		    vdp->xdf_addr);
1474 	}
1475 #endif /* DEBUG */
1476 
1477 	xdf_ring_destroy(vdp);
1478 
1479 	/* If we're busy then we can only go into the unknown state */
1480 	xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state);
1481 	mutex_exit(&vdp->xdf_dev_lk);
1482 
1483 	/* if we're closed now, let the other end know */
1484 	if (vdp->xdf_state == XD_CLOSED)
1485 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1486 }
1487 
1488 
1489 /*
1490  * Kick-off connect process
1491  * Status should be XD_UNKNOWN or XD_CLOSED
1492  * On success, status will be changed to XD_INIT
1493  * On error, it will be changed to XD_UNKNOWN
1494  */
1495 static int
1496 xdf_setstate_init(xdf_t *vdp)
1497 {
1498 	dev_info_t		*dip = vdp->xdf_dip;
1499 	xenbus_transaction_t	xbt;
1500 	grant_ref_t		gref;
1501 	char			*xsname, *str;
1502 	int 			rv;
1503 
1504 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1505 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1506 	ASSERT((vdp->xdf_state == XD_UNKNOWN) ||
1507 	    (vdp->xdf_state == XD_CLOSED));
1508 
1509 	DPRINTF(DDI_DBG,
1510 	    ("xdf@%s: starting connection process\n", vdp->xdf_addr));
1511 
1512 	/*
1513 	 * If an eject is pending then don't allow a new connection, but
1514 	 * we want to return without displaying an error message.
1515 	 */
1516 	if (xdf_eject_pending(vdp)) {
1517 		xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1518 		return (DDI_FAILURE);
1519 	}
1520 
1521 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1522 		goto errout;
1523 
1524 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID)
1525 		goto errout;
1526 
1527 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising);
1528 
1529 	/*
1530 	 * Sanity check for the existance of the xenbus device-type property.
1531 	 * This property might not exist if we our xenbus device nodes was
1532 	 * force destroyed while we were still connected to the backend.
1533 	 */
1534 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0)
1535 		goto errout;
1536 	strfree(str);
1537 
1538 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS)
1539 		goto errout;
1540 
1541 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1542 #ifdef XPV_HVM_DRIVER
1543 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1544 #else /* !XPV_HVM_DRIVER */
1545 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1546 	    DDI_SUCCESS) {
1547 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: "
1548 		    "failed to add intr handler", vdp->xdf_addr);
1549 		goto errout1;
1550 	}
1551 #endif /* !XPV_HVM_DRIVER */
1552 
1553 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1554 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1555 	    DDI_SUCCESS) {
1556 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1557 		    vdp->xdf_addr);
1558 		goto errout2;
1559 	}
1560 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1561 
1562 	/*
1563 	 * Write into xenstore the info needed by backend
1564 	 */
1565 trans_retry:
1566 	if (xenbus_transaction_start(&xbt)) {
1567 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1568 		    vdp->xdf_addr);
1569 		xvdi_fatal_error(dip, EIO, "connect transaction init");
1570 		goto fail_trans;
1571 	}
1572 
1573 	/*
1574 	 * XBP_PROTOCOL is written by the domain builder in the case of PV
1575 	 * domains. However, it is not written for HVM domains, so let's
1576 	 * write it here.
1577 	 */
1578 	if (((rv = xenbus_printf(xbt, xsname,
1579 	    XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) ||
1580 	    ((rv = xenbus_printf(xbt, xsname,
1581 	    XBP_RING_REF, "%u", gref)) != 0) ||
1582 	    ((rv = xenbus_printf(xbt, xsname,
1583 	    XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) ||
1584 	    ((rv = xenbus_printf(xbt, xsname,
1585 	    XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) ||
1586 	    ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) {
1587 		(void) xenbus_transaction_end(xbt, 1);
1588 		xvdi_fatal_error(dip, rv, "connect transaction setup");
1589 		goto fail_trans;
1590 	}
1591 
1592 	/* kick-off connect process */
1593 	if (rv = xenbus_transaction_end(xbt, 0)) {
1594 		if (rv == EAGAIN)
1595 			goto trans_retry;
1596 		xvdi_fatal_error(dip, rv, "connect transaction commit");
1597 		goto fail_trans;
1598 	}
1599 
1600 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1601 	mutex_enter(&vdp->xdf_dev_lk);
1602 	xdf_set_state(vdp, XD_INIT);
1603 	mutex_exit(&vdp->xdf_dev_lk);
1604 
1605 	return (DDI_SUCCESS);
1606 
1607 fail_trans:
1608 	xvdi_free_ring(vdp->xdf_xb_ring);
1609 errout2:
1610 #ifdef XPV_HVM_DRIVER
1611 	ec_unbind_evtchn(vdp->xdf_evtchn);
1612 #else /* !XPV_HVM_DRIVER */
1613 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1614 #endif /* !XPV_HVM_DRIVER */
1615 errout1:
1616 	xvdi_free_evtchn(dip);
1617 	vdp->xdf_evtchn = INVALID_EVTCHN;
1618 errout:
1619 	xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1620 	cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend",
1621 	    vdp->xdf_addr);
1622 	return (DDI_FAILURE);
1623 }
1624 
1625 int
1626 xdf_get_flush_block(xdf_t *vdp)
1627 {
1628 	/*
1629 	 * Get a DEV_BSIZE aligned bufer
1630 	 */
1631 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1632 	vdp->xdf_cache_flush_block =
1633 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1634 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1635 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1636 		return (DDI_FAILURE);
1637 	return (DDI_SUCCESS);
1638 }
1639 
1640 static void
1641 xdf_setstate_ready(void *arg)
1642 {
1643 	xdf_t	*vdp = (xdf_t *)arg;
1644 
1645 	vdp->xdf_ready_tq_thread = curthread;
1646 
1647 	/*
1648 	 * We've created all the minor nodes via cmlb_attach() using default
1649 	 * value in xdf_attach() to make it possible to block in xdf_open(),
1650 	 * in case there's anyone (say, booting thread) ever trying to open
1651 	 * it before connected to backend. We will refresh all those minor
1652 	 * nodes w/ latest info we've got now when we are almost connected.
1653 	 */
1654 	mutex_enter(&vdp->xdf_dev_lk);
1655 	if (vdp->xdf_cmbl_reattach) {
1656 		vdp->xdf_cmbl_reattach = B_FALSE;
1657 
1658 		mutex_exit(&vdp->xdf_dev_lk);
1659 		if (xdf_cmlb_attach(vdp) != 0) {
1660 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1661 			return;
1662 		}
1663 		mutex_enter(&vdp->xdf_dev_lk);
1664 	}
1665 
1666 	/* If we're not still trying to get to the ready state, then bail. */
1667 	if (vdp->xdf_state != XD_CONNECTED) {
1668 		mutex_exit(&vdp->xdf_dev_lk);
1669 		return;
1670 	}
1671 	mutex_exit(&vdp->xdf_dev_lk);
1672 
1673 	/*
1674 	 * If backend has feature-barrier, see if it supports disk
1675 	 * cache flush op.
1676 	 */
1677 	vdp->xdf_flush_supported = B_FALSE;
1678 	if (vdp->xdf_feature_barrier) {
1679 		/*
1680 		 * Pretend we already know flush is supported so probe
1681 		 * will attempt the correct op.
1682 		 */
1683 		vdp->xdf_flush_supported = B_TRUE;
1684 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1685 			vdp->xdf_flush_supported = B_TRUE;
1686 		} else {
1687 			vdp->xdf_flush_supported = B_FALSE;
1688 			/*
1689 			 * If the other end does not support the cache flush op
1690 			 * then we must use a barrier-write to force disk
1691 			 * cache flushing.  Barrier writes require that a data
1692 			 * block actually be written.
1693 			 * Cache a block to barrier-write when we are
1694 			 * asked to perform a flush.
1695 			 * XXX - would it be better to just copy 1 block
1696 			 * (512 bytes) from whatever write we did last
1697 			 * and rewrite that block?
1698 			 */
1699 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS) {
1700 				xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1701 				return;
1702 			}
1703 		}
1704 	}
1705 
1706 	mutex_enter(&vdp->xdf_cb_lk);
1707 	mutex_enter(&vdp->xdf_dev_lk);
1708 	if (vdp->xdf_state == XD_CONNECTED)
1709 		xdf_set_state(vdp, XD_READY);
1710 	mutex_exit(&vdp->xdf_dev_lk);
1711 
1712 	/* Restart any currently queued up io */
1713 	xdf_io_start(vdp);
1714 
1715 	mutex_exit(&vdp->xdf_cb_lk);
1716 }
1717 
1718 /*
1719  * synthetic geometry
1720  */
1721 #define	XDF_NSECTS	256
1722 #define	XDF_NHEADS	16
1723 
1724 static void
1725 xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp)
1726 {
1727 	xdf_t *vdp;
1728 	uint_t ncyl;
1729 
1730 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
1731 
1732 	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1733 
1734 	bzero(geomp, sizeof (*geomp));
1735 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1736 	geomp->g_acyl = 0;
1737 	geomp->g_nhead = XDF_NHEADS;
1738 	geomp->g_nsect = XDF_NSECTS;
1739 	geomp->g_secsize = XB_BSIZE;
1740 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1741 	geomp->g_intrlv = 0;
1742 	geomp->g_rpm = 7200;
1743 }
1744 
1745 /*
1746  * Finish other initialization after we've connected to backend
1747  * Status should be XD_INIT before calling this routine
1748  * On success, status should be changed to XD_CONNECTED.
1749  * On error, status should stay XD_INIT
1750  */
1751 static int
1752 xdf_setstate_connected(xdf_t *vdp)
1753 {
1754 	dev_info_t	*dip = vdp->xdf_dip;
1755 	cmlb_geom_t	pgeom;
1756 	diskaddr_t	nblocks = 0;
1757 	char		*oename, *xsname, *str;
1758 	uint_t		dinfo;
1759 
1760 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1761 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1762 	ASSERT(vdp->xdf_state == XD_INIT);
1763 
1764 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1765 	    ((oename = xvdi_get_oename(dip)) == NULL))
1766 		return (DDI_FAILURE);
1767 
1768 	/* Determine if feature barrier is supported by backend */
1769 	if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB)))
1770 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1771 		    vdp->xdf_addr);
1772 
1773 	/*
1774 	 * Probe backend.  Read the device size into xdf_xdev_nblocks
1775 	 * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE
1776 	 * flags in xdf_dinfo.  If the emulated device type is "cdrom",
1777 	 * we always set VDISK_CDROM, regardless of if it's present in
1778 	 * the xenbus info parameter.
1779 	 */
1780 	if (xenbus_gather(XBT_NULL, oename,
1781 	    XBP_SECTORS, "%"SCNu64, &nblocks,
1782 	    XBP_INFO, "%u", &dinfo,
1783 	    NULL) != 0) {
1784 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1785 		    "cannot read backend info", vdp->xdf_addr);
1786 		return (DDI_FAILURE);
1787 	}
1788 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
1789 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
1790 		    vdp->xdf_addr);
1791 		return (DDI_FAILURE);
1792 	}
1793 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
1794 		dinfo |= VDISK_CDROM;
1795 	strfree(str);
1796 
1797 	vdp->xdf_xdev_nblocks = nblocks;
1798 #ifdef _ILP32
1799 	if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1800 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1801 		    "backend disk device too large with %llu blocks for"
1802 		    " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks);
1803 		xvdi_fatal_error(dip, EFBIG, "reading backend info");
1804 		return (DDI_FAILURE);
1805 	}
1806 #endif
1807 
1808 	/*
1809 	 * If the physical geometry for a fixed disk has been explicity
1810 	 * set then make sure that the specified physical geometry isn't
1811 	 * larger than the device we connected to.
1812 	 */
1813 	if (vdp->xdf_pgeom_fixed &&
1814 	    (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) {
1815 		cmn_err(CE_WARN,
1816 		    "xdf@%s: connect failed, fixed geometry too large",
1817 		    vdp->xdf_addr);
1818 		return (DDI_FAILURE);
1819 	}
1820 
1821 	vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP);
1822 
1823 	/* mark vbd is ready for I/O */
1824 	mutex_enter(&vdp->xdf_dev_lk);
1825 	xdf_set_state(vdp, XD_CONNECTED);
1826 
1827 	/* check if the cmlb label should be updated */
1828 	xdf_synthetic_pgeom(dip, &pgeom);
1829 	if ((vdp->xdf_dinfo != dinfo) ||
1830 	    (!vdp->xdf_pgeom_fixed &&
1831 	    (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) {
1832 		vdp->xdf_cmbl_reattach = B_TRUE;
1833 
1834 		vdp->xdf_dinfo = dinfo;
1835 		if (!vdp->xdf_pgeom_fixed)
1836 			vdp->xdf_pgeom = pgeom;
1837 	}
1838 
1839 	if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) {
1840 		if (vdp->xdf_xdev_nblocks == 0) {
1841 			vdp->xdf_mstate = DKIO_EJECTED;
1842 			cv_broadcast(&vdp->xdf_mstate_cv);
1843 		} else {
1844 			vdp->xdf_mstate = DKIO_INSERTED;
1845 			cv_broadcast(&vdp->xdf_mstate_cv);
1846 		}
1847 	} else {
1848 		if (vdp->xdf_mstate != DKIO_NONE) {
1849 			vdp->xdf_mstate = DKIO_NONE;
1850 			cv_broadcast(&vdp->xdf_mstate_cv);
1851 		}
1852 	}
1853 
1854 	mutex_exit(&vdp->xdf_dev_lk);
1855 
1856 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr,
1857 	    (uint64_t)vdp->xdf_xdev_nblocks);
1858 
1859 	/* Restart any currently queued up io */
1860 	xdf_io_start(vdp);
1861 
1862 	/*
1863 	 * To get to the ready state we have to do IO to the backend device,
1864 	 * but we can't initiate IO from the other end change callback thread
1865 	 * (which is the current context we're executing in.)  This is because
1866 	 * if the other end disconnects while we're doing IO from the callback
1867 	 * thread, then we can't recieve that disconnect event and we hang
1868 	 * waiting for an IO that can never complete.
1869 	 */
1870 	(void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp,
1871 	    DDI_SLEEP);
1872 
1873 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1874 	return (DDI_SUCCESS);
1875 }
1876 
1877 /*ARGSUSED*/
1878 static void
1879 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1880 {
1881 	XenbusState new_state = *(XenbusState *)impl_data;
1882 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1883 
1884 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1885 	    vdp->xdf_addr, new_state));
1886 
1887 	mutex_enter(&vdp->xdf_cb_lk);
1888 
1889 	/* We assume that this callback is single threaded */
1890 	ASSERT(vdp->xdf_oe_change_thread == NULL);
1891 	DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread);
1892 
1893 	/* ignore any backend state changes if we're suspending/suspended */
1894 	if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) {
1895 		DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1896 		mutex_exit(&vdp->xdf_cb_lk);
1897 		return;
1898 	}
1899 
1900 	switch (new_state) {
1901 	case XenbusStateUnknown:
1902 	case XenbusStateInitialising:
1903 	case XenbusStateInitWait:
1904 	case XenbusStateInitialised:
1905 		if (vdp->xdf_state == XD_INIT)
1906 			break;
1907 
1908 		xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1909 		if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1910 			break;
1911 		ASSERT(vdp->xdf_state == XD_INIT);
1912 		break;
1913 
1914 	case XenbusStateConnected:
1915 		if ((vdp->xdf_state == XD_CONNECTED) ||
1916 		    (vdp->xdf_state == XD_READY))
1917 			break;
1918 
1919 		if (vdp->xdf_state != XD_INIT) {
1920 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1921 			if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1922 				break;
1923 			ASSERT(vdp->xdf_state == XD_INIT);
1924 		}
1925 
1926 		if (xdf_setstate_connected(vdp) != DDI_SUCCESS) {
1927 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1928 			break;
1929 		}
1930 		ASSERT(vdp->xdf_state == XD_CONNECTED);
1931 		break;
1932 
1933 	case XenbusStateClosing:
1934 		if (xdf_isopen(vdp, -1)) {
1935 			cmn_err(CE_NOTE,
1936 			    "xdf@%s: hot-unplug failed, still in use",
1937 			    vdp->xdf_addr);
1938 			break;
1939 		}
1940 		/*FALLTHROUGH*/
1941 	case XenbusStateClosed:
1942 		xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
1943 		break;
1944 	}
1945 
1946 	/* notify anybody waiting for oe state change */
1947 	cv_broadcast(&vdp->xdf_dev_cv);
1948 	DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1949 	mutex_exit(&vdp->xdf_cb_lk);
1950 }
1951 
1952 static int
1953 xdf_connect_locked(xdf_t *vdp, boolean_t wait)
1954 {
1955 	int	rv;
1956 
1957 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1958 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1959 
1960 	/* we can't connect once we're in the closed state */
1961 	if (vdp->xdf_state == XD_CLOSED)
1962 		return (XD_CLOSED);
1963 
1964 	vdp->xdf_connect_req++;
1965 	while (vdp->xdf_state != XD_READY) {
1966 		mutex_exit(&vdp->xdf_dev_lk);
1967 		if (vdp->xdf_state == XD_UNKNOWN)
1968 			(void) xdf_setstate_init(vdp);
1969 		mutex_enter(&vdp->xdf_dev_lk);
1970 
1971 		if (!wait || (vdp->xdf_state == XD_READY))
1972 			goto out;
1973 
1974 		mutex_exit((&vdp->xdf_cb_lk));
1975 		rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk);
1976 		mutex_exit((&vdp->xdf_dev_lk));
1977 		mutex_enter((&vdp->xdf_cb_lk));
1978 		mutex_enter((&vdp->xdf_dev_lk));
1979 		if (rv == 0)
1980 			goto out;
1981 	}
1982 
1983 out:
1984 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1985 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1986 
1987 	/* Try to lock the media */
1988 	(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
1989 
1990 	vdp->xdf_connect_req--;
1991 	return (vdp->xdf_state);
1992 }
1993 
1994 static uint_t
1995 xdf_iorestart(caddr_t arg)
1996 {
1997 	xdf_t *vdp = (xdf_t *)arg;
1998 
1999 	ASSERT(vdp != NULL);
2000 
2001 	mutex_enter(&vdp->xdf_dev_lk);
2002 	ASSERT(ISDMACBON(vdp));
2003 	SETDMACBOFF(vdp);
2004 	mutex_exit(&vdp->xdf_dev_lk);
2005 
2006 	xdf_io_start(vdp);
2007 
2008 	return (DDI_INTR_CLAIMED);
2009 }
2010 
2011 #if defined(XPV_HVM_DRIVER)
2012 
2013 typedef struct xdf_hvm_entry {
2014 	list_node_t	xdf_he_list;
2015 	char		*xdf_he_path;
2016 	dev_info_t	*xdf_he_dip;
2017 } xdf_hvm_entry_t;
2018 
2019 static list_t xdf_hvm_list;
2020 static kmutex_t xdf_hvm_list_lock;
2021 
2022 static xdf_hvm_entry_t *
2023 i_xdf_hvm_find(const char *path, dev_info_t *dip)
2024 {
2025 	xdf_hvm_entry_t	*i;
2026 
2027 	ASSERT((path != NULL) || (dip != NULL));
2028 	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2029 
2030 	i = list_head(&xdf_hvm_list);
2031 	while (i != NULL) {
2032 		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2033 			i = list_next(&xdf_hvm_list, i);
2034 			continue;
2035 		}
2036 		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2037 			i = list_next(&xdf_hvm_list, i);
2038 			continue;
2039 		}
2040 		break;
2041 	}
2042 	return (i);
2043 }
2044 
2045 dev_info_t *
2046 xdf_hvm_hold(const char *path)
2047 {
2048 	xdf_hvm_entry_t	*i;
2049 	dev_info_t	*dip;
2050 
2051 	mutex_enter(&xdf_hvm_list_lock);
2052 	i = i_xdf_hvm_find(path, NULL);
2053 	if (i == NULL) {
2054 		mutex_exit(&xdf_hvm_list_lock);
2055 		return (B_FALSE);
2056 	}
2057 	ndi_hold_devi(dip = i->xdf_he_dip);
2058 	mutex_exit(&xdf_hvm_list_lock);
2059 	return (dip);
2060 }
2061 
2062 static void
2063 xdf_hvm_add(dev_info_t *dip)
2064 {
2065 	xdf_hvm_entry_t	*i;
2066 	char		*path;
2067 
2068 	/* figure out the path for the dip */
2069 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2070 	(void) ddi_pathname(dip, path);
2071 
2072 	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2073 	i->xdf_he_dip = dip;
2074 	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2075 
2076 	mutex_enter(&xdf_hvm_list_lock);
2077 	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2078 	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2079 	list_insert_head(&xdf_hvm_list, i);
2080 	mutex_exit(&xdf_hvm_list_lock);
2081 
2082 	kmem_free(path, MAXPATHLEN);
2083 }
2084 
2085 static void
2086 xdf_hvm_rm(dev_info_t *dip)
2087 {
2088 	xdf_hvm_entry_t	*i;
2089 
2090 	mutex_enter(&xdf_hvm_list_lock);
2091 	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2092 	list_remove(&xdf_hvm_list, i);
2093 	mutex_exit(&xdf_hvm_list_lock);
2094 
2095 	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2096 	kmem_free(i, sizeof (*i));
2097 }
2098 
2099 static void
2100 xdf_hvm_init(void)
2101 {
2102 	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2103 	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2104 	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2105 }
2106 
2107 static void
2108 xdf_hvm_fini(void)
2109 {
2110 	ASSERT(list_head(&xdf_hvm_list) == NULL);
2111 	list_destroy(&xdf_hvm_list);
2112 	mutex_destroy(&xdf_hvm_list_lock);
2113 }
2114 
2115 boolean_t
2116 xdf_hvm_connect(dev_info_t *dip)
2117 {
2118 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2119 	char	*oename, *str;
2120 	int	rv;
2121 
2122 	mutex_enter(&vdp->xdf_cb_lk);
2123 	mutex_enter(&vdp->xdf_dev_lk);
2124 
2125 	/*
2126 	 * Before try to establish a connection we need to wait for the
2127 	 * backend hotplug scripts to have run.  Once they are run the
2128 	 * "<oename>/hotplug-status" property will be set to "connected".
2129 	 */
2130 	for (;;) {
2131 		ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2132 		ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2133 
2134 		/*
2135 		 * Get the xenbus path to the backend device.  Note that
2136 		 * we can't cache this path (and we look it up on each pass
2137 		 * through this loop) because it could change during
2138 		 * suspend, resume, and migration operations.
2139 		 */
2140 		if ((oename = xvdi_get_oename(dip)) == NULL) {
2141 			mutex_exit(&vdp->xdf_dev_lk);
2142 			mutex_exit(&vdp->xdf_cb_lk);
2143 			return (B_FALSE);
2144 		}
2145 
2146 		str = NULL;
2147 		if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) &&
2148 		    (strcmp(str, XBV_HP_STATUS_CONN) == 0))
2149 			break;
2150 
2151 		if (str != NULL)
2152 			strfree(str);
2153 
2154 		/* wait for an update to "<oename>/hotplug-status" */
2155 		mutex_exit(&vdp->xdf_dev_lk);
2156 		if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) {
2157 			/* we got interrupted by a signal */
2158 			mutex_exit(&vdp->xdf_cb_lk);
2159 			return (B_FALSE);
2160 		}
2161 		mutex_enter(&vdp->xdf_dev_lk);
2162 	}
2163 
2164 	/* Good news.  The backend hotplug scripts have been run. */
2165 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2166 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2167 	ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0);
2168 	strfree(str);
2169 
2170 	/*
2171 	 * If we're emulating a cd device and if the backend doesn't support
2172 	 * media request opreations, then we're not going to bother trying
2173 	 * to establish a connection for a couple reasons.  First off, media
2174 	 * requests support is required to support operations like eject and
2175 	 * media locking.  Second, other backend platforms like Linux don't
2176 	 * support hvm pv cdrom access.  They don't even have a backend pv
2177 	 * driver for cdrom device nodes, so we don't want to block forever
2178 	 * waiting for a connection to a backend driver that doesn't exist.
2179 	 */
2180 	if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) {
2181 		mutex_exit(&vdp->xdf_dev_lk);
2182 		mutex_exit(&vdp->xdf_cb_lk);
2183 		return (B_FALSE);
2184 	}
2185 
2186 	rv = xdf_connect_locked(vdp, B_TRUE);
2187 	mutex_exit(&vdp->xdf_dev_lk);
2188 	mutex_exit(&vdp->xdf_cb_lk);
2189 
2190 	return ((rv == XD_READY) ? B_TRUE : B_FALSE);
2191 }
2192 
2193 int
2194 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2195 {
2196 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2197 
2198 	/* sanity check the requested physical geometry */
2199 	mutex_enter(&vdp->xdf_dev_lk);
2200 	if ((geomp->g_secsize != XB_BSIZE) ||
2201 	    (geomp->g_capacity == 0)) {
2202 		mutex_exit(&vdp->xdf_dev_lk);
2203 		return (EINVAL);
2204 	}
2205 
2206 	/*
2207 	 * If we've already connected to the backend device then make sure
2208 	 * we're not defining a physical geometry larger than our backend
2209 	 * device.
2210 	 */
2211 	if ((vdp->xdf_xdev_nblocks != 0) &&
2212 	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2213 		mutex_exit(&vdp->xdf_dev_lk);
2214 		return (EINVAL);
2215 	}
2216 
2217 	bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom));
2218 	vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl;
2219 	vdp->xdf_pgeom.g_acyl = geomp->g_acyl;
2220 	vdp->xdf_pgeom.g_nhead = geomp->g_nhead;
2221 	vdp->xdf_pgeom.g_nsect = geomp->g_nsect;
2222 	vdp->xdf_pgeom.g_secsize = geomp->g_secsize;
2223 	vdp->xdf_pgeom.g_capacity = geomp->g_capacity;
2224 	vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv;
2225 	vdp->xdf_pgeom.g_rpm = geomp->g_rpm;
2226 
2227 	vdp->xdf_pgeom_fixed = B_TRUE;
2228 	mutex_exit(&vdp->xdf_dev_lk);
2229 
2230 	/* force a re-validation */
2231 	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2232 
2233 	return (0);
2234 }
2235 
2236 boolean_t
2237 xdf_is_cd(dev_info_t *dip)
2238 {
2239 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2240 	boolean_t	rv;
2241 
2242 	mutex_enter(&vdp->xdf_cb_lk);
2243 	rv = XD_IS_CD(vdp);
2244 	mutex_exit(&vdp->xdf_cb_lk);
2245 	return (rv);
2246 }
2247 
2248 boolean_t
2249 xdf_is_rm(dev_info_t *dip)
2250 {
2251 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2252 	boolean_t	rv;
2253 
2254 	mutex_enter(&vdp->xdf_cb_lk);
2255 	rv = XD_IS_RM(vdp);
2256 	mutex_exit(&vdp->xdf_cb_lk);
2257 	return (rv);
2258 }
2259 
2260 boolean_t
2261 xdf_media_req_supported(dev_info_t *dip)
2262 {
2263 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2264 	boolean_t	rv;
2265 
2266 	mutex_enter(&vdp->xdf_cb_lk);
2267 	rv = vdp->xdf_media_req_supported;
2268 	mutex_exit(&vdp->xdf_cb_lk);
2269 	return (rv);
2270 }
2271 
2272 #endif /* XPV_HVM_DRIVER */
2273 
2274 static int
2275 xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp)
2276 {
2277 	xdf_t *vdp;
2278 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2279 
2280 	if (vdp == NULL)
2281 		return (ENXIO);
2282 
2283 	mutex_enter(&vdp->xdf_dev_lk);
2284 	*capp = vdp->xdf_pgeom.g_capacity;
2285 	DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp));
2286 	mutex_exit(&vdp->xdf_dev_lk);
2287 	return (0);
2288 }
2289 
2290 static int
2291 xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2292 {
2293 	xdf_t *vdp;
2294 
2295 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
2296 		return (ENXIO);
2297 	*geomp = vdp->xdf_pgeom;
2298 	return (0);
2299 }
2300 
2301 /*
2302  * No real HBA, no geometry available from it
2303  */
2304 /*ARGSUSED*/
2305 static int
2306 xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2307 {
2308 	return (EINVAL);
2309 }
2310 
2311 static int
2312 xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep)
2313 {
2314 	xdf_t *vdp;
2315 
2316 	if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))))
2317 		return (ENXIO);
2318 
2319 	if (XD_IS_RO(vdp))
2320 		tgattributep->media_is_writable = 0;
2321 	else
2322 		tgattributep->media_is_writable = 1;
2323 	return (0);
2324 }
2325 
2326 /* ARGSUSED3 */
2327 int
2328 xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
2329 {
2330 	switch (cmd) {
2331 	case TG_GETPHYGEOM:
2332 		return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg));
2333 	case TG_GETVIRTGEOM:
2334 		return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg));
2335 	case TG_GETCAPACITY:
2336 		return (xdf_lb_getcap(dip, (diskaddr_t *)arg));
2337 	case TG_GETBLOCKSIZE:
2338 		*(uint32_t *)arg = XB_BSIZE;
2339 		return (0);
2340 	case TG_GETATTR:
2341 		return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg));
2342 	default:
2343 		return (ENOTTY);
2344 	}
2345 }
2346 
2347 /* ARGSUSED5 */
2348 int
2349 xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp,
2350     diskaddr_t start, size_t reqlen, void *tg_cookie)
2351 {
2352 	xdf_t *vdp;
2353 	struct buf *bp;
2354 	int err = 0;
2355 
2356 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2357 
2358 	/* We don't allow IO from the oe_change callback thread */
2359 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2360 
2361 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
2362 		return (EINVAL);
2363 
2364 	bp = getrbuf(KM_SLEEP);
2365 	if (cmd == TG_READ)
2366 		bp->b_flags = B_BUSY | B_READ;
2367 	else
2368 		bp->b_flags = B_BUSY | B_WRITE;
2369 	bp->b_un.b_addr = bufp;
2370 	bp->b_bcount = reqlen;
2371 	bp->b_blkno = start;
2372 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
2373 
2374 	mutex_enter(&vdp->xdf_dev_lk);
2375 	xdf_bp_push(vdp, bp);
2376 	mutex_exit(&vdp->xdf_dev_lk);
2377 	xdf_io_start(vdp);
2378 	if (curthread == vdp->xdf_ready_tq_thread)
2379 		(void) xdf_ring_drain(vdp);
2380 	err = biowait(bp);
2381 	ASSERT(bp->b_flags & B_DONE);
2382 	freerbuf(bp);
2383 	return (err);
2384 }
2385 
2386 /*
2387  * Lock the current media.  Set the media state to "lock".
2388  * (Media locks are only respected by the backend driver.)
2389  */
2390 static int
2391 xdf_ioctl_mlock(xdf_t *vdp)
2392 {
2393 	int rv;
2394 	mutex_enter(&vdp->xdf_cb_lk);
2395 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2396 	mutex_exit(&vdp->xdf_cb_lk);
2397 	return (rv);
2398 }
2399 
2400 /*
2401  * Release a media lock.  Set the media state to "none".
2402  */
2403 static int
2404 xdf_ioctl_munlock(xdf_t *vdp)
2405 {
2406 	int rv;
2407 	mutex_enter(&vdp->xdf_cb_lk);
2408 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE);
2409 	mutex_exit(&vdp->xdf_cb_lk);
2410 	return (rv);
2411 }
2412 
2413 /*
2414  * Eject the current media.  Ignores any media locks.  (Media locks
2415  * are only for benifit of the the backend.)
2416  */
2417 static int
2418 xdf_ioctl_eject(xdf_t *vdp)
2419 {
2420 	int rv;
2421 
2422 	mutex_enter(&vdp->xdf_cb_lk);
2423 	if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) {
2424 		mutex_exit(&vdp->xdf_cb_lk);
2425 		return (rv);
2426 	}
2427 
2428 	/*
2429 	 * We've set the media requests xenbus parameter to eject, so now
2430 	 * disconnect from the backend, wait for the backend to clear
2431 	 * the media requets xenbus paramter, and then we can reconnect
2432 	 * to the backend.
2433 	 */
2434 	(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2435 	mutex_enter(&vdp->xdf_dev_lk);
2436 	if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) {
2437 		mutex_exit(&vdp->xdf_dev_lk);
2438 		mutex_exit(&vdp->xdf_cb_lk);
2439 		return (EIO);
2440 	}
2441 	mutex_exit(&vdp->xdf_dev_lk);
2442 	mutex_exit(&vdp->xdf_cb_lk);
2443 	return (0);
2444 }
2445 
2446 /*
2447  * Watch for media state changes.  This can be an insertion of a device
2448  * (triggered by a 'xm block-configure' request in another domain) or
2449  * the ejection of a device (triggered by a local "eject" operation).
2450  * For a full description of the DKIOCSTATE ioctl behavior see dkio(7I).
2451  */
2452 static int
2453 xdf_dkstate(xdf_t *vdp, enum dkio_state mstate)
2454 {
2455 	enum dkio_state		prev_state;
2456 
2457 	mutex_enter(&vdp->xdf_cb_lk);
2458 	prev_state = vdp->xdf_mstate;
2459 
2460 	if (vdp->xdf_mstate == mstate) {
2461 		while (vdp->xdf_mstate == prev_state) {
2462 			if (cv_wait_sig(&vdp->xdf_mstate_cv,
2463 			    &vdp->xdf_cb_lk) == 0) {
2464 				mutex_exit(&vdp->xdf_cb_lk);
2465 				return (EINTR);
2466 			}
2467 		}
2468 	}
2469 
2470 	if ((prev_state != DKIO_INSERTED) &&
2471 	    (vdp->xdf_mstate == DKIO_INSERTED)) {
2472 		(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2473 		mutex_exit(&vdp->xdf_cb_lk);
2474 		return (0);
2475 	}
2476 
2477 	mutex_exit(&vdp->xdf_cb_lk);
2478 	return (0);
2479 }
2480 
2481 /*ARGSUSED*/
2482 static int
2483 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2484     int *rvalp)
2485 {
2486 	minor_t		minor = getminor(dev);
2487 	int		part = XDF_PART(minor);
2488 	xdf_t		*vdp;
2489 	int		rv;
2490 
2491 	if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) ||
2492 	    (!xdf_isopen(vdp, part)))
2493 		return (ENXIO);
2494 
2495 	DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n",
2496 	    vdp->xdf_addr, cmd, cmd));
2497 
2498 	switch (cmd) {
2499 	default:
2500 		return (ENOTTY);
2501 	case DKIOCG_PHYGEOM:
2502 	case DKIOCG_VIRTGEOM:
2503 	case DKIOCGGEOM:
2504 	case DKIOCSGEOM:
2505 	case DKIOCGAPART:
2506 	case DKIOCSAPART:
2507 	case DKIOCGVTOC:
2508 	case DKIOCSVTOC:
2509 	case DKIOCPARTINFO:
2510 	case DKIOCGEXTVTOC:
2511 	case DKIOCSEXTVTOC:
2512 	case DKIOCEXTPARTINFO:
2513 	case DKIOCGMBOOT:
2514 	case DKIOCSMBOOT:
2515 	case DKIOCGETEFI:
2516 	case DKIOCSETEFI:
2517 	case DKIOCPARTITION:
2518 		return (cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
2519 		    rvalp, NULL));
2520 	case FDEJECT:
2521 	case DKIOCEJECT:
2522 	case CDROMEJECT:
2523 		return (xdf_ioctl_eject(vdp));
2524 	case DKIOCLOCK:
2525 		return (xdf_ioctl_mlock(vdp));
2526 	case DKIOCUNLOCK:
2527 		return (xdf_ioctl_munlock(vdp));
2528 	case CDROMREADOFFSET: {
2529 		int offset = 0;
2530 		if (!XD_IS_CD(vdp))
2531 			return (ENOTTY);
2532 		if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode))
2533 			return (EFAULT);
2534 		return (0);
2535 	}
2536 	case DKIOCGMEDIAINFO: {
2537 		struct dk_minfo media_info;
2538 
2539 		media_info.dki_lbsize = DEV_BSIZE;
2540 		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
2541 		if (XD_IS_CD(vdp))
2542 			media_info.dki_media_type = DK_CDROM;
2543 		else
2544 			media_info.dki_media_type = DK_FIXED_DISK;
2545 
2546 		if (ddi_copyout(&media_info, (void *)arg,
2547 		    sizeof (struct dk_minfo), mode))
2548 			return (EFAULT);
2549 		return (0);
2550 	}
2551 	case DKIOCINFO: {
2552 		struct dk_cinfo info;
2553 
2554 		/* controller information */
2555 		if (XD_IS_CD(vdp))
2556 			info.dki_ctype = DKC_CDROM;
2557 		else
2558 			info.dki_ctype = DKC_VBD;
2559 
2560 		info.dki_cnum = 0;
2561 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
2562 
2563 		/* unit information */
2564 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
2565 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
2566 		info.dki_flags = DKI_FMTVOL;
2567 		info.dki_partition = part;
2568 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
2569 		info.dki_addr = 0;
2570 		info.dki_space = 0;
2571 		info.dki_prio = 0;
2572 		info.dki_vec = 0;
2573 
2574 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
2575 			return (EFAULT);
2576 		return (0);
2577 	}
2578 	case DKIOCSTATE: {
2579 		enum dkio_state mstate;
2580 
2581 		if (ddi_copyin((void *)arg, &mstate,
2582 		    sizeof (mstate), mode) != 0)
2583 			return (EFAULT);
2584 		if ((rv = xdf_dkstate(vdp, mstate)) != 0)
2585 			return (rv);
2586 		mstate = vdp->xdf_mstate;
2587 		if (ddi_copyout(&mstate, (void *)arg,
2588 		    sizeof (mstate), mode) != 0)
2589 			return (EFAULT);
2590 		return (0);
2591 	}
2592 	case DKIOCREMOVABLE: {
2593 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2594 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode))
2595 			return (EFAULT);
2596 		return (0);
2597 	}
2598 	case DKIOCGETWCE: {
2599 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2600 		if (ddi_copyout(&i, (void *)arg, sizeof (i), mode))
2601 			return (EFAULT);
2602 		return (0);
2603 	}
2604 	case DKIOCSETWCE: {
2605 		int i;
2606 		if (ddi_copyin((void *)arg, &i, sizeof (i), mode))
2607 			return (EFAULT);
2608 		vdp->xdf_wce = VOID2BOOLEAN(i);
2609 		return (0);
2610 	}
2611 	case DKIOCFLUSHWRITECACHE: {
2612 		struct dk_callback *dkc = (struct dk_callback *)arg;
2613 
2614 		if (vdp->xdf_flush_supported) {
2615 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2616 			    NULL, 0, 0, (void *)dev);
2617 		} else if (vdp->xdf_feature_barrier &&
2618 		    !xdf_barrier_flush_disable) {
2619 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2620 			    vdp->xdf_cache_flush_block, xdf_flush_block,
2621 			    DEV_BSIZE, (void *)dev);
2622 		} else {
2623 			return (ENOTTY);
2624 		}
2625 		if ((mode & FKIOCTL) && (dkc != NULL) &&
2626 		    (dkc->dkc_callback != NULL)) {
2627 			(*dkc->dkc_callback)(dkc->dkc_cookie, rv);
2628 			/* need to return 0 after calling callback */
2629 			rv = 0;
2630 		}
2631 		return (rv);
2632 	}
2633 	}
2634 	/*NOTREACHED*/
2635 }
2636 
2637 static int
2638 xdf_strategy(struct buf *bp)
2639 {
2640 	xdf_t	*vdp;
2641 	minor_t minor;
2642 	diskaddr_t p_blkct, p_blkst;
2643 	ulong_t nblks;
2644 	int part;
2645 
2646 	minor = getminor(bp->b_edev);
2647 	part = XDF_PART(minor);
2648 	vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor));
2649 
2650 	mutex_enter(&vdp->xdf_dev_lk);
2651 	if (!xdf_isopen(vdp, part)) {
2652 		mutex_exit(&vdp->xdf_dev_lk);
2653 		xdf_io_err(bp, ENXIO, 0);
2654 		return (0);
2655 	}
2656 
2657 	/* We don't allow IO from the oe_change callback thread */
2658 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2659 
2660 	/* Check for writes to a read only device */
2661 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
2662 		mutex_exit(&vdp->xdf_dev_lk);
2663 		xdf_io_err(bp, EROFS, 0);
2664 		return (0);
2665 	}
2666 
2667 	/* Check if this I/O is accessing a partition or the entire disk */
2668 	if ((long)bp->b_private == XB_SLICE_NONE) {
2669 		/* This I/O is using an absolute offset */
2670 		p_blkct = vdp->xdf_xdev_nblocks;
2671 		p_blkst = 0;
2672 	} else {
2673 		/* This I/O is using a partition relative offset */
2674 		mutex_exit(&vdp->xdf_dev_lk);
2675 		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
2676 		    &p_blkst, NULL, NULL, NULL)) {
2677 			xdf_io_err(bp, ENXIO, 0);
2678 			return (0);
2679 		}
2680 		mutex_enter(&vdp->xdf_dev_lk);
2681 	}
2682 
2683 	/* check for a starting block beyond the disk or partition limit */
2684 	if (bp->b_blkno > p_blkct) {
2685 		DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64,
2686 		    vdp->xdf_addr, (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
2687 		xdf_io_err(bp, EINVAL, 0);
2688 		return (0);
2689 	}
2690 
2691 	/* Legacy: don't set error flag at this case */
2692 	if (bp->b_blkno == p_blkct) {
2693 		bp->b_resid = bp->b_bcount;
2694 		biodone(bp);
2695 		return (0);
2696 	}
2697 
2698 	/* sanitize the input buf */
2699 	bioerror(bp, 0);
2700 	bp->b_resid = 0;
2701 	bp->av_back = bp->av_forw = NULL;
2702 
2703 	/* Adjust for partial transfer, this will result in an error later */
2704 	nblks = bp->b_bcount >> XB_BSHIFT;
2705 	if ((bp->b_blkno + nblks) > p_blkct) {
2706 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
2707 		bp->b_bcount -= bp->b_resid;
2708 	}
2709 
2710 	DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n",
2711 	    vdp->xdf_addr, (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
2712 
2713 	/* Fix up the buf struct */
2714 	bp->b_flags |= B_BUSY;
2715 	bp->b_private = (void *)(uintptr_t)p_blkst;
2716 
2717 	xdf_bp_push(vdp, bp);
2718 	mutex_exit(&vdp->xdf_dev_lk);
2719 	xdf_io_start(vdp);
2720 	if (do_polled_io)
2721 		(void) xdf_ring_drain(vdp);
2722 	return (0);
2723 }
2724 
2725 /*ARGSUSED*/
2726 static int
2727 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
2728 {
2729 	xdf_t	*vdp;
2730 	minor_t minor;
2731 	diskaddr_t p_blkcnt;
2732 	int part;
2733 
2734 	minor = getminor(dev);
2735 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2736 		return (ENXIO);
2737 
2738 	DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n",
2739 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2740 
2741 	part = XDF_PART(minor);
2742 	if (!xdf_isopen(vdp, part))
2743 		return (ENXIO);
2744 
2745 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2746 	    NULL, NULL, NULL, NULL))
2747 		return (ENXIO);
2748 
2749 	if (U_INVAL(uiop))
2750 		return (EINVAL);
2751 
2752 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
2753 }
2754 
2755 /*ARGSUSED*/
2756 static int
2757 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
2758 {
2759 	xdf_t *vdp;
2760 	minor_t minor;
2761 	diskaddr_t p_blkcnt;
2762 	int part;
2763 
2764 	minor = getminor(dev);
2765 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2766 		return (ENXIO);
2767 
2768 	DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n",
2769 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2770 
2771 	part = XDF_PART(minor);
2772 	if (!xdf_isopen(vdp, part))
2773 		return (ENXIO);
2774 
2775 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2776 	    NULL, NULL, NULL, NULL))
2777 		return (ENXIO);
2778 
2779 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
2780 		return (ENOSPC);
2781 
2782 	if (U_INVAL(uiop))
2783 		return (EINVAL);
2784 
2785 	return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop));
2786 }
2787 
2788 /*ARGSUSED*/
2789 static int
2790 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
2791 {
2792 	xdf_t	*vdp;
2793 	minor_t minor;
2794 	struct uio *uiop = aiop->aio_uio;
2795 	diskaddr_t p_blkcnt;
2796 	int part;
2797 
2798 	minor = getminor(dev);
2799 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2800 		return (ENXIO);
2801 
2802 	part = XDF_PART(minor);
2803 	if (!xdf_isopen(vdp, part))
2804 		return (ENXIO);
2805 
2806 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2807 	    NULL, NULL, NULL, NULL))
2808 		return (ENXIO);
2809 
2810 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
2811 		return (ENOSPC);
2812 
2813 	if (U_INVAL(uiop))
2814 		return (EINVAL);
2815 
2816 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop));
2817 }
2818 
2819 /*ARGSUSED*/
2820 static int
2821 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
2822 {
2823 	xdf_t *vdp;
2824 	minor_t minor;
2825 	struct uio *uiop = aiop->aio_uio;
2826 	diskaddr_t p_blkcnt;
2827 	int part;
2828 
2829 	minor = getminor(dev);
2830 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2831 		return (ENXIO);
2832 
2833 	part = XDF_PART(minor);
2834 	if (!xdf_isopen(vdp, part))
2835 		return (ENXIO);
2836 
2837 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2838 	    NULL, NULL, NULL, NULL))
2839 		return (ENXIO);
2840 
2841 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
2842 		return (ENOSPC);
2843 
2844 	if (U_INVAL(uiop))
2845 		return (EINVAL);
2846 
2847 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop));
2848 }
2849 
2850 static int
2851 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
2852 {
2853 	struct buf dumpbuf, *dbp = &dumpbuf;
2854 	xdf_t	*vdp;
2855 	minor_t minor;
2856 	int err = 0;
2857 	int part;
2858 	diskaddr_t p_blkcnt, p_blkst;
2859 
2860 	minor = getminor(dev);
2861 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2862 		return (ENXIO);
2863 
2864 	DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n",
2865 	    vdp->xdf_addr, (void *)addr, blkno, nblk));
2866 
2867 	/* We don't allow IO from the oe_change callback thread */
2868 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2869 
2870 	part = XDF_PART(minor);
2871 	if (!xdf_isopen(vdp, part))
2872 		return (ENXIO);
2873 
2874 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
2875 	    NULL, NULL, NULL))
2876 		return (ENXIO);
2877 
2878 	if ((blkno + nblk) > p_blkcnt) {
2879 		cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64,
2880 		    vdp->xdf_addr, blkno + nblk, (uint64_t)p_blkcnt);
2881 		return (EINVAL);
2882 	}
2883 
2884 	bioinit(dbp);
2885 	dbp->b_flags = B_BUSY;
2886 	dbp->b_un.b_addr = addr;
2887 	dbp->b_bcount = nblk << DEV_BSHIFT;
2888 	dbp->b_blkno = blkno;
2889 	dbp->b_edev = dev;
2890 	dbp->b_private = (void *)(uintptr_t)p_blkst;
2891 
2892 	mutex_enter(&vdp->xdf_dev_lk);
2893 	xdf_bp_push(vdp, dbp);
2894 	mutex_exit(&vdp->xdf_dev_lk);
2895 	xdf_io_start(vdp);
2896 	err = xdf_ring_drain(vdp);
2897 	biofini(dbp);
2898 	return (err);
2899 }
2900 
2901 /*ARGSUSED*/
2902 static int
2903 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
2904 {
2905 	minor_t	minor;
2906 	xdf_t	*vdp;
2907 	int part;
2908 	ulong_t parbit;
2909 
2910 	minor = getminor(dev);
2911 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2912 		return (ENXIO);
2913 
2914 	mutex_enter(&vdp->xdf_dev_lk);
2915 	part = XDF_PART(minor);
2916 	if (!xdf_isopen(vdp, part)) {
2917 		mutex_exit(&vdp->xdf_dev_lk);
2918 		return (ENXIO);
2919 	}
2920 	parbit = 1 << part;
2921 
2922 	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
2923 	if (otyp == OTYP_LYR) {
2924 		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
2925 		if (--vdp->xdf_vd_lyropen[part] == 0)
2926 			vdp->xdf_vd_open[otyp] &= ~parbit;
2927 	} else {
2928 		vdp->xdf_vd_open[otyp] &= ~parbit;
2929 	}
2930 	vdp->xdf_vd_exclopen &= ~parbit;
2931 
2932 	mutex_exit(&vdp->xdf_dev_lk);
2933 	return (0);
2934 }
2935 
2936 static int
2937 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2938 {
2939 	minor_t	minor;
2940 	xdf_t	*vdp;
2941 	int part;
2942 	ulong_t parbit;
2943 	diskaddr_t p_blkct = 0;
2944 	boolean_t firstopen;
2945 	boolean_t nodelay;
2946 
2947 	minor = getminor(*devp);
2948 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2949 		return (ENXIO);
2950 
2951 	nodelay = (flag & (FNDELAY | FNONBLOCK));
2952 
2953 	DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr));
2954 
2955 	/* do cv_wait until connected or failed */
2956 	mutex_enter(&vdp->xdf_cb_lk);
2957 	mutex_enter(&vdp->xdf_dev_lk);
2958 	if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) {
2959 		mutex_exit(&vdp->xdf_dev_lk);
2960 		mutex_exit(&vdp->xdf_cb_lk);
2961 		return (ENXIO);
2962 	}
2963 	mutex_exit(&vdp->xdf_cb_lk);
2964 
2965 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
2966 		mutex_exit(&vdp->xdf_dev_lk);
2967 		return (EROFS);
2968 	}
2969 
2970 	part = XDF_PART(minor);
2971 	parbit = 1 << part;
2972 	if ((vdp->xdf_vd_exclopen & parbit) ||
2973 	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
2974 		mutex_exit(&vdp->xdf_dev_lk);
2975 		return (EBUSY);
2976 	}
2977 
2978 	/* are we the first one to open this node? */
2979 	firstopen = !xdf_isopen(vdp, -1);
2980 
2981 	if (otyp == OTYP_LYR)
2982 		vdp->xdf_vd_lyropen[part]++;
2983 
2984 	vdp->xdf_vd_open[otyp] |= parbit;
2985 
2986 	if (flag & FEXCL)
2987 		vdp->xdf_vd_exclopen |= parbit;
2988 
2989 	mutex_exit(&vdp->xdf_dev_lk);
2990 
2991 	/* force a re-validation */
2992 	if (firstopen)
2993 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2994 
2995 	/* If this is a non-blocking open then we're done */
2996 	if (nodelay)
2997 		return (0);
2998 
2999 	/*
3000 	 * This is a blocking open, so we require:
3001 	 * - that the disk have a valid label on it
3002 	 * - that the size of the partition that we're opening is non-zero
3003 	 */
3004 	if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
3005 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) {
3006 		(void) xdf_close(*devp, flag, otyp, credp);
3007 		return (ENXIO);
3008 	}
3009 
3010 	return (0);
3011 }
3012 
3013 /*ARGSUSED*/
3014 static void
3015 xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg)
3016 {
3017 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
3018 	cv_broadcast(&vdp->xdf_hp_status_cv);
3019 }
3020 
3021 static int
3022 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
3023 	char *name, caddr_t valuep, int *lengthp)
3024 {
3025 	xdf_t	*vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
3026 
3027 	/*
3028 	 * Sanity check that if a dev_t or dip were specified that they
3029 	 * correspond to this device driver.  On debug kernels we'll
3030 	 * panic and on non-debug kernels we'll return failure.
3031 	 */
3032 	ASSERT(ddi_driver_major(dip) == xdf_major);
3033 	ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major));
3034 	if ((ddi_driver_major(dip) != xdf_major) ||
3035 	    ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major)))
3036 		return (DDI_PROP_NOT_FOUND);
3037 
3038 	if (vdp == NULL)
3039 		return (ddi_prop_op(dev, dip, prop_op, flags,
3040 		    name, valuep, lengthp));
3041 
3042 	return (cmlb_prop_op(vdp->xdf_vd_lbl,
3043 	    dev, dip, prop_op, flags, name, valuep, lengthp,
3044 	    XDF_PART(getminor(dev)), NULL));
3045 }
3046 
3047 /*ARGSUSED*/
3048 static int
3049 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
3050 {
3051 	int	instance = XDF_INST(getminor((dev_t)arg));
3052 	xdf_t	*vbdp;
3053 
3054 	switch (cmd) {
3055 	case DDI_INFO_DEVT2DEVINFO:
3056 		if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) {
3057 			*rp = NULL;
3058 			return (DDI_FAILURE);
3059 		}
3060 		*rp = vbdp->xdf_dip;
3061 		return (DDI_SUCCESS);
3062 
3063 	case DDI_INFO_DEVT2INSTANCE:
3064 		*rp = (void *)(uintptr_t)instance;
3065 		return (DDI_SUCCESS);
3066 
3067 	default:
3068 		return (DDI_FAILURE);
3069 	}
3070 }
3071 
3072 /*ARGSUSED*/
3073 static int
3074 xdf_resume(dev_info_t *dip)
3075 {
3076 	xdf_t	*vdp;
3077 	char	*oename;
3078 
3079 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
3080 		goto err;
3081 
3082 	if (xdf_debug & SUSRES_DBG)
3083 		xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr);
3084 
3085 	mutex_enter(&vdp->xdf_cb_lk);
3086 
3087 	if (xvdi_resume(dip) != DDI_SUCCESS) {
3088 		mutex_exit(&vdp->xdf_cb_lk);
3089 		goto err;
3090 	}
3091 
3092 	if (((oename = xvdi_get_oename(dip)) == NULL) ||
3093 	    (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3094 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) {
3095 		mutex_exit(&vdp->xdf_cb_lk);
3096 		goto err;
3097 	}
3098 
3099 	mutex_enter(&vdp->xdf_dev_lk);
3100 	ASSERT(vdp->xdf_state != XD_READY);
3101 	xdf_set_state(vdp, XD_UNKNOWN);
3102 	mutex_exit(&vdp->xdf_dev_lk);
3103 
3104 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3105 		mutex_exit(&vdp->xdf_cb_lk);
3106 		goto err;
3107 	}
3108 
3109 	mutex_exit(&vdp->xdf_cb_lk);
3110 
3111 	if (xdf_debug & SUSRES_DBG)
3112 		xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr);
3113 	return (DDI_SUCCESS);
3114 err:
3115 	if (xdf_debug & SUSRES_DBG)
3116 		xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr);
3117 	return (DDI_FAILURE);
3118 }
3119 
3120 static int
3121 xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3122 {
3123 	int			n, instance = ddi_get_instance(dip);
3124 	ddi_iblock_cookie_t	ibc, softibc;
3125 	boolean_t		dev_iscd = B_FALSE;
3126 	xdf_t			*vdp;
3127 	char			*oename, *xsname, *str;
3128 
3129 	if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM,
3130 	    "xdf_debug", 0)) != 0)
3131 		xdf_debug = n;
3132 
3133 	switch (cmd) {
3134 	case DDI_RESUME:
3135 		return (xdf_resume(dip));
3136 	case DDI_ATTACH:
3137 		break;
3138 	default:
3139 		return (DDI_FAILURE);
3140 	}
3141 	/* DDI_ATTACH */
3142 
3143 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
3144 	    ((oename = xvdi_get_oename(dip)) == NULL))
3145 		return (DDI_FAILURE);
3146 
3147 	/*
3148 	 * Disable auto-detach.  This is necessary so that we don't get
3149 	 * detached while we're disconnected from the back end.
3150 	 */
3151 	if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip,
3152 	    DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS))
3153 		return (DDI_FAILURE);
3154 
3155 	/* driver handles kernel-issued IOCTLs */
3156 	if (ddi_prop_create(DDI_DEV_T_NONE, dip,
3157 	    DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS)
3158 		return (DDI_FAILURE);
3159 
3160 	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
3161 		return (DDI_FAILURE);
3162 
3163 	if (ddi_get_soft_iblock_cookie(dip,
3164 	    DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS)
3165 		return (DDI_FAILURE);
3166 
3167 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
3168 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
3169 		    ddi_get_name_addr(dip));
3170 		return (DDI_FAILURE);
3171 	}
3172 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
3173 		dev_iscd = B_TRUE;
3174 	strfree(str);
3175 
3176 	if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS)
3177 		return (DDI_FAILURE);
3178 
3179 	DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip)));
3180 	vdp = ddi_get_soft_state(xdf_ssp, instance);
3181 	ddi_set_driver_private(dip, vdp);
3182 	vdp->xdf_dip = dip;
3183 	vdp->xdf_addr = ddi_get_name_addr(dip);
3184 	vdp->xdf_suspending = B_FALSE;
3185 	vdp->xdf_media_req_supported = B_FALSE;
3186 	vdp->xdf_peer = INVALID_DOMID;
3187 	vdp->xdf_evtchn = INVALID_EVTCHN;
3188 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
3189 	    offsetof(v_req_t, v_link));
3190 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
3191 	cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL);
3192 	cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL);
3193 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3194 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3195 	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3196 	vdp->xdf_cmbl_reattach = B_TRUE;
3197 	if (dev_iscd) {
3198 		vdp->xdf_dinfo |= VDISK_CDROM;
3199 		vdp->xdf_mstate = DKIO_EJECTED;
3200 	} else {
3201 		vdp->xdf_mstate = DKIO_NONE;
3202 	}
3203 
3204 	if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq",
3205 	    1, TASKQ_DEFAULTPRI, 0)) == NULL)
3206 		goto errout0;
3207 
3208 	if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3209 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)
3210 		goto errout0;
3211 
3212 	if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
3213 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
3214 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
3215 		    ddi_get_name_addr(dip));
3216 		goto errout0;
3217 	}
3218 
3219 	/*
3220 	 * Initialize the physical geometry stucture.  Note that currently
3221 	 * we don't know the size of the backend device so the number
3222 	 * of blocks on the device will be initialized to zero.  Once
3223 	 * we connect to the backend device we'll update the physical
3224 	 * geometry to reflect the real size of the device.
3225 	 */
3226 	xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom);
3227 	vdp->xdf_pgeom_fixed = B_FALSE;
3228 
3229 	/*
3230 	 * create default device minor nodes: non-removable disk
3231 	 * we will adjust minor nodes after we are connected w/ backend
3232 	 */
3233 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
3234 	if (xdf_cmlb_attach(vdp) != 0) {
3235 		cmn_err(CE_WARN,
3236 		    "xdf@%s: attach failed, cmlb attach failed",
3237 		    ddi_get_name_addr(dip));
3238 		goto errout0;
3239 	}
3240 
3241 	/*
3242 	 * We ship with cache-enabled disks
3243 	 */
3244 	vdp->xdf_wce = B_TRUE;
3245 
3246 	mutex_enter(&vdp->xdf_cb_lk);
3247 	/* Watch backend XenbusState change */
3248 	if (xvdi_add_event_handler(dip,
3249 	    XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) {
3250 		mutex_exit(&vdp->xdf_cb_lk);
3251 		goto errout0;
3252 	}
3253 
3254 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3255 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
3256 		    ddi_get_name_addr(dip));
3257 		mutex_exit(&vdp->xdf_cb_lk);
3258 		goto errout1;
3259 	}
3260 	mutex_exit(&vdp->xdf_cb_lk);
3261 
3262 #if defined(XPV_HVM_DRIVER)
3263 
3264 	xdf_hvm_add(dip);
3265 
3266 	/* Report our version to dom0.  */
3267 	if (xenbus_printf(XBT_NULL, "hvmpv/xdf", "version", "%d",
3268 	    HVMPV_XDF_VERS))
3269 		cmn_err(CE_WARN, "xdf: couldn't write version\n");
3270 
3271 #else /* !XPV_HVM_DRIVER */
3272 
3273 	/* create kstat for iostat(1M) */
3274 	if (xdf_kstat_create(dip, "xdf", instance) != 0) {
3275 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
3276 		    ddi_get_name_addr(dip));
3277 		goto errout1;
3278 	}
3279 
3280 #endif /* !XPV_HVM_DRIVER */
3281 
3282 	ddi_report_dev(dip);
3283 	DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr));
3284 	return (DDI_SUCCESS);
3285 
3286 errout1:
3287 	(void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed);
3288 	xvdi_remove_event_handler(dip, XS_OE_STATE);
3289 errout0:
3290 	if (vdp->xdf_vd_lbl != NULL) {
3291 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
3292 		cmlb_free_handle(&vdp->xdf_vd_lbl);
3293 		vdp->xdf_vd_lbl = NULL;
3294 	}
3295 	if (vdp->xdf_softintr_id != NULL)
3296 		ddi_remove_softintr(vdp->xdf_softintr_id);
3297 	xvdi_remove_xb_watch_handlers(dip);
3298 	if (vdp->xdf_ready_tq != NULL)
3299 		ddi_taskq_destroy(vdp->xdf_ready_tq);
3300 	mutex_destroy(&vdp->xdf_cb_lk);
3301 	mutex_destroy(&vdp->xdf_dev_lk);
3302 	cv_destroy(&vdp->xdf_dev_cv);
3303 	cv_destroy(&vdp->xdf_hp_status_cv);
3304 	ddi_soft_state_free(xdf_ssp, instance);
3305 	ddi_set_driver_private(dip, NULL);
3306 	ddi_prop_remove_all(dip);
3307 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip));
3308 	return (DDI_FAILURE);
3309 }
3310 
3311 static int
3312 xdf_suspend(dev_info_t *dip)
3313 {
3314 	int		instance = ddi_get_instance(dip);
3315 	xdf_t		*vdp;
3316 
3317 	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
3318 		return (DDI_FAILURE);
3319 
3320 	if (xdf_debug & SUSRES_DBG)
3321 		xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr);
3322 
3323 	xvdi_suspend(dip);
3324 
3325 	mutex_enter(&vdp->xdf_cb_lk);
3326 	mutex_enter(&vdp->xdf_dev_lk);
3327 
3328 	vdp->xdf_suspending = B_TRUE;
3329 	xdf_ring_destroy(vdp);
3330 	xdf_set_state(vdp, XD_SUSPEND);
3331 	vdp->xdf_suspending = B_FALSE;
3332 
3333 	mutex_exit(&vdp->xdf_dev_lk);
3334 	mutex_exit(&vdp->xdf_cb_lk);
3335 
3336 	if (xdf_debug & SUSRES_DBG)
3337 		xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr);
3338 
3339 	return (DDI_SUCCESS);
3340 }
3341 
3342 static int
3343 xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3344 {
3345 	xdf_t *vdp;
3346 	int instance;
3347 
3348 	switch (cmd) {
3349 
3350 	case DDI_PM_SUSPEND:
3351 		break;
3352 
3353 	case DDI_SUSPEND:
3354 		return (xdf_suspend(dip));
3355 
3356 	case DDI_DETACH:
3357 		break;
3358 
3359 	default:
3360 		return (DDI_FAILURE);
3361 	}
3362 
3363 	instance = ddi_get_instance(dip);
3364 	DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip)));
3365 	vdp = ddi_get_soft_state(xdf_ssp, instance);
3366 
3367 	if (vdp == NULL)
3368 		return (DDI_FAILURE);
3369 
3370 	mutex_enter(&vdp->xdf_cb_lk);
3371 	xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
3372 	if (vdp->xdf_state != XD_CLOSED) {
3373 		mutex_exit(&vdp->xdf_cb_lk);
3374 		return (DDI_FAILURE);
3375 	}
3376 	mutex_exit(&vdp->xdf_cb_lk);
3377 
3378 	ASSERT(!ISDMACBON(vdp));
3379 
3380 #if defined(XPV_HVM_DRIVER)
3381 	xdf_hvm_rm(dip);
3382 #endif /* XPV_HVM_DRIVER */
3383 
3384 	if (vdp->xdf_timeout_id != 0)
3385 		(void) untimeout(vdp->xdf_timeout_id);
3386 
3387 	xvdi_remove_event_handler(dip, XS_OE_STATE);
3388 	ddi_taskq_destroy(vdp->xdf_ready_tq);
3389 
3390 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
3391 	cmlb_free_handle(&vdp->xdf_vd_lbl);
3392 
3393 	/* we'll support backend running in domU later */
3394 #ifdef	DOMU_BACKEND
3395 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
3396 #endif
3397 
3398 	list_destroy(&vdp->xdf_vreq_act);
3399 	ddi_prop_remove_all(dip);
3400 	xdf_kstat_delete(dip);
3401 	ddi_remove_softintr(vdp->xdf_softintr_id);
3402 	xvdi_remove_xb_watch_handlers(dip);
3403 	ddi_set_driver_private(dip, NULL);
3404 	cv_destroy(&vdp->xdf_dev_cv);
3405 	mutex_destroy(&vdp->xdf_cb_lk);
3406 	mutex_destroy(&vdp->xdf_dev_lk);
3407 	if (vdp->xdf_cache_flush_block != NULL)
3408 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
3409 	ddi_soft_state_free(xdf_ssp, instance);
3410 	return (DDI_SUCCESS);
3411 }
3412 
3413 /*
3414  * Driver linkage structures.
3415  */
3416 static struct cb_ops xdf_cbops = {
3417 	xdf_open,
3418 	xdf_close,
3419 	xdf_strategy,
3420 	nodev,
3421 	xdf_dump,
3422 	xdf_read,
3423 	xdf_write,
3424 	xdf_ioctl,
3425 	nodev,
3426 	nodev,
3427 	nodev,
3428 	nochpoll,
3429 	xdf_prop_op,
3430 	NULL,
3431 	D_MP | D_NEW | D_64BIT,
3432 	CB_REV,
3433 	xdf_aread,
3434 	xdf_awrite
3435 };
3436 
3437 struct dev_ops xdf_devops = {
3438 	DEVO_REV,		/* devo_rev */
3439 	0,			/* devo_refcnt */
3440 	xdf_getinfo,		/* devo_getinfo */
3441 	nulldev,		/* devo_identify */
3442 	nulldev,		/* devo_probe */
3443 	xdf_attach,		/* devo_attach */
3444 	xdf_detach,		/* devo_detach */
3445 	nodev,			/* devo_reset */
3446 	&xdf_cbops,		/* devo_cb_ops */
3447 	NULL,			/* devo_bus_ops */
3448 	NULL,			/* devo_power */
3449 	ddi_quiesce_not_supported, /* devo_quiesce */
3450 };
3451 
3452 /*
3453  * Module linkage structures.
3454  */
3455 static struct modldrv modldrv = {
3456 	&mod_driverops,		/* Type of module.  This one is a driver */
3457 	"virtual block driver",	/* short description */
3458 	&xdf_devops		/* driver specific ops */
3459 };
3460 
3461 static struct modlinkage xdf_modlinkage = {
3462 	MODREV_1, (void *)&modldrv, NULL
3463 };
3464 
3465 /*
3466  * standard module entry points
3467  */
3468 int
3469 _init(void)
3470 {
3471 	int rc;
3472 
3473 	xdf_major = ddi_name_to_major("xdf");
3474 	if (xdf_major == (major_t)-1)
3475 		return (EINVAL);
3476 
3477 	if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0)
3478 		return (rc);
3479 
3480 	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
3481 	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3482 	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
3483 	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3484 
3485 #if defined(XPV_HVM_DRIVER)
3486 	xdf_hvm_init();
3487 #endif /* XPV_HVM_DRIVER */
3488 
3489 	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
3490 #if defined(XPV_HVM_DRIVER)
3491 		xdf_hvm_fini();
3492 #endif /* XPV_HVM_DRIVER */
3493 		kmem_cache_destroy(xdf_vreq_cache);
3494 		kmem_cache_destroy(xdf_gs_cache);
3495 		ddi_soft_state_fini(&xdf_ssp);
3496 		return (rc);
3497 	}
3498 
3499 	return (rc);
3500 }
3501 
3502 int
3503 _fini(void)
3504 {
3505 
3506 	int err;
3507 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
3508 		return (err);
3509 
3510 #if defined(XPV_HVM_DRIVER)
3511 	xdf_hvm_fini();
3512 #endif /* XPV_HVM_DRIVER */
3513 
3514 	kmem_cache_destroy(xdf_vreq_cache);
3515 	kmem_cache_destroy(xdf_gs_cache);
3516 	ddi_soft_state_fini(&xdf_ssp);
3517 
3518 	return (0);
3519 }
3520 
3521 int
3522 _info(struct modinfo *modinfop)
3523 {
3524 	return (mod_info(&xdf_modlinkage, modinfop));
3525 }
3526