xref: /illumos-gate/usr/src/uts/common/xen/io/xdf.c (revision bd0ce624be4492bab2f6c53383a40618647aba28)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
29  * Copyright 2017 Nexenta Systems, Inc.
30  */
31 
32 /*
33  * xdf.c - Xen Virtual Block Device Driver
34  * TODO:
35  *	- support alternate block size (currently only DEV_BSIZE supported)
36  *	- revalidate geometry for removable devices
37  *
38  * This driver exports disk device nodes, accepts IO requests from those
39  * nodes, and services those requests by talking to a backend device
40  * in another domain.
41  *
42  * Communication with the backend device is done via a ringbuffer (which is
43  * managed via xvdi interfaces) and dma memory (which is managed via ddi
44  * interfaces).
45  *
46  * Communication with the backend device is dependant upon establishing a
47  * connection to the backend device.  This connection process involves
48  * reading device configuration information from xenbus and publishing
49  * some frontend runtime configuration parameters via the xenbus (for
50  * consumption by the backend).  Once we've published runtime configuration
51  * information via the xenbus, the backend device can enter the connected
52  * state and we'll enter the XD_CONNECTED state.  But before we can allow
53  * random IO to begin, we need to do IO to the backend device to determine
54  * the device label and if flush operations are supported.  Once this is
55  * done we enter the XD_READY state and can process any IO operations.
56  *
57  * We receive notifications of xenbus state changes for the backend device
58  * (aka, the "other end") via the xdf_oe_change() callback.  This callback
59  * is single threaded, meaning that we can't receive new notification of
60  * other end state changes while we're processing an outstanding
61  * notification of an other end state change.  There for we can't do any
62  * blocking operations from the xdf_oe_change() callback.  This is why we
63  * have a seperate taskq (xdf_ready_tq) which exists to do the necessary
64  * IO to get us from the XD_CONNECTED to the XD_READY state.  All IO
65  * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go
66  * throught xdf_lb_rdwr(), which is a synchronous IO interface.  IOs
67  * generated by the xdf_ready_tq_thread thread have priority over all
68  * other IO requests.
69  *
70  * We also communicate with the backend device via the xenbus "media-req"
71  * (XBP_MEDIA_REQ) property.  For more information on this see the
72  * comments in blkif.h.
73  */
74 
75 #include <io/xdf.h>
76 
77 #include <sys/conf.h>
78 #include <sys/dkio.h>
79 #include <sys/promif.h>
80 #include <sys/sysmacros.h>
81 #include <sys/kstat.h>
82 #include <sys/mach_mmu.h>
83 #ifdef XPV_HVM_DRIVER
84 #include <sys/xpv_support.h>
85 #else /* !XPV_HVM_DRIVER */
86 #include <sys/evtchn_impl.h>
87 #endif /* !XPV_HVM_DRIVER */
88 #include <sys/sunndi.h>
89 #include <public/io/xenbus.h>
90 #include <xen/sys/xenbus_impl.h>
91 #include <sys/scsi/generic/inquiry.h>
92 #include <xen/io/blkif_impl.h>
93 #include <sys/fdio.h>
94 #include <sys/cdio.h>
95 
96 /*
97  * DEBUG_EVAL can be used to include debug only statements without
98  * having to use '#ifdef DEBUG' statements
99  */
100 #ifdef DEBUG
101 #define	DEBUG_EVAL(x)	(x)
102 #else /* !DEBUG */
103 #define	DEBUG_EVAL(x)
104 #endif /* !DEBUG */
105 
106 #define	XDF_DRAIN_MSEC_DELAY		(50*1000)	/* 00.05 sec */
107 #define	XDF_DRAIN_RETRY_COUNT		200		/* 10.00 sec */
108 #define	XDF_STATE_TIMEOUT		(30*1000*1000)	/* 30.00 sec */
109 
110 #define	INVALID_DOMID	((domid_t)-1)
111 #define	FLUSH_DISKCACHE	0x1
112 #define	WRITE_BARRIER	0x2
113 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
114 #define	USE_WRITE_BARRIER(vdp)						\
115 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
116 #define	USE_FLUSH_DISKCACHE(vdp)					\
117 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
118 #define	IS_WRITE_BARRIER(vdp, bp)					\
119 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&			\
120 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
121 #define	IS_FLUSH_DISKCACHE(bp)						\
122 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
123 
124 #define	VREQ_DONE(vreq)							\
125 	VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) &&		\
126 	    (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) ||		\
127 	    (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws)))
128 
129 #define	BP_VREQ(bp)		((v_req_t *)((bp)->av_back))
130 #define	BP_VREQ_SET(bp, vreq)	(((bp)->av_back = (buf_t *)(vreq)))
131 
132 extern int		do_polled_io;
133 
134 /* run-time tunables that we don't want the compiler to optimize away */
135 volatile int		xdf_debug = 0;
136 volatile boolean_t	xdf_barrier_flush_disable = B_FALSE;
137 
138 /* per module globals */
139 major_t			xdf_major;
140 static void		*xdf_ssp;
141 static kmem_cache_t	*xdf_vreq_cache;
142 static kmem_cache_t	*xdf_gs_cache;
143 static int		xdf_maxphys = XB_MAXPHYS;
144 static diskaddr_t	xdf_flush_block = DEFAULT_FLUSH_BLOCK;
145 static int		xdf_fbrewrites;	/* flush block re-write count */
146 
147 /* misc public functions */
148 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
149 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
150 
151 /*  misc private functions */
152 static void xdf_io_start(xdf_t *);
153 static void xdf_devid_setup(xdf_t *);
154 
155 /* callbacks from commmon label */
156 static cmlb_tg_ops_t xdf_lb_ops = {
157 	TG_DK_OPS_VERSION_1,
158 	xdf_lb_rdwr,
159 	xdf_lb_getinfo
160 };
161 
162 /*
163  * I/O buffer DMA attributes
164  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
165  */
166 static ddi_dma_attr_t xb_dma_attr = {
167 	DMA_ATTR_V0,
168 	(uint64_t)0,			/* lowest address */
169 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
170 	(uint64_t)0xffffff,		/* DMA counter limit max */
171 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
172 	XB_BSIZE - 1,			/* bitmap of burst sizes */
173 	XB_BSIZE,			/* min transfer */
174 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
175 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
176 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
177 	XB_BSIZE,			/* granularity */
178 	0,				/* flags (reserved) */
179 };
180 
181 static ddi_device_acc_attr_t xc_acc_attr = {
182 	DDI_DEVICE_ATTR_V0,
183 	DDI_NEVERSWAP_ACC,
184 	DDI_STRICTORDER_ACC
185 };
186 
187 static void
188 xdf_timeout_handler(void *arg)
189 {
190 	xdf_t *vdp = arg;
191 
192 	mutex_enter(&vdp->xdf_dev_lk);
193 	vdp->xdf_timeout_id = 0;
194 	mutex_exit(&vdp->xdf_dev_lk);
195 
196 	/* new timeout thread could be re-scheduled */
197 	xdf_io_start(vdp);
198 }
199 
200 /*
201  * callback func when DMA/GTE resources is available
202  *
203  * Note: we only register one callback function to grant table subsystem
204  * since we only have one 'struct gnttab_free_callback' in xdf_t.
205  */
206 static int
207 xdf_dmacallback(caddr_t arg)
208 {
209 	xdf_t *vdp = (xdf_t *)arg;
210 	ASSERT(vdp != NULL);
211 
212 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
213 	    vdp->xdf_addr));
214 
215 	ddi_trigger_softintr(vdp->xdf_softintr_id);
216 	return (DDI_DMA_CALLBACK_DONE);
217 }
218 
219 static ge_slot_t *
220 gs_get(xdf_t *vdp, int isread)
221 {
222 	grant_ref_t gh;
223 	ge_slot_t *gs;
224 
225 	/* try to alloc GTEs needed in this slot, first */
226 	if (gnttab_alloc_grant_references(
227 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
228 		if (vdp->xdf_gnt_callback.next == NULL) {
229 			SETDMACBON(vdp);
230 			gnttab_request_free_callback(
231 			    &vdp->xdf_gnt_callback,
232 			    (void (*)(void *))xdf_dmacallback,
233 			    (void *)vdp,
234 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
235 		}
236 		return (NULL);
237 	}
238 
239 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
240 	if (gs == NULL) {
241 		gnttab_free_grant_references(gh);
242 		if (vdp->xdf_timeout_id == 0)
243 			/* restart I/O after one second */
244 			vdp->xdf_timeout_id =
245 			    timeout(xdf_timeout_handler, vdp, hz);
246 		return (NULL);
247 	}
248 
249 	/* init gs_slot */
250 	gs->gs_oeid = vdp->xdf_peer;
251 	gs->gs_isread = isread;
252 	gs->gs_ghead = gh;
253 	gs->gs_ngrefs = 0;
254 
255 	return (gs);
256 }
257 
258 static void
259 gs_free(ge_slot_t *gs)
260 {
261 	int		i;
262 
263 	/* release all grant table entry resources used in this slot */
264 	for (i = 0; i < gs->gs_ngrefs; i++)
265 		gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0);
266 	gnttab_free_grant_references(gs->gs_ghead);
267 	list_remove(&gs->gs_vreq->v_gs, gs);
268 	kmem_cache_free(xdf_gs_cache, gs);
269 }
270 
271 static grant_ref_t
272 gs_grant(ge_slot_t *gs, mfn_t mfn)
273 {
274 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead);
275 
276 	ASSERT(gr != -1);
277 	ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
278 	gs->gs_ge[gs->gs_ngrefs++] = gr;
279 	gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread);
280 
281 	return (gr);
282 }
283 
284 /*
285  * Alloc a vreq for this bp
286  * bp->av_back contains the pointer to the vreq upon return
287  */
288 static v_req_t *
289 vreq_get(xdf_t *vdp, buf_t *bp)
290 {
291 	v_req_t *vreq = NULL;
292 
293 	ASSERT(BP_VREQ(bp) == NULL);
294 
295 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
296 	if (vreq == NULL) {
297 		if (vdp->xdf_timeout_id == 0)
298 			/* restart I/O after one second */
299 			vdp->xdf_timeout_id =
300 			    timeout(xdf_timeout_handler, vdp, hz);
301 		return (NULL);
302 	}
303 	bzero(vreq, sizeof (v_req_t));
304 	list_create(&vreq->v_gs, sizeof (ge_slot_t),
305 	    offsetof(ge_slot_t, gs_vreq_link));
306 	vreq->v_buf = bp;
307 	vreq->v_status = VREQ_INIT;
308 	vreq->v_runq = B_FALSE;
309 	BP_VREQ_SET(bp, vreq);
310 	/* init of other fields in vreq is up to the caller */
311 
312 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
313 
314 	return (vreq);
315 }
316 
317 static void
318 vreq_free(xdf_t *vdp, v_req_t *vreq)
319 {
320 	buf_t	*bp = vreq->v_buf;
321 
322 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
323 	ASSERT(BP_VREQ(bp) == vreq);
324 
325 	list_remove(&vdp->xdf_vreq_act, vreq);
326 
327 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
328 		goto done;
329 
330 	switch (vreq->v_status) {
331 	case VREQ_DMAWIN_DONE:
332 	case VREQ_GS_ALLOCED:
333 	case VREQ_DMABUF_BOUND:
334 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
335 		/*FALLTHRU*/
336 	case VREQ_DMAMEM_ALLOCED:
337 		if (!ALIGNED_XFER(bp)) {
338 			ASSERT(vreq->v_abuf != NULL);
339 			if (!IS_ERROR(bp) && IS_READ(bp))
340 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
341 				    bp->b_bcount);
342 			ddi_dma_mem_free(&vreq->v_align);
343 		}
344 		/*FALLTHRU*/
345 	case VREQ_MEMDMAHDL_ALLOCED:
346 		if (!ALIGNED_XFER(bp))
347 			ddi_dma_free_handle(&vreq->v_memdmahdl);
348 		/*FALLTHRU*/
349 	case VREQ_DMAHDL_ALLOCED:
350 		ddi_dma_free_handle(&vreq->v_dmahdl);
351 		break;
352 	default:
353 		break;
354 	}
355 done:
356 	ASSERT(!vreq->v_runq);
357 	list_destroy(&vreq->v_gs);
358 	kmem_cache_free(xdf_vreq_cache, vreq);
359 }
360 
361 /*
362  * Snarf new data if our flush block was re-written
363  */
364 static void
365 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
366 {
367 	int nblks;
368 	boolean_t mapin;
369 
370 	if (IS_WRITE_BARRIER(vdp, bp))
371 		return; /* write was a flush write */
372 
373 	mapin = B_FALSE;
374 	nblks = bp->b_bcount >> DEV_BSHIFT;
375 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
376 		xdf_fbrewrites++;
377 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
378 			mapin = B_TRUE;
379 			bp_mapin(bp);
380 		}
381 		bcopy(bp->b_un.b_addr +
382 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
383 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
384 		if (mapin)
385 			bp_mapout(bp);
386 	}
387 }
388 
389 /*
390  * Initalize the DMA and grant table resources for the buf
391  */
392 static int
393 vreq_setup(xdf_t *vdp, v_req_t *vreq)
394 {
395 	int rc;
396 	ddi_dma_attr_t dmaattr;
397 	uint_t ndcs, ndws;
398 	ddi_dma_handle_t dh;
399 	ddi_dma_handle_t mdh;
400 	ddi_dma_cookie_t dc;
401 	ddi_acc_handle_t abh;
402 	caddr_t	aba;
403 	ge_slot_t *gs;
404 	size_t bufsz;
405 	off_t off;
406 	size_t sz;
407 	buf_t *bp = vreq->v_buf;
408 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
409 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
410 
411 	switch (vreq->v_status) {
412 	case VREQ_INIT:
413 		if (IS_FLUSH_DISKCACHE(bp)) {
414 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
415 				DPRINTF(DMA_DBG, ("xdf@%s: "
416 				    "get ge_slotfailed\n", vdp->xdf_addr));
417 				return (DDI_FAILURE);
418 			}
419 			vreq->v_blkno = 0;
420 			vreq->v_nslots = 1;
421 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
422 			vreq->v_status = VREQ_GS_ALLOCED;
423 			gs->gs_vreq = vreq;
424 			list_insert_head(&vreq->v_gs, gs);
425 			return (DDI_SUCCESS);
426 		}
427 
428 		if (IS_WRITE_BARRIER(vdp, bp))
429 			vreq->v_flush_diskcache = WRITE_BARRIER;
430 		vreq->v_blkno = bp->b_blkno +
431 		    (diskaddr_t)(uintptr_t)bp->b_private;
432 		/* See if we wrote new data to our flush block */
433 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
434 			check_fbwrite(vdp, bp, vreq->v_blkno);
435 		vreq->v_status = VREQ_INIT_DONE;
436 		/*FALLTHRU*/
437 
438 	case VREQ_INIT_DONE:
439 		/*
440 		 * alloc DMA handle
441 		 */
442 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
443 		    xdf_dmacallback, (caddr_t)vdp, &dh);
444 		if (rc != DDI_SUCCESS) {
445 			SETDMACBON(vdp);
446 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
447 			    vdp->xdf_addr));
448 			return (DDI_FAILURE);
449 		}
450 
451 		vreq->v_dmahdl = dh;
452 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
453 		/*FALLTHRU*/
454 
455 	case VREQ_DMAHDL_ALLOCED:
456 		/*
457 		 * alloc dma handle for 512-byte aligned buf
458 		 */
459 		if (!ALIGNED_XFER(bp)) {
460 			/*
461 			 * XXPV: we need to temporarily enlarge the seg
462 			 * boundary and s/g length to work round CR6381968
463 			 */
464 			dmaattr = xb_dma_attr;
465 			dmaattr.dma_attr_seg = (uint64_t)-1;
466 			dmaattr.dma_attr_sgllen = INT_MAX;
467 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
468 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
469 			if (rc != DDI_SUCCESS) {
470 				SETDMACBON(vdp);
471 				DPRINTF(DMA_DBG, ("xdf@%s: "
472 				    "unaligned buf DMAhandle alloc failed\n",
473 				    vdp->xdf_addr));
474 				return (DDI_FAILURE);
475 			}
476 			vreq->v_memdmahdl = mdh;
477 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
478 		}
479 		/*FALLTHRU*/
480 
481 	case VREQ_MEMDMAHDL_ALLOCED:
482 		/*
483 		 * alloc 512-byte aligned buf
484 		 */
485 		if (!ALIGNED_XFER(bp)) {
486 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
487 				bp_mapin(bp);
488 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
489 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
490 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
491 			    &aba, &bufsz, &abh);
492 			if (rc != DDI_SUCCESS) {
493 				SETDMACBON(vdp);
494 				DPRINTF(DMA_DBG, ("xdf@%s: "
495 				    "DMA mem allocation failed\n",
496 				    vdp->xdf_addr));
497 				return (DDI_FAILURE);
498 			}
499 
500 			vreq->v_abuf = aba;
501 			vreq->v_align = abh;
502 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
503 
504 			ASSERT(bufsz >= bp->b_bcount);
505 			if (!IS_READ(bp))
506 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
507 				    bp->b_bcount);
508 		}
509 		/*FALLTHRU*/
510 
511 	case VREQ_DMAMEM_ALLOCED:
512 		/*
513 		 * dma bind
514 		 */
515 		if (ALIGNED_XFER(bp)) {
516 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
517 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
518 			    &dc, &ndcs);
519 		} else {
520 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
521 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
522 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
523 		}
524 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
525 			/* get num of dma windows */
526 			if (rc == DDI_DMA_PARTIAL_MAP) {
527 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
528 				ASSERT(rc == DDI_SUCCESS);
529 			} else {
530 				ndws = 1;
531 			}
532 		} else {
533 			SETDMACBON(vdp);
534 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
535 			    vdp->xdf_addr));
536 			return (DDI_FAILURE);
537 		}
538 
539 		vreq->v_dmac = dc;
540 		vreq->v_dmaw = 0;
541 		vreq->v_ndmacs = ndcs;
542 		vreq->v_ndmaws = ndws;
543 		vreq->v_nslots = ndws;
544 		vreq->v_status = VREQ_DMABUF_BOUND;
545 		/*FALLTHRU*/
546 
547 	case VREQ_DMABUF_BOUND:
548 		/*
549 		 * get ge_slot, callback is set upon failure from gs_get(),
550 		 * if not set previously
551 		 */
552 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
553 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
554 			    vdp->xdf_addr));
555 			return (DDI_FAILURE);
556 		}
557 
558 		vreq->v_status = VREQ_GS_ALLOCED;
559 		gs->gs_vreq = vreq;
560 		list_insert_head(&vreq->v_gs, gs);
561 		break;
562 
563 	case VREQ_GS_ALLOCED:
564 		/* nothing need to be done */
565 		break;
566 
567 	case VREQ_DMAWIN_DONE:
568 		/*
569 		 * move to the next dma window
570 		 */
571 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
572 
573 		/* get a ge_slot for this DMA window */
574 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
575 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
576 			    vdp->xdf_addr));
577 			return (DDI_FAILURE);
578 		}
579 
580 		vreq->v_dmaw++;
581 		VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
582 		    &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS);
583 		vreq->v_status = VREQ_GS_ALLOCED;
584 		gs->gs_vreq = vreq;
585 		list_insert_head(&vreq->v_gs, gs);
586 		break;
587 
588 	default:
589 		return (DDI_FAILURE);
590 	}
591 
592 	return (DDI_SUCCESS);
593 }
594 
595 static int
596 xdf_cmlb_attach(xdf_t *vdp)
597 {
598 	dev_info_t	*dip = vdp->xdf_dip;
599 
600 	return (cmlb_attach(dip, &xdf_lb_ops,
601 	    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
602 	    XD_IS_RM(vdp),
603 	    B_TRUE,
604 	    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
605 #ifdef XPV_HVM_DRIVER
606 	    (XD_IS_CD(vdp) ? 0 : CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT),
607 #else /* XPV_HVM_DRIVER */
608 	    0,
609 #endif /* XPV_HVM_DRIVER */
610 	    vdp->xdf_vd_lbl, NULL));
611 }
612 
613 static void
614 xdf_io_err(buf_t *bp, int err, size_t resid)
615 {
616 	bioerror(bp, err);
617 	if (resid == 0)
618 		bp->b_resid = bp->b_bcount;
619 	biodone(bp);
620 }
621 
622 static void
623 xdf_kstat_enter(xdf_t *vdp, buf_t *bp)
624 {
625 	v_req_t *vreq = BP_VREQ(bp);
626 
627 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
628 
629 	if (vdp->xdf_xdev_iostat == NULL)
630 		return;
631 	if ((vreq != NULL) && vreq->v_runq) {
632 		kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
633 	} else {
634 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
635 	}
636 }
637 
638 static void
639 xdf_kstat_exit(xdf_t *vdp, buf_t *bp)
640 {
641 	v_req_t *vreq = BP_VREQ(bp);
642 
643 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
644 
645 	if (vdp->xdf_xdev_iostat == NULL)
646 		return;
647 
648 	if ((vreq != NULL) && vreq->v_runq) {
649 		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
650 	} else {
651 		kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
652 	}
653 
654 	if (bp->b_flags & B_READ) {
655 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->reads++;
656 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nread += bp->b_bcount;
657 	} else if (bp->b_flags & B_WRITE) {
658 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->writes++;
659 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nwritten += bp->b_bcount;
660 	}
661 }
662 
663 static void
664 xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp)
665 {
666 	v_req_t *vreq = BP_VREQ(bp);
667 
668 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
669 	ASSERT(!vreq->v_runq);
670 
671 	vreq->v_runq = B_TRUE;
672 	if (vdp->xdf_xdev_iostat == NULL)
673 		return;
674 	kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
675 }
676 
677 static void
678 xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp)
679 {
680 	v_req_t *vreq = BP_VREQ(bp);
681 
682 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
683 	ASSERT(vreq->v_runq);
684 
685 	vreq->v_runq = B_FALSE;
686 	if (vdp->xdf_xdev_iostat == NULL)
687 		return;
688 	kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
689 }
690 
691 int
692 xdf_kstat_create(dev_info_t *dip)
693 {
694 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
695 	kstat_t		*kstat;
696 	buf_t		*bp;
697 
698 	if ((kstat = kstat_create("xdf", ddi_get_instance(dip), NULL, "disk",
699 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL)
700 		return (-1);
701 
702 	/* See comment about locking in xdf_kstat_delete(). */
703 	mutex_enter(&vdp->xdf_iostat_lk);
704 	mutex_enter(&vdp->xdf_dev_lk);
705 
706 	/* only one kstat can exist at a time */
707 	if (vdp->xdf_xdev_iostat != NULL) {
708 		mutex_exit(&vdp->xdf_dev_lk);
709 		mutex_exit(&vdp->xdf_iostat_lk);
710 		kstat_delete(kstat);
711 		return (-1);
712 	}
713 
714 	vdp->xdf_xdev_iostat = kstat;
715 	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
716 	kstat_install(vdp->xdf_xdev_iostat);
717 
718 	/*
719 	 * Now that we've created a kstat, we need to update the waitq and
720 	 * runq counts for the kstat to reflect our current state.
721 	 *
722 	 * For a buf_t structure to be on the runq, it must have a ring
723 	 * buffer slot associated with it.  To get a ring buffer slot the
724 	 * buf must first have a v_req_t and a ge_slot_t associated with it.
725 	 * Then when it is granted a ring buffer slot, v_runq will be set to
726 	 * true.
727 	 *
728 	 * For a buf_t structure to be on the waitq, it must not be on the
729 	 * runq.  So to find all the buf_t's that should be on waitq, we
730 	 * walk the active buf list and add any buf_t's which aren't on the
731 	 * runq to the waitq.
732 	 */
733 	bp = vdp->xdf_f_act;
734 	while (bp != NULL) {
735 		xdf_kstat_enter(vdp, bp);
736 		bp = bp->av_forw;
737 	}
738 	if (vdp->xdf_ready_tq_bp != NULL)
739 		xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp);
740 
741 	mutex_exit(&vdp->xdf_dev_lk);
742 	mutex_exit(&vdp->xdf_iostat_lk);
743 	return (0);
744 }
745 
746 void
747 xdf_kstat_delete(dev_info_t *dip)
748 {
749 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
750 	kstat_t		*kstat;
751 	buf_t		*bp;
752 
753 	/*
754 	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
755 	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
756 	 * and the contents of the our kstat.  xdf_iostat_lk is used
757 	 * to protect the allocation and freeing of the actual kstat.
758 	 * xdf_dev_lk can't be used for this purpose because kstat
759 	 * readers use it to access the contents of the kstat and
760 	 * hence it can't be held when calling kstat_delete().
761 	 */
762 	mutex_enter(&vdp->xdf_iostat_lk);
763 	mutex_enter(&vdp->xdf_dev_lk);
764 
765 	if (vdp->xdf_xdev_iostat == NULL) {
766 		mutex_exit(&vdp->xdf_dev_lk);
767 		mutex_exit(&vdp->xdf_iostat_lk);
768 		return;
769 	}
770 
771 	/*
772 	 * We're about to destroy the kstat structures, so it isn't really
773 	 * necessary to update the runq and waitq counts.  But, since this
774 	 * isn't a hot code path we can afford to be a little pedantic and
775 	 * go ahead and decrement the runq and waitq kstat counters to zero
776 	 * before free'ing them.  This helps us ensure that we've gotten all
777 	 * our accounting correct.
778 	 *
779 	 * For an explanation of how we determine which buffers go on the
780 	 * runq vs which go on the waitq, see the comments in
781 	 * xdf_kstat_create().
782 	 */
783 	bp = vdp->xdf_f_act;
784 	while (bp != NULL) {
785 		xdf_kstat_exit(vdp, bp);
786 		bp = bp->av_forw;
787 	}
788 	if (vdp->xdf_ready_tq_bp != NULL)
789 		xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp);
790 
791 	kstat = vdp->xdf_xdev_iostat;
792 	vdp->xdf_xdev_iostat = NULL;
793 	mutex_exit(&vdp->xdf_dev_lk);
794 	kstat_delete(kstat);
795 	mutex_exit(&vdp->xdf_iostat_lk);
796 }
797 
798 /*
799  * Add an IO requests onto the active queue.
800  *
801  * We have to detect IOs generated by xdf_ready_tq_thread.  These IOs
802  * are used to establish a connection to the backend, so they receive
803  * priority over all other IOs.  Since xdf_ready_tq_thread only does
804  * synchronous IO, there can only be one xdf_ready_tq_thread request at any
805  * given time and we record the buf associated with that request in
806  * xdf_ready_tq_bp.
807  */
808 static void
809 xdf_bp_push(xdf_t *vdp, buf_t *bp)
810 {
811 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
812 	ASSERT(bp->av_forw == NULL);
813 
814 	xdf_kstat_enter(vdp, bp);
815 
816 	if (curthread == vdp->xdf_ready_tq_thread) {
817 		/* new IO requests from the ready thread */
818 		ASSERT(vdp->xdf_ready_tq_bp == NULL);
819 		vdp->xdf_ready_tq_bp = bp;
820 		return;
821 	}
822 
823 	/* this is normal IO request */
824 	ASSERT(bp != vdp->xdf_ready_tq_bp);
825 
826 	if (vdp->xdf_f_act == NULL) {
827 		/* this is only only IO on the active queue */
828 		ASSERT(vdp->xdf_l_act == NULL);
829 		ASSERT(vdp->xdf_i_act == NULL);
830 		vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp;
831 		return;
832 	}
833 
834 	/* add this IO to the tail of the active queue */
835 	vdp->xdf_l_act->av_forw = bp;
836 	vdp->xdf_l_act = bp;
837 	if (vdp->xdf_i_act == NULL)
838 		vdp->xdf_i_act = bp;
839 }
840 
841 static void
842 xdf_bp_pop(xdf_t *vdp, buf_t *bp)
843 {
844 	buf_t	*bp_iter;
845 
846 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
847 	ASSERT(VREQ_DONE(BP_VREQ(bp)));
848 
849 	if (vdp->xdf_ready_tq_bp == bp) {
850 		/* we're done with a ready thread IO request */
851 		ASSERT(bp->av_forw == NULL);
852 		vdp->xdf_ready_tq_bp = NULL;
853 		return;
854 	}
855 
856 	/* we're done with a normal IO request */
857 	ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act));
858 	ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act));
859 	ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act)));
860 	ASSERT(vdp->xdf_f_act != vdp->xdf_i_act);
861 
862 	if (bp == vdp->xdf_f_act) {
863 		/* This IO was at the head of our active queue. */
864 		vdp->xdf_f_act = bp->av_forw;
865 		if (bp == vdp->xdf_l_act)
866 			vdp->xdf_l_act = NULL;
867 	} else {
868 		/* There IO finished before some other pending IOs. */
869 		bp_iter = vdp->xdf_f_act;
870 		while (bp != bp_iter->av_forw) {
871 			bp_iter = bp_iter->av_forw;
872 			ASSERT(VREQ_DONE(BP_VREQ(bp_iter)));
873 			ASSERT(bp_iter != vdp->xdf_i_act);
874 		}
875 		bp_iter->av_forw = bp->av_forw;
876 		if (bp == vdp->xdf_l_act)
877 			vdp->xdf_l_act = bp_iter;
878 	}
879 	bp->av_forw = NULL;
880 }
881 
882 static buf_t *
883 xdf_bp_next(xdf_t *vdp)
884 {
885 	v_req_t	*vreq;
886 	buf_t	*bp;
887 
888 	if (vdp->xdf_state == XD_CONNECTED) {
889 		/*
890 		 * If we're in the XD_CONNECTED state, we only service IOs
891 		 * from the xdf_ready_tq_thread thread.
892 		 */
893 		if ((bp = vdp->xdf_ready_tq_bp) == NULL)
894 			return (NULL);
895 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
896 			return (bp);
897 		return (NULL);
898 	}
899 
900 	/* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */
901 	if (vdp->xdf_state != XD_READY)
902 		return (NULL);
903 
904 	ASSERT(vdp->xdf_ready_tq_bp == NULL);
905 	for (;;) {
906 		if ((bp = vdp->xdf_i_act) == NULL)
907 			return (NULL);
908 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
909 			return (bp);
910 
911 		/* advance the active buf index pointer */
912 		vdp->xdf_i_act = bp->av_forw;
913 	}
914 }
915 
916 static void
917 xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr)
918 {
919 	ge_slot_t	*gs = (ge_slot_t *)(uintptr_t)id;
920 	v_req_t		*vreq = gs->gs_vreq;
921 	buf_t		*bp = vreq->v_buf;
922 
923 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
924 	ASSERT(BP_VREQ(bp) == vreq);
925 
926 	gs_free(gs);
927 
928 	if (bioerr != 0)
929 		bioerror(bp, bioerr);
930 	ASSERT(vreq->v_nslots > 0);
931 	if (--vreq->v_nslots > 0)
932 		return;
933 
934 	/* remove this IO from our active queue */
935 	xdf_bp_pop(vdp, bp);
936 
937 	ASSERT(vreq->v_runq);
938 	xdf_kstat_exit(vdp, bp);
939 	vreq->v_runq = B_FALSE;
940 	vreq_free(vdp, vreq);
941 
942 	if (IS_ERROR(bp)) {
943 		xdf_io_err(bp, geterror(bp), 0);
944 	} else if (bp->b_resid != 0) {
945 		/* Partial transfers are an error */
946 		xdf_io_err(bp, EIO, bp->b_resid);
947 	} else {
948 		biodone(bp);
949 	}
950 }
951 
952 /*
953  * xdf interrupt handler
954  */
955 static uint_t
956 xdf_intr_locked(xdf_t *vdp)
957 {
958 	xendev_ring_t *xbr;
959 	blkif_response_t *resp;
960 	int bioerr;
961 	uint64_t id;
962 	uint8_t op;
963 	uint16_t status;
964 	ddi_acc_handle_t acchdl;
965 
966 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
967 
968 	if ((xbr = vdp->xdf_xb_ring) == NULL)
969 		return (DDI_INTR_UNCLAIMED);
970 
971 	acchdl = vdp->xdf_xb_ring_hdl;
972 
973 	/*
974 	 * complete all requests which have a response
975 	 */
976 	while (resp = xvdi_ring_get_response(xbr)) {
977 		id = ddi_get64(acchdl, &resp->id);
978 		op = ddi_get8(acchdl, &resp->operation);
979 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
980 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
981 		    op, id, status));
982 
983 		if (status != BLKIF_RSP_OKAY) {
984 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
985 			    vdp->xdf_addr,
986 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
987 			bioerr = EIO;
988 		} else {
989 			bioerr = 0;
990 		}
991 
992 		xdf_io_fini(vdp, id, bioerr);
993 	}
994 	return (DDI_INTR_CLAIMED);
995 }
996 
997 /*
998  * xdf_intr runs at PIL 5, so no one else can grab xdf_dev_lk and
999  * block at a lower pil.
1000  */
1001 static uint_t
1002 xdf_intr(caddr_t arg)
1003 {
1004 	xdf_t *vdp = (xdf_t *)arg;
1005 	int rv;
1006 
1007 	mutex_enter(&vdp->xdf_dev_lk);
1008 	rv = xdf_intr_locked(vdp);
1009 	mutex_exit(&vdp->xdf_dev_lk);
1010 
1011 	if (!do_polled_io)
1012 		xdf_io_start(vdp);
1013 
1014 	return (rv);
1015 }
1016 
1017 static void
1018 xdf_ring_push(xdf_t *vdp)
1019 {
1020 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1021 
1022 	if (vdp->xdf_xb_ring == NULL)
1023 		return;
1024 
1025 	if (xvdi_ring_push_request(vdp->xdf_xb_ring)) {
1026 		DPRINTF(IO_DBG, (
1027 		    "xdf@%s: xdf_ring_push: sent request(s) to backend\n",
1028 		    vdp->xdf_addr));
1029 	}
1030 
1031 	if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN)
1032 		xvdi_notify_oe(vdp->xdf_dip);
1033 }
1034 
1035 static int
1036 xdf_ring_drain_locked(xdf_t *vdp)
1037 {
1038 	int		pollc, rv = 0;
1039 
1040 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1041 
1042 	if (xdf_debug & SUSRES_DBG)
1043 		xen_printf("xdf_ring_drain: start\n");
1044 
1045 	for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) {
1046 		if (vdp->xdf_xb_ring == NULL)
1047 			goto out;
1048 
1049 		if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1050 			(void) xdf_intr_locked(vdp);
1051 		if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring))
1052 			goto out;
1053 		xdf_ring_push(vdp);
1054 
1055 		/* file-backed devices can be slow */
1056 		mutex_exit(&vdp->xdf_dev_lk);
1057 #ifdef XPV_HVM_DRIVER
1058 		(void) HYPERVISOR_yield();
1059 #endif /* XPV_HVM_DRIVER */
1060 		delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY));
1061 		mutex_enter(&vdp->xdf_dev_lk);
1062 	}
1063 	cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr);
1064 
1065 out:
1066 	if (vdp->xdf_xb_ring != NULL) {
1067 		if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) ||
1068 		    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1069 			rv = EIO;
1070 	}
1071 	if (xdf_debug & SUSRES_DBG)
1072 		xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n",
1073 		    vdp->xdf_addr, rv);
1074 	return (rv);
1075 }
1076 
1077 static int
1078 xdf_ring_drain(xdf_t *vdp)
1079 {
1080 	int rv;
1081 	mutex_enter(&vdp->xdf_dev_lk);
1082 	rv = xdf_ring_drain_locked(vdp);
1083 	mutex_exit(&vdp->xdf_dev_lk);
1084 	return (rv);
1085 }
1086 
1087 /*
1088  * Destroy all v_req_t, grant table entries, and our ring buffer.
1089  */
1090 static void
1091 xdf_ring_destroy(xdf_t *vdp)
1092 {
1093 	v_req_t		*vreq;
1094 	buf_t		*bp;
1095 	ge_slot_t	*gs;
1096 
1097 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1098 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1099 
1100 	if ((vdp->xdf_state != XD_INIT) &&
1101 	    (vdp->xdf_state != XD_CONNECTED) &&
1102 	    (vdp->xdf_state != XD_READY)) {
1103 		ASSERT(vdp->xdf_xb_ring == NULL);
1104 		ASSERT(vdp->xdf_xb_ring_hdl == NULL);
1105 		ASSERT(vdp->xdf_peer == INVALID_DOMID);
1106 		ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN);
1107 		ASSERT(list_is_empty(&vdp->xdf_vreq_act));
1108 		return;
1109 	}
1110 
1111 	/*
1112 	 * We don't want to receive async notifications from the backend
1113 	 * when it finishes processing ring entries.
1114 	 */
1115 #ifdef XPV_HVM_DRIVER
1116 	ec_unbind_evtchn(vdp->xdf_evtchn);
1117 #else /* !XPV_HVM_DRIVER */
1118 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1119 #endif /* !XPV_HVM_DRIVER */
1120 
1121 	/*
1122 	 * Drain any requests in the ring.  We need to do this before we
1123 	 * can free grant table entries, because if active ring entries
1124 	 * point to grants, then the backend could be trying to access
1125 	 * those grants.
1126 	 */
1127 	(void) xdf_ring_drain_locked(vdp);
1128 
1129 	/* We're done talking to the backend so free up our event channel */
1130 	xvdi_free_evtchn(vdp->xdf_dip);
1131 	vdp->xdf_evtchn = INVALID_EVTCHN;
1132 
1133 	while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) {
1134 		bp = vreq->v_buf;
1135 		ASSERT(BP_VREQ(bp) == vreq);
1136 
1137 		/* Free up any grant table entries associaed with this IO */
1138 		while ((gs = list_head(&vreq->v_gs)) != NULL)
1139 			gs_free(gs);
1140 
1141 		/* If this IO was on the runq, move it back to the waitq. */
1142 		if (vreq->v_runq)
1143 			xdf_kstat_runq_to_waitq(vdp, bp);
1144 
1145 		/*
1146 		 * Reset any buf IO state since we're going to re-issue the
1147 		 * IO when we reconnect.
1148 		 */
1149 		vreq_free(vdp, vreq);
1150 		BP_VREQ_SET(bp, NULL);
1151 		bioerror(bp, 0);
1152 	}
1153 
1154 	/* reset the active queue index pointer */
1155 	vdp->xdf_i_act = vdp->xdf_f_act;
1156 
1157 	/* Destroy the ring */
1158 	xvdi_free_ring(vdp->xdf_xb_ring);
1159 	vdp->xdf_xb_ring = NULL;
1160 	vdp->xdf_xb_ring_hdl = NULL;
1161 	vdp->xdf_peer = INVALID_DOMID;
1162 }
1163 
1164 void
1165 xdfmin(struct buf *bp)
1166 {
1167 	if (bp->b_bcount > xdf_maxphys)
1168 		bp->b_bcount = xdf_maxphys;
1169 }
1170 
1171 /*
1172  * Check if we have a pending "eject" media request.
1173  */
1174 static int
1175 xdf_eject_pending(xdf_t *vdp)
1176 {
1177 	dev_info_t	*dip = vdp->xdf_dip;
1178 	char		*xsname, *str;
1179 
1180 	if (!vdp->xdf_media_req_supported)
1181 		return (B_FALSE);
1182 
1183 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1184 	    (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0))
1185 		return (B_FALSE);
1186 
1187 	if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
1188 		strfree(str);
1189 		return (B_FALSE);
1190 	}
1191 	strfree(str);
1192 	return (B_TRUE);
1193 }
1194 
1195 /*
1196  * Generate a media request.
1197  */
1198 static int
1199 xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required)
1200 {
1201 	dev_info_t	*dip = vdp->xdf_dip;
1202 	char		*xsname;
1203 
1204 	/*
1205 	 * we can't be holding xdf_dev_lk because xenbus_printf() can
1206 	 * block while waiting for a PIL 1 interrupt message.  this
1207 	 * would cause a deadlock with xdf_intr() which needs to grab
1208 	 * xdf_dev_lk as well and runs at PIL 5.
1209 	 */
1210 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1211 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1212 
1213 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1214 		return (ENXIO);
1215 
1216 	/* Check if we support media requests */
1217 	if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported)
1218 		return (ENOTTY);
1219 
1220 	/* If an eject is pending then don't allow any new requests */
1221 	if (xdf_eject_pending(vdp))
1222 		return (ENXIO);
1223 
1224 	/* Make sure that there is media present */
1225 	if (media_required && (vdp->xdf_xdev_nblocks == 0))
1226 		return (ENXIO);
1227 
1228 	/* We only allow operations when the device is ready and connected */
1229 	if (vdp->xdf_state != XD_READY)
1230 		return (EIO);
1231 
1232 	if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0)
1233 		return (EIO);
1234 
1235 	return (0);
1236 }
1237 
1238 /*
1239  * populate a single blkif_request_t w/ a buf
1240  */
1241 static void
1242 xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1243 {
1244 	grant_ref_t	gr;
1245 	uint8_t		fsect, lsect;
1246 	size_t		bcnt;
1247 	paddr_t		dma_addr;
1248 	off_t		blk_off;
1249 	dev_info_t	*dip = vdp->xdf_dip;
1250 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1251 	v_req_t		*vreq = BP_VREQ(bp);
1252 	uint64_t	blkno = vreq->v_blkno;
1253 	uint_t		ndmacs = vreq->v_ndmacs;
1254 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1255 	int		seg = 0;
1256 	int		isread = IS_READ(bp);
1257 	ge_slot_t	*gs = list_head(&vreq->v_gs);
1258 
1259 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1260 	ASSERT(vreq->v_status == VREQ_GS_ALLOCED);
1261 
1262 	if (isread)
1263 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1264 	else {
1265 		switch (vreq->v_flush_diskcache) {
1266 		case FLUSH_DISKCACHE:
1267 			ddi_put8(acchdl, &rreq->operation,
1268 			    BLKIF_OP_FLUSH_DISKCACHE);
1269 			ddi_put16(acchdl, &rreq->handle, vdev);
1270 			ddi_put64(acchdl, &rreq->id,
1271 			    (uint64_t)(uintptr_t)(gs));
1272 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1273 			vreq->v_status = VREQ_DMAWIN_DONE;
1274 			return;
1275 		case WRITE_BARRIER:
1276 			ddi_put8(acchdl, &rreq->operation,
1277 			    BLKIF_OP_WRITE_BARRIER);
1278 			break;
1279 		default:
1280 			if (!vdp->xdf_wce)
1281 				ddi_put8(acchdl, &rreq->operation,
1282 				    BLKIF_OP_WRITE_BARRIER);
1283 			else
1284 				ddi_put8(acchdl, &rreq->operation,
1285 				    BLKIF_OP_WRITE);
1286 			break;
1287 		}
1288 	}
1289 
1290 	ddi_put16(acchdl, &rreq->handle, vdev);
1291 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1292 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs));
1293 
1294 	/*
1295 	 * loop until all segments are populated or no more dma cookie in buf
1296 	 */
1297 	for (;;) {
1298 		/*
1299 		 * Each segment of a blkif request can transfer up to
1300 		 * one 4K page of data.
1301 		 */
1302 		bcnt = vreq->v_dmac.dmac_size;
1303 		dma_addr = vreq->v_dmac.dmac_laddress;
1304 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1305 		fsect = blk_off >> XB_BSHIFT;
1306 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1307 
1308 		ASSERT(bcnt <= PAGESIZE);
1309 		ASSERT((bcnt % XB_BSIZE) == 0);
1310 		ASSERT((blk_off & XB_BMASK) == 0);
1311 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1312 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1313 
1314 		gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT);
1315 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1316 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1317 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1318 
1319 		DPRINTF(IO_DBG, (
1320 		    "xdf@%s: seg%d: dmacS %lu blk_off %ld\n",
1321 		    vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off));
1322 		DPRINTF(IO_DBG, (
1323 		    "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n",
1324 		    vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr));
1325 
1326 		blkno += (bcnt >> XB_BSHIFT);
1327 		seg++;
1328 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1329 		if (--ndmacs) {
1330 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1331 			continue;
1332 		}
1333 
1334 		vreq->v_status = VREQ_DMAWIN_DONE;
1335 		vreq->v_blkno = blkno;
1336 		break;
1337 	}
1338 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1339 	DPRINTF(IO_DBG, (
1340 	    "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n",
1341 	    vdp->xdf_addr, rreq->id));
1342 }
1343 
1344 static void
1345 xdf_io_start(xdf_t *vdp)
1346 {
1347 	struct buf	*bp;
1348 	v_req_t		*vreq;
1349 	blkif_request_t	*rreq;
1350 	boolean_t	rreqready = B_FALSE;
1351 
1352 	mutex_enter(&vdp->xdf_dev_lk);
1353 
1354 	/*
1355 	 * Populate the ring request(s).  Loop until there is no buf to
1356 	 * transfer or no free slot available in I/O ring.
1357 	 */
1358 	for (;;) {
1359 		/* don't start any new IO if we're suspending */
1360 		if (vdp->xdf_suspending)
1361 			break;
1362 		if ((bp = xdf_bp_next(vdp)) == NULL)
1363 			break;
1364 
1365 		/* if the buf doesn't already have a vreq, allocate one */
1366 		if (((vreq = BP_VREQ(bp)) == NULL) &&
1367 		    ((vreq = vreq_get(vdp, bp)) == NULL))
1368 			break;
1369 
1370 		/* alloc DMA/GTE resources */
1371 		if (vreq_setup(vdp, vreq) != DDI_SUCCESS)
1372 			break;
1373 
1374 		/* get next blkif_request in the ring */
1375 		if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL)
1376 			break;
1377 		bzero(rreq, sizeof (blkif_request_t));
1378 		rreqready = B_TRUE;
1379 
1380 		/* populate blkif_request with this buf */
1381 		xdf_process_rreq(vdp, bp, rreq);
1382 
1383 		/*
1384 		 * This buffer/vreq pair is has been allocated a ring buffer
1385 		 * resources, so if it isn't already in our runq, add it.
1386 		 */
1387 		if (!vreq->v_runq)
1388 			xdf_kstat_waitq_to_runq(vdp, bp);
1389 	}
1390 
1391 	/* Send the request(s) to the backend */
1392 	if (rreqready)
1393 		xdf_ring_push(vdp);
1394 
1395 	mutex_exit(&vdp->xdf_dev_lk);
1396 }
1397 
1398 
1399 /* check if partition is open, -1 - check all partitions on the disk */
1400 static boolean_t
1401 xdf_isopen(xdf_t *vdp, int partition)
1402 {
1403 	int i;
1404 	ulong_t parbit;
1405 	boolean_t rval = B_FALSE;
1406 
1407 	ASSERT((partition == -1) ||
1408 	    ((partition >= 0) || (partition < XDF_PEXT)));
1409 
1410 	if (partition == -1)
1411 		parbit = (ulong_t)-1;
1412 	else
1413 		parbit = 1 << partition;
1414 
1415 	for (i = 0; i < OTYPCNT; i++) {
1416 		if (vdp->xdf_vd_open[i] & parbit)
1417 			rval = B_TRUE;
1418 	}
1419 
1420 	return (rval);
1421 }
1422 
1423 /*
1424  * The connection should never be closed as long as someone is holding
1425  * us open, there is pending IO, or someone is waiting waiting for a
1426  * connection.
1427  */
1428 static boolean_t
1429 xdf_busy(xdf_t *vdp)
1430 {
1431 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1432 
1433 	if ((vdp->xdf_xb_ring != NULL) &&
1434 	    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
1435 		ASSERT(vdp->xdf_state != XD_CLOSED);
1436 		return (B_TRUE);
1437 	}
1438 
1439 	if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) {
1440 		ASSERT(vdp->xdf_state != XD_CLOSED);
1441 		return (B_TRUE);
1442 	}
1443 
1444 	if (xdf_isopen(vdp, -1)) {
1445 		ASSERT(vdp->xdf_state != XD_CLOSED);
1446 		return (B_TRUE);
1447 	}
1448 
1449 	if (vdp->xdf_connect_req > 0) {
1450 		ASSERT(vdp->xdf_state != XD_CLOSED);
1451 		return (B_TRUE);
1452 	}
1453 
1454 	return (B_FALSE);
1455 }
1456 
1457 static void
1458 xdf_set_state(xdf_t *vdp, xdf_state_t new_state)
1459 {
1460 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1461 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1462 	DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n",
1463 	    vdp->xdf_addr, vdp->xdf_state, new_state));
1464 	vdp->xdf_state = new_state;
1465 	cv_broadcast(&vdp->xdf_dev_cv);
1466 }
1467 
1468 static void
1469 xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet)
1470 {
1471 	dev_info_t	*dip = vdp->xdf_dip;
1472 	boolean_t	busy;
1473 
1474 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1475 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1476 	ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED));
1477 
1478 	/* Check if we're already there. */
1479 	if (vdp->xdf_state == new_state)
1480 		return;
1481 
1482 	mutex_enter(&vdp->xdf_dev_lk);
1483 	busy = xdf_busy(vdp);
1484 
1485 	/* If we're already closed then there's nothing todo. */
1486 	if (vdp->xdf_state == XD_CLOSED) {
1487 		ASSERT(!busy);
1488 		xdf_set_state(vdp, new_state);
1489 		mutex_exit(&vdp->xdf_dev_lk);
1490 		return;
1491 	}
1492 
1493 #ifdef DEBUG
1494 	/* UhOh.  Warn the user that something bad has happened. */
1495 	if (!quiet && busy && (vdp->xdf_state == XD_READY) &&
1496 	    (vdp->xdf_xdev_nblocks != 0)) {
1497 		cmn_err(CE_WARN, "xdf@%s: disconnected while in use",
1498 		    vdp->xdf_addr);
1499 	}
1500 #endif /* DEBUG */
1501 
1502 	xdf_ring_destroy(vdp);
1503 
1504 	/* If we're busy then we can only go into the unknown state */
1505 	xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state);
1506 	mutex_exit(&vdp->xdf_dev_lk);
1507 
1508 	/* if we're closed now, let the other end know */
1509 	if (vdp->xdf_state == XD_CLOSED)
1510 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1511 }
1512 
1513 
1514 /*
1515  * Kick-off connect process
1516  * Status should be XD_UNKNOWN or XD_CLOSED
1517  * On success, status will be changed to XD_INIT
1518  * On error, it will be changed to XD_UNKNOWN
1519  */
1520 static int
1521 xdf_setstate_init(xdf_t *vdp)
1522 {
1523 	dev_info_t		*dip = vdp->xdf_dip;
1524 	xenbus_transaction_t	xbt;
1525 	grant_ref_t		gref;
1526 	char			*xsname, *str;
1527 	int 			rv;
1528 
1529 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1530 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1531 	ASSERT((vdp->xdf_state == XD_UNKNOWN) ||
1532 	    (vdp->xdf_state == XD_CLOSED));
1533 
1534 	DPRINTF(DDI_DBG,
1535 	    ("xdf@%s: starting connection process\n", vdp->xdf_addr));
1536 
1537 	/*
1538 	 * If an eject is pending then don't allow a new connection.
1539 	 * (Only the backend can clear media request eject request.)
1540 	 */
1541 	if (xdf_eject_pending(vdp))
1542 		return (DDI_FAILURE);
1543 
1544 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1545 		goto errout;
1546 
1547 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID)
1548 		goto errout;
1549 
1550 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising);
1551 
1552 	/*
1553 	 * Sanity check for the existance of the xenbus device-type property.
1554 	 * This property might not exist if our xenbus device nodes were
1555 	 * force destroyed while we were still connected to the backend.
1556 	 */
1557 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0)
1558 		goto errout;
1559 	strfree(str);
1560 
1561 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS)
1562 		goto errout;
1563 
1564 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1565 #ifdef XPV_HVM_DRIVER
1566 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1567 #else /* !XPV_HVM_DRIVER */
1568 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1569 	    DDI_SUCCESS) {
1570 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: "
1571 		    "failed to add intr handler", vdp->xdf_addr);
1572 		goto errout1;
1573 	}
1574 #endif /* !XPV_HVM_DRIVER */
1575 
1576 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1577 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1578 	    DDI_SUCCESS) {
1579 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1580 		    vdp->xdf_addr);
1581 		goto errout2;
1582 	}
1583 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1584 
1585 	/*
1586 	 * Write into xenstore the info needed by backend
1587 	 */
1588 trans_retry:
1589 	if (xenbus_transaction_start(&xbt)) {
1590 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1591 		    vdp->xdf_addr);
1592 		xvdi_fatal_error(dip, EIO, "connect transaction init");
1593 		goto fail_trans;
1594 	}
1595 
1596 	/*
1597 	 * XBP_PROTOCOL is written by the domain builder in the case of PV
1598 	 * domains. However, it is not written for HVM domains, so let's
1599 	 * write it here.
1600 	 */
1601 	if (((rv = xenbus_printf(xbt, xsname,
1602 	    XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) ||
1603 	    ((rv = xenbus_printf(xbt, xsname,
1604 	    XBP_RING_REF, "%u", gref)) != 0) ||
1605 	    ((rv = xenbus_printf(xbt, xsname,
1606 	    XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) ||
1607 	    ((rv = xenbus_printf(xbt, xsname,
1608 	    XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) ||
1609 	    ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) {
1610 		(void) xenbus_transaction_end(xbt, 1);
1611 		xvdi_fatal_error(dip, rv, "connect transaction setup");
1612 		goto fail_trans;
1613 	}
1614 
1615 	/* kick-off connect process */
1616 	if (rv = xenbus_transaction_end(xbt, 0)) {
1617 		if (rv == EAGAIN)
1618 			goto trans_retry;
1619 		xvdi_fatal_error(dip, rv, "connect transaction commit");
1620 		goto fail_trans;
1621 	}
1622 
1623 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1624 	mutex_enter(&vdp->xdf_dev_lk);
1625 	xdf_set_state(vdp, XD_INIT);
1626 	mutex_exit(&vdp->xdf_dev_lk);
1627 
1628 	return (DDI_SUCCESS);
1629 
1630 fail_trans:
1631 	xvdi_free_ring(vdp->xdf_xb_ring);
1632 errout2:
1633 #ifdef XPV_HVM_DRIVER
1634 	ec_unbind_evtchn(vdp->xdf_evtchn);
1635 #else /* !XPV_HVM_DRIVER */
1636 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1637 #endif /* !XPV_HVM_DRIVER */
1638 errout1:
1639 	xvdi_free_evtchn(dip);
1640 	vdp->xdf_evtchn = INVALID_EVTCHN;
1641 errout:
1642 	xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1643 	cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend",
1644 	    vdp->xdf_addr);
1645 	return (DDI_FAILURE);
1646 }
1647 
1648 int
1649 xdf_get_flush_block(xdf_t *vdp)
1650 {
1651 	/*
1652 	 * Get a DEV_BSIZE aligned bufer
1653 	 */
1654 	vdp->xdf_flush_mem = kmem_alloc(vdp->xdf_xdev_secsize * 2, KM_SLEEP);
1655 	vdp->xdf_cache_flush_block =
1656 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem),
1657 	    (int)vdp->xdf_xdev_secsize);
1658 
1659 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1660 	    xdf_flush_block, vdp->xdf_xdev_secsize, NULL) != 0)
1661 		return (DDI_FAILURE);
1662 	return (DDI_SUCCESS);
1663 }
1664 
1665 static void
1666 xdf_setstate_ready(void *arg)
1667 {
1668 	xdf_t	*vdp = (xdf_t *)arg;
1669 
1670 	vdp->xdf_ready_tq_thread = curthread;
1671 
1672 	/*
1673 	 * We've created all the minor nodes via cmlb_attach() using default
1674 	 * value in xdf_attach() to make it possible to block in xdf_open(),
1675 	 * in case there's anyone (say, booting thread) ever trying to open
1676 	 * it before connected to backend. We will refresh all those minor
1677 	 * nodes w/ latest info we've got now when we are almost connected.
1678 	 */
1679 	mutex_enter(&vdp->xdf_dev_lk);
1680 	if (vdp->xdf_cmbl_reattach) {
1681 		vdp->xdf_cmbl_reattach = B_FALSE;
1682 
1683 		mutex_exit(&vdp->xdf_dev_lk);
1684 		if (xdf_cmlb_attach(vdp) != 0) {
1685 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1686 			return;
1687 		}
1688 		mutex_enter(&vdp->xdf_dev_lk);
1689 	}
1690 
1691 	/* If we're not still trying to get to the ready state, then bail. */
1692 	if (vdp->xdf_state != XD_CONNECTED) {
1693 		mutex_exit(&vdp->xdf_dev_lk);
1694 		return;
1695 	}
1696 	mutex_exit(&vdp->xdf_dev_lk);
1697 
1698 	/*
1699 	 * If backend has feature-barrier, see if it supports disk
1700 	 * cache flush op.
1701 	 */
1702 	vdp->xdf_flush_supported = B_FALSE;
1703 	if (vdp->xdf_feature_barrier) {
1704 		/*
1705 		 * Pretend we already know flush is supported so probe
1706 		 * will attempt the correct op.
1707 		 */
1708 		vdp->xdf_flush_supported = B_TRUE;
1709 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1710 			vdp->xdf_flush_supported = B_TRUE;
1711 		} else {
1712 			vdp->xdf_flush_supported = B_FALSE;
1713 			/*
1714 			 * If the other end does not support the cache flush op
1715 			 * then we must use a barrier-write to force disk
1716 			 * cache flushing.  Barrier writes require that a data
1717 			 * block actually be written.
1718 			 * Cache a block to barrier-write when we are
1719 			 * asked to perform a flush.
1720 			 * XXX - would it be better to just copy 1 block
1721 			 * (512 bytes) from whatever write we did last
1722 			 * and rewrite that block?
1723 			 */
1724 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS) {
1725 				xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1726 				return;
1727 			}
1728 		}
1729 	}
1730 
1731 	mutex_enter(&vdp->xdf_cb_lk);
1732 	mutex_enter(&vdp->xdf_dev_lk);
1733 	if (vdp->xdf_state == XD_CONNECTED)
1734 		xdf_set_state(vdp, XD_READY);
1735 	mutex_exit(&vdp->xdf_dev_lk);
1736 
1737 	/* Restart any currently queued up io */
1738 	xdf_io_start(vdp);
1739 
1740 	mutex_exit(&vdp->xdf_cb_lk);
1741 }
1742 
1743 /*
1744  * synthetic geometry
1745  */
1746 #define	XDF_NSECTS	256
1747 #define	XDF_NHEADS	16
1748 
1749 static void
1750 xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp)
1751 {
1752 	xdf_t *vdp;
1753 	uint_t ncyl;
1754 
1755 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
1756 
1757 	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1758 
1759 	bzero(geomp, sizeof (*geomp));
1760 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1761 	geomp->g_acyl = 0;
1762 	geomp->g_nhead = XDF_NHEADS;
1763 	geomp->g_nsect = XDF_NSECTS;
1764 	geomp->g_secsize = vdp->xdf_xdev_secsize;
1765 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1766 	geomp->g_intrlv = 0;
1767 	geomp->g_rpm = 7200;
1768 }
1769 
1770 /*
1771  * Finish other initialization after we've connected to backend
1772  * Status should be XD_INIT before calling this routine
1773  * On success, status should be changed to XD_CONNECTED.
1774  * On error, status should stay XD_INIT
1775  */
1776 static int
1777 xdf_setstate_connected(xdf_t *vdp)
1778 {
1779 	dev_info_t	*dip = vdp->xdf_dip;
1780 	cmlb_geom_t	pgeom;
1781 	diskaddr_t	nblocks = 0;
1782 	uint_t		secsize = 0;
1783 	char		*oename, *xsname, *str;
1784 	uint_t		dinfo;
1785 
1786 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1787 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1788 	ASSERT(vdp->xdf_state == XD_INIT);
1789 
1790 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1791 	    ((oename = xvdi_get_oename(dip)) == NULL))
1792 		return (DDI_FAILURE);
1793 
1794 	/* Make sure the other end is XenbusStateConnected */
1795 	if (xenbus_read_driver_state(oename) != XenbusStateConnected)
1796 		return (DDI_FAILURE);
1797 
1798 	/* Determine if feature barrier is supported by backend */
1799 	if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB)))
1800 		cmn_err(CE_NOTE, "!xdf@%s: feature-barrier not supported",
1801 		    vdp->xdf_addr);
1802 
1803 	/*
1804 	 * Probe backend.  Read the device size into xdf_xdev_nblocks
1805 	 * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE
1806 	 * flags in xdf_dinfo.  If the emulated device type is "cdrom",
1807 	 * we always set VDISK_CDROM, regardless of if it's present in
1808 	 * the xenbus info parameter.
1809 	 */
1810 	if (xenbus_gather(XBT_NULL, oename,
1811 	    XBP_SECTORS, "%"SCNu64, &nblocks,
1812 	    XBP_SECTOR_SIZE, "%u", &secsize,
1813 	    XBP_INFO, "%u", &dinfo,
1814 	    NULL) != 0) {
1815 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1816 		    "cannot read backend info", vdp->xdf_addr);
1817 		return (DDI_FAILURE);
1818 	}
1819 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
1820 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
1821 		    vdp->xdf_addr);
1822 		return (DDI_FAILURE);
1823 	}
1824 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
1825 		dinfo |= VDISK_CDROM;
1826 	strfree(str);
1827 
1828 	if (secsize == 0 || !(ISP2(secsize / DEV_BSIZE)))
1829 		secsize = DEV_BSIZE;
1830 	vdp->xdf_xdev_nblocks = nblocks;
1831 	vdp->xdf_xdev_secsize = secsize;
1832 #ifdef _ILP32
1833 	if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1834 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1835 		    "backend disk device too large with %llu blocks for"
1836 		    " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks);
1837 		xvdi_fatal_error(dip, EFBIG, "reading backend info");
1838 		return (DDI_FAILURE);
1839 	}
1840 #endif
1841 
1842 	/*
1843 	 * If the physical geometry for a fixed disk has been explicity
1844 	 * set then make sure that the specified physical geometry isn't
1845 	 * larger than the device we connected to.
1846 	 */
1847 	if (vdp->xdf_pgeom_fixed &&
1848 	    (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) {
1849 		cmn_err(CE_WARN,
1850 		    "xdf@%s: connect failed, fixed geometry too large",
1851 		    vdp->xdf_addr);
1852 		return (DDI_FAILURE);
1853 	}
1854 
1855 	vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP);
1856 
1857 	/* mark vbd is ready for I/O */
1858 	mutex_enter(&vdp->xdf_dev_lk);
1859 	xdf_set_state(vdp, XD_CONNECTED);
1860 
1861 	/* check if the cmlb label should be updated */
1862 	xdf_synthetic_pgeom(dip, &pgeom);
1863 	if ((vdp->xdf_dinfo != dinfo) ||
1864 	    (!vdp->xdf_pgeom_fixed &&
1865 	    (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) {
1866 		vdp->xdf_cmbl_reattach = B_TRUE;
1867 
1868 		vdp->xdf_dinfo = dinfo;
1869 		if (!vdp->xdf_pgeom_fixed)
1870 			vdp->xdf_pgeom = pgeom;
1871 	}
1872 
1873 	if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) {
1874 		if (vdp->xdf_xdev_nblocks == 0) {
1875 			vdp->xdf_mstate = DKIO_EJECTED;
1876 			cv_broadcast(&vdp->xdf_mstate_cv);
1877 		} else {
1878 			vdp->xdf_mstate = DKIO_INSERTED;
1879 			cv_broadcast(&vdp->xdf_mstate_cv);
1880 		}
1881 	} else {
1882 		if (vdp->xdf_mstate != DKIO_NONE) {
1883 			vdp->xdf_mstate = DKIO_NONE;
1884 			cv_broadcast(&vdp->xdf_mstate_cv);
1885 		}
1886 	}
1887 
1888 	mutex_exit(&vdp->xdf_dev_lk);
1889 
1890 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr,
1891 	    (uint64_t)vdp->xdf_xdev_nblocks);
1892 
1893 	/* Restart any currently queued up io */
1894 	xdf_io_start(vdp);
1895 
1896 	/*
1897 	 * To get to the ready state we have to do IO to the backend device,
1898 	 * but we can't initiate IO from the other end change callback thread
1899 	 * (which is the current context we're executing in.)  This is because
1900 	 * if the other end disconnects while we're doing IO from the callback
1901 	 * thread, then we can't receive that disconnect event and we hang
1902 	 * waiting for an IO that can never complete.
1903 	 */
1904 	(void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp,
1905 	    DDI_SLEEP);
1906 
1907 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1908 	return (DDI_SUCCESS);
1909 }
1910 
1911 /*ARGSUSED*/
1912 static void
1913 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1914 {
1915 	XenbusState new_state = *(XenbusState *)impl_data;
1916 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1917 
1918 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1919 	    vdp->xdf_addr, new_state));
1920 
1921 	mutex_enter(&vdp->xdf_cb_lk);
1922 
1923 	/* We assume that this callback is single threaded */
1924 	ASSERT(vdp->xdf_oe_change_thread == NULL);
1925 	DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread);
1926 
1927 	/* ignore any backend state changes if we're suspending/suspended */
1928 	if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) {
1929 		DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1930 		mutex_exit(&vdp->xdf_cb_lk);
1931 		return;
1932 	}
1933 
1934 	switch (new_state) {
1935 	case XenbusStateUnknown:
1936 	case XenbusStateInitialising:
1937 	case XenbusStateInitWait:
1938 	case XenbusStateInitialised:
1939 		if (vdp->xdf_state == XD_INIT)
1940 			break;
1941 
1942 		xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1943 		if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1944 			break;
1945 		ASSERT(vdp->xdf_state == XD_INIT);
1946 		break;
1947 
1948 	case XenbusStateConnected:
1949 		if ((vdp->xdf_state == XD_CONNECTED) ||
1950 		    (vdp->xdf_state == XD_READY))
1951 			break;
1952 
1953 		if (vdp->xdf_state != XD_INIT) {
1954 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1955 			if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1956 				break;
1957 			ASSERT(vdp->xdf_state == XD_INIT);
1958 		}
1959 
1960 		if (xdf_setstate_connected(vdp) != DDI_SUCCESS) {
1961 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1962 			break;
1963 		}
1964 		ASSERT(vdp->xdf_state == XD_CONNECTED);
1965 		break;
1966 
1967 	case XenbusStateClosing:
1968 		if (xdf_isopen(vdp, -1)) {
1969 			cmn_err(CE_NOTE,
1970 			    "xdf@%s: hot-unplug failed, still in use",
1971 			    vdp->xdf_addr);
1972 			break;
1973 		}
1974 		/*FALLTHROUGH*/
1975 	case XenbusStateClosed:
1976 		xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
1977 		break;
1978 	}
1979 
1980 	/* notify anybody waiting for oe state change */
1981 	cv_broadcast(&vdp->xdf_dev_cv);
1982 	DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1983 	mutex_exit(&vdp->xdf_cb_lk);
1984 }
1985 
1986 static int
1987 xdf_connect_locked(xdf_t *vdp, boolean_t wait)
1988 {
1989 	int	rv, timeouts = 0, reset = 20;
1990 
1991 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1992 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1993 
1994 	/* we can't connect once we're in the closed state */
1995 	if (vdp->xdf_state == XD_CLOSED)
1996 		return (XD_CLOSED);
1997 
1998 	vdp->xdf_connect_req++;
1999 	while (vdp->xdf_state != XD_READY) {
2000 		mutex_exit(&vdp->xdf_dev_lk);
2001 
2002 		/* only one thread at a time can be the connection thread */
2003 		if (vdp->xdf_connect_thread == NULL)
2004 			vdp->xdf_connect_thread = curthread;
2005 
2006 		if (vdp->xdf_connect_thread == curthread) {
2007 			if ((timeouts > 0) && ((timeouts % reset) == 0)) {
2008 				/*
2009 				 * If we haven't establised a connection
2010 				 * within the reset time, then disconnect
2011 				 * so we can try again, and double the reset
2012 				 * time.  The reset time starts at 2 sec.
2013 				 */
2014 				(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2015 				reset *= 2;
2016 			}
2017 			if (vdp->xdf_state == XD_UNKNOWN)
2018 				(void) xdf_setstate_init(vdp);
2019 			if (vdp->xdf_state == XD_INIT)
2020 				(void) xdf_setstate_connected(vdp);
2021 		}
2022 
2023 		mutex_enter(&vdp->xdf_dev_lk);
2024 		if (!wait || (vdp->xdf_state == XD_READY))
2025 			goto out;
2026 
2027 		mutex_exit((&vdp->xdf_cb_lk));
2028 		if (vdp->xdf_connect_thread != curthread) {
2029 			rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk);
2030 		} else {
2031 			/* delay for 0.1 sec */
2032 			rv = cv_reltimedwait_sig(&vdp->xdf_dev_cv,
2033 			    &vdp->xdf_dev_lk, drv_usectohz(100*1000),
2034 			    TR_CLOCK_TICK);
2035 			if (rv == -1)
2036 				timeouts++;
2037 		}
2038 		mutex_exit((&vdp->xdf_dev_lk));
2039 		mutex_enter((&vdp->xdf_cb_lk));
2040 		mutex_enter((&vdp->xdf_dev_lk));
2041 		if (rv == 0)
2042 			goto out;
2043 	}
2044 
2045 out:
2046 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2047 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2048 
2049 	if (vdp->xdf_connect_thread == curthread) {
2050 		/*
2051 		 * wake up someone else so they can become the connection
2052 		 * thread.
2053 		 */
2054 		cv_signal(&vdp->xdf_dev_cv);
2055 		vdp->xdf_connect_thread = NULL;
2056 	}
2057 
2058 	/* Try to lock the media */
2059 	mutex_exit((&vdp->xdf_dev_lk));
2060 	(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2061 	mutex_enter((&vdp->xdf_dev_lk));
2062 
2063 	vdp->xdf_connect_req--;
2064 	return (vdp->xdf_state);
2065 }
2066 
2067 static uint_t
2068 xdf_iorestart(caddr_t arg)
2069 {
2070 	xdf_t *vdp = (xdf_t *)arg;
2071 
2072 	ASSERT(vdp != NULL);
2073 
2074 	mutex_enter(&vdp->xdf_dev_lk);
2075 	ASSERT(ISDMACBON(vdp));
2076 	SETDMACBOFF(vdp);
2077 	mutex_exit(&vdp->xdf_dev_lk);
2078 
2079 	xdf_io_start(vdp);
2080 
2081 	return (DDI_INTR_CLAIMED);
2082 }
2083 
2084 #ifdef XPV_HVM_DRIVER
2085 
2086 typedef struct xdf_hvm_entry {
2087 	list_node_t	xdf_he_list;
2088 	char		*xdf_he_path;
2089 	dev_info_t	*xdf_he_dip;
2090 } xdf_hvm_entry_t;
2091 
2092 static list_t xdf_hvm_list;
2093 static kmutex_t xdf_hvm_list_lock;
2094 
2095 static xdf_hvm_entry_t *
2096 i_xdf_hvm_find(const char *path, dev_info_t *dip)
2097 {
2098 	xdf_hvm_entry_t	*i;
2099 
2100 	ASSERT((path != NULL) || (dip != NULL));
2101 	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2102 
2103 	i = list_head(&xdf_hvm_list);
2104 	while (i != NULL) {
2105 		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2106 			i = list_next(&xdf_hvm_list, i);
2107 			continue;
2108 		}
2109 		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2110 			i = list_next(&xdf_hvm_list, i);
2111 			continue;
2112 		}
2113 		break;
2114 	}
2115 	return (i);
2116 }
2117 
2118 dev_info_t *
2119 xdf_hvm_hold(const char *path)
2120 {
2121 	xdf_hvm_entry_t	*i;
2122 	dev_info_t	*dip;
2123 
2124 	mutex_enter(&xdf_hvm_list_lock);
2125 	i = i_xdf_hvm_find(path, NULL);
2126 	if (i == NULL) {
2127 		mutex_exit(&xdf_hvm_list_lock);
2128 		return (B_FALSE);
2129 	}
2130 	ndi_hold_devi(dip = i->xdf_he_dip);
2131 	mutex_exit(&xdf_hvm_list_lock);
2132 	return (dip);
2133 }
2134 
2135 static void
2136 xdf_hvm_add(dev_info_t *dip)
2137 {
2138 	xdf_hvm_entry_t	*i;
2139 	char		*path;
2140 
2141 	/* figure out the path for the dip */
2142 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2143 	(void) ddi_pathname(dip, path);
2144 
2145 	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2146 	i->xdf_he_dip = dip;
2147 	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2148 
2149 	mutex_enter(&xdf_hvm_list_lock);
2150 	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2151 	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2152 	list_insert_head(&xdf_hvm_list, i);
2153 	mutex_exit(&xdf_hvm_list_lock);
2154 
2155 	kmem_free(path, MAXPATHLEN);
2156 }
2157 
2158 static void
2159 xdf_hvm_rm(dev_info_t *dip)
2160 {
2161 	xdf_hvm_entry_t	*i;
2162 
2163 	mutex_enter(&xdf_hvm_list_lock);
2164 	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2165 	list_remove(&xdf_hvm_list, i);
2166 	mutex_exit(&xdf_hvm_list_lock);
2167 
2168 	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2169 	kmem_free(i, sizeof (*i));
2170 }
2171 
2172 static void
2173 xdf_hvm_init(void)
2174 {
2175 	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2176 	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2177 	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2178 }
2179 
2180 static void
2181 xdf_hvm_fini(void)
2182 {
2183 	ASSERT(list_head(&xdf_hvm_list) == NULL);
2184 	list_destroy(&xdf_hvm_list);
2185 	mutex_destroy(&xdf_hvm_list_lock);
2186 }
2187 
2188 boolean_t
2189 xdf_hvm_connect(dev_info_t *dip)
2190 {
2191 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2192 	char	*oename, *str;
2193 	int	rv;
2194 
2195 	mutex_enter(&vdp->xdf_cb_lk);
2196 
2197 	/*
2198 	 * Before try to establish a connection we need to wait for the
2199 	 * backend hotplug scripts to have run.  Once they are run the
2200 	 * "<oename>/hotplug-status" property will be set to "connected".
2201 	 */
2202 	for (;;) {
2203 		ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2204 
2205 		/*
2206 		 * Get the xenbus path to the backend device.  Note that
2207 		 * we can't cache this path (and we look it up on each pass
2208 		 * through this loop) because it could change during
2209 		 * suspend, resume, and migration operations.
2210 		 */
2211 		if ((oename = xvdi_get_oename(dip)) == NULL) {
2212 			mutex_exit(&vdp->xdf_cb_lk);
2213 			return (B_FALSE);
2214 		}
2215 
2216 		str = NULL;
2217 		if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) &&
2218 		    (strcmp(str, XBV_HP_STATUS_CONN) == 0))
2219 			break;
2220 
2221 		if (str != NULL)
2222 			strfree(str);
2223 
2224 		/* wait for an update to "<oename>/hotplug-status" */
2225 		if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) {
2226 			/* we got interrupted by a signal */
2227 			mutex_exit(&vdp->xdf_cb_lk);
2228 			return (B_FALSE);
2229 		}
2230 	}
2231 
2232 	/* Good news.  The backend hotplug scripts have been run. */
2233 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2234 	ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0);
2235 	strfree(str);
2236 
2237 	/*
2238 	 * If we're emulating a cd device and if the backend doesn't support
2239 	 * media request opreations, then we're not going to bother trying
2240 	 * to establish a connection for a couple reasons.  First off, media
2241 	 * requests support is required to support operations like eject and
2242 	 * media locking.  Second, other backend platforms like Linux don't
2243 	 * support hvm pv cdrom access.  They don't even have a backend pv
2244 	 * driver for cdrom device nodes, so we don't want to block forever
2245 	 * waiting for a connection to a backend driver that doesn't exist.
2246 	 */
2247 	if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) {
2248 		mutex_exit(&vdp->xdf_cb_lk);
2249 		return (B_FALSE);
2250 	}
2251 
2252 	mutex_enter(&vdp->xdf_dev_lk);
2253 	rv = xdf_connect_locked(vdp, B_TRUE);
2254 	mutex_exit(&vdp->xdf_dev_lk);
2255 	mutex_exit(&vdp->xdf_cb_lk);
2256 
2257 	return ((rv == XD_READY) ? B_TRUE : B_FALSE);
2258 }
2259 
2260 int
2261 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2262 {
2263 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2264 
2265 	/* sanity check the requested physical geometry */
2266 	mutex_enter(&vdp->xdf_dev_lk);
2267 	if ((geomp->g_secsize != XB_BSIZE) ||
2268 	    (geomp->g_capacity == 0)) {
2269 		mutex_exit(&vdp->xdf_dev_lk);
2270 		return (EINVAL);
2271 	}
2272 
2273 	/*
2274 	 * If we've already connected to the backend device then make sure
2275 	 * we're not defining a physical geometry larger than our backend
2276 	 * device.
2277 	 */
2278 	if ((vdp->xdf_xdev_nblocks != 0) &&
2279 	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2280 		mutex_exit(&vdp->xdf_dev_lk);
2281 		return (EINVAL);
2282 	}
2283 
2284 	bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom));
2285 	vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl;
2286 	vdp->xdf_pgeom.g_acyl = geomp->g_acyl;
2287 	vdp->xdf_pgeom.g_nhead = geomp->g_nhead;
2288 	vdp->xdf_pgeom.g_nsect = geomp->g_nsect;
2289 	vdp->xdf_pgeom.g_secsize = geomp->g_secsize;
2290 	vdp->xdf_pgeom.g_capacity = geomp->g_capacity;
2291 	vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv;
2292 	vdp->xdf_pgeom.g_rpm = geomp->g_rpm;
2293 
2294 	vdp->xdf_pgeom_fixed = B_TRUE;
2295 	mutex_exit(&vdp->xdf_dev_lk);
2296 
2297 	/* force a re-validation */
2298 	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2299 
2300 	return (0);
2301 }
2302 
2303 boolean_t
2304 xdf_is_cd(dev_info_t *dip)
2305 {
2306 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2307 	boolean_t	rv;
2308 
2309 	mutex_enter(&vdp->xdf_cb_lk);
2310 	rv = XD_IS_CD(vdp);
2311 	mutex_exit(&vdp->xdf_cb_lk);
2312 	return (rv);
2313 }
2314 
2315 boolean_t
2316 xdf_is_rm(dev_info_t *dip)
2317 {
2318 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2319 	boolean_t	rv;
2320 
2321 	mutex_enter(&vdp->xdf_cb_lk);
2322 	rv = XD_IS_RM(vdp);
2323 	mutex_exit(&vdp->xdf_cb_lk);
2324 	return (rv);
2325 }
2326 
2327 boolean_t
2328 xdf_media_req_supported(dev_info_t *dip)
2329 {
2330 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2331 	boolean_t	rv;
2332 
2333 	mutex_enter(&vdp->xdf_cb_lk);
2334 	rv = vdp->xdf_media_req_supported;
2335 	mutex_exit(&vdp->xdf_cb_lk);
2336 	return (rv);
2337 }
2338 
2339 #endif /* XPV_HVM_DRIVER */
2340 
2341 static int
2342 xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp)
2343 {
2344 	xdf_t *vdp;
2345 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2346 
2347 	if (vdp == NULL)
2348 		return (ENXIO);
2349 
2350 	mutex_enter(&vdp->xdf_dev_lk);
2351 	*capp = vdp->xdf_pgeom.g_capacity;
2352 	DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp));
2353 	mutex_exit(&vdp->xdf_dev_lk);
2354 	return (0);
2355 }
2356 
2357 static int
2358 xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2359 {
2360 	xdf_t *vdp;
2361 
2362 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
2363 		return (ENXIO);
2364 	*geomp = vdp->xdf_pgeom;
2365 	return (0);
2366 }
2367 
2368 /*
2369  * No real HBA, no geometry available from it
2370  */
2371 /*ARGSUSED*/
2372 static int
2373 xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2374 {
2375 	return (EINVAL);
2376 }
2377 
2378 static int
2379 xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep)
2380 {
2381 	xdf_t *vdp;
2382 
2383 	if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))))
2384 		return (ENXIO);
2385 
2386 	if (XD_IS_RO(vdp))
2387 		tgattributep->media_is_writable = 0;
2388 	else
2389 		tgattributep->media_is_writable = 1;
2390 	tgattributep->media_is_rotational = 0;
2391 	return (0);
2392 }
2393 
2394 /* ARGSUSED3 */
2395 int
2396 xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
2397 {
2398 	int instance;
2399 	xdf_t   *vdp;
2400 
2401 	instance = ddi_get_instance(dip);
2402 
2403 	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
2404 		return (ENXIO);
2405 
2406 	switch (cmd) {
2407 	case TG_GETPHYGEOM:
2408 		return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg));
2409 	case TG_GETVIRTGEOM:
2410 		return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg));
2411 	case TG_GETCAPACITY:
2412 		return (xdf_lb_getcap(dip, (diskaddr_t *)arg));
2413 	case TG_GETBLOCKSIZE:
2414 		mutex_enter(&vdp->xdf_cb_lk);
2415 		*(uint32_t *)arg = vdp->xdf_xdev_secsize;
2416 		mutex_exit(&vdp->xdf_cb_lk);
2417 		return (0);
2418 	case TG_GETATTR:
2419 		return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg));
2420 	default:
2421 		return (ENOTTY);
2422 	}
2423 }
2424 
2425 /* ARGSUSED5 */
2426 int
2427 xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp,
2428     diskaddr_t start, size_t reqlen, void *tg_cookie)
2429 {
2430 	xdf_t *vdp;
2431 	struct buf *bp;
2432 	int err = 0;
2433 
2434 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2435 
2436 	/* We don't allow IO from the oe_change callback thread */
2437 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2438 
2439 	/*
2440 	 * Having secsize of 0 means that device isn't connected yet.
2441 	 * FIXME This happens for CD devices, and there's nothing we
2442 	 * can do about it at the moment.
2443 	 */
2444 	if (vdp->xdf_xdev_secsize == 0)
2445 		return (EIO);
2446 
2447 	if ((start + ((reqlen / (vdp->xdf_xdev_secsize / DEV_BSIZE))
2448 	    >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
2449 		return (EINVAL);
2450 
2451 	bp = getrbuf(KM_SLEEP);
2452 	if (cmd == TG_READ)
2453 		bp->b_flags = B_BUSY | B_READ;
2454 	else
2455 		bp->b_flags = B_BUSY | B_WRITE;
2456 
2457 	bp->b_un.b_addr = bufp;
2458 	bp->b_bcount = reqlen;
2459 	bp->b_blkno = start * (vdp->xdf_xdev_secsize / DEV_BSIZE);
2460 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
2461 
2462 	mutex_enter(&vdp->xdf_dev_lk);
2463 	xdf_bp_push(vdp, bp);
2464 	mutex_exit(&vdp->xdf_dev_lk);
2465 	xdf_io_start(vdp);
2466 	if (curthread == vdp->xdf_ready_tq_thread)
2467 		(void) xdf_ring_drain(vdp);
2468 	err = biowait(bp);
2469 	ASSERT(bp->b_flags & B_DONE);
2470 	freerbuf(bp);
2471 	return (err);
2472 }
2473 
2474 /*
2475  * Lock the current media.  Set the media state to "lock".
2476  * (Media locks are only respected by the backend driver.)
2477  */
2478 static int
2479 xdf_ioctl_mlock(xdf_t *vdp)
2480 {
2481 	int rv;
2482 	mutex_enter(&vdp->xdf_cb_lk);
2483 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2484 	mutex_exit(&vdp->xdf_cb_lk);
2485 	return (rv);
2486 }
2487 
2488 /*
2489  * Release a media lock.  Set the media state to "none".
2490  */
2491 static int
2492 xdf_ioctl_munlock(xdf_t *vdp)
2493 {
2494 	int rv;
2495 	mutex_enter(&vdp->xdf_cb_lk);
2496 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE);
2497 	mutex_exit(&vdp->xdf_cb_lk);
2498 	return (rv);
2499 }
2500 
2501 /*
2502  * Eject the current media.  Ignores any media locks.  (Media locks
2503  * are only for benifit of the the backend.)
2504  */
2505 static int
2506 xdf_ioctl_eject(xdf_t *vdp)
2507 {
2508 	int rv;
2509 
2510 	mutex_enter(&vdp->xdf_cb_lk);
2511 	if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) {
2512 		mutex_exit(&vdp->xdf_cb_lk);
2513 		return (rv);
2514 	}
2515 
2516 	/*
2517 	 * We've set the media requests xenbus parameter to eject, so now
2518 	 * disconnect from the backend, wait for the backend to clear
2519 	 * the media requets xenbus paramter, and then we can reconnect
2520 	 * to the backend.
2521 	 */
2522 	(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2523 	mutex_enter(&vdp->xdf_dev_lk);
2524 	if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) {
2525 		mutex_exit(&vdp->xdf_dev_lk);
2526 		mutex_exit(&vdp->xdf_cb_lk);
2527 		return (EIO);
2528 	}
2529 	mutex_exit(&vdp->xdf_dev_lk);
2530 	mutex_exit(&vdp->xdf_cb_lk);
2531 	return (0);
2532 }
2533 
2534 /*
2535  * Watch for media state changes.  This can be an insertion of a device
2536  * (triggered by a 'xm block-configure' request in another domain) or
2537  * the ejection of a device (triggered by a local "eject" operation).
2538  * For a full description of the DKIOCSTATE ioctl behavior see dkio(7I).
2539  */
2540 static int
2541 xdf_dkstate(xdf_t *vdp, enum dkio_state mstate)
2542 {
2543 	enum dkio_state		prev_state;
2544 
2545 	mutex_enter(&vdp->xdf_cb_lk);
2546 	prev_state = vdp->xdf_mstate;
2547 
2548 	if (vdp->xdf_mstate == mstate) {
2549 		while (vdp->xdf_mstate == prev_state) {
2550 			if (cv_wait_sig(&vdp->xdf_mstate_cv,
2551 			    &vdp->xdf_cb_lk) == 0) {
2552 				mutex_exit(&vdp->xdf_cb_lk);
2553 				return (EINTR);
2554 			}
2555 		}
2556 	}
2557 
2558 	if ((prev_state != DKIO_INSERTED) &&
2559 	    (vdp->xdf_mstate == DKIO_INSERTED)) {
2560 		(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2561 		mutex_exit(&vdp->xdf_cb_lk);
2562 		return (0);
2563 	}
2564 
2565 	mutex_exit(&vdp->xdf_cb_lk);
2566 	return (0);
2567 }
2568 
2569 /*ARGSUSED*/
2570 static int
2571 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2572     int *rvalp)
2573 {
2574 	minor_t		minor = getminor(dev);
2575 	int		part = XDF_PART(minor);
2576 	xdf_t		*vdp;
2577 	int		rv;
2578 
2579 	if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) ||
2580 	    (!xdf_isopen(vdp, part)))
2581 		return (ENXIO);
2582 
2583 	DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n",
2584 	    vdp->xdf_addr, cmd, cmd));
2585 
2586 	switch (cmd) {
2587 	default:
2588 		return (ENOTTY);
2589 	case DKIOCG_PHYGEOM:
2590 	case DKIOCG_VIRTGEOM:
2591 	case DKIOCGGEOM:
2592 	case DKIOCSGEOM:
2593 	case DKIOCGAPART:
2594 	case DKIOCSAPART:
2595 	case DKIOCGVTOC:
2596 	case DKIOCSVTOC:
2597 	case DKIOCPARTINFO:
2598 	case DKIOCGEXTVTOC:
2599 	case DKIOCSEXTVTOC:
2600 	case DKIOCEXTPARTINFO:
2601 	case DKIOCGMBOOT:
2602 	case DKIOCSMBOOT:
2603 	case DKIOCGETEFI:
2604 	case DKIOCSETEFI:
2605 	case DKIOCSETEXTPART:
2606 	case DKIOCPARTITION:
2607 		rv = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
2608 		    rvalp, NULL);
2609 		if (rv != 0)
2610 			return (rv);
2611 		/*
2612 		 * If we're labelling the disk, we have to update the geometry
2613 		 * in the cmlb data structures, and we also have to write a new
2614 		 * devid to the disk.  Note that writing an EFI label currently
2615 		 * requires 4 ioctls, and devid setup will fail on all but the
2616 		 * last.
2617 		 */
2618 		if (cmd == DKIOCSEXTVTOC || cmd == DKIOCSVTOC ||
2619 		    cmd == DKIOCSETEFI) {
2620 			rv = cmlb_validate(vdp->xdf_vd_lbl, 0, 0);
2621 			if (rv == 0) {
2622 				xdf_devid_setup(vdp);
2623 			} else {
2624 				cmn_err(CE_WARN,
2625 				    "xdf@%s, labeling failed on validate",
2626 				    vdp->xdf_addr);
2627 			}
2628 		}
2629 		return (rv);
2630 	case FDEJECT:
2631 	case DKIOCEJECT:
2632 	case CDROMEJECT:
2633 		return (xdf_ioctl_eject(vdp));
2634 	case DKIOCLOCK:
2635 		return (xdf_ioctl_mlock(vdp));
2636 	case DKIOCUNLOCK:
2637 		return (xdf_ioctl_munlock(vdp));
2638 	case CDROMREADOFFSET: {
2639 		int offset = 0;
2640 		if (!XD_IS_CD(vdp))
2641 			return (ENOTTY);
2642 		if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode))
2643 			return (EFAULT);
2644 		return (0);
2645 	}
2646 	case DKIOCGMEDIAINFO: {
2647 		struct dk_minfo media_info;
2648 
2649 		media_info.dki_lbsize = vdp->xdf_xdev_secsize;
2650 		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
2651 		if (XD_IS_CD(vdp))
2652 			media_info.dki_media_type = DK_CDROM;
2653 		else
2654 			media_info.dki_media_type = DK_FIXED_DISK;
2655 
2656 		if (ddi_copyout(&media_info, (void *)arg,
2657 		    sizeof (struct dk_minfo), mode))
2658 			return (EFAULT);
2659 		return (0);
2660 	}
2661 	case DKIOCINFO: {
2662 		struct dk_cinfo info;
2663 
2664 		/* controller information */
2665 		if (XD_IS_CD(vdp))
2666 			info.dki_ctype = DKC_CDROM;
2667 		else
2668 			info.dki_ctype = DKC_VBD;
2669 
2670 		info.dki_cnum = 0;
2671 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
2672 
2673 		/* unit information */
2674 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
2675 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
2676 		info.dki_flags = DKI_FMTVOL;
2677 		info.dki_partition = part;
2678 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
2679 		info.dki_addr = 0;
2680 		info.dki_space = 0;
2681 		info.dki_prio = 0;
2682 		info.dki_vec = 0;
2683 
2684 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
2685 			return (EFAULT);
2686 		return (0);
2687 	}
2688 	case DKIOCSTATE: {
2689 		enum dkio_state mstate;
2690 
2691 		if (ddi_copyin((void *)arg, &mstate,
2692 		    sizeof (mstate), mode) != 0)
2693 			return (EFAULT);
2694 		if ((rv = xdf_dkstate(vdp, mstate)) != 0)
2695 			return (rv);
2696 		mstate = vdp->xdf_mstate;
2697 		if (ddi_copyout(&mstate, (void *)arg,
2698 		    sizeof (mstate), mode) != 0)
2699 			return (EFAULT);
2700 		return (0);
2701 	}
2702 	case DKIOCREMOVABLE: {
2703 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2704 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode))
2705 			return (EFAULT);
2706 		return (0);
2707 	}
2708 	case DKIOCGETWCE: {
2709 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2710 		if (ddi_copyout(&i, (void *)arg, sizeof (i), mode))
2711 			return (EFAULT);
2712 		return (0);
2713 	}
2714 	case DKIOCSETWCE: {
2715 		int i;
2716 		if (ddi_copyin((void *)arg, &i, sizeof (i), mode))
2717 			return (EFAULT);
2718 		vdp->xdf_wce = VOID2BOOLEAN(i);
2719 		return (0);
2720 	}
2721 	case DKIOCFLUSHWRITECACHE: {
2722 		struct dk_callback *dkc = (struct dk_callback *)arg;
2723 
2724 		if (vdp->xdf_flush_supported) {
2725 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2726 			    NULL, 0, 0, (void *)dev);
2727 		} else if (vdp->xdf_feature_barrier &&
2728 		    !xdf_barrier_flush_disable) {
2729 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2730 			    vdp->xdf_cache_flush_block, xdf_flush_block,
2731 			    vdp->xdf_xdev_secsize, (void *)dev);
2732 		} else {
2733 			return (ENOTTY);
2734 		}
2735 		if ((mode & FKIOCTL) && (dkc != NULL) &&
2736 		    (dkc->dkc_callback != NULL)) {
2737 			(*dkc->dkc_callback)(dkc->dkc_cookie, rv);
2738 			/* need to return 0 after calling callback */
2739 			rv = 0;
2740 		}
2741 		return (rv);
2742 	}
2743 	}
2744 	/*NOTREACHED*/
2745 }
2746 
2747 static int
2748 xdf_strategy(struct buf *bp)
2749 {
2750 	xdf_t	*vdp;
2751 	minor_t minor;
2752 	diskaddr_t p_blkct, p_blkst;
2753 	daddr_t blkno;
2754 	ulong_t nblks;
2755 	int part;
2756 
2757 	minor = getminor(bp->b_edev);
2758 	part = XDF_PART(minor);
2759 	vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor));
2760 
2761 	mutex_enter(&vdp->xdf_dev_lk);
2762 	if (!xdf_isopen(vdp, part)) {
2763 		mutex_exit(&vdp->xdf_dev_lk);
2764 		xdf_io_err(bp, ENXIO, 0);
2765 		return (0);
2766 	}
2767 
2768 	/* We don't allow IO from the oe_change callback thread */
2769 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2770 
2771 	/* Check for writes to a read only device */
2772 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
2773 		mutex_exit(&vdp->xdf_dev_lk);
2774 		xdf_io_err(bp, EROFS, 0);
2775 		return (0);
2776 	}
2777 
2778 	/* Check if this I/O is accessing a partition or the entire disk */
2779 	if ((long)bp->b_private == XB_SLICE_NONE) {
2780 		/* This I/O is using an absolute offset */
2781 		p_blkct = vdp->xdf_xdev_nblocks;
2782 		p_blkst = 0;
2783 	} else {
2784 		/* This I/O is using a partition relative offset */
2785 		mutex_exit(&vdp->xdf_dev_lk);
2786 		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
2787 		    &p_blkst, NULL, NULL, NULL)) {
2788 			xdf_io_err(bp, ENXIO, 0);
2789 			return (0);
2790 		}
2791 		mutex_enter(&vdp->xdf_dev_lk);
2792 	}
2793 
2794 	/*
2795 	 * Adjust the real blkno and bcount according to the underline
2796 	 * physical sector size.
2797 	 */
2798 	blkno = bp->b_blkno / (vdp->xdf_xdev_secsize / XB_BSIZE);
2799 
2800 	/* check for a starting block beyond the disk or partition limit */
2801 	if (blkno > p_blkct) {
2802 		DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64,
2803 		    vdp->xdf_addr, (longlong_t)blkno, (uint64_t)p_blkct));
2804 		mutex_exit(&vdp->xdf_dev_lk);
2805 		xdf_io_err(bp, EINVAL, 0);
2806 		return (0);
2807 	}
2808 
2809 	/* Legacy: don't set error flag at this case */
2810 	if (blkno == p_blkct) {
2811 		mutex_exit(&vdp->xdf_dev_lk);
2812 		bp->b_resid = bp->b_bcount;
2813 		biodone(bp);
2814 		return (0);
2815 	}
2816 
2817 	/* sanitize the input buf */
2818 	bioerror(bp, 0);
2819 	bp->b_resid = 0;
2820 	bp->av_back = bp->av_forw = NULL;
2821 
2822 	/* Adjust for partial transfer, this will result in an error later */
2823 	if (vdp->xdf_xdev_secsize != 0 &&
2824 	    vdp->xdf_xdev_secsize != XB_BSIZE) {
2825 		nblks = bp->b_bcount / vdp->xdf_xdev_secsize;
2826 	} else {
2827 		nblks = bp->b_bcount >> XB_BSHIFT;
2828 	}
2829 
2830 	if ((blkno + nblks) > p_blkct) {
2831 		if (vdp->xdf_xdev_secsize != 0 &&
2832 		    vdp->xdf_xdev_secsize != XB_BSIZE) {
2833 			bp->b_resid =
2834 			    ((blkno + nblks) - p_blkct) *
2835 			    vdp->xdf_xdev_secsize;
2836 		} else {
2837 			bp->b_resid =
2838 			    ((blkno + nblks) - p_blkct) <<
2839 			    XB_BSHIFT;
2840 		}
2841 		bp->b_bcount -= bp->b_resid;
2842 	}
2843 
2844 	DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n",
2845 	    vdp->xdf_addr, (longlong_t)blkno, (ulong_t)bp->b_bcount));
2846 
2847 	/* Fix up the buf struct */
2848 	bp->b_flags |= B_BUSY;
2849 	bp->b_private = (void *)(uintptr_t)p_blkst;
2850 
2851 	xdf_bp_push(vdp, bp);
2852 	mutex_exit(&vdp->xdf_dev_lk);
2853 	xdf_io_start(vdp);
2854 	if (do_polled_io)
2855 		(void) xdf_ring_drain(vdp);
2856 	return (0);
2857 }
2858 
2859 /*ARGSUSED*/
2860 static int
2861 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
2862 {
2863 	xdf_t	*vdp;
2864 	minor_t minor;
2865 	diskaddr_t p_blkcnt;
2866 	int part;
2867 
2868 	minor = getminor(dev);
2869 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2870 		return (ENXIO);
2871 
2872 	DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n",
2873 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2874 
2875 	part = XDF_PART(minor);
2876 	if (!xdf_isopen(vdp, part))
2877 		return (ENXIO);
2878 
2879 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2880 	    NULL, NULL, NULL, NULL))
2881 		return (ENXIO);
2882 
2883 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2884 		return (ENOSPC);
2885 
2886 	if (U_INVAL(uiop))
2887 		return (EINVAL);
2888 
2889 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
2890 }
2891 
2892 /*ARGSUSED*/
2893 static int
2894 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
2895 {
2896 	xdf_t *vdp;
2897 	minor_t minor;
2898 	diskaddr_t p_blkcnt;
2899 	int part;
2900 
2901 	minor = getminor(dev);
2902 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2903 		return (ENXIO);
2904 
2905 	DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n",
2906 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2907 
2908 	part = XDF_PART(minor);
2909 	if (!xdf_isopen(vdp, part))
2910 		return (ENXIO);
2911 
2912 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2913 	    NULL, NULL, NULL, NULL))
2914 		return (ENXIO);
2915 
2916 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2917 		return (ENOSPC);
2918 
2919 	if (U_INVAL(uiop))
2920 		return (EINVAL);
2921 
2922 	return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop));
2923 }
2924 
2925 /*ARGSUSED*/
2926 static int
2927 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
2928 {
2929 	xdf_t	*vdp;
2930 	minor_t minor;
2931 	struct uio *uiop = aiop->aio_uio;
2932 	diskaddr_t p_blkcnt;
2933 	int part;
2934 
2935 	minor = getminor(dev);
2936 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2937 		return (ENXIO);
2938 
2939 	part = XDF_PART(minor);
2940 	if (!xdf_isopen(vdp, part))
2941 		return (ENXIO);
2942 
2943 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2944 	    NULL, NULL, NULL, NULL))
2945 		return (ENXIO);
2946 
2947 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2948 		return (ENOSPC);
2949 
2950 	if (U_INVAL(uiop))
2951 		return (EINVAL);
2952 
2953 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop));
2954 }
2955 
2956 /*ARGSUSED*/
2957 static int
2958 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
2959 {
2960 	xdf_t *vdp;
2961 	minor_t minor;
2962 	struct uio *uiop = aiop->aio_uio;
2963 	diskaddr_t p_blkcnt;
2964 	int part;
2965 
2966 	minor = getminor(dev);
2967 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2968 		return (ENXIO);
2969 
2970 	part = XDF_PART(minor);
2971 	if (!xdf_isopen(vdp, part))
2972 		return (ENXIO);
2973 
2974 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2975 	    NULL, NULL, NULL, NULL))
2976 		return (ENXIO);
2977 
2978 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2979 		return (ENOSPC);
2980 
2981 	if (U_INVAL(uiop))
2982 		return (EINVAL);
2983 
2984 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop));
2985 }
2986 
2987 static int
2988 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
2989 {
2990 	struct buf dumpbuf, *dbp = &dumpbuf;
2991 	xdf_t	*vdp;
2992 	minor_t minor;
2993 	int err = 0;
2994 	int part;
2995 	diskaddr_t p_blkcnt, p_blkst;
2996 
2997 	minor = getminor(dev);
2998 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2999 		return (ENXIO);
3000 
3001 	DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n",
3002 	    vdp->xdf_addr, (void *)addr, blkno, nblk));
3003 
3004 	/* We don't allow IO from the oe_change callback thread */
3005 	ASSERT(curthread != vdp->xdf_oe_change_thread);
3006 
3007 	part = XDF_PART(minor);
3008 	if (!xdf_isopen(vdp, part))
3009 		return (ENXIO);
3010 
3011 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
3012 	    NULL, NULL, NULL))
3013 		return (ENXIO);
3014 
3015 	if ((blkno + nblk) >
3016 	    (p_blkcnt * (vdp->xdf_xdev_secsize / XB_BSIZE))) {
3017 		cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64,
3018 		    vdp->xdf_addr, (daddr_t)((blkno + nblk) /
3019 		    (vdp->xdf_xdev_secsize / XB_BSIZE)), (uint64_t)p_blkcnt);
3020 		return (EINVAL);
3021 	}
3022 
3023 	bioinit(dbp);
3024 	dbp->b_flags = B_BUSY;
3025 	dbp->b_un.b_addr = addr;
3026 	dbp->b_bcount = nblk << DEV_BSHIFT;
3027 	dbp->b_blkno = blkno;
3028 	dbp->b_edev = dev;
3029 	dbp->b_private = (void *)(uintptr_t)p_blkst;
3030 
3031 	mutex_enter(&vdp->xdf_dev_lk);
3032 	xdf_bp_push(vdp, dbp);
3033 	mutex_exit(&vdp->xdf_dev_lk);
3034 	xdf_io_start(vdp);
3035 	err = xdf_ring_drain(vdp);
3036 	biofini(dbp);
3037 	return (err);
3038 }
3039 
3040 /*ARGSUSED*/
3041 static int
3042 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
3043 {
3044 	minor_t	minor;
3045 	xdf_t	*vdp;
3046 	int part;
3047 	ulong_t parbit;
3048 
3049 	minor = getminor(dev);
3050 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3051 		return (ENXIO);
3052 
3053 	mutex_enter(&vdp->xdf_dev_lk);
3054 	part = XDF_PART(minor);
3055 	if (!xdf_isopen(vdp, part)) {
3056 		mutex_exit(&vdp->xdf_dev_lk);
3057 		return (ENXIO);
3058 	}
3059 	parbit = 1 << part;
3060 
3061 	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
3062 	if (otyp == OTYP_LYR) {
3063 		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
3064 		if (--vdp->xdf_vd_lyropen[part] == 0)
3065 			vdp->xdf_vd_open[otyp] &= ~parbit;
3066 	} else {
3067 		vdp->xdf_vd_open[otyp] &= ~parbit;
3068 	}
3069 	vdp->xdf_vd_exclopen &= ~parbit;
3070 
3071 	mutex_exit(&vdp->xdf_dev_lk);
3072 	return (0);
3073 }
3074 
3075 static int
3076 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
3077 {
3078 	minor_t	minor;
3079 	xdf_t	*vdp;
3080 	int part;
3081 	ulong_t parbit;
3082 	diskaddr_t p_blkct = 0;
3083 	boolean_t firstopen;
3084 	boolean_t nodelay;
3085 
3086 	minor = getminor(*devp);
3087 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3088 		return (ENXIO);
3089 
3090 	nodelay = (flag & (FNDELAY | FNONBLOCK));
3091 
3092 	DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr));
3093 
3094 	/* do cv_wait until connected or failed */
3095 	mutex_enter(&vdp->xdf_cb_lk);
3096 	mutex_enter(&vdp->xdf_dev_lk);
3097 	if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) {
3098 		mutex_exit(&vdp->xdf_dev_lk);
3099 		mutex_exit(&vdp->xdf_cb_lk);
3100 		return (ENXIO);
3101 	}
3102 	mutex_exit(&vdp->xdf_cb_lk);
3103 
3104 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
3105 		mutex_exit(&vdp->xdf_dev_lk);
3106 		return (EROFS);
3107 	}
3108 
3109 	part = XDF_PART(minor);
3110 	parbit = 1 << part;
3111 	if ((vdp->xdf_vd_exclopen & parbit) ||
3112 	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
3113 		mutex_exit(&vdp->xdf_dev_lk);
3114 		return (EBUSY);
3115 	}
3116 
3117 	/* are we the first one to open this node? */
3118 	firstopen = !xdf_isopen(vdp, -1);
3119 
3120 	if (otyp == OTYP_LYR)
3121 		vdp->xdf_vd_lyropen[part]++;
3122 
3123 	vdp->xdf_vd_open[otyp] |= parbit;
3124 
3125 	if (flag & FEXCL)
3126 		vdp->xdf_vd_exclopen |= parbit;
3127 
3128 	mutex_exit(&vdp->xdf_dev_lk);
3129 
3130 	/* force a re-validation */
3131 	if (firstopen)
3132 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
3133 
3134 	/* If this is a non-blocking open then we're done */
3135 	if (nodelay)
3136 		return (0);
3137 
3138 	/*
3139 	 * This is a blocking open, so we require:
3140 	 * - that the disk have a valid label on it
3141 	 * - that the size of the partition that we're opening is non-zero
3142 	 */
3143 	if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
3144 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) {
3145 		(void) xdf_close(*devp, flag, otyp, credp);
3146 		return (ENXIO);
3147 	}
3148 
3149 	return (0);
3150 }
3151 
3152 /*ARGSUSED*/
3153 static void
3154 xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg)
3155 {
3156 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
3157 	cv_broadcast(&vdp->xdf_hp_status_cv);
3158 }
3159 
3160 static int
3161 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
3162     char *name, caddr_t valuep, int *lengthp)
3163 {
3164 	xdf_t	*vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
3165 
3166 	/*
3167 	 * Sanity check that if a dev_t or dip were specified that they
3168 	 * correspond to this device driver.  On debug kernels we'll
3169 	 * panic and on non-debug kernels we'll return failure.
3170 	 */
3171 	ASSERT(ddi_driver_major(dip) == xdf_major);
3172 	ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major));
3173 	if ((ddi_driver_major(dip) != xdf_major) ||
3174 	    ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major)))
3175 		return (DDI_PROP_NOT_FOUND);
3176 
3177 	if (vdp == NULL)
3178 		return (ddi_prop_op(dev, dip, prop_op, flags,
3179 		    name, valuep, lengthp));
3180 
3181 	return (cmlb_prop_op(vdp->xdf_vd_lbl,
3182 	    dev, dip, prop_op, flags, name, valuep, lengthp,
3183 	    XDF_PART(getminor(dev)), NULL));
3184 }
3185 
3186 /*ARGSUSED*/
3187 static int
3188 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
3189 {
3190 	int	instance = XDF_INST(getminor((dev_t)arg));
3191 	xdf_t	*vbdp;
3192 
3193 	switch (cmd) {
3194 	case DDI_INFO_DEVT2DEVINFO:
3195 		if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) {
3196 			*rp = NULL;
3197 			return (DDI_FAILURE);
3198 		}
3199 		*rp = vbdp->xdf_dip;
3200 		return (DDI_SUCCESS);
3201 
3202 	case DDI_INFO_DEVT2INSTANCE:
3203 		*rp = (void *)(uintptr_t)instance;
3204 		return (DDI_SUCCESS);
3205 
3206 	default:
3207 		return (DDI_FAILURE);
3208 	}
3209 }
3210 
3211 /*ARGSUSED*/
3212 static int
3213 xdf_resume(dev_info_t *dip)
3214 {
3215 	xdf_t	*vdp;
3216 	char	*oename;
3217 
3218 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
3219 		goto err;
3220 
3221 	if (xdf_debug & SUSRES_DBG)
3222 		xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr);
3223 
3224 	mutex_enter(&vdp->xdf_cb_lk);
3225 
3226 	if (xvdi_resume(dip) != DDI_SUCCESS) {
3227 		mutex_exit(&vdp->xdf_cb_lk);
3228 		goto err;
3229 	}
3230 
3231 	if (((oename = xvdi_get_oename(dip)) == NULL) ||
3232 	    (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3233 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) {
3234 		mutex_exit(&vdp->xdf_cb_lk);
3235 		goto err;
3236 	}
3237 
3238 	mutex_enter(&vdp->xdf_dev_lk);
3239 	ASSERT(vdp->xdf_state != XD_READY);
3240 	xdf_set_state(vdp, XD_UNKNOWN);
3241 	mutex_exit(&vdp->xdf_dev_lk);
3242 
3243 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3244 		mutex_exit(&vdp->xdf_cb_lk);
3245 		goto err;
3246 	}
3247 
3248 	mutex_exit(&vdp->xdf_cb_lk);
3249 
3250 	if (xdf_debug & SUSRES_DBG)
3251 		xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr);
3252 	return (DDI_SUCCESS);
3253 err:
3254 	if (xdf_debug & SUSRES_DBG)
3255 		xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr);
3256 	return (DDI_FAILURE);
3257 }
3258 
3259 /*
3260  * Uses the in-memory devid if one exists.
3261  *
3262  * Create a devid and write it on the first block of the last track of
3263  * the last cylinder.
3264  * Return DDI_SUCCESS or DDI_FAILURE.
3265  */
3266 static int
3267 xdf_devid_fabricate(xdf_t *vdp)
3268 {
3269 	ddi_devid_t	devid = vdp->xdf_tgt_devid; /* null if no devid */
3270 	struct dk_devid *dkdevidp = NULL; /* devid struct stored on disk */
3271 	diskaddr_t	blk;
3272 	uint_t		*ip, chksum;
3273 	int		i, devid_size;
3274 
3275 	if (cmlb_get_devid_block(vdp->xdf_vd_lbl, &blk, NULL) != 0)
3276 		goto err;
3277 
3278 	if (devid == NULL && ddi_devid_init(vdp->xdf_dip, DEVID_FAB, 0,
3279 	    NULL, &devid) != DDI_SUCCESS)
3280 		goto err;
3281 
3282 	/* allocate a buffer */
3283 	dkdevidp = (struct dk_devid *)kmem_zalloc(NBPSCTR, KM_SLEEP);
3284 
3285 	/* Fill in the revision */
3286 	dkdevidp->dkd_rev_hi = DK_DEVID_REV_MSB;
3287 	dkdevidp->dkd_rev_lo = DK_DEVID_REV_LSB;
3288 
3289 	/* Copy in the device id */
3290 	devid_size = ddi_devid_sizeof(devid);
3291 	if (devid_size > DK_DEVID_SIZE)
3292 		goto err;
3293 	bcopy(devid, dkdevidp->dkd_devid, devid_size);
3294 
3295 	/* Calculate the chksum */
3296 	chksum = 0;
3297 	ip = (uint_t *)dkdevidp;
3298 	for (i = 0; i < (NBPSCTR / sizeof (int)) - 1; i++)
3299 		chksum ^= ip[i];
3300 
3301 	/* Fill in the checksum */
3302 	DKD_FORMCHKSUM(chksum, dkdevidp);
3303 
3304 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, dkdevidp, blk,
3305 	    NBPSCTR, NULL) != 0)
3306 		goto err;
3307 
3308 	kmem_free(dkdevidp, NBPSCTR);
3309 
3310 	vdp->xdf_tgt_devid = devid;
3311 	return (DDI_SUCCESS);
3312 
3313 err:
3314 	if (dkdevidp != NULL)
3315 		kmem_free(dkdevidp, NBPSCTR);
3316 	if (devid != NULL && vdp->xdf_tgt_devid == NULL)
3317 		ddi_devid_free(devid);
3318 	return (DDI_FAILURE);
3319 }
3320 
3321 /*
3322  * xdf_devid_read() is a local copy of xdfs_devid_read(), modified to use xdf
3323  * functions.
3324  *
3325  * Read a devid from on the first block of the last track of
3326  * the last cylinder.  Make sure what we read is a valid devid.
3327  * Return DDI_SUCCESS or DDI_FAILURE.
3328  */
3329 static int
3330 xdf_devid_read(xdf_t *vdp)
3331 {
3332 	diskaddr_t	blk;
3333 	struct dk_devid *dkdevidp;
3334 	uint_t		*ip, chksum;
3335 	int		i;
3336 
3337 	if (cmlb_get_devid_block(vdp->xdf_vd_lbl, &blk, NULL) != 0)
3338 		return (DDI_FAILURE);
3339 
3340 	dkdevidp = kmem_zalloc(NBPSCTR, KM_SLEEP);
3341 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, dkdevidp, blk,
3342 	    NBPSCTR, NULL) != 0)
3343 		goto err;
3344 
3345 	/* Validate the revision */
3346 	if ((dkdevidp->dkd_rev_hi != DK_DEVID_REV_MSB) ||
3347 	    (dkdevidp->dkd_rev_lo != DK_DEVID_REV_LSB))
3348 		goto err;
3349 
3350 	/* Calculate the checksum */
3351 	chksum = 0;
3352 	ip = (uint_t *)dkdevidp;
3353 	for (i = 0; i < (NBPSCTR / sizeof (int)) - 1; i++)
3354 		chksum ^= ip[i];
3355 	if (DKD_GETCHKSUM(dkdevidp) != chksum)
3356 		goto err;
3357 
3358 	/* Validate the device id */
3359 	if (ddi_devid_valid((ddi_devid_t)dkdevidp->dkd_devid) != DDI_SUCCESS)
3360 		goto err;
3361 
3362 	/* keep a copy of the device id */
3363 	i = ddi_devid_sizeof((ddi_devid_t)dkdevidp->dkd_devid);
3364 	vdp->xdf_tgt_devid = kmem_alloc(i, KM_SLEEP);
3365 	bcopy(dkdevidp->dkd_devid, vdp->xdf_tgt_devid, i);
3366 	kmem_free(dkdevidp, NBPSCTR);
3367 	return (DDI_SUCCESS);
3368 
3369 err:
3370 	kmem_free(dkdevidp, NBPSCTR);
3371 	return (DDI_FAILURE);
3372 }
3373 
3374 /*
3375  * xdf_devid_setup() is a modified copy of cmdk_devid_setup().
3376  *
3377  * This function creates a devid if we don't already have one, and
3378  * registers it.  If we already have one, we make sure that it can be
3379  * read from the disk, otherwise we write it to the disk ourselves.  If
3380  * we didn't already have a devid, and we create one, we also need to
3381  * register it.
3382  */
3383 void
3384 xdf_devid_setup(xdf_t *vdp)
3385 {
3386 	int rc;
3387 	boolean_t existed = vdp->xdf_tgt_devid != NULL;
3388 
3389 	/* Read devid from the disk, if present */
3390 	rc = xdf_devid_read(vdp);
3391 
3392 	/* Otherwise write a devid (which we create if necessary) on the disk */
3393 	if (rc != DDI_SUCCESS)
3394 		rc = xdf_devid_fabricate(vdp);
3395 
3396 	/* If we created a devid or found it on the disk, register it */
3397 	if (rc == DDI_SUCCESS && !existed)
3398 		(void) ddi_devid_register(vdp->xdf_dip, vdp->xdf_tgt_devid);
3399 }
3400 
3401 static int
3402 xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3403 {
3404 	int			n, instance = ddi_get_instance(dip);
3405 	ddi_iblock_cookie_t	ibc, softibc;
3406 	boolean_t		dev_iscd = B_FALSE;
3407 	xdf_t			*vdp;
3408 	char			*oename, *xsname, *str;
3409 	clock_t			timeout;
3410 	int			err = 0;
3411 
3412 	if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM,
3413 	    "xdf_debug", 0)) != 0)
3414 		xdf_debug = n;
3415 
3416 	switch (cmd) {
3417 	case DDI_RESUME:
3418 		return (xdf_resume(dip));
3419 	case DDI_ATTACH:
3420 		break;
3421 	default:
3422 		return (DDI_FAILURE);
3423 	}
3424 	/* DDI_ATTACH */
3425 
3426 	if ((xsname = xvdi_get_xsname(dip)) == NULL ||
3427 	    (oename = xvdi_get_oename(dip)) == NULL)
3428 		return (DDI_FAILURE);
3429 
3430 	/*
3431 	 * Disable auto-detach.  This is necessary so that we don't get
3432 	 * detached while we're disconnected from the back end.
3433 	 */
3434 	if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip,
3435 	    DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS))
3436 		return (DDI_FAILURE);
3437 
3438 	/* driver handles kernel-issued IOCTLs */
3439 	if (ddi_prop_create(DDI_DEV_T_NONE, dip,
3440 	    DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS)
3441 		return (DDI_FAILURE);
3442 
3443 	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
3444 		return (DDI_FAILURE);
3445 
3446 	if (ddi_get_soft_iblock_cookie(dip,
3447 	    DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS)
3448 		return (DDI_FAILURE);
3449 
3450 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
3451 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
3452 		    ddi_get_name_addr(dip));
3453 		return (DDI_FAILURE);
3454 	}
3455 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
3456 		dev_iscd = B_TRUE;
3457 	strfree(str);
3458 
3459 	if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS)
3460 		return (DDI_FAILURE);
3461 
3462 	DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip)));
3463 	vdp = ddi_get_soft_state(xdf_ssp, instance);
3464 	ddi_set_driver_private(dip, vdp);
3465 	vdp->xdf_dip = dip;
3466 	vdp->xdf_addr = ddi_get_name_addr(dip);
3467 	vdp->xdf_suspending = B_FALSE;
3468 	vdp->xdf_media_req_supported = B_FALSE;
3469 	vdp->xdf_peer = INVALID_DOMID;
3470 	vdp->xdf_evtchn = INVALID_EVTCHN;
3471 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
3472 	    offsetof(v_req_t, v_link));
3473 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
3474 	cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL);
3475 	cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL);
3476 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3477 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3478 	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3479 	vdp->xdf_cmbl_reattach = B_TRUE;
3480 	if (dev_iscd) {
3481 		vdp->xdf_dinfo |= VDISK_CDROM;
3482 		vdp->xdf_mstate = DKIO_EJECTED;
3483 	} else {
3484 		vdp->xdf_mstate = DKIO_NONE;
3485 	}
3486 
3487 	if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq",
3488 	    1, TASKQ_DEFAULTPRI, 0)) == NULL)
3489 		goto errout0;
3490 
3491 	if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3492 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)
3493 		goto errout0;
3494 
3495 	if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
3496 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
3497 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
3498 		    ddi_get_name_addr(dip));
3499 		goto errout0;
3500 	}
3501 
3502 	/*
3503 	 * Initialize the physical geometry stucture.  Note that currently
3504 	 * we don't know the size of the backend device so the number
3505 	 * of blocks on the device will be initialized to zero.  Once
3506 	 * we connect to the backend device we'll update the physical
3507 	 * geometry to reflect the real size of the device.
3508 	 */
3509 	xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom);
3510 	vdp->xdf_pgeom_fixed = B_FALSE;
3511 
3512 	/*
3513 	 * Create default device minor nodes: non-removable disk.
3514 	 * We will adjust minor nodes after we are connected w/ backend.
3515 	 *
3516 	 * FIXME creating device minor nodes is currently disabled for CD
3517 	 * devices, re-enable once the issues with xdf CD devices are fixed.
3518 	 */
3519 	if (!dev_iscd) {
3520 		cmlb_alloc_handle(&vdp->xdf_vd_lbl);
3521 		if (xdf_cmlb_attach(vdp) != 0) {
3522 			cmn_err(CE_WARN,
3523 			    "xdf@%s: attach failed, cmlb attach failed",
3524 			    ddi_get_name_addr(dip));
3525 			goto errout0;
3526 		}
3527 	}
3528 
3529 	/* We ship with cache-enabled disks */
3530 	vdp->xdf_wce = B_TRUE;
3531 
3532 	mutex_enter(&vdp->xdf_cb_lk);
3533 	/* Watch backend XenbusState change */
3534 	if (xvdi_add_event_handler(dip,
3535 	    XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) {
3536 		mutex_exit(&vdp->xdf_cb_lk);
3537 		goto errout0;
3538 	}
3539 
3540 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3541 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
3542 		    ddi_get_name_addr(dip));
3543 		mutex_exit(&vdp->xdf_cb_lk);
3544 		goto errout1;
3545 	}
3546 
3547 	/* Nothing else to do for CD devices */
3548 	if (dev_iscd) {
3549 		mutex_exit(&vdp->xdf_cb_lk);
3550 		goto done;
3551 	}
3552 
3553 	/*
3554 	 * In order to do cmlb_validate, we have to wait for the disk to
3555 	 * acknowledge the attach, so we can query the backend for the disk
3556 	 * geometry (see xdf_setstate_connected).
3557 	 *
3558 	 * We only wait 30 seconds; if this is the root disk, the boot
3559 	 * will fail, but it would fail anyway if the device never
3560 	 * connected.  If this is a non-boot disk, that disk will fail
3561 	 * to connect, but again, it would fail anyway.
3562 	 */
3563 	timeout = ddi_get_lbolt() + drv_usectohz(XDF_STATE_TIMEOUT);
3564 	while (vdp->xdf_state != XD_CONNECTED && vdp->xdf_state != XD_READY) {
3565 		if (cv_timedwait(&vdp->xdf_dev_cv, &vdp->xdf_cb_lk,
3566 		    timeout) < 0) {
3567 			cmn_err(CE_WARN, "xdf@%s: disk failed to connect",
3568 			    ddi_get_name_addr(dip));
3569 			mutex_exit(&vdp->xdf_cb_lk);
3570 			goto errout1;
3571 		}
3572 	}
3573 	mutex_exit(&vdp->xdf_cb_lk);
3574 
3575 	/*
3576 	 * We call cmlb_validate so that the geometry information in
3577 	 * vdp->xdf_vd_lbl is correct; this fills out the number of
3578 	 * alternate cylinders so that we have a place to write the
3579 	 * devid.
3580 	 */
3581 	if ((err = cmlb_validate(vdp->xdf_vd_lbl, 0, NULL)) != 0) {
3582 		cmn_err(CE_NOTE,
3583 		    "xdf@%s: cmlb_validate failed: %d",
3584 		    ddi_get_name_addr(dip), err);
3585 		/*
3586 		 * We can carry on even if cmlb_validate() returns EINVAL here,
3587 		 * as we'll rewrite the disk label anyway.
3588 		 */
3589 		if (err != EINVAL)
3590 			goto errout1;
3591 	}
3592 
3593 	/*
3594 	 * xdf_devid_setup will only write a devid if one isn't
3595 	 * already present.  If it fails to find or create one, we
3596 	 * create one in-memory so that when we label the disk later,
3597 	 * it will have a devid to use.  This is helpful to deal with
3598 	 * cases where people use the devids of their disks before
3599 	 * labelling them; note that this does cause problems if
3600 	 * people rely on the devids of unlabelled disks to persist
3601 	 * across reboot.
3602 	 */
3603 	xdf_devid_setup(vdp);
3604 	if (vdp->xdf_tgt_devid == NULL) {
3605 		if (ddi_devid_init(vdp->xdf_dip, DEVID_FAB, 0, NULL,
3606 		    &vdp->xdf_tgt_devid) != DDI_SUCCESS) {
3607 			cmn_err(CE_WARN,
3608 			    "xdf@%s_ attach failed, devid_init failed",
3609 			    ddi_get_name_addr(dip));
3610 			goto errout1;
3611 		} else {
3612 			(void) ddi_devid_register(vdp->xdf_dip,
3613 			    vdp->xdf_tgt_devid);
3614 		}
3615 	}
3616 
3617 done:
3618 #ifdef XPV_HVM_DRIVER
3619 	xdf_hvm_add(dip);
3620 
3621 	/* Report our version to dom0.  */
3622 	if (xenbus_printf(XBT_NULL, "guest/xdf", "version", "%d",
3623 	    HVMPV_XDF_VERS))
3624 		cmn_err(CE_WARN, "xdf: couldn't write version\n");
3625 
3626 #endif /* XPV_HVM_DRIVER */
3627 
3628 	/* Create kstat for iostat(1M) */
3629 	if (xdf_kstat_create(dip) != 0) {
3630 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
3631 		    ddi_get_name_addr(dip));
3632 		goto errout1;
3633 	}
3634 
3635 	/*
3636 	 * Don't bother with getting real device identification
3637 	 * strings (is it even possible?), they are unlikely to
3638 	 * change often (if at all).
3639 	 */
3640 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_VENDOR_ID,
3641 	    "Xen");
3642 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_PRODUCT_ID,
3643 	    dev_iscd ? "Virtual CD" : "Virtual disk");
3644 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_REVISION_ID,
3645 	    "1.0");
3646 
3647 	ddi_report_dev(dip);
3648 	DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr));
3649 	return (DDI_SUCCESS);
3650 
3651 errout1:
3652 	(void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed);
3653 	xvdi_remove_event_handler(dip, XS_OE_STATE);
3654 errout0:
3655 	if (vdp->xdf_vd_lbl != NULL) {
3656 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
3657 		cmlb_free_handle(&vdp->xdf_vd_lbl);
3658 		vdp->xdf_vd_lbl = NULL;
3659 	}
3660 	if (vdp->xdf_softintr_id != NULL)
3661 		ddi_remove_softintr(vdp->xdf_softintr_id);
3662 	xvdi_remove_xb_watch_handlers(dip);
3663 	if (vdp->xdf_ready_tq != NULL)
3664 		ddi_taskq_destroy(vdp->xdf_ready_tq);
3665 	mutex_destroy(&vdp->xdf_cb_lk);
3666 	mutex_destroy(&vdp->xdf_dev_lk);
3667 	cv_destroy(&vdp->xdf_dev_cv);
3668 	cv_destroy(&vdp->xdf_hp_status_cv);
3669 	ddi_soft_state_free(xdf_ssp, instance);
3670 	ddi_set_driver_private(dip, NULL);
3671 	ddi_prop_remove_all(dip);
3672 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip));
3673 	return (DDI_FAILURE);
3674 }
3675 
3676 static int
3677 xdf_suspend(dev_info_t *dip)
3678 {
3679 	int		instance = ddi_get_instance(dip);
3680 	xdf_t		*vdp;
3681 
3682 	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
3683 		return (DDI_FAILURE);
3684 
3685 	if (xdf_debug & SUSRES_DBG)
3686 		xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr);
3687 
3688 	xvdi_suspend(dip);
3689 
3690 	mutex_enter(&vdp->xdf_cb_lk);
3691 	mutex_enter(&vdp->xdf_dev_lk);
3692 
3693 	vdp->xdf_suspending = B_TRUE;
3694 	xdf_ring_destroy(vdp);
3695 	xdf_set_state(vdp, XD_SUSPEND);
3696 	vdp->xdf_suspending = B_FALSE;
3697 
3698 	mutex_exit(&vdp->xdf_dev_lk);
3699 	mutex_exit(&vdp->xdf_cb_lk);
3700 
3701 	if (xdf_debug & SUSRES_DBG)
3702 		xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr);
3703 
3704 	return (DDI_SUCCESS);
3705 }
3706 
3707 static int
3708 xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3709 {
3710 	xdf_t *vdp;
3711 	int instance;
3712 
3713 	switch (cmd) {
3714 
3715 	case DDI_PM_SUSPEND:
3716 		break;
3717 
3718 	case DDI_SUSPEND:
3719 		return (xdf_suspend(dip));
3720 
3721 	case DDI_DETACH:
3722 		break;
3723 
3724 	default:
3725 		return (DDI_FAILURE);
3726 	}
3727 
3728 	instance = ddi_get_instance(dip);
3729 	DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip)));
3730 	vdp = ddi_get_soft_state(xdf_ssp, instance);
3731 
3732 	if (vdp == NULL)
3733 		return (DDI_FAILURE);
3734 
3735 	mutex_enter(&vdp->xdf_cb_lk);
3736 	xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
3737 	if (vdp->xdf_state != XD_CLOSED) {
3738 		mutex_exit(&vdp->xdf_cb_lk);
3739 		return (DDI_FAILURE);
3740 	}
3741 	mutex_exit(&vdp->xdf_cb_lk);
3742 
3743 	ASSERT(!ISDMACBON(vdp));
3744 
3745 #ifdef XPV_HVM_DRIVER
3746 	xdf_hvm_rm(dip);
3747 #endif /* XPV_HVM_DRIVER */
3748 
3749 	if (vdp->xdf_timeout_id != 0)
3750 		(void) untimeout(vdp->xdf_timeout_id);
3751 
3752 	xvdi_remove_event_handler(dip, XS_OE_STATE);
3753 	ddi_taskq_destroy(vdp->xdf_ready_tq);
3754 
3755 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
3756 	cmlb_free_handle(&vdp->xdf_vd_lbl);
3757 
3758 	/* we'll support backend running in domU later */
3759 #ifdef	DOMU_BACKEND
3760 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
3761 #endif
3762 
3763 	list_destroy(&vdp->xdf_vreq_act);
3764 	ddi_prop_remove_all(dip);
3765 	xdf_kstat_delete(dip);
3766 	ddi_remove_softintr(vdp->xdf_softintr_id);
3767 	xvdi_remove_xb_watch_handlers(dip);
3768 	ddi_set_driver_private(dip, NULL);
3769 	cv_destroy(&vdp->xdf_dev_cv);
3770 	mutex_destroy(&vdp->xdf_cb_lk);
3771 	mutex_destroy(&vdp->xdf_dev_lk);
3772 	if (vdp->xdf_cache_flush_block != NULL)
3773 		kmem_free(vdp->xdf_flush_mem, 2 * vdp->xdf_xdev_secsize);
3774 	ddi_soft_state_free(xdf_ssp, instance);
3775 	return (DDI_SUCCESS);
3776 }
3777 
3778 /*
3779  * Driver linkage structures.
3780  */
3781 static struct cb_ops xdf_cbops = {
3782 	xdf_open,
3783 	xdf_close,
3784 	xdf_strategy,
3785 	nodev,
3786 	xdf_dump,
3787 	xdf_read,
3788 	xdf_write,
3789 	xdf_ioctl,
3790 	nodev,
3791 	nodev,
3792 	nodev,
3793 	nochpoll,
3794 	xdf_prop_op,
3795 	NULL,
3796 	D_MP | D_NEW | D_64BIT,
3797 	CB_REV,
3798 	xdf_aread,
3799 	xdf_awrite
3800 };
3801 
3802 struct dev_ops xdf_devops = {
3803 	DEVO_REV,		/* devo_rev */
3804 	0,			/* devo_refcnt */
3805 	xdf_getinfo,		/* devo_getinfo */
3806 	nulldev,		/* devo_identify */
3807 	nulldev,		/* devo_probe */
3808 	xdf_attach,		/* devo_attach */
3809 	xdf_detach,		/* devo_detach */
3810 	nodev,			/* devo_reset */
3811 	&xdf_cbops,		/* devo_cb_ops */
3812 	NULL,			/* devo_bus_ops */
3813 	NULL,			/* devo_power */
3814 	ddi_quiesce_not_supported, /* devo_quiesce */
3815 };
3816 
3817 /*
3818  * Module linkage structures.
3819  */
3820 static struct modldrv modldrv = {
3821 	&mod_driverops,		/* Type of module.  This one is a driver */
3822 	"virtual block driver",	/* short description */
3823 	&xdf_devops		/* driver specific ops */
3824 };
3825 
3826 static struct modlinkage xdf_modlinkage = {
3827 	MODREV_1, (void *)&modldrv, NULL
3828 };
3829 
3830 /*
3831  * standard module entry points
3832  */
3833 int
3834 _init(void)
3835 {
3836 	int rc;
3837 
3838 	xdf_major = ddi_name_to_major("xdf");
3839 	if (xdf_major == (major_t)-1)
3840 		return (EINVAL);
3841 
3842 	if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0)
3843 		return (rc);
3844 
3845 	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
3846 	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3847 	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
3848 	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3849 
3850 #ifdef XPV_HVM_DRIVER
3851 	xdf_hvm_init();
3852 #endif /* XPV_HVM_DRIVER */
3853 
3854 	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
3855 #ifdef XPV_HVM_DRIVER
3856 		xdf_hvm_fini();
3857 #endif /* XPV_HVM_DRIVER */
3858 		kmem_cache_destroy(xdf_vreq_cache);
3859 		kmem_cache_destroy(xdf_gs_cache);
3860 		ddi_soft_state_fini(&xdf_ssp);
3861 		return (rc);
3862 	}
3863 
3864 	return (rc);
3865 }
3866 
3867 int
3868 _fini(void)
3869 {
3870 	int err;
3871 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
3872 		return (err);
3873 
3874 #ifdef XPV_HVM_DRIVER
3875 	xdf_hvm_fini();
3876 #endif /* XPV_HVM_DRIVER */
3877 
3878 	kmem_cache_destroy(xdf_vreq_cache);
3879 	kmem_cache_destroy(xdf_gs_cache);
3880 	ddi_soft_state_fini(&xdf_ssp);
3881 
3882 	return (0);
3883 }
3884 
3885 int
3886 _info(struct modinfo *modinfop)
3887 {
3888 	return (mod_info(&xdf_modlinkage, modinfop));
3889 }
3890