xref: /illumos-gate/usr/src/uts/common/xen/io/xdf.c (revision f17620a4f72a29025a22655ba8735ccd20ae174f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
29  * Copyright 2017 Nexenta Systems, Inc.
30  */
31 
32 /*
33  * xdf.c - Xen Virtual Block Device Driver
34  * TODO:
35  *	- support alternate block size (currently only DEV_BSIZE supported)
36  *	- revalidate geometry for removable devices
37  *
38  * This driver exports disk device nodes, accepts IO requests from those
39  * nodes, and services those requests by talking to a backend device
40  * in another domain.
41  *
42  * Communication with the backend device is done via a ringbuffer (which is
43  * managed via xvdi interfaces) and dma memory (which is managed via ddi
44  * interfaces).
45  *
46  * Communication with the backend device is dependant upon establishing a
47  * connection to the backend device.  This connection process involves
48  * reading device configuration information from xenbus and publishing
49  * some frontend runtime configuration parameters via the xenbus (for
50  * consumption by the backend).  Once we've published runtime configuration
51  * information via the xenbus, the backend device can enter the connected
52  * state and we'll enter the XD_CONNECTED state.  But before we can allow
53  * random IO to begin, we need to do IO to the backend device to determine
54  * the device label and if flush operations are supported.  Once this is
55  * done we enter the XD_READY state and can process any IO operations.
56  *
57  * We receive notifications of xenbus state changes for the backend device
58  * (aka, the "other end") via the xdf_oe_change() callback.  This callback
59  * is single threaded, meaning that we can't receive new notification of
60  * other end state changes while we're processing an outstanding
61  * notification of an other end state change.  There for we can't do any
62  * blocking operations from the xdf_oe_change() callback.  This is why we
63  * have a seperate taskq (xdf_ready_tq) which exists to do the necessary
64  * IO to get us from the XD_CONNECTED to the XD_READY state.  All IO
65  * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go
66  * throught xdf_lb_rdwr(), which is a synchronous IO interface.  IOs
67  * generated by the xdf_ready_tq_thread thread have priority over all
68  * other IO requests.
69  *
70  * We also communicate with the backend device via the xenbus "media-req"
71  * (XBP_MEDIA_REQ) property.  For more information on this see the
72  * comments in blkif.h.
73  */
74 
75 #include <io/xdf.h>
76 
77 #include <sys/conf.h>
78 #include <sys/dkio.h>
79 #include <sys/promif.h>
80 #include <sys/sysmacros.h>
81 #include <sys/kstat.h>
82 #include <sys/mach_mmu.h>
83 #ifdef XPV_HVM_DRIVER
84 #include <sys/xpv_support.h>
85 #else /* !XPV_HVM_DRIVER */
86 #include <sys/evtchn_impl.h>
87 #endif /* !XPV_HVM_DRIVER */
88 #include <sys/sunndi.h>
89 #include <public/io/xenbus.h>
90 #include <xen/sys/xenbus_impl.h>
91 #include <sys/scsi/generic/inquiry.h>
92 #include <xen/io/blkif_impl.h>
93 #include <sys/fdio.h>
94 #include <sys/cdio.h>
95 
96 /*
97  * DEBUG_EVAL can be used to include debug only statements without
98  * having to use '#ifdef DEBUG' statements
99  */
100 #ifdef DEBUG
101 #define	DEBUG_EVAL(x)	(x)
102 #else /* !DEBUG */
103 #define	DEBUG_EVAL(x)
104 #endif /* !DEBUG */
105 
106 #define	XDF_DRAIN_MSEC_DELAY		(50*1000)	/* 00.05 sec */
107 #define	XDF_DRAIN_RETRY_COUNT		200		/* 10.00 sec */
108 #define	XDF_STATE_TIMEOUT		(30*1000*1000)	/* 30.00 sec */
109 
110 #define	INVALID_DOMID	((domid_t)-1)
111 #define	FLUSH_DISKCACHE	0x1
112 #define	WRITE_BARRIER	0x2
113 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
114 #define	USE_WRITE_BARRIER(vdp)						\
115 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
116 #define	USE_FLUSH_DISKCACHE(vdp)					\
117 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
118 #define	IS_WRITE_BARRIER(vdp, bp)					\
119 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&			\
120 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
121 #define	IS_FLUSH_DISKCACHE(bp)						\
122 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
123 
124 #define	VREQ_DONE(vreq)							\
125 	VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) &&		\
126 	    (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) ||		\
127 	    (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws)))
128 
129 #define	BP_VREQ(bp)		((v_req_t *)((bp)->av_back))
130 #define	BP_VREQ_SET(bp, vreq)	(((bp)->av_back = (buf_t *)(vreq)))
131 
132 extern int		do_polled_io;
133 
134 /* run-time tunables that we don't want the compiler to optimize away */
135 volatile int		xdf_debug = 0;
136 volatile boolean_t	xdf_barrier_flush_disable = B_FALSE;
137 
138 /* per module globals */
139 major_t			xdf_major;
140 static void		*xdf_ssp;
141 static kmem_cache_t	*xdf_vreq_cache;
142 static kmem_cache_t	*xdf_gs_cache;
143 static int		xdf_maxphys = XB_MAXPHYS;
144 static diskaddr_t	xdf_flush_block = DEFAULT_FLUSH_BLOCK;
145 static int		xdf_fbrewrites;	/* flush block re-write count */
146 
147 /* misc public functions */
148 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
149 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
150 
151 /*  misc private functions */
152 static void xdf_io_start(xdf_t *);
153 static void xdf_devid_setup(xdf_t *);
154 
155 /* callbacks from commmon label */
156 static cmlb_tg_ops_t xdf_lb_ops = {
157 	TG_DK_OPS_VERSION_1,
158 	xdf_lb_rdwr,
159 	xdf_lb_getinfo
160 };
161 
162 /*
163  * I/O buffer DMA attributes
164  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
165  */
166 static ddi_dma_attr_t xb_dma_attr = {
167 	DMA_ATTR_V0,
168 	(uint64_t)0,			/* lowest address */
169 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
170 	(uint64_t)0xffffff,		/* DMA counter limit max */
171 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
172 	XB_BSIZE - 1,			/* bitmap of burst sizes */
173 	XB_BSIZE,			/* min transfer */
174 	(uint64_t)XB_MAX_XFER,		/* maximum transfer */
175 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
176 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
177 	XB_BSIZE,			/* granularity */
178 	0,				/* flags (reserved) */
179 };
180 
181 static ddi_device_acc_attr_t xc_acc_attr = {
182 	DDI_DEVICE_ATTR_V0,
183 	DDI_NEVERSWAP_ACC,
184 	DDI_STRICTORDER_ACC
185 };
186 
187 static void
188 xdf_timeout_handler(void *arg)
189 {
190 	xdf_t *vdp = arg;
191 
192 	mutex_enter(&vdp->xdf_dev_lk);
193 	vdp->xdf_timeout_id = 0;
194 	mutex_exit(&vdp->xdf_dev_lk);
195 
196 	/* new timeout thread could be re-scheduled */
197 	xdf_io_start(vdp);
198 }
199 
200 /*
201  * callback func when DMA/GTE resources is available
202  *
203  * Note: we only register one callback function to grant table subsystem
204  * since we only have one 'struct gnttab_free_callback' in xdf_t.
205  */
206 static void
207 xdf_gncallback(void *arg)
208 {
209 	xdf_t *vdp = arg;
210 	ASSERT(vdp != NULL);
211 
212 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
213 	    vdp->xdf_addr));
214 
215 	ddi_trigger_softintr(vdp->xdf_softintr_id);
216 }
217 
218 static int
219 xdf_dmacallback(caddr_t arg)
220 {
221 	xdf_gncallback(arg);
222 	return (DDI_DMA_CALLBACK_DONE);
223 }
224 
225 static ge_slot_t *
226 gs_get(xdf_t *vdp, int isread)
227 {
228 	grant_ref_t gh;
229 	ge_slot_t *gs;
230 
231 	/* try to alloc GTEs needed in this slot, first */
232 	if (gnttab_alloc_grant_references(
233 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
234 		if (vdp->xdf_gnt_callback.next == NULL) {
235 			SETDMACBON(vdp);
236 			gnttab_request_free_callback(
237 			    &vdp->xdf_gnt_callback,
238 			    xdf_gncallback,
239 			    (void *)vdp,
240 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
241 		}
242 		return (NULL);
243 	}
244 
245 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
246 	if (gs == NULL) {
247 		gnttab_free_grant_references(gh);
248 		if (vdp->xdf_timeout_id == 0)
249 			/* restart I/O after one second */
250 			vdp->xdf_timeout_id =
251 			    timeout(xdf_timeout_handler, vdp, hz);
252 		return (NULL);
253 	}
254 
255 	/* init gs_slot */
256 	gs->gs_oeid = vdp->xdf_peer;
257 	gs->gs_isread = isread;
258 	gs->gs_ghead = gh;
259 	gs->gs_ngrefs = 0;
260 
261 	return (gs);
262 }
263 
264 static void
265 gs_free(ge_slot_t *gs)
266 {
267 	int		i;
268 
269 	/* release all grant table entry resources used in this slot */
270 	for (i = 0; i < gs->gs_ngrefs; i++)
271 		gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0);
272 	gnttab_free_grant_references(gs->gs_ghead);
273 	list_remove(&gs->gs_vreq->v_gs, gs);
274 	kmem_cache_free(xdf_gs_cache, gs);
275 }
276 
277 static grant_ref_t
278 gs_grant(ge_slot_t *gs, mfn_t mfn)
279 {
280 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead);
281 
282 	ASSERT(gr != -1);
283 	ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
284 	gs->gs_ge[gs->gs_ngrefs++] = gr;
285 	gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread);
286 
287 	return (gr);
288 }
289 
290 /*
291  * Alloc a vreq for this bp
292  * bp->av_back contains the pointer to the vreq upon return
293  */
294 static v_req_t *
295 vreq_get(xdf_t *vdp, buf_t *bp)
296 {
297 	v_req_t *vreq = NULL;
298 
299 	ASSERT(BP_VREQ(bp) == NULL);
300 
301 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
302 	if (vreq == NULL) {
303 		if (vdp->xdf_timeout_id == 0)
304 			/* restart I/O after one second */
305 			vdp->xdf_timeout_id =
306 			    timeout(xdf_timeout_handler, vdp, hz);
307 		return (NULL);
308 	}
309 	bzero(vreq, sizeof (v_req_t));
310 	list_create(&vreq->v_gs, sizeof (ge_slot_t),
311 	    offsetof(ge_slot_t, gs_vreq_link));
312 	vreq->v_buf = bp;
313 	vreq->v_status = VREQ_INIT;
314 	vreq->v_runq = B_FALSE;
315 	BP_VREQ_SET(bp, vreq);
316 	/* init of other fields in vreq is up to the caller */
317 
318 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
319 
320 	return (vreq);
321 }
322 
323 static void
324 vreq_free(xdf_t *vdp, v_req_t *vreq)
325 {
326 	buf_t	*bp = vreq->v_buf;
327 
328 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
329 	ASSERT(BP_VREQ(bp) == vreq);
330 
331 	list_remove(&vdp->xdf_vreq_act, vreq);
332 
333 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
334 		goto done;
335 
336 	switch (vreq->v_status) {
337 	case VREQ_DMAWIN_DONE:
338 	case VREQ_GS_ALLOCED:
339 	case VREQ_DMABUF_BOUND:
340 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
341 		/*FALLTHRU*/
342 	case VREQ_DMAMEM_ALLOCED:
343 		if (!ALIGNED_XFER(bp)) {
344 			ASSERT(vreq->v_abuf != NULL);
345 			if (!IS_ERROR(bp) && IS_READ(bp))
346 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
347 				    bp->b_bcount);
348 			ddi_dma_mem_free(&vreq->v_align);
349 		}
350 		/*FALLTHRU*/
351 	case VREQ_MEMDMAHDL_ALLOCED:
352 		if (!ALIGNED_XFER(bp))
353 			ddi_dma_free_handle(&vreq->v_memdmahdl);
354 		/*FALLTHRU*/
355 	case VREQ_DMAHDL_ALLOCED:
356 		ddi_dma_free_handle(&vreq->v_dmahdl);
357 		break;
358 	default:
359 		break;
360 	}
361 done:
362 	ASSERT(!vreq->v_runq);
363 	list_destroy(&vreq->v_gs);
364 	kmem_cache_free(xdf_vreq_cache, vreq);
365 }
366 
367 /*
368  * Snarf new data if our flush block was re-written
369  */
370 static void
371 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
372 {
373 	int nblks;
374 	boolean_t mapin;
375 
376 	if (IS_WRITE_BARRIER(vdp, bp))
377 		return; /* write was a flush write */
378 
379 	mapin = B_FALSE;
380 	nblks = bp->b_bcount >> DEV_BSHIFT;
381 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
382 		xdf_fbrewrites++;
383 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
384 			mapin = B_TRUE;
385 			bp_mapin(bp);
386 		}
387 		bcopy(bp->b_un.b_addr +
388 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
389 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
390 		if (mapin)
391 			bp_mapout(bp);
392 	}
393 }
394 
395 /*
396  * Initalize the DMA and grant table resources for the buf
397  */
398 static int
399 vreq_setup(xdf_t *vdp, v_req_t *vreq)
400 {
401 	int rc;
402 	ddi_dma_attr_t dmaattr;
403 	uint_t ndcs, ndws;
404 	ddi_dma_handle_t dh;
405 	ddi_dma_handle_t mdh;
406 	ddi_dma_cookie_t dc;
407 	ddi_acc_handle_t abh;
408 	caddr_t	aba;
409 	ge_slot_t *gs;
410 	size_t bufsz;
411 	off_t off;
412 	size_t sz;
413 	buf_t *bp = vreq->v_buf;
414 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
415 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
416 
417 	switch (vreq->v_status) {
418 	case VREQ_INIT:
419 		if (IS_FLUSH_DISKCACHE(bp)) {
420 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
421 				DPRINTF(DMA_DBG, ("xdf@%s: "
422 				    "get ge_slotfailed\n", vdp->xdf_addr));
423 				return (DDI_FAILURE);
424 			}
425 			vreq->v_blkno = 0;
426 			vreq->v_nslots = 1;
427 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
428 			vreq->v_status = VREQ_GS_ALLOCED;
429 			gs->gs_vreq = vreq;
430 			list_insert_head(&vreq->v_gs, gs);
431 			return (DDI_SUCCESS);
432 		}
433 
434 		if (IS_WRITE_BARRIER(vdp, bp))
435 			vreq->v_flush_diskcache = WRITE_BARRIER;
436 		vreq->v_blkno = bp->b_blkno +
437 		    (diskaddr_t)(uintptr_t)bp->b_private;
438 		/* See if we wrote new data to our flush block */
439 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
440 			check_fbwrite(vdp, bp, vreq->v_blkno);
441 		vreq->v_status = VREQ_INIT_DONE;
442 		/*FALLTHRU*/
443 
444 	case VREQ_INIT_DONE:
445 		/*
446 		 * alloc DMA handle
447 		 */
448 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
449 		    xdf_dmacallback, (caddr_t)vdp, &dh);
450 		if (rc != DDI_SUCCESS) {
451 			SETDMACBON(vdp);
452 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
453 			    vdp->xdf_addr));
454 			return (DDI_FAILURE);
455 		}
456 
457 		vreq->v_dmahdl = dh;
458 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
459 		/*FALLTHRU*/
460 
461 	case VREQ_DMAHDL_ALLOCED:
462 		/*
463 		 * alloc dma handle for 512-byte aligned buf
464 		 */
465 		if (!ALIGNED_XFER(bp)) {
466 			/*
467 			 * XXPV: we need to temporarily enlarge the seg
468 			 * boundary and s/g length to work round CR6381968
469 			 */
470 			dmaattr = xb_dma_attr;
471 			dmaattr.dma_attr_seg = (uint64_t)-1;
472 			dmaattr.dma_attr_sgllen = INT_MAX;
473 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
474 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
475 			if (rc != DDI_SUCCESS) {
476 				SETDMACBON(vdp);
477 				DPRINTF(DMA_DBG, ("xdf@%s: "
478 				    "unaligned buf DMAhandle alloc failed\n",
479 				    vdp->xdf_addr));
480 				return (DDI_FAILURE);
481 			}
482 			vreq->v_memdmahdl = mdh;
483 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
484 		}
485 		/*FALLTHRU*/
486 
487 	case VREQ_MEMDMAHDL_ALLOCED:
488 		/*
489 		 * alloc 512-byte aligned buf
490 		 */
491 		if (!ALIGNED_XFER(bp)) {
492 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
493 				bp_mapin(bp);
494 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
495 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
496 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
497 			    &aba, &bufsz, &abh);
498 			if (rc != DDI_SUCCESS) {
499 				SETDMACBON(vdp);
500 				DPRINTF(DMA_DBG, ("xdf@%s: "
501 				    "DMA mem allocation failed\n",
502 				    vdp->xdf_addr));
503 				return (DDI_FAILURE);
504 			}
505 
506 			vreq->v_abuf = aba;
507 			vreq->v_align = abh;
508 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
509 
510 			ASSERT(bufsz >= bp->b_bcount);
511 			if (!IS_READ(bp))
512 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
513 				    bp->b_bcount);
514 		}
515 		/*FALLTHRU*/
516 
517 	case VREQ_DMAMEM_ALLOCED:
518 		/*
519 		 * dma bind
520 		 */
521 		if (ALIGNED_XFER(bp)) {
522 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
523 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
524 			    &dc, &ndcs);
525 		} else {
526 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
527 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
528 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
529 		}
530 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
531 			/* get num of dma windows */
532 			if (rc == DDI_DMA_PARTIAL_MAP) {
533 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
534 				ASSERT(rc == DDI_SUCCESS);
535 			} else {
536 				ndws = 1;
537 			}
538 		} else {
539 			SETDMACBON(vdp);
540 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
541 			    vdp->xdf_addr));
542 			return (DDI_FAILURE);
543 		}
544 
545 		vreq->v_dmac = dc;
546 		vreq->v_dmaw = 0;
547 		vreq->v_ndmacs = ndcs;
548 		vreq->v_ndmaws = ndws;
549 		vreq->v_nslots = ndws;
550 		vreq->v_status = VREQ_DMABUF_BOUND;
551 		/*FALLTHRU*/
552 
553 	case VREQ_DMABUF_BOUND:
554 		/*
555 		 * get ge_slot, callback is set upon failure from gs_get(),
556 		 * if not set previously
557 		 */
558 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
559 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
560 			    vdp->xdf_addr));
561 			return (DDI_FAILURE);
562 		}
563 
564 		vreq->v_status = VREQ_GS_ALLOCED;
565 		gs->gs_vreq = vreq;
566 		list_insert_head(&vreq->v_gs, gs);
567 		break;
568 
569 	case VREQ_GS_ALLOCED:
570 		/* nothing need to be done */
571 		break;
572 
573 	case VREQ_DMAWIN_DONE:
574 		/*
575 		 * move to the next dma window
576 		 */
577 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
578 
579 		/* get a ge_slot for this DMA window */
580 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
581 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
582 			    vdp->xdf_addr));
583 			return (DDI_FAILURE);
584 		}
585 
586 		vreq->v_dmaw++;
587 		VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
588 		    &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS);
589 		vreq->v_status = VREQ_GS_ALLOCED;
590 		gs->gs_vreq = vreq;
591 		list_insert_head(&vreq->v_gs, gs);
592 		break;
593 
594 	default:
595 		return (DDI_FAILURE);
596 	}
597 
598 	return (DDI_SUCCESS);
599 }
600 
601 static int
602 xdf_cmlb_attach(xdf_t *vdp)
603 {
604 	dev_info_t	*dip = vdp->xdf_dip;
605 
606 	return (cmlb_attach(dip, &xdf_lb_ops,
607 	    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
608 	    XD_IS_RM(vdp), B_TRUE,
609 	    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
610 	    0, vdp->xdf_vd_lbl, NULL));
611 }
612 
613 static void
614 xdf_io_err(buf_t *bp, int err, size_t resid)
615 {
616 	bioerror(bp, err);
617 	if (resid == 0)
618 		bp->b_resid = bp->b_bcount;
619 	biodone(bp);
620 }
621 
622 static void
623 xdf_kstat_enter(xdf_t *vdp, buf_t *bp)
624 {
625 	v_req_t *vreq = BP_VREQ(bp);
626 
627 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
628 
629 	if (vdp->xdf_xdev_iostat == NULL)
630 		return;
631 	if ((vreq != NULL) && vreq->v_runq) {
632 		kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
633 	} else {
634 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
635 	}
636 }
637 
638 static void
639 xdf_kstat_exit(xdf_t *vdp, buf_t *bp)
640 {
641 	v_req_t *vreq = BP_VREQ(bp);
642 
643 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
644 
645 	if (vdp->xdf_xdev_iostat == NULL)
646 		return;
647 
648 	if ((vreq != NULL) && vreq->v_runq) {
649 		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
650 	} else {
651 		kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
652 	}
653 
654 	if (bp->b_flags & B_READ) {
655 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->reads++;
656 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nread += bp->b_bcount;
657 	} else if (bp->b_flags & B_WRITE) {
658 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->writes++;
659 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nwritten += bp->b_bcount;
660 	}
661 }
662 
663 static void
664 xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp)
665 {
666 	v_req_t *vreq = BP_VREQ(bp);
667 
668 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
669 	ASSERT(!vreq->v_runq);
670 
671 	vreq->v_runq = B_TRUE;
672 	if (vdp->xdf_xdev_iostat == NULL)
673 		return;
674 	kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
675 }
676 
677 static void
678 xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp)
679 {
680 	v_req_t *vreq = BP_VREQ(bp);
681 
682 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
683 	ASSERT(vreq->v_runq);
684 
685 	vreq->v_runq = B_FALSE;
686 	if (vdp->xdf_xdev_iostat == NULL)
687 		return;
688 	kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
689 }
690 
691 int
692 xdf_kstat_create(dev_info_t *dip)
693 {
694 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
695 	kstat_t		*kstat;
696 	buf_t		*bp;
697 
698 	if ((kstat = kstat_create("xdf", ddi_get_instance(dip), NULL, "disk",
699 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL)
700 		return (-1);
701 
702 	/* See comment about locking in xdf_kstat_delete(). */
703 	mutex_enter(&vdp->xdf_iostat_lk);
704 	mutex_enter(&vdp->xdf_dev_lk);
705 
706 	/* only one kstat can exist at a time */
707 	if (vdp->xdf_xdev_iostat != NULL) {
708 		mutex_exit(&vdp->xdf_dev_lk);
709 		mutex_exit(&vdp->xdf_iostat_lk);
710 		kstat_delete(kstat);
711 		return (-1);
712 	}
713 
714 	vdp->xdf_xdev_iostat = kstat;
715 	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
716 	kstat_install(vdp->xdf_xdev_iostat);
717 
718 	/*
719 	 * Now that we've created a kstat, we need to update the waitq and
720 	 * runq counts for the kstat to reflect our current state.
721 	 *
722 	 * For a buf_t structure to be on the runq, it must have a ring
723 	 * buffer slot associated with it.  To get a ring buffer slot the
724 	 * buf must first have a v_req_t and a ge_slot_t associated with it.
725 	 * Then when it is granted a ring buffer slot, v_runq will be set to
726 	 * true.
727 	 *
728 	 * For a buf_t structure to be on the waitq, it must not be on the
729 	 * runq.  So to find all the buf_t's that should be on waitq, we
730 	 * walk the active buf list and add any buf_t's which aren't on the
731 	 * runq to the waitq.
732 	 */
733 	bp = vdp->xdf_f_act;
734 	while (bp != NULL) {
735 		xdf_kstat_enter(vdp, bp);
736 		bp = bp->av_forw;
737 	}
738 	if (vdp->xdf_ready_tq_bp != NULL)
739 		xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp);
740 
741 	mutex_exit(&vdp->xdf_dev_lk);
742 	mutex_exit(&vdp->xdf_iostat_lk);
743 	return (0);
744 }
745 
746 void
747 xdf_kstat_delete(dev_info_t *dip)
748 {
749 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
750 	kstat_t		*kstat;
751 	buf_t		*bp;
752 
753 	/*
754 	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
755 	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
756 	 * and the contents of the our kstat.  xdf_iostat_lk is used
757 	 * to protect the allocation and freeing of the actual kstat.
758 	 * xdf_dev_lk can't be used for this purpose because kstat
759 	 * readers use it to access the contents of the kstat and
760 	 * hence it can't be held when calling kstat_delete().
761 	 */
762 	mutex_enter(&vdp->xdf_iostat_lk);
763 	mutex_enter(&vdp->xdf_dev_lk);
764 
765 	if (vdp->xdf_xdev_iostat == NULL) {
766 		mutex_exit(&vdp->xdf_dev_lk);
767 		mutex_exit(&vdp->xdf_iostat_lk);
768 		return;
769 	}
770 
771 	/*
772 	 * We're about to destroy the kstat structures, so it isn't really
773 	 * necessary to update the runq and waitq counts.  But, since this
774 	 * isn't a hot code path we can afford to be a little pedantic and
775 	 * go ahead and decrement the runq and waitq kstat counters to zero
776 	 * before free'ing them.  This helps us ensure that we've gotten all
777 	 * our accounting correct.
778 	 *
779 	 * For an explanation of how we determine which buffers go on the
780 	 * runq vs which go on the waitq, see the comments in
781 	 * xdf_kstat_create().
782 	 */
783 	bp = vdp->xdf_f_act;
784 	while (bp != NULL) {
785 		xdf_kstat_exit(vdp, bp);
786 		bp = bp->av_forw;
787 	}
788 	if (vdp->xdf_ready_tq_bp != NULL)
789 		xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp);
790 
791 	kstat = vdp->xdf_xdev_iostat;
792 	vdp->xdf_xdev_iostat = NULL;
793 	mutex_exit(&vdp->xdf_dev_lk);
794 	kstat_delete(kstat);
795 	mutex_exit(&vdp->xdf_iostat_lk);
796 }
797 
798 /*
799  * Add an IO requests onto the active queue.
800  *
801  * We have to detect IOs generated by xdf_ready_tq_thread.  These IOs
802  * are used to establish a connection to the backend, so they receive
803  * priority over all other IOs.  Since xdf_ready_tq_thread only does
804  * synchronous IO, there can only be one xdf_ready_tq_thread request at any
805  * given time and we record the buf associated with that request in
806  * xdf_ready_tq_bp.
807  */
808 static void
809 xdf_bp_push(xdf_t *vdp, buf_t *bp)
810 {
811 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
812 	ASSERT(bp->av_forw == NULL);
813 
814 	xdf_kstat_enter(vdp, bp);
815 
816 	if (curthread == vdp->xdf_ready_tq_thread) {
817 		/* new IO requests from the ready thread */
818 		ASSERT(vdp->xdf_ready_tq_bp == NULL);
819 		vdp->xdf_ready_tq_bp = bp;
820 		return;
821 	}
822 
823 	/* this is normal IO request */
824 	ASSERT(bp != vdp->xdf_ready_tq_bp);
825 
826 	if (vdp->xdf_f_act == NULL) {
827 		/* this is only only IO on the active queue */
828 		ASSERT(vdp->xdf_l_act == NULL);
829 		ASSERT(vdp->xdf_i_act == NULL);
830 		vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp;
831 		return;
832 	}
833 
834 	/* add this IO to the tail of the active queue */
835 	vdp->xdf_l_act->av_forw = bp;
836 	vdp->xdf_l_act = bp;
837 	if (vdp->xdf_i_act == NULL)
838 		vdp->xdf_i_act = bp;
839 }
840 
841 static void
842 xdf_bp_pop(xdf_t *vdp, buf_t *bp)
843 {
844 	buf_t	*bp_iter;
845 
846 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
847 	ASSERT(VREQ_DONE(BP_VREQ(bp)));
848 
849 	if (vdp->xdf_ready_tq_bp == bp) {
850 		/* we're done with a ready thread IO request */
851 		ASSERT(bp->av_forw == NULL);
852 		vdp->xdf_ready_tq_bp = NULL;
853 		return;
854 	}
855 
856 	/* we're done with a normal IO request */
857 	ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act));
858 	ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act));
859 	ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act)));
860 	ASSERT(vdp->xdf_f_act != vdp->xdf_i_act);
861 
862 	if (bp == vdp->xdf_f_act) {
863 		/* This IO was at the head of our active queue. */
864 		vdp->xdf_f_act = bp->av_forw;
865 		if (bp == vdp->xdf_l_act)
866 			vdp->xdf_l_act = NULL;
867 	} else {
868 		/* There IO finished before some other pending IOs. */
869 		bp_iter = vdp->xdf_f_act;
870 		while (bp != bp_iter->av_forw) {
871 			bp_iter = bp_iter->av_forw;
872 			ASSERT(VREQ_DONE(BP_VREQ(bp_iter)));
873 			ASSERT(bp_iter != vdp->xdf_i_act);
874 		}
875 		bp_iter->av_forw = bp->av_forw;
876 		if (bp == vdp->xdf_l_act)
877 			vdp->xdf_l_act = bp_iter;
878 	}
879 	bp->av_forw = NULL;
880 }
881 
882 static buf_t *
883 xdf_bp_next(xdf_t *vdp)
884 {
885 	v_req_t	*vreq;
886 	buf_t	*bp;
887 
888 	if (vdp->xdf_state == XD_CONNECTED) {
889 		/*
890 		 * If we're in the XD_CONNECTED state, we only service IOs
891 		 * from the xdf_ready_tq_thread thread.
892 		 */
893 		if ((bp = vdp->xdf_ready_tq_bp) == NULL)
894 			return (NULL);
895 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
896 			return (bp);
897 		return (NULL);
898 	}
899 
900 	/* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */
901 	if (vdp->xdf_state != XD_READY)
902 		return (NULL);
903 
904 	ASSERT(vdp->xdf_ready_tq_bp == NULL);
905 	for (;;) {
906 		if ((bp = vdp->xdf_i_act) == NULL)
907 			return (NULL);
908 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
909 			return (bp);
910 
911 		/* advance the active buf index pointer */
912 		vdp->xdf_i_act = bp->av_forw;
913 	}
914 }
915 
916 static void
917 xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr)
918 {
919 	ge_slot_t	*gs = (ge_slot_t *)(uintptr_t)id;
920 	v_req_t		*vreq = gs->gs_vreq;
921 	buf_t		*bp = vreq->v_buf;
922 
923 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
924 	ASSERT(BP_VREQ(bp) == vreq);
925 
926 	gs_free(gs);
927 
928 	if (bioerr != 0)
929 		bioerror(bp, bioerr);
930 	ASSERT(vreq->v_nslots > 0);
931 	if (--vreq->v_nslots > 0)
932 		return;
933 
934 	/* remove this IO from our active queue */
935 	xdf_bp_pop(vdp, bp);
936 
937 	ASSERT(vreq->v_runq);
938 	xdf_kstat_exit(vdp, bp);
939 	vreq->v_runq = B_FALSE;
940 	vreq_free(vdp, vreq);
941 
942 	if (IS_ERROR(bp)) {
943 		xdf_io_err(bp, geterror(bp), 0);
944 	} else if (bp->b_resid != 0) {
945 		/* Partial transfers are an error */
946 		xdf_io_err(bp, EIO, bp->b_resid);
947 	} else {
948 		biodone(bp);
949 	}
950 }
951 
952 /*
953  * xdf interrupt handler
954  */
955 static uint_t
956 xdf_intr_locked(xdf_t *vdp)
957 {
958 	xendev_ring_t *xbr;
959 	blkif_response_t *resp;
960 	int bioerr;
961 	uint64_t id;
962 	uint8_t op;
963 	uint16_t status;
964 	ddi_acc_handle_t acchdl;
965 
966 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
967 
968 	if ((xbr = vdp->xdf_xb_ring) == NULL)
969 		return (DDI_INTR_UNCLAIMED);
970 
971 	acchdl = vdp->xdf_xb_ring_hdl;
972 
973 	/*
974 	 * complete all requests which have a response
975 	 */
976 	while (resp = xvdi_ring_get_response(xbr)) {
977 		id = ddi_get64(acchdl, &resp->id);
978 		op = ddi_get8(acchdl, &resp->operation);
979 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
980 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
981 		    op, id, status));
982 
983 		if (status != BLKIF_RSP_OKAY) {
984 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
985 			    vdp->xdf_addr,
986 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
987 			bioerr = EIO;
988 		} else {
989 			bioerr = 0;
990 		}
991 
992 		xdf_io_fini(vdp, id, bioerr);
993 	}
994 	return (DDI_INTR_CLAIMED);
995 }
996 
997 /*
998  * xdf_intr runs at PIL 5, so no one else can grab xdf_dev_lk and
999  * block at a lower pil.
1000  */
1001 static uint_t
1002 xdf_intr(caddr_t arg)
1003 {
1004 	xdf_t *vdp = (xdf_t *)arg;
1005 	int rv;
1006 
1007 	mutex_enter(&vdp->xdf_dev_lk);
1008 	rv = xdf_intr_locked(vdp);
1009 	mutex_exit(&vdp->xdf_dev_lk);
1010 
1011 	if (!do_polled_io)
1012 		xdf_io_start(vdp);
1013 
1014 	return (rv);
1015 }
1016 
1017 static void
1018 xdf_ring_push(xdf_t *vdp)
1019 {
1020 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1021 
1022 	if (vdp->xdf_xb_ring == NULL)
1023 		return;
1024 
1025 	if (xvdi_ring_push_request(vdp->xdf_xb_ring)) {
1026 		DPRINTF(IO_DBG, (
1027 		    "xdf@%s: xdf_ring_push: sent request(s) to backend\n",
1028 		    vdp->xdf_addr));
1029 	}
1030 
1031 	if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN)
1032 		xvdi_notify_oe(vdp->xdf_dip);
1033 }
1034 
1035 static int
1036 xdf_ring_drain_locked(xdf_t *vdp)
1037 {
1038 	int		pollc, rv = 0;
1039 
1040 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1041 
1042 	if (xdf_debug & SUSRES_DBG)
1043 		xen_printf("xdf_ring_drain: start\n");
1044 
1045 	for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) {
1046 		if (vdp->xdf_xb_ring == NULL)
1047 			goto out;
1048 
1049 		if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1050 			(void) xdf_intr_locked(vdp);
1051 		if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring))
1052 			goto out;
1053 		xdf_ring_push(vdp);
1054 
1055 		/* file-backed devices can be slow */
1056 		mutex_exit(&vdp->xdf_dev_lk);
1057 #ifdef XPV_HVM_DRIVER
1058 		(void) HYPERVISOR_yield();
1059 #endif /* XPV_HVM_DRIVER */
1060 		delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY));
1061 		mutex_enter(&vdp->xdf_dev_lk);
1062 	}
1063 	cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr);
1064 
1065 out:
1066 	if (vdp->xdf_xb_ring != NULL) {
1067 		if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) ||
1068 		    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1069 			rv = EIO;
1070 	}
1071 	if (xdf_debug & SUSRES_DBG)
1072 		xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n",
1073 		    vdp->xdf_addr, rv);
1074 	return (rv);
1075 }
1076 
1077 static int
1078 xdf_ring_drain(xdf_t *vdp)
1079 {
1080 	int rv;
1081 	mutex_enter(&vdp->xdf_dev_lk);
1082 	rv = xdf_ring_drain_locked(vdp);
1083 	mutex_exit(&vdp->xdf_dev_lk);
1084 	return (rv);
1085 }
1086 
1087 /*
1088  * Destroy all v_req_t, grant table entries, and our ring buffer.
1089  */
1090 static void
1091 xdf_ring_destroy(xdf_t *vdp)
1092 {
1093 	v_req_t		*vreq;
1094 	buf_t		*bp;
1095 	ge_slot_t	*gs;
1096 
1097 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1098 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1099 
1100 	if ((vdp->xdf_state != XD_INIT) &&
1101 	    (vdp->xdf_state != XD_CONNECTED) &&
1102 	    (vdp->xdf_state != XD_READY)) {
1103 		ASSERT(vdp->xdf_xb_ring == NULL);
1104 		ASSERT(vdp->xdf_xb_ring_hdl == NULL);
1105 		ASSERT(vdp->xdf_peer == INVALID_DOMID);
1106 		ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN);
1107 		ASSERT(list_is_empty(&vdp->xdf_vreq_act));
1108 		return;
1109 	}
1110 
1111 	/*
1112 	 * We don't want to receive async notifications from the backend
1113 	 * when it finishes processing ring entries.
1114 	 */
1115 #ifdef XPV_HVM_DRIVER
1116 	ec_unbind_evtchn(vdp->xdf_evtchn);
1117 #else /* !XPV_HVM_DRIVER */
1118 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1119 #endif /* !XPV_HVM_DRIVER */
1120 
1121 	/*
1122 	 * Drain any requests in the ring.  We need to do this before we
1123 	 * can free grant table entries, because if active ring entries
1124 	 * point to grants, then the backend could be trying to access
1125 	 * those grants.
1126 	 */
1127 	(void) xdf_ring_drain_locked(vdp);
1128 
1129 	/* We're done talking to the backend so free up our event channel */
1130 	xvdi_free_evtchn(vdp->xdf_dip);
1131 	vdp->xdf_evtchn = INVALID_EVTCHN;
1132 
1133 	while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) {
1134 		bp = vreq->v_buf;
1135 		ASSERT(BP_VREQ(bp) == vreq);
1136 
1137 		/* Free up any grant table entries associaed with this IO */
1138 		while ((gs = list_head(&vreq->v_gs)) != NULL)
1139 			gs_free(gs);
1140 
1141 		/* If this IO was on the runq, move it back to the waitq. */
1142 		if (vreq->v_runq)
1143 			xdf_kstat_runq_to_waitq(vdp, bp);
1144 
1145 		/*
1146 		 * Reset any buf IO state since we're going to re-issue the
1147 		 * IO when we reconnect.
1148 		 */
1149 		vreq_free(vdp, vreq);
1150 		BP_VREQ_SET(bp, NULL);
1151 		bioerror(bp, 0);
1152 	}
1153 
1154 	/* reset the active queue index pointer */
1155 	vdp->xdf_i_act = vdp->xdf_f_act;
1156 
1157 	/* Destroy the ring */
1158 	xvdi_free_ring(vdp->xdf_xb_ring);
1159 	vdp->xdf_xb_ring = NULL;
1160 	vdp->xdf_xb_ring_hdl = NULL;
1161 	vdp->xdf_peer = INVALID_DOMID;
1162 }
1163 
1164 void
1165 xdfmin(struct buf *bp)
1166 {
1167 	if (bp->b_bcount > xdf_maxphys)
1168 		bp->b_bcount = xdf_maxphys;
1169 }
1170 
1171 /*
1172  * Check if we have a pending "eject" media request.
1173  */
1174 static int
1175 xdf_eject_pending(xdf_t *vdp)
1176 {
1177 	dev_info_t	*dip = vdp->xdf_dip;
1178 	char		*xsname, *str;
1179 
1180 	if (!vdp->xdf_media_req_supported)
1181 		return (B_FALSE);
1182 
1183 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1184 	    (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0))
1185 		return (B_FALSE);
1186 
1187 	if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
1188 		strfree(str);
1189 		return (B_FALSE);
1190 	}
1191 	strfree(str);
1192 	return (B_TRUE);
1193 }
1194 
1195 /*
1196  * Generate a media request.
1197  */
1198 static int
1199 xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required)
1200 {
1201 	dev_info_t	*dip = vdp->xdf_dip;
1202 	char		*xsname;
1203 
1204 	/*
1205 	 * we can't be holding xdf_dev_lk because xenbus_printf() can
1206 	 * block while waiting for a PIL 1 interrupt message.  this
1207 	 * would cause a deadlock with xdf_intr() which needs to grab
1208 	 * xdf_dev_lk as well and runs at PIL 5.
1209 	 */
1210 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1211 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1212 
1213 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1214 		return (ENXIO);
1215 
1216 	/* Check if we support media requests */
1217 	if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported)
1218 		return (ENOTTY);
1219 
1220 	/* If an eject is pending then don't allow any new requests */
1221 	if (xdf_eject_pending(vdp))
1222 		return (ENXIO);
1223 
1224 	/* Make sure that there is media present */
1225 	if (media_required && (vdp->xdf_xdev_nblocks == 0))
1226 		return (ENXIO);
1227 
1228 	/* We only allow operations when the device is ready and connected */
1229 	if (vdp->xdf_state != XD_READY)
1230 		return (EIO);
1231 
1232 	if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0)
1233 		return (EIO);
1234 
1235 	return (0);
1236 }
1237 
1238 /*
1239  * populate a single blkif_request_t w/ a buf
1240  */
1241 static void
1242 xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1243 {
1244 	grant_ref_t	gr;
1245 	uint8_t		fsect, lsect;
1246 	size_t		bcnt;
1247 	paddr_t		dma_addr;
1248 	off_t		blk_off;
1249 	dev_info_t	*dip = vdp->xdf_dip;
1250 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1251 	v_req_t		*vreq = BP_VREQ(bp);
1252 	uint64_t	blkno = vreq->v_blkno;
1253 	uint_t		ndmacs = vreq->v_ndmacs;
1254 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1255 	int		seg = 0;
1256 	int		isread = IS_READ(bp);
1257 	ge_slot_t	*gs = list_head(&vreq->v_gs);
1258 
1259 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1260 	ASSERT(vreq->v_status == VREQ_GS_ALLOCED);
1261 
1262 	if (isread)
1263 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1264 	else {
1265 		switch (vreq->v_flush_diskcache) {
1266 		case FLUSH_DISKCACHE:
1267 			ddi_put8(acchdl, &rreq->operation,
1268 			    BLKIF_OP_FLUSH_DISKCACHE);
1269 			ddi_put16(acchdl, &rreq->handle, vdev);
1270 			ddi_put64(acchdl, &rreq->id,
1271 			    (uint64_t)(uintptr_t)(gs));
1272 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1273 			vreq->v_status = VREQ_DMAWIN_DONE;
1274 			return;
1275 		case WRITE_BARRIER:
1276 			ddi_put8(acchdl, &rreq->operation,
1277 			    BLKIF_OP_WRITE_BARRIER);
1278 			break;
1279 		default:
1280 			if (!vdp->xdf_wce)
1281 				ddi_put8(acchdl, &rreq->operation,
1282 				    BLKIF_OP_WRITE_BARRIER);
1283 			else
1284 				ddi_put8(acchdl, &rreq->operation,
1285 				    BLKIF_OP_WRITE);
1286 			break;
1287 		}
1288 	}
1289 
1290 	ddi_put16(acchdl, &rreq->handle, vdev);
1291 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1292 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs));
1293 
1294 	/*
1295 	 * loop until all segments are populated or no more dma cookie in buf
1296 	 */
1297 	for (;;) {
1298 		/*
1299 		 * Each segment of a blkif request can transfer up to
1300 		 * one 4K page of data.
1301 		 */
1302 		bcnt = vreq->v_dmac.dmac_size;
1303 		dma_addr = vreq->v_dmac.dmac_laddress;
1304 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1305 		fsect = blk_off >> XB_BSHIFT;
1306 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1307 
1308 		ASSERT(bcnt <= PAGESIZE);
1309 		ASSERT((bcnt % XB_BSIZE) == 0);
1310 		ASSERT((blk_off & XB_BMASK) == 0);
1311 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1312 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1313 
1314 		gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT);
1315 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1316 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1317 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1318 
1319 		DPRINTF(IO_DBG, (
1320 		    "xdf@%s: seg%d: dmacS %lu blk_off %ld\n",
1321 		    vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off));
1322 		DPRINTF(IO_DBG, (
1323 		    "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n",
1324 		    vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr));
1325 
1326 		blkno += (bcnt >> XB_BSHIFT);
1327 		seg++;
1328 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1329 		if (--ndmacs) {
1330 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1331 			continue;
1332 		}
1333 
1334 		vreq->v_status = VREQ_DMAWIN_DONE;
1335 		vreq->v_blkno = blkno;
1336 		break;
1337 	}
1338 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1339 	DPRINTF(IO_DBG, (
1340 	    "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n",
1341 	    vdp->xdf_addr, rreq->id));
1342 }
1343 
1344 static void
1345 xdf_io_start(xdf_t *vdp)
1346 {
1347 	struct buf	*bp;
1348 	v_req_t		*vreq;
1349 	blkif_request_t	*rreq;
1350 	boolean_t	rreqready = B_FALSE;
1351 
1352 	mutex_enter(&vdp->xdf_dev_lk);
1353 
1354 	/*
1355 	 * Populate the ring request(s).  Loop until there is no buf to
1356 	 * transfer or no free slot available in I/O ring.
1357 	 */
1358 	for (;;) {
1359 		/* don't start any new IO if we're suspending */
1360 		if (vdp->xdf_suspending)
1361 			break;
1362 		if ((bp = xdf_bp_next(vdp)) == NULL)
1363 			break;
1364 
1365 		/* if the buf doesn't already have a vreq, allocate one */
1366 		if (((vreq = BP_VREQ(bp)) == NULL) &&
1367 		    ((vreq = vreq_get(vdp, bp)) == NULL))
1368 			break;
1369 
1370 		/* alloc DMA/GTE resources */
1371 		if (vreq_setup(vdp, vreq) != DDI_SUCCESS)
1372 			break;
1373 
1374 		/* get next blkif_request in the ring */
1375 		if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL)
1376 			break;
1377 		bzero(rreq, sizeof (blkif_request_t));
1378 		rreqready = B_TRUE;
1379 
1380 		/* populate blkif_request with this buf */
1381 		xdf_process_rreq(vdp, bp, rreq);
1382 
1383 		/*
1384 		 * This buffer/vreq pair is has been allocated a ring buffer
1385 		 * resources, so if it isn't already in our runq, add it.
1386 		 */
1387 		if (!vreq->v_runq)
1388 			xdf_kstat_waitq_to_runq(vdp, bp);
1389 	}
1390 
1391 	/* Send the request(s) to the backend */
1392 	if (rreqready)
1393 		xdf_ring_push(vdp);
1394 
1395 	mutex_exit(&vdp->xdf_dev_lk);
1396 }
1397 
1398 
1399 /* check if partition is open, -1 - check all partitions on the disk */
1400 static boolean_t
1401 xdf_isopen(xdf_t *vdp, int partition)
1402 {
1403 	int i;
1404 	ulong_t parbit;
1405 	boolean_t rval = B_FALSE;
1406 
1407 	ASSERT((partition == -1) ||
1408 	    ((partition >= 0) || (partition < XDF_PEXT)));
1409 
1410 	if (partition == -1)
1411 		parbit = (ulong_t)-1;
1412 	else
1413 		parbit = 1 << partition;
1414 
1415 	for (i = 0; i < OTYPCNT; i++) {
1416 		if (vdp->xdf_vd_open[i] & parbit)
1417 			rval = B_TRUE;
1418 	}
1419 
1420 	return (rval);
1421 }
1422 
1423 /*
1424  * The connection should never be closed as long as someone is holding
1425  * us open, there is pending IO, or someone is waiting waiting for a
1426  * connection.
1427  */
1428 static boolean_t
1429 xdf_busy(xdf_t *vdp)
1430 {
1431 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1432 
1433 	if ((vdp->xdf_xb_ring != NULL) &&
1434 	    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
1435 		ASSERT(vdp->xdf_state != XD_CLOSED);
1436 		return (B_TRUE);
1437 	}
1438 
1439 	if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) {
1440 		ASSERT(vdp->xdf_state != XD_CLOSED);
1441 		return (B_TRUE);
1442 	}
1443 
1444 	if (xdf_isopen(vdp, -1)) {
1445 		ASSERT(vdp->xdf_state != XD_CLOSED);
1446 		return (B_TRUE);
1447 	}
1448 
1449 	if (vdp->xdf_connect_req > 0) {
1450 		ASSERT(vdp->xdf_state != XD_CLOSED);
1451 		return (B_TRUE);
1452 	}
1453 
1454 	return (B_FALSE);
1455 }
1456 
1457 static void
1458 xdf_set_state(xdf_t *vdp, xdf_state_t new_state)
1459 {
1460 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1461 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1462 	DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n",
1463 	    vdp->xdf_addr, vdp->xdf_state, new_state));
1464 	vdp->xdf_state = new_state;
1465 	cv_broadcast(&vdp->xdf_dev_cv);
1466 }
1467 
1468 static void
1469 xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet)
1470 {
1471 	dev_info_t	*dip = vdp->xdf_dip;
1472 	boolean_t	busy;
1473 
1474 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1475 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1476 	ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED));
1477 
1478 	/* Check if we're already there. */
1479 	if (vdp->xdf_state == new_state)
1480 		return;
1481 
1482 	mutex_enter(&vdp->xdf_dev_lk);
1483 	busy = xdf_busy(vdp);
1484 
1485 	/* If we're already closed then there's nothing todo. */
1486 	if (vdp->xdf_state == XD_CLOSED) {
1487 		ASSERT(!busy);
1488 		xdf_set_state(vdp, new_state);
1489 		mutex_exit(&vdp->xdf_dev_lk);
1490 		return;
1491 	}
1492 
1493 #ifdef DEBUG
1494 	/* UhOh.  Warn the user that something bad has happened. */
1495 	if (!quiet && busy && (vdp->xdf_state == XD_READY) &&
1496 	    (vdp->xdf_xdev_nblocks != 0)) {
1497 		cmn_err(CE_WARN, "xdf@%s: disconnected while in use",
1498 		    vdp->xdf_addr);
1499 	}
1500 #endif /* DEBUG */
1501 
1502 	xdf_ring_destroy(vdp);
1503 
1504 	/* If we're busy then we can only go into the unknown state */
1505 	xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state);
1506 	mutex_exit(&vdp->xdf_dev_lk);
1507 
1508 	/* if we're closed now, let the other end know */
1509 	if (vdp->xdf_state == XD_CLOSED)
1510 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1511 }
1512 
1513 
1514 /*
1515  * Kick-off connect process
1516  * Status should be XD_UNKNOWN or XD_CLOSED
1517  * On success, status will be changed to XD_INIT
1518  * On error, it will be changed to XD_UNKNOWN
1519  */
1520 static int
1521 xdf_setstate_init(xdf_t *vdp)
1522 {
1523 	dev_info_t		*dip = vdp->xdf_dip;
1524 	xenbus_transaction_t	xbt;
1525 	grant_ref_t		gref;
1526 	char			*xsname, *str;
1527 	int			rv;
1528 
1529 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1530 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1531 	ASSERT((vdp->xdf_state == XD_UNKNOWN) ||
1532 	    (vdp->xdf_state == XD_CLOSED));
1533 
1534 	DPRINTF(DDI_DBG,
1535 	    ("xdf@%s: starting connection process\n", vdp->xdf_addr));
1536 
1537 	/*
1538 	 * If an eject is pending then don't allow a new connection.
1539 	 * (Only the backend can clear media request eject request.)
1540 	 */
1541 	if (xdf_eject_pending(vdp))
1542 		return (DDI_FAILURE);
1543 
1544 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1545 		goto errout;
1546 
1547 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID)
1548 		goto errout;
1549 
1550 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising);
1551 
1552 	/*
1553 	 * Sanity check for the existance of the xenbus device-type property.
1554 	 * This property might not exist if our xenbus device nodes were
1555 	 * force destroyed while we were still connected to the backend.
1556 	 */
1557 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0)
1558 		goto errout;
1559 	strfree(str);
1560 
1561 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS)
1562 		goto errout;
1563 
1564 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1565 #ifdef XPV_HVM_DRIVER
1566 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1567 #else /* !XPV_HVM_DRIVER */
1568 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1569 	    DDI_SUCCESS) {
1570 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: "
1571 		    "failed to add intr handler", vdp->xdf_addr);
1572 		goto errout1;
1573 	}
1574 #endif /* !XPV_HVM_DRIVER */
1575 
1576 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1577 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1578 	    DDI_SUCCESS) {
1579 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1580 		    vdp->xdf_addr);
1581 		goto errout2;
1582 	}
1583 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1584 
1585 	/*
1586 	 * Write into xenstore the info needed by backend
1587 	 */
1588 trans_retry:
1589 	if (xenbus_transaction_start(&xbt)) {
1590 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1591 		    vdp->xdf_addr);
1592 		xvdi_fatal_error(dip, EIO, "connect transaction init");
1593 		goto fail_trans;
1594 	}
1595 
1596 	/*
1597 	 * XBP_PROTOCOL is written by the domain builder in the case of PV
1598 	 * domains. However, it is not written for HVM domains, so let's
1599 	 * write it here.
1600 	 */
1601 	if (((rv = xenbus_printf(xbt, xsname,
1602 	    XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) ||
1603 	    ((rv = xenbus_printf(xbt, xsname,
1604 	    XBP_RING_REF, "%u", gref)) != 0) ||
1605 	    ((rv = xenbus_printf(xbt, xsname,
1606 	    XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) ||
1607 	    ((rv = xenbus_printf(xbt, xsname,
1608 	    XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) ||
1609 	    ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) {
1610 		(void) xenbus_transaction_end(xbt, 1);
1611 		xvdi_fatal_error(dip, rv, "connect transaction setup");
1612 		goto fail_trans;
1613 	}
1614 
1615 	/* kick-off connect process */
1616 	if (rv = xenbus_transaction_end(xbt, 0)) {
1617 		if (rv == EAGAIN)
1618 			goto trans_retry;
1619 		xvdi_fatal_error(dip, rv, "connect transaction commit");
1620 		goto fail_trans;
1621 	}
1622 
1623 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1624 	mutex_enter(&vdp->xdf_dev_lk);
1625 	xdf_set_state(vdp, XD_INIT);
1626 	mutex_exit(&vdp->xdf_dev_lk);
1627 
1628 	return (DDI_SUCCESS);
1629 
1630 fail_trans:
1631 	xvdi_free_ring(vdp->xdf_xb_ring);
1632 errout2:
1633 #ifdef XPV_HVM_DRIVER
1634 	ec_unbind_evtchn(vdp->xdf_evtchn);
1635 #else /* !XPV_HVM_DRIVER */
1636 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1637 #endif /* !XPV_HVM_DRIVER */
1638 errout1:
1639 	xvdi_free_evtchn(dip);
1640 	vdp->xdf_evtchn = INVALID_EVTCHN;
1641 errout:
1642 	xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1643 	cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend",
1644 	    vdp->xdf_addr);
1645 	return (DDI_FAILURE);
1646 }
1647 
1648 int
1649 xdf_get_flush_block(xdf_t *vdp)
1650 {
1651 	/*
1652 	 * Get a DEV_BSIZE aligned bufer
1653 	 */
1654 	vdp->xdf_flush_mem = kmem_alloc(vdp->xdf_xdev_secsize * 2, KM_SLEEP);
1655 	vdp->xdf_cache_flush_block =
1656 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem),
1657 	    (int)vdp->xdf_xdev_secsize);
1658 
1659 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1660 	    xdf_flush_block, vdp->xdf_xdev_secsize, NULL) != 0)
1661 		return (DDI_FAILURE);
1662 	return (DDI_SUCCESS);
1663 }
1664 
1665 static void
1666 xdf_setstate_ready(void *arg)
1667 {
1668 	xdf_t		*vdp = (xdf_t *)arg;
1669 	dev_info_t	*dip = vdp->xdf_dip;
1670 
1671 	vdp->xdf_ready_tq_thread = curthread;
1672 
1673 	/* Create minor nodes now when we are almost connected */
1674 	mutex_enter(&vdp->xdf_dev_lk);
1675 	if (vdp->xdf_cmlb_reattach) {
1676 		vdp->xdf_cmlb_reattach = B_FALSE;
1677 		mutex_exit(&vdp->xdf_dev_lk);
1678 		if (xdf_cmlb_attach(vdp) != 0) {
1679 			cmn_err(CE_WARN,
1680 			    "xdf@%s: cmlb attach failed",
1681 			    ddi_get_name_addr(dip));
1682 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1683 			return;
1684 		}
1685 		mutex_enter(&vdp->xdf_dev_lk);
1686 	}
1687 
1688 	/* If we're not still trying to get to the ready state, then bail. */
1689 	if (vdp->xdf_state != XD_CONNECTED) {
1690 		mutex_exit(&vdp->xdf_dev_lk);
1691 		return;
1692 	}
1693 	mutex_exit(&vdp->xdf_dev_lk);
1694 
1695 	/*
1696 	 * If backend has feature-barrier, see if it supports disk
1697 	 * cache flush op.
1698 	 */
1699 	vdp->xdf_flush_supported = B_FALSE;
1700 	if (vdp->xdf_feature_barrier) {
1701 		/*
1702 		 * Pretend we already know flush is supported so probe
1703 		 * will attempt the correct op.
1704 		 */
1705 		vdp->xdf_flush_supported = B_TRUE;
1706 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1707 			vdp->xdf_flush_supported = B_TRUE;
1708 		} else {
1709 			vdp->xdf_flush_supported = B_FALSE;
1710 			/*
1711 			 * If the other end does not support the cache flush op
1712 			 * then we must use a barrier-write to force disk
1713 			 * cache flushing.  Barrier writes require that a data
1714 			 * block actually be written.
1715 			 * Cache a block to barrier-write when we are
1716 			 * asked to perform a flush.
1717 			 * XXX - would it be better to just copy 1 block
1718 			 * (512 bytes) from whatever write we did last
1719 			 * and rewrite that block?
1720 			 */
1721 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS) {
1722 				xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1723 				return;
1724 			}
1725 		}
1726 	}
1727 
1728 	mutex_enter(&vdp->xdf_cb_lk);
1729 	mutex_enter(&vdp->xdf_dev_lk);
1730 	if (vdp->xdf_state == XD_CONNECTED)
1731 		xdf_set_state(vdp, XD_READY);
1732 	mutex_exit(&vdp->xdf_dev_lk);
1733 
1734 	/* Restart any currently queued up io */
1735 	xdf_io_start(vdp);
1736 
1737 	mutex_exit(&vdp->xdf_cb_lk);
1738 }
1739 
1740 /*
1741  * synthetic geometry
1742  */
1743 #define	XDF_NSECTS	256
1744 #define	XDF_NHEADS	16
1745 
1746 static void
1747 xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp)
1748 {
1749 	xdf_t *vdp;
1750 	uint_t ncyl;
1751 
1752 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
1753 
1754 	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1755 
1756 	bzero(geomp, sizeof (*geomp));
1757 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1758 	geomp->g_acyl = 0;
1759 	geomp->g_nhead = XDF_NHEADS;
1760 	geomp->g_nsect = XDF_NSECTS;
1761 	geomp->g_secsize = vdp->xdf_xdev_secsize;
1762 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1763 	geomp->g_intrlv = 0;
1764 	geomp->g_rpm = 7200;
1765 }
1766 
1767 /*
1768  * Finish other initialization after we've connected to backend
1769  * Status should be XD_INIT before calling this routine
1770  * On success, status should be changed to XD_CONNECTED.
1771  * On error, status should stay XD_INIT
1772  */
1773 static int
1774 xdf_setstate_connected(xdf_t *vdp)
1775 {
1776 	dev_info_t	*dip = vdp->xdf_dip;
1777 	cmlb_geom_t	pgeom;
1778 	diskaddr_t	nblocks = 0;
1779 	uint_t		secsize = 0;
1780 	char		*oename, *xsname, *str;
1781 	uint_t		dinfo;
1782 
1783 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1784 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1785 	ASSERT(vdp->xdf_state == XD_INIT);
1786 
1787 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1788 	    ((oename = xvdi_get_oename(dip)) == NULL))
1789 		return (DDI_FAILURE);
1790 
1791 	/* Make sure the other end is XenbusStateConnected */
1792 	if (xenbus_read_driver_state(oename) != XenbusStateConnected)
1793 		return (DDI_FAILURE);
1794 
1795 	/* Determine if feature barrier is supported by backend */
1796 	if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB)))
1797 		cmn_err(CE_NOTE, "!xdf@%s: feature-barrier not supported",
1798 		    vdp->xdf_addr);
1799 
1800 	/*
1801 	 * Probe backend.  Read the device size into xdf_xdev_nblocks
1802 	 * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE
1803 	 * flags in xdf_dinfo.  If the emulated device type is "cdrom",
1804 	 * we always set VDISK_CDROM, regardless of if it's present in
1805 	 * the xenbus info parameter.
1806 	 */
1807 	if (xenbus_gather(XBT_NULL, oename,
1808 	    XBP_SECTORS, "%"SCNu64, &nblocks,
1809 	    XBP_SECTOR_SIZE, "%u", &secsize,
1810 	    XBP_INFO, "%u", &dinfo,
1811 	    NULL) != 0) {
1812 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1813 		    "cannot read backend info", vdp->xdf_addr);
1814 		return (DDI_FAILURE);
1815 	}
1816 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
1817 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
1818 		    vdp->xdf_addr);
1819 		return (DDI_FAILURE);
1820 	}
1821 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
1822 		dinfo |= VDISK_CDROM;
1823 	strfree(str);
1824 
1825 	if (secsize == 0 || !(ISP2(secsize / DEV_BSIZE)))
1826 		secsize = DEV_BSIZE;
1827 	vdp->xdf_xdev_nblocks = nblocks;
1828 	vdp->xdf_xdev_secsize = secsize;
1829 #ifdef _ILP32
1830 	if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1831 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1832 		    "backend disk device too large with %llu blocks for"
1833 		    " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks);
1834 		xvdi_fatal_error(dip, EFBIG, "reading backend info");
1835 		return (DDI_FAILURE);
1836 	}
1837 #endif
1838 
1839 	/*
1840 	 * If the physical geometry for a fixed disk has been explicity
1841 	 * set then make sure that the specified physical geometry isn't
1842 	 * larger than the device we connected to.
1843 	 */
1844 	if (vdp->xdf_pgeom_fixed &&
1845 	    (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) {
1846 		cmn_err(CE_WARN,
1847 		    "xdf@%s: connect failed, fixed geometry too large",
1848 		    vdp->xdf_addr);
1849 		return (DDI_FAILURE);
1850 	}
1851 
1852 	vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP);
1853 
1854 	/* mark vbd is ready for I/O */
1855 	mutex_enter(&vdp->xdf_dev_lk);
1856 	xdf_set_state(vdp, XD_CONNECTED);
1857 
1858 	/* check if the cmlb label should be updated */
1859 	xdf_synthetic_pgeom(dip, &pgeom);
1860 	if ((vdp->xdf_dinfo != dinfo) ||
1861 	    (!vdp->xdf_pgeom_fixed &&
1862 	    (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) {
1863 		vdp->xdf_cmlb_reattach = B_TRUE;
1864 
1865 		vdp->xdf_dinfo = dinfo;
1866 		if (!vdp->xdf_pgeom_fixed)
1867 			vdp->xdf_pgeom = pgeom;
1868 	}
1869 
1870 	if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) {
1871 		if (vdp->xdf_xdev_nblocks == 0) {
1872 			vdp->xdf_mstate = DKIO_EJECTED;
1873 			cv_broadcast(&vdp->xdf_mstate_cv);
1874 		} else {
1875 			vdp->xdf_mstate = DKIO_INSERTED;
1876 			cv_broadcast(&vdp->xdf_mstate_cv);
1877 		}
1878 	} else {
1879 		if (vdp->xdf_mstate != DKIO_NONE) {
1880 			vdp->xdf_mstate = DKIO_NONE;
1881 			cv_broadcast(&vdp->xdf_mstate_cv);
1882 		}
1883 	}
1884 
1885 	mutex_exit(&vdp->xdf_dev_lk);
1886 
1887 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr,
1888 	    (uint64_t)vdp->xdf_xdev_nblocks);
1889 
1890 	/* Restart any currently queued up io */
1891 	xdf_io_start(vdp);
1892 
1893 	/*
1894 	 * To get to the ready state we have to do IO to the backend device,
1895 	 * but we can't initiate IO from the other end change callback thread
1896 	 * (which is the current context we're executing in.)  This is because
1897 	 * if the other end disconnects while we're doing IO from the callback
1898 	 * thread, then we can't receive that disconnect event and we hang
1899 	 * waiting for an IO that can never complete.
1900 	 */
1901 	(void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp,
1902 	    DDI_SLEEP);
1903 
1904 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1905 	return (DDI_SUCCESS);
1906 }
1907 
1908 /*ARGSUSED*/
1909 static void
1910 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1911 {
1912 	XenbusState new_state = *(XenbusState *)impl_data;
1913 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1914 
1915 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1916 	    vdp->xdf_addr, new_state));
1917 
1918 	mutex_enter(&vdp->xdf_cb_lk);
1919 
1920 	/* We assume that this callback is single threaded */
1921 	ASSERT(vdp->xdf_oe_change_thread == NULL);
1922 	DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread);
1923 
1924 	/* ignore any backend state changes if we're suspending/suspended */
1925 	if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) {
1926 		DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1927 		mutex_exit(&vdp->xdf_cb_lk);
1928 		return;
1929 	}
1930 
1931 	switch (new_state) {
1932 	case XenbusStateUnknown:
1933 	case XenbusStateInitialising:
1934 	case XenbusStateInitWait:
1935 	case XenbusStateInitialised:
1936 		if (vdp->xdf_state == XD_INIT)
1937 			break;
1938 
1939 		xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1940 		if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1941 			break;
1942 		ASSERT(vdp->xdf_state == XD_INIT);
1943 		break;
1944 
1945 	case XenbusStateConnected:
1946 		if ((vdp->xdf_state == XD_CONNECTED) ||
1947 		    (vdp->xdf_state == XD_READY))
1948 			break;
1949 
1950 		if (vdp->xdf_state != XD_INIT) {
1951 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1952 			if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1953 				break;
1954 			ASSERT(vdp->xdf_state == XD_INIT);
1955 		}
1956 
1957 		if (xdf_setstate_connected(vdp) != DDI_SUCCESS) {
1958 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1959 			break;
1960 		}
1961 		ASSERT(vdp->xdf_state == XD_CONNECTED);
1962 		break;
1963 
1964 	case XenbusStateClosing:
1965 		if (xdf_isopen(vdp, -1)) {
1966 			cmn_err(CE_NOTE,
1967 			    "xdf@%s: hot-unplug failed, still in use",
1968 			    vdp->xdf_addr);
1969 			break;
1970 		}
1971 		/*FALLTHROUGH*/
1972 	case XenbusStateClosed:
1973 		xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
1974 		break;
1975 	}
1976 
1977 	/* notify anybody waiting for oe state change */
1978 	cv_broadcast(&vdp->xdf_dev_cv);
1979 	DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1980 	mutex_exit(&vdp->xdf_cb_lk);
1981 }
1982 
1983 static int
1984 xdf_connect_locked(xdf_t *vdp, boolean_t wait)
1985 {
1986 	int	rv, timeouts = 0, reset = 20;
1987 
1988 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1989 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1990 
1991 	/* we can't connect once we're in the closed state */
1992 	if (vdp->xdf_state == XD_CLOSED)
1993 		return (XD_CLOSED);
1994 
1995 	vdp->xdf_connect_req++;
1996 	while (vdp->xdf_state != XD_READY) {
1997 		mutex_exit(&vdp->xdf_dev_lk);
1998 
1999 		/* only one thread at a time can be the connection thread */
2000 		if (vdp->xdf_connect_thread == NULL)
2001 			vdp->xdf_connect_thread = curthread;
2002 
2003 		if (vdp->xdf_connect_thread == curthread) {
2004 			if ((timeouts > 0) && ((timeouts % reset) == 0)) {
2005 				/*
2006 				 * If we haven't establised a connection
2007 				 * within the reset time, then disconnect
2008 				 * so we can try again, and double the reset
2009 				 * time.  The reset time starts at 2 sec.
2010 				 */
2011 				(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2012 				reset *= 2;
2013 			}
2014 			if (vdp->xdf_state == XD_UNKNOWN)
2015 				(void) xdf_setstate_init(vdp);
2016 			if (vdp->xdf_state == XD_INIT)
2017 				(void) xdf_setstate_connected(vdp);
2018 		}
2019 
2020 		mutex_enter(&vdp->xdf_dev_lk);
2021 		if (!wait || (vdp->xdf_state == XD_READY))
2022 			goto out;
2023 
2024 		mutex_exit((&vdp->xdf_cb_lk));
2025 		if (vdp->xdf_connect_thread != curthread) {
2026 			rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk);
2027 		} else {
2028 			/* delay for 0.1 sec */
2029 			rv = cv_reltimedwait_sig(&vdp->xdf_dev_cv,
2030 			    &vdp->xdf_dev_lk, drv_usectohz(100*1000),
2031 			    TR_CLOCK_TICK);
2032 			if (rv == -1)
2033 				timeouts++;
2034 		}
2035 		mutex_exit((&vdp->xdf_dev_lk));
2036 		mutex_enter((&vdp->xdf_cb_lk));
2037 		mutex_enter((&vdp->xdf_dev_lk));
2038 		if (rv == 0)
2039 			goto out;
2040 	}
2041 
2042 out:
2043 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2044 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2045 
2046 	if (vdp->xdf_connect_thread == curthread) {
2047 		/*
2048 		 * wake up someone else so they can become the connection
2049 		 * thread.
2050 		 */
2051 		cv_signal(&vdp->xdf_dev_cv);
2052 		vdp->xdf_connect_thread = NULL;
2053 	}
2054 
2055 	/* Try to lock the media */
2056 	mutex_exit((&vdp->xdf_dev_lk));
2057 	(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2058 	mutex_enter((&vdp->xdf_dev_lk));
2059 
2060 	vdp->xdf_connect_req--;
2061 	return (vdp->xdf_state);
2062 }
2063 
2064 static uint_t
2065 xdf_iorestart(caddr_t arg)
2066 {
2067 	xdf_t *vdp = (xdf_t *)arg;
2068 
2069 	ASSERT(vdp != NULL);
2070 
2071 	mutex_enter(&vdp->xdf_dev_lk);
2072 	ASSERT(ISDMACBON(vdp));
2073 	SETDMACBOFF(vdp);
2074 	mutex_exit(&vdp->xdf_dev_lk);
2075 
2076 	xdf_io_start(vdp);
2077 
2078 	return (DDI_INTR_CLAIMED);
2079 }
2080 
2081 #ifdef XPV_HVM_DRIVER
2082 
2083 typedef struct xdf_hvm_entry {
2084 	list_node_t	xdf_he_list;
2085 	char		*xdf_he_path;
2086 	dev_info_t	*xdf_he_dip;
2087 } xdf_hvm_entry_t;
2088 
2089 static list_t xdf_hvm_list;
2090 static kmutex_t xdf_hvm_list_lock;
2091 
2092 static xdf_hvm_entry_t *
2093 i_xdf_hvm_find(const char *path, dev_info_t *dip)
2094 {
2095 	xdf_hvm_entry_t	*i;
2096 
2097 	ASSERT((path != NULL) || (dip != NULL));
2098 	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2099 
2100 	i = list_head(&xdf_hvm_list);
2101 	while (i != NULL) {
2102 		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2103 			i = list_next(&xdf_hvm_list, i);
2104 			continue;
2105 		}
2106 		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2107 			i = list_next(&xdf_hvm_list, i);
2108 			continue;
2109 		}
2110 		break;
2111 	}
2112 	return (i);
2113 }
2114 
2115 dev_info_t *
2116 xdf_hvm_hold(const char *path)
2117 {
2118 	xdf_hvm_entry_t	*i;
2119 	dev_info_t	*dip;
2120 
2121 	mutex_enter(&xdf_hvm_list_lock);
2122 	i = i_xdf_hvm_find(path, NULL);
2123 	if (i == NULL) {
2124 		mutex_exit(&xdf_hvm_list_lock);
2125 		return (B_FALSE);
2126 	}
2127 	ndi_hold_devi(dip = i->xdf_he_dip);
2128 	mutex_exit(&xdf_hvm_list_lock);
2129 	return (dip);
2130 }
2131 
2132 static void
2133 xdf_hvm_add(dev_info_t *dip)
2134 {
2135 	xdf_hvm_entry_t	*i;
2136 	char		*path;
2137 
2138 	/* figure out the path for the dip */
2139 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2140 	(void) ddi_pathname(dip, path);
2141 
2142 	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2143 	i->xdf_he_dip = dip;
2144 	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2145 
2146 	mutex_enter(&xdf_hvm_list_lock);
2147 	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2148 	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2149 	list_insert_head(&xdf_hvm_list, i);
2150 	mutex_exit(&xdf_hvm_list_lock);
2151 
2152 	kmem_free(path, MAXPATHLEN);
2153 }
2154 
2155 static void
2156 xdf_hvm_rm(dev_info_t *dip)
2157 {
2158 	xdf_hvm_entry_t	*i;
2159 
2160 	mutex_enter(&xdf_hvm_list_lock);
2161 	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2162 	list_remove(&xdf_hvm_list, i);
2163 	mutex_exit(&xdf_hvm_list_lock);
2164 
2165 	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2166 	kmem_free(i, sizeof (*i));
2167 }
2168 
2169 static void
2170 xdf_hvm_init(void)
2171 {
2172 	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2173 	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2174 	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2175 }
2176 
2177 static void
2178 xdf_hvm_fini(void)
2179 {
2180 	ASSERT(list_head(&xdf_hvm_list) == NULL);
2181 	list_destroy(&xdf_hvm_list);
2182 	mutex_destroy(&xdf_hvm_list_lock);
2183 }
2184 
2185 boolean_t
2186 xdf_hvm_connect(dev_info_t *dip)
2187 {
2188 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2189 	char	*oename, *str;
2190 	int	rv;
2191 
2192 	mutex_enter(&vdp->xdf_cb_lk);
2193 
2194 	/*
2195 	 * Before try to establish a connection we need to wait for the
2196 	 * backend hotplug scripts to have run.  Once they are run the
2197 	 * "<oename>/hotplug-status" property will be set to "connected".
2198 	 */
2199 	for (;;) {
2200 		ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2201 
2202 		/*
2203 		 * Get the xenbus path to the backend device.  Note that
2204 		 * we can't cache this path (and we look it up on each pass
2205 		 * through this loop) because it could change during
2206 		 * suspend, resume, and migration operations.
2207 		 */
2208 		if ((oename = xvdi_get_oename(dip)) == NULL) {
2209 			mutex_exit(&vdp->xdf_cb_lk);
2210 			return (B_FALSE);
2211 		}
2212 
2213 		str = NULL;
2214 		if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) &&
2215 		    (strcmp(str, XBV_HP_STATUS_CONN) == 0))
2216 			break;
2217 
2218 		if (str != NULL)
2219 			strfree(str);
2220 
2221 		/* wait for an update to "<oename>/hotplug-status" */
2222 		if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) {
2223 			/* we got interrupted by a signal */
2224 			mutex_exit(&vdp->xdf_cb_lk);
2225 			return (B_FALSE);
2226 		}
2227 	}
2228 
2229 	/* Good news.  The backend hotplug scripts have been run. */
2230 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2231 	ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0);
2232 	strfree(str);
2233 
2234 	/*
2235 	 * If we're emulating a cd device and if the backend doesn't support
2236 	 * media request opreations, then we're not going to bother trying
2237 	 * to establish a connection for a couple reasons.  First off, media
2238 	 * requests support is required to support operations like eject and
2239 	 * media locking.  Second, other backend platforms like Linux don't
2240 	 * support hvm pv cdrom access.  They don't even have a backend pv
2241 	 * driver for cdrom device nodes, so we don't want to block forever
2242 	 * waiting for a connection to a backend driver that doesn't exist.
2243 	 */
2244 	if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) {
2245 		mutex_exit(&vdp->xdf_cb_lk);
2246 		return (B_FALSE);
2247 	}
2248 
2249 	mutex_enter(&vdp->xdf_dev_lk);
2250 	rv = xdf_connect_locked(vdp, B_TRUE);
2251 	mutex_exit(&vdp->xdf_dev_lk);
2252 	mutex_exit(&vdp->xdf_cb_lk);
2253 
2254 	return ((rv == XD_READY) ? B_TRUE : B_FALSE);
2255 }
2256 
2257 int
2258 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2259 {
2260 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2261 
2262 	/* sanity check the requested physical geometry */
2263 	mutex_enter(&vdp->xdf_dev_lk);
2264 	if ((geomp->g_secsize != XB_BSIZE) ||
2265 	    (geomp->g_capacity == 0)) {
2266 		mutex_exit(&vdp->xdf_dev_lk);
2267 		return (EINVAL);
2268 	}
2269 
2270 	/*
2271 	 * If we've already connected to the backend device then make sure
2272 	 * we're not defining a physical geometry larger than our backend
2273 	 * device.
2274 	 */
2275 	if ((vdp->xdf_xdev_nblocks != 0) &&
2276 	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2277 		mutex_exit(&vdp->xdf_dev_lk);
2278 		return (EINVAL);
2279 	}
2280 
2281 	bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom));
2282 	vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl;
2283 	vdp->xdf_pgeom.g_acyl = geomp->g_acyl;
2284 	vdp->xdf_pgeom.g_nhead = geomp->g_nhead;
2285 	vdp->xdf_pgeom.g_nsect = geomp->g_nsect;
2286 	vdp->xdf_pgeom.g_secsize = geomp->g_secsize;
2287 	vdp->xdf_pgeom.g_capacity = geomp->g_capacity;
2288 	vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv;
2289 	vdp->xdf_pgeom.g_rpm = geomp->g_rpm;
2290 
2291 	vdp->xdf_pgeom_fixed = B_TRUE;
2292 	mutex_exit(&vdp->xdf_dev_lk);
2293 
2294 	/* force a re-validation */
2295 	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2296 
2297 	return (0);
2298 }
2299 
2300 boolean_t
2301 xdf_is_cd(dev_info_t *dip)
2302 {
2303 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2304 	boolean_t	rv;
2305 
2306 	mutex_enter(&vdp->xdf_cb_lk);
2307 	rv = XD_IS_CD(vdp);
2308 	mutex_exit(&vdp->xdf_cb_lk);
2309 	return (rv);
2310 }
2311 
2312 boolean_t
2313 xdf_is_rm(dev_info_t *dip)
2314 {
2315 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2316 	boolean_t	rv;
2317 
2318 	mutex_enter(&vdp->xdf_cb_lk);
2319 	rv = XD_IS_RM(vdp);
2320 	mutex_exit(&vdp->xdf_cb_lk);
2321 	return (rv);
2322 }
2323 
2324 boolean_t
2325 xdf_media_req_supported(dev_info_t *dip)
2326 {
2327 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2328 	boolean_t	rv;
2329 
2330 	mutex_enter(&vdp->xdf_cb_lk);
2331 	rv = vdp->xdf_media_req_supported;
2332 	mutex_exit(&vdp->xdf_cb_lk);
2333 	return (rv);
2334 }
2335 
2336 #endif /* XPV_HVM_DRIVER */
2337 
2338 static int
2339 xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp)
2340 {
2341 	xdf_t *vdp;
2342 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2343 
2344 	if (vdp == NULL)
2345 		return (ENXIO);
2346 
2347 	mutex_enter(&vdp->xdf_dev_lk);
2348 	*capp = vdp->xdf_pgeom.g_capacity;
2349 	DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp));
2350 	mutex_exit(&vdp->xdf_dev_lk);
2351 	return (0);
2352 }
2353 
2354 static int
2355 xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2356 {
2357 	xdf_t *vdp;
2358 
2359 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
2360 		return (ENXIO);
2361 	*geomp = vdp->xdf_pgeom;
2362 	return (0);
2363 }
2364 
2365 /*
2366  * No real HBA, no geometry available from it
2367  */
2368 /*ARGSUSED*/
2369 static int
2370 xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2371 {
2372 	return (EINVAL);
2373 }
2374 
2375 static int
2376 xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep)
2377 {
2378 	xdf_t *vdp;
2379 
2380 	if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))))
2381 		return (ENXIO);
2382 
2383 	if (XD_IS_RO(vdp))
2384 		tgattributep->media_is_writable = 0;
2385 	else
2386 		tgattributep->media_is_writable = 1;
2387 	tgattributep->media_is_rotational = 0;
2388 	return (0);
2389 }
2390 
2391 /* ARGSUSED3 */
2392 int
2393 xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
2394 {
2395 	int instance;
2396 	xdf_t   *vdp;
2397 
2398 	instance = ddi_get_instance(dip);
2399 
2400 	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
2401 		return (ENXIO);
2402 
2403 	switch (cmd) {
2404 	case TG_GETPHYGEOM:
2405 		return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg));
2406 	case TG_GETVIRTGEOM:
2407 		return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg));
2408 	case TG_GETCAPACITY:
2409 		return (xdf_lb_getcap(dip, (diskaddr_t *)arg));
2410 	case TG_GETBLOCKSIZE:
2411 		mutex_enter(&vdp->xdf_cb_lk);
2412 		*(uint32_t *)arg = vdp->xdf_xdev_secsize;
2413 		mutex_exit(&vdp->xdf_cb_lk);
2414 		return (0);
2415 	case TG_GETATTR:
2416 		return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg));
2417 	default:
2418 		return (ENOTTY);
2419 	}
2420 }
2421 
2422 /* ARGSUSED5 */
2423 int
2424 xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp,
2425     diskaddr_t start, size_t reqlen, void *tg_cookie)
2426 {
2427 	xdf_t *vdp;
2428 	struct buf *bp;
2429 	int err = 0;
2430 
2431 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2432 
2433 	/* We don't allow IO from the oe_change callback thread */
2434 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2435 
2436 	/*
2437 	 * Having secsize of 0 means that device isn't connected yet.
2438 	 * FIXME This happens for CD devices, and there's nothing we
2439 	 * can do about it at the moment.
2440 	 */
2441 	if (vdp->xdf_xdev_secsize == 0)
2442 		return (EIO);
2443 
2444 	if ((start + ((reqlen / (vdp->xdf_xdev_secsize / DEV_BSIZE))
2445 	    >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
2446 		return (EINVAL);
2447 
2448 	bp = getrbuf(KM_SLEEP);
2449 	if (cmd == TG_READ)
2450 		bp->b_flags = B_BUSY | B_READ;
2451 	else
2452 		bp->b_flags = B_BUSY | B_WRITE;
2453 
2454 	bp->b_un.b_addr = bufp;
2455 	bp->b_bcount = reqlen;
2456 	bp->b_blkno = start * (vdp->xdf_xdev_secsize / DEV_BSIZE);
2457 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
2458 
2459 	mutex_enter(&vdp->xdf_dev_lk);
2460 	xdf_bp_push(vdp, bp);
2461 	mutex_exit(&vdp->xdf_dev_lk);
2462 	xdf_io_start(vdp);
2463 	if (curthread == vdp->xdf_ready_tq_thread)
2464 		(void) xdf_ring_drain(vdp);
2465 	err = biowait(bp);
2466 	ASSERT(bp->b_flags & B_DONE);
2467 	freerbuf(bp);
2468 	return (err);
2469 }
2470 
2471 /*
2472  * Lock the current media.  Set the media state to "lock".
2473  * (Media locks are only respected by the backend driver.)
2474  */
2475 static int
2476 xdf_ioctl_mlock(xdf_t *vdp)
2477 {
2478 	int rv;
2479 	mutex_enter(&vdp->xdf_cb_lk);
2480 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2481 	mutex_exit(&vdp->xdf_cb_lk);
2482 	return (rv);
2483 }
2484 
2485 /*
2486  * Release a media lock.  Set the media state to "none".
2487  */
2488 static int
2489 xdf_ioctl_munlock(xdf_t *vdp)
2490 {
2491 	int rv;
2492 	mutex_enter(&vdp->xdf_cb_lk);
2493 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE);
2494 	mutex_exit(&vdp->xdf_cb_lk);
2495 	return (rv);
2496 }
2497 
2498 /*
2499  * Eject the current media.  Ignores any media locks.  (Media locks
2500  * are only for benifit of the the backend.)
2501  */
2502 static int
2503 xdf_ioctl_eject(xdf_t *vdp)
2504 {
2505 	int rv;
2506 
2507 	mutex_enter(&vdp->xdf_cb_lk);
2508 	if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) {
2509 		mutex_exit(&vdp->xdf_cb_lk);
2510 		return (rv);
2511 	}
2512 
2513 	/*
2514 	 * We've set the media requests xenbus parameter to eject, so now
2515 	 * disconnect from the backend, wait for the backend to clear
2516 	 * the media requets xenbus paramter, and then we can reconnect
2517 	 * to the backend.
2518 	 */
2519 	(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2520 	mutex_enter(&vdp->xdf_dev_lk);
2521 	if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) {
2522 		mutex_exit(&vdp->xdf_dev_lk);
2523 		mutex_exit(&vdp->xdf_cb_lk);
2524 		return (EIO);
2525 	}
2526 	mutex_exit(&vdp->xdf_dev_lk);
2527 	mutex_exit(&vdp->xdf_cb_lk);
2528 	return (0);
2529 }
2530 
2531 /*
2532  * Watch for media state changes.  This can be an insertion of a device
2533  * (triggered by a 'xm block-configure' request in another domain) or
2534  * the ejection of a device (triggered by a local "eject" operation).
2535  * For a full description of the DKIOCSTATE ioctl behavior see dkio(4I).
2536  */
2537 static int
2538 xdf_dkstate(xdf_t *vdp, enum dkio_state mstate)
2539 {
2540 	enum dkio_state		prev_state;
2541 
2542 	mutex_enter(&vdp->xdf_cb_lk);
2543 	prev_state = vdp->xdf_mstate;
2544 
2545 	if (vdp->xdf_mstate == mstate) {
2546 		while (vdp->xdf_mstate == prev_state) {
2547 			if (cv_wait_sig(&vdp->xdf_mstate_cv,
2548 			    &vdp->xdf_cb_lk) == 0) {
2549 				mutex_exit(&vdp->xdf_cb_lk);
2550 				return (EINTR);
2551 			}
2552 		}
2553 	}
2554 
2555 	if ((prev_state != DKIO_INSERTED) &&
2556 	    (vdp->xdf_mstate == DKIO_INSERTED)) {
2557 		(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2558 		mutex_exit(&vdp->xdf_cb_lk);
2559 		return (0);
2560 	}
2561 
2562 	mutex_exit(&vdp->xdf_cb_lk);
2563 	return (0);
2564 }
2565 
2566 /*ARGSUSED*/
2567 static int
2568 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2569     int *rvalp)
2570 {
2571 	minor_t		minor = getminor(dev);
2572 	int		part = XDF_PART(minor);
2573 	xdf_t		*vdp;
2574 	int		rv;
2575 
2576 	if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) ||
2577 	    (!xdf_isopen(vdp, part)))
2578 		return (ENXIO);
2579 
2580 	DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n",
2581 	    vdp->xdf_addr, cmd, cmd));
2582 
2583 	switch (cmd) {
2584 	default:
2585 		return (ENOTTY);
2586 	case DKIOCG_PHYGEOM:
2587 	case DKIOCG_VIRTGEOM:
2588 	case DKIOCGGEOM:
2589 	case DKIOCSGEOM:
2590 	case DKIOCGAPART:
2591 	case DKIOCSAPART:
2592 	case DKIOCGVTOC:
2593 	case DKIOCSVTOC:
2594 	case DKIOCPARTINFO:
2595 	case DKIOCGEXTVTOC:
2596 	case DKIOCSEXTVTOC:
2597 	case DKIOCEXTPARTINFO:
2598 	case DKIOCGMBOOT:
2599 	case DKIOCSMBOOT:
2600 	case DKIOCGETEFI:
2601 	case DKIOCSETEFI:
2602 	case DKIOCSETEXTPART:
2603 	case DKIOCPARTITION:
2604 		rv = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
2605 		    rvalp, NULL);
2606 		if (rv != 0)
2607 			return (rv);
2608 		/*
2609 		 * If we're labelling the disk, we have to update the geometry
2610 		 * in the cmlb data structures, and we also have to write a new
2611 		 * devid to the disk.  Note that writing an EFI label currently
2612 		 * requires 4 ioctls, and devid setup will fail on all but the
2613 		 * last.
2614 		 */
2615 		if (cmd == DKIOCSEXTVTOC || cmd == DKIOCSVTOC ||
2616 		    cmd == DKIOCSETEFI) {
2617 			rv = cmlb_validate(vdp->xdf_vd_lbl, 0, 0);
2618 			if (rv == 0) {
2619 				xdf_devid_setup(vdp);
2620 			} else {
2621 				cmn_err(CE_WARN,
2622 				    "xdf@%s, labeling failed on validate",
2623 				    vdp->xdf_addr);
2624 			}
2625 		}
2626 		return (rv);
2627 	case FDEJECT:
2628 	case DKIOCEJECT:
2629 	case CDROMEJECT:
2630 		return (xdf_ioctl_eject(vdp));
2631 	case DKIOCLOCK:
2632 		return (xdf_ioctl_mlock(vdp));
2633 	case DKIOCUNLOCK:
2634 		return (xdf_ioctl_munlock(vdp));
2635 	case CDROMREADOFFSET: {
2636 		int offset = 0;
2637 		if (!XD_IS_CD(vdp))
2638 			return (ENOTTY);
2639 		if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode))
2640 			return (EFAULT);
2641 		return (0);
2642 	}
2643 	case DKIOCGMEDIAINFO: {
2644 		struct dk_minfo media_info;
2645 
2646 		media_info.dki_lbsize = vdp->xdf_xdev_secsize;
2647 		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
2648 		if (XD_IS_CD(vdp))
2649 			media_info.dki_media_type = DK_CDROM;
2650 		else
2651 			media_info.dki_media_type = DK_FIXED_DISK;
2652 
2653 		if (ddi_copyout(&media_info, (void *)arg,
2654 		    sizeof (struct dk_minfo), mode))
2655 			return (EFAULT);
2656 		return (0);
2657 	}
2658 	case DKIOCINFO: {
2659 		struct dk_cinfo info;
2660 
2661 		/* controller information */
2662 		if (XD_IS_CD(vdp))
2663 			info.dki_ctype = DKC_CDROM;
2664 		else
2665 			info.dki_ctype = DKC_VBD;
2666 
2667 		info.dki_cnum = 0;
2668 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
2669 
2670 		/* unit information */
2671 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
2672 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
2673 		info.dki_flags = DKI_FMTVOL;
2674 		info.dki_partition = part;
2675 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
2676 		info.dki_addr = 0;
2677 		info.dki_space = 0;
2678 		info.dki_prio = 0;
2679 		info.dki_vec = 0;
2680 
2681 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
2682 			return (EFAULT);
2683 		return (0);
2684 	}
2685 	case DKIOCSTATE: {
2686 		enum dkio_state mstate;
2687 
2688 		if (ddi_copyin((void *)arg, &mstate,
2689 		    sizeof (mstate), mode) != 0)
2690 			return (EFAULT);
2691 		if ((rv = xdf_dkstate(vdp, mstate)) != 0)
2692 			return (rv);
2693 		mstate = vdp->xdf_mstate;
2694 		if (ddi_copyout(&mstate, (void *)arg,
2695 		    sizeof (mstate), mode) != 0)
2696 			return (EFAULT);
2697 		return (0);
2698 	}
2699 	case DKIOCREMOVABLE: {
2700 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2701 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode))
2702 			return (EFAULT);
2703 		return (0);
2704 	}
2705 	case DKIOCGETWCE: {
2706 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2707 		if (ddi_copyout(&i, (void *)arg, sizeof (i), mode))
2708 			return (EFAULT);
2709 		return (0);
2710 	}
2711 	case DKIOCSETWCE: {
2712 		int i;
2713 		if (ddi_copyin((void *)arg, &i, sizeof (i), mode))
2714 			return (EFAULT);
2715 		vdp->xdf_wce = VOID2BOOLEAN(i);
2716 		return (0);
2717 	}
2718 	case DKIOCFLUSHWRITECACHE: {
2719 		struct dk_callback *dkc = (struct dk_callback *)arg;
2720 
2721 		if (vdp->xdf_flush_supported) {
2722 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2723 			    NULL, 0, 0, (void *)dev);
2724 		} else if (vdp->xdf_feature_barrier &&
2725 		    !xdf_barrier_flush_disable) {
2726 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2727 			    vdp->xdf_cache_flush_block, xdf_flush_block,
2728 			    vdp->xdf_xdev_secsize, (void *)dev);
2729 		} else {
2730 			return (ENOTTY);
2731 		}
2732 		if ((mode & FKIOCTL) && (dkc != NULL) &&
2733 		    (dkc->dkc_callback != NULL)) {
2734 			(*dkc->dkc_callback)(dkc->dkc_cookie, rv);
2735 			/* need to return 0 after calling callback */
2736 			rv = 0;
2737 		}
2738 		return (rv);
2739 	}
2740 	}
2741 	/*NOTREACHED*/
2742 }
2743 
2744 static int
2745 xdf_strategy(struct buf *bp)
2746 {
2747 	xdf_t	*vdp;
2748 	minor_t minor;
2749 	diskaddr_t p_blkct, p_blkst;
2750 	daddr_t blkno;
2751 	ulong_t nblks;
2752 	int part;
2753 
2754 	minor = getminor(bp->b_edev);
2755 	part = XDF_PART(minor);
2756 	vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor));
2757 
2758 	mutex_enter(&vdp->xdf_dev_lk);
2759 	if (!xdf_isopen(vdp, part)) {
2760 		mutex_exit(&vdp->xdf_dev_lk);
2761 		xdf_io_err(bp, ENXIO, 0);
2762 		return (0);
2763 	}
2764 
2765 	/* We don't allow IO from the oe_change callback thread */
2766 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2767 
2768 	/* Check for writes to a read only device */
2769 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
2770 		mutex_exit(&vdp->xdf_dev_lk);
2771 		xdf_io_err(bp, EROFS, 0);
2772 		return (0);
2773 	}
2774 
2775 	/* Check if this I/O is accessing a partition or the entire disk */
2776 	if ((long)bp->b_private == XB_SLICE_NONE) {
2777 		/* This I/O is using an absolute offset */
2778 		p_blkct = vdp->xdf_xdev_nblocks;
2779 		p_blkst = 0;
2780 	} else {
2781 		/* This I/O is using a partition relative offset */
2782 		mutex_exit(&vdp->xdf_dev_lk);
2783 		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
2784 		    &p_blkst, NULL, NULL, NULL)) {
2785 			xdf_io_err(bp, ENXIO, 0);
2786 			return (0);
2787 		}
2788 		mutex_enter(&vdp->xdf_dev_lk);
2789 	}
2790 
2791 	/*
2792 	 * Adjust the real blkno and bcount according to the underline
2793 	 * physical sector size.
2794 	 */
2795 	blkno = bp->b_blkno / (vdp->xdf_xdev_secsize / XB_BSIZE);
2796 
2797 	/* check for a starting block beyond the disk or partition limit */
2798 	if (blkno > p_blkct) {
2799 		DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64,
2800 		    vdp->xdf_addr, (longlong_t)blkno, (uint64_t)p_blkct));
2801 		mutex_exit(&vdp->xdf_dev_lk);
2802 		xdf_io_err(bp, EINVAL, 0);
2803 		return (0);
2804 	}
2805 
2806 	/* Legacy: don't set error flag at this case */
2807 	if (blkno == p_blkct) {
2808 		mutex_exit(&vdp->xdf_dev_lk);
2809 		bp->b_resid = bp->b_bcount;
2810 		biodone(bp);
2811 		return (0);
2812 	}
2813 
2814 	/* sanitize the input buf */
2815 	bioerror(bp, 0);
2816 	bp->b_resid = 0;
2817 	bp->av_back = bp->av_forw = NULL;
2818 
2819 	/* Adjust for partial transfer, this will result in an error later */
2820 	if (vdp->xdf_xdev_secsize != 0 &&
2821 	    vdp->xdf_xdev_secsize != XB_BSIZE) {
2822 		nblks = bp->b_bcount / vdp->xdf_xdev_secsize;
2823 	} else {
2824 		nblks = bp->b_bcount >> XB_BSHIFT;
2825 	}
2826 
2827 	if ((blkno + nblks) > p_blkct) {
2828 		if (vdp->xdf_xdev_secsize != 0 &&
2829 		    vdp->xdf_xdev_secsize != XB_BSIZE) {
2830 			bp->b_resid =
2831 			    ((blkno + nblks) - p_blkct) *
2832 			    vdp->xdf_xdev_secsize;
2833 		} else {
2834 			bp->b_resid =
2835 			    ((blkno + nblks) - p_blkct) <<
2836 			    XB_BSHIFT;
2837 		}
2838 		bp->b_bcount -= bp->b_resid;
2839 	}
2840 
2841 	DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n",
2842 	    vdp->xdf_addr, (longlong_t)blkno, (ulong_t)bp->b_bcount));
2843 
2844 	/* Fix up the buf struct */
2845 	bp->b_flags |= B_BUSY;
2846 	bp->b_private = (void *)(uintptr_t)p_blkst;
2847 
2848 	xdf_bp_push(vdp, bp);
2849 	mutex_exit(&vdp->xdf_dev_lk);
2850 	xdf_io_start(vdp);
2851 	if (do_polled_io)
2852 		(void) xdf_ring_drain(vdp);
2853 	return (0);
2854 }
2855 
2856 /*ARGSUSED*/
2857 static int
2858 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
2859 {
2860 	xdf_t	*vdp;
2861 	minor_t minor;
2862 	diskaddr_t p_blkcnt;
2863 	int part;
2864 
2865 	minor = getminor(dev);
2866 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2867 		return (ENXIO);
2868 
2869 	DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n",
2870 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2871 
2872 	part = XDF_PART(minor);
2873 	if (!xdf_isopen(vdp, part))
2874 		return (ENXIO);
2875 
2876 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2877 	    NULL, NULL, NULL, NULL))
2878 		return (ENXIO);
2879 
2880 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2881 		return (ENOSPC);
2882 
2883 	if (U_INVAL(uiop))
2884 		return (EINVAL);
2885 
2886 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
2887 }
2888 
2889 /*ARGSUSED*/
2890 static int
2891 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
2892 {
2893 	xdf_t *vdp;
2894 	minor_t minor;
2895 	diskaddr_t p_blkcnt;
2896 	int part;
2897 
2898 	minor = getminor(dev);
2899 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2900 		return (ENXIO);
2901 
2902 	DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n",
2903 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2904 
2905 	part = XDF_PART(minor);
2906 	if (!xdf_isopen(vdp, part))
2907 		return (ENXIO);
2908 
2909 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2910 	    NULL, NULL, NULL, NULL))
2911 		return (ENXIO);
2912 
2913 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2914 		return (ENOSPC);
2915 
2916 	if (U_INVAL(uiop))
2917 		return (EINVAL);
2918 
2919 	return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop));
2920 }
2921 
2922 /*ARGSUSED*/
2923 static int
2924 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
2925 {
2926 	xdf_t	*vdp;
2927 	minor_t minor;
2928 	struct uio *uiop = aiop->aio_uio;
2929 	diskaddr_t p_blkcnt;
2930 	int part;
2931 
2932 	minor = getminor(dev);
2933 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2934 		return (ENXIO);
2935 
2936 	part = XDF_PART(minor);
2937 	if (!xdf_isopen(vdp, part))
2938 		return (ENXIO);
2939 
2940 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2941 	    NULL, NULL, NULL, NULL))
2942 		return (ENXIO);
2943 
2944 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2945 		return (ENOSPC);
2946 
2947 	if (U_INVAL(uiop))
2948 		return (EINVAL);
2949 
2950 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop));
2951 }
2952 
2953 /*ARGSUSED*/
2954 static int
2955 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
2956 {
2957 	xdf_t *vdp;
2958 	minor_t minor;
2959 	struct uio *uiop = aiop->aio_uio;
2960 	diskaddr_t p_blkcnt;
2961 	int part;
2962 
2963 	minor = getminor(dev);
2964 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2965 		return (ENXIO);
2966 
2967 	part = XDF_PART(minor);
2968 	if (!xdf_isopen(vdp, part))
2969 		return (ENXIO);
2970 
2971 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2972 	    NULL, NULL, NULL, NULL))
2973 		return (ENXIO);
2974 
2975 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2976 		return (ENOSPC);
2977 
2978 	if (U_INVAL(uiop))
2979 		return (EINVAL);
2980 
2981 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop));
2982 }
2983 
2984 static int
2985 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
2986 {
2987 	struct buf dumpbuf, *dbp = &dumpbuf;
2988 	xdf_t	*vdp;
2989 	minor_t minor;
2990 	int err = 0;
2991 	int part;
2992 	diskaddr_t p_blkcnt, p_blkst;
2993 
2994 	minor = getminor(dev);
2995 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2996 		return (ENXIO);
2997 
2998 	DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n",
2999 	    vdp->xdf_addr, (void *)addr, blkno, nblk));
3000 
3001 	/* We don't allow IO from the oe_change callback thread */
3002 	ASSERT(curthread != vdp->xdf_oe_change_thread);
3003 
3004 	part = XDF_PART(minor);
3005 	if (!xdf_isopen(vdp, part))
3006 		return (ENXIO);
3007 
3008 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
3009 	    NULL, NULL, NULL))
3010 		return (ENXIO);
3011 
3012 	if ((blkno + nblk) >
3013 	    (p_blkcnt * (vdp->xdf_xdev_secsize / XB_BSIZE))) {
3014 		cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64,
3015 		    vdp->xdf_addr, (daddr_t)((blkno + nblk) /
3016 		    (vdp->xdf_xdev_secsize / XB_BSIZE)), (uint64_t)p_blkcnt);
3017 		return (EINVAL);
3018 	}
3019 
3020 	bioinit(dbp);
3021 	dbp->b_flags = B_BUSY;
3022 	dbp->b_un.b_addr = addr;
3023 	dbp->b_bcount = nblk << DEV_BSHIFT;
3024 	dbp->b_blkno = blkno;
3025 	dbp->b_edev = dev;
3026 	dbp->b_private = (void *)(uintptr_t)p_blkst;
3027 
3028 	mutex_enter(&vdp->xdf_dev_lk);
3029 	xdf_bp_push(vdp, dbp);
3030 	mutex_exit(&vdp->xdf_dev_lk);
3031 	xdf_io_start(vdp);
3032 	err = xdf_ring_drain(vdp);
3033 	biofini(dbp);
3034 	return (err);
3035 }
3036 
3037 /*ARGSUSED*/
3038 static int
3039 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
3040 {
3041 	minor_t	minor;
3042 	xdf_t	*vdp;
3043 	int part;
3044 	ulong_t parbit;
3045 
3046 	minor = getminor(dev);
3047 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3048 		return (ENXIO);
3049 
3050 	mutex_enter(&vdp->xdf_dev_lk);
3051 	part = XDF_PART(minor);
3052 	if (!xdf_isopen(vdp, part)) {
3053 		mutex_exit(&vdp->xdf_dev_lk);
3054 		return (ENXIO);
3055 	}
3056 	parbit = 1 << part;
3057 
3058 	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
3059 	if (otyp == OTYP_LYR) {
3060 		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
3061 		if (--vdp->xdf_vd_lyropen[part] == 0)
3062 			vdp->xdf_vd_open[otyp] &= ~parbit;
3063 	} else {
3064 		vdp->xdf_vd_open[otyp] &= ~parbit;
3065 	}
3066 	vdp->xdf_vd_exclopen &= ~parbit;
3067 
3068 	mutex_exit(&vdp->xdf_dev_lk);
3069 	return (0);
3070 }
3071 
3072 static int
3073 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
3074 {
3075 	minor_t	minor;
3076 	xdf_t	*vdp;
3077 	int part;
3078 	ulong_t parbit;
3079 	diskaddr_t p_blkct = 0;
3080 	boolean_t firstopen;
3081 	boolean_t nodelay;
3082 
3083 	minor = getminor(*devp);
3084 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3085 		return (ENXIO);
3086 
3087 	nodelay = (flag & (FNDELAY | FNONBLOCK));
3088 
3089 	DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr));
3090 
3091 	/* do cv_wait until connected or failed */
3092 	mutex_enter(&vdp->xdf_cb_lk);
3093 	mutex_enter(&vdp->xdf_dev_lk);
3094 	if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) {
3095 		mutex_exit(&vdp->xdf_dev_lk);
3096 		mutex_exit(&vdp->xdf_cb_lk);
3097 		return (ENXIO);
3098 	}
3099 	mutex_exit(&vdp->xdf_cb_lk);
3100 
3101 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
3102 		mutex_exit(&vdp->xdf_dev_lk);
3103 		return (EROFS);
3104 	}
3105 
3106 	part = XDF_PART(minor);
3107 	parbit = 1 << part;
3108 	if ((vdp->xdf_vd_exclopen & parbit) ||
3109 	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
3110 		mutex_exit(&vdp->xdf_dev_lk);
3111 		return (EBUSY);
3112 	}
3113 
3114 	/* are we the first one to open this node? */
3115 	firstopen = !xdf_isopen(vdp, -1);
3116 
3117 	if (otyp == OTYP_LYR)
3118 		vdp->xdf_vd_lyropen[part]++;
3119 
3120 	vdp->xdf_vd_open[otyp] |= parbit;
3121 
3122 	if (flag & FEXCL)
3123 		vdp->xdf_vd_exclopen |= parbit;
3124 
3125 	mutex_exit(&vdp->xdf_dev_lk);
3126 
3127 	/* force a re-validation */
3128 	if (firstopen)
3129 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
3130 
3131 	/* If this is a non-blocking open then we're done */
3132 	if (nodelay)
3133 		return (0);
3134 
3135 	/*
3136 	 * This is a blocking open, so we require:
3137 	 * - that the disk have a valid label on it
3138 	 * - that the size of the partition that we're opening is non-zero
3139 	 */
3140 	if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
3141 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) {
3142 		(void) xdf_close(*devp, flag, otyp, credp);
3143 		return (ENXIO);
3144 	}
3145 
3146 	return (0);
3147 }
3148 
3149 /*ARGSUSED*/
3150 static void
3151 xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg)
3152 {
3153 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
3154 	cv_broadcast(&vdp->xdf_hp_status_cv);
3155 }
3156 
3157 static int
3158 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
3159     char *name, caddr_t valuep, int *lengthp)
3160 {
3161 	xdf_t	*vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
3162 
3163 	/*
3164 	 * Sanity check that if a dev_t or dip were specified that they
3165 	 * correspond to this device driver.  On debug kernels we'll
3166 	 * panic and on non-debug kernels we'll return failure.
3167 	 */
3168 	ASSERT(ddi_driver_major(dip) == xdf_major);
3169 	ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major));
3170 	if ((ddi_driver_major(dip) != xdf_major) ||
3171 	    ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major)))
3172 		return (DDI_PROP_NOT_FOUND);
3173 
3174 	if (vdp == NULL)
3175 		return (ddi_prop_op(dev, dip, prop_op, flags,
3176 		    name, valuep, lengthp));
3177 
3178 	return (cmlb_prop_op(vdp->xdf_vd_lbl,
3179 	    dev, dip, prop_op, flags, name, valuep, lengthp,
3180 	    XDF_PART(getminor(dev)), NULL));
3181 }
3182 
3183 /*ARGSUSED*/
3184 static int
3185 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
3186 {
3187 	int	instance = XDF_INST(getminor((dev_t)arg));
3188 	xdf_t	*vbdp;
3189 
3190 	switch (cmd) {
3191 	case DDI_INFO_DEVT2DEVINFO:
3192 		if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) {
3193 			*rp = NULL;
3194 			return (DDI_FAILURE);
3195 		}
3196 		*rp = vbdp->xdf_dip;
3197 		return (DDI_SUCCESS);
3198 
3199 	case DDI_INFO_DEVT2INSTANCE:
3200 		*rp = (void *)(uintptr_t)instance;
3201 		return (DDI_SUCCESS);
3202 
3203 	default:
3204 		return (DDI_FAILURE);
3205 	}
3206 }
3207 
3208 /*ARGSUSED*/
3209 static int
3210 xdf_resume(dev_info_t *dip)
3211 {
3212 	xdf_t	*vdp;
3213 	char	*oename;
3214 
3215 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
3216 		goto err;
3217 
3218 	if (xdf_debug & SUSRES_DBG)
3219 		xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr);
3220 
3221 	mutex_enter(&vdp->xdf_cb_lk);
3222 
3223 	if (xvdi_resume(dip) != DDI_SUCCESS) {
3224 		mutex_exit(&vdp->xdf_cb_lk);
3225 		goto err;
3226 	}
3227 
3228 	if (((oename = xvdi_get_oename(dip)) == NULL) ||
3229 	    (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3230 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) {
3231 		mutex_exit(&vdp->xdf_cb_lk);
3232 		goto err;
3233 	}
3234 
3235 	mutex_enter(&vdp->xdf_dev_lk);
3236 	ASSERT(vdp->xdf_state != XD_READY);
3237 	xdf_set_state(vdp, XD_UNKNOWN);
3238 	mutex_exit(&vdp->xdf_dev_lk);
3239 
3240 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3241 		mutex_exit(&vdp->xdf_cb_lk);
3242 		goto err;
3243 	}
3244 
3245 	mutex_exit(&vdp->xdf_cb_lk);
3246 
3247 	if (xdf_debug & SUSRES_DBG)
3248 		xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr);
3249 	return (DDI_SUCCESS);
3250 err:
3251 	if (xdf_debug & SUSRES_DBG)
3252 		xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr);
3253 	return (DDI_FAILURE);
3254 }
3255 
3256 /*
3257  * Uses the in-memory devid if one exists.
3258  *
3259  * Create a devid and write it on the first block of the last track of
3260  * the last cylinder.
3261  * Return DDI_SUCCESS or DDI_FAILURE.
3262  */
3263 static int
3264 xdf_devid_fabricate(xdf_t *vdp)
3265 {
3266 	ddi_devid_t	devid = vdp->xdf_tgt_devid; /* null if no devid */
3267 	struct dk_devid *dkdevidp = NULL; /* devid struct stored on disk */
3268 	diskaddr_t	blk;
3269 	uint_t		*ip, chksum;
3270 	int		i, devid_size;
3271 
3272 	if (cmlb_get_devid_block(vdp->xdf_vd_lbl, &blk, NULL) != 0)
3273 		goto err;
3274 
3275 	if (devid == NULL && ddi_devid_init(vdp->xdf_dip, DEVID_FAB, 0,
3276 	    NULL, &devid) != DDI_SUCCESS)
3277 		goto err;
3278 
3279 	/* allocate a buffer */
3280 	dkdevidp = (struct dk_devid *)kmem_zalloc(NBPSCTR, KM_SLEEP);
3281 
3282 	/* Fill in the revision */
3283 	dkdevidp->dkd_rev_hi = DK_DEVID_REV_MSB;
3284 	dkdevidp->dkd_rev_lo = DK_DEVID_REV_LSB;
3285 
3286 	/* Copy in the device id */
3287 	devid_size = ddi_devid_sizeof(devid);
3288 	if (devid_size > DK_DEVID_SIZE)
3289 		goto err;
3290 	bcopy(devid, dkdevidp->dkd_devid, devid_size);
3291 
3292 	/* Calculate the chksum */
3293 	chksum = 0;
3294 	ip = (uint_t *)dkdevidp;
3295 	for (i = 0; i < (NBPSCTR / sizeof (int)) - 1; i++)
3296 		chksum ^= ip[i];
3297 
3298 	/* Fill in the checksum */
3299 	DKD_FORMCHKSUM(chksum, dkdevidp);
3300 
3301 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, dkdevidp, blk,
3302 	    NBPSCTR, NULL) != 0)
3303 		goto err;
3304 
3305 	kmem_free(dkdevidp, NBPSCTR);
3306 
3307 	vdp->xdf_tgt_devid = devid;
3308 	return (DDI_SUCCESS);
3309 
3310 err:
3311 	if (dkdevidp != NULL)
3312 		kmem_free(dkdevidp, NBPSCTR);
3313 	if (devid != NULL && vdp->xdf_tgt_devid == NULL)
3314 		ddi_devid_free(devid);
3315 	return (DDI_FAILURE);
3316 }
3317 
3318 /*
3319  * xdf_devid_read() is a local copy of xdfs_devid_read(), modified to use xdf
3320  * functions.
3321  *
3322  * Read a devid from on the first block of the last track of
3323  * the last cylinder.  Make sure what we read is a valid devid.
3324  * Return DDI_SUCCESS or DDI_FAILURE.
3325  */
3326 static int
3327 xdf_devid_read(xdf_t *vdp)
3328 {
3329 	diskaddr_t	blk;
3330 	struct dk_devid *dkdevidp;
3331 	uint_t		*ip, chksum;
3332 	int		i;
3333 
3334 	if (cmlb_get_devid_block(vdp->xdf_vd_lbl, &blk, NULL) != 0)
3335 		return (DDI_FAILURE);
3336 
3337 	dkdevidp = kmem_zalloc(NBPSCTR, KM_SLEEP);
3338 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, dkdevidp, blk,
3339 	    NBPSCTR, NULL) != 0)
3340 		goto err;
3341 
3342 	/* Validate the revision */
3343 	if ((dkdevidp->dkd_rev_hi != DK_DEVID_REV_MSB) ||
3344 	    (dkdevidp->dkd_rev_lo != DK_DEVID_REV_LSB))
3345 		goto err;
3346 
3347 	/* Calculate the checksum */
3348 	chksum = 0;
3349 	ip = (uint_t *)dkdevidp;
3350 	for (i = 0; i < (NBPSCTR / sizeof (int)) - 1; i++)
3351 		chksum ^= ip[i];
3352 	if (DKD_GETCHKSUM(dkdevidp) != chksum)
3353 		goto err;
3354 
3355 	/* Validate the device id */
3356 	if (ddi_devid_valid((ddi_devid_t)dkdevidp->dkd_devid) != DDI_SUCCESS)
3357 		goto err;
3358 
3359 	/* keep a copy of the device id */
3360 	i = ddi_devid_sizeof((ddi_devid_t)dkdevidp->dkd_devid);
3361 	vdp->xdf_tgt_devid = kmem_alloc(i, KM_SLEEP);
3362 	bcopy(dkdevidp->dkd_devid, vdp->xdf_tgt_devid, i);
3363 	kmem_free(dkdevidp, NBPSCTR);
3364 	return (DDI_SUCCESS);
3365 
3366 err:
3367 	kmem_free(dkdevidp, NBPSCTR);
3368 	return (DDI_FAILURE);
3369 }
3370 
3371 /*
3372  * xdf_devid_setup() is a modified copy of cmdk_devid_setup().
3373  *
3374  * This function creates a devid if we don't already have one, and
3375  * registers it.  If we already have one, we make sure that it can be
3376  * read from the disk, otherwise we write it to the disk ourselves.  If
3377  * we didn't already have a devid, and we create one, we also need to
3378  * register it.
3379  */
3380 void
3381 xdf_devid_setup(xdf_t *vdp)
3382 {
3383 	int rc;
3384 	boolean_t existed = vdp->xdf_tgt_devid != NULL;
3385 
3386 	/* Read devid from the disk, if present */
3387 	rc = xdf_devid_read(vdp);
3388 
3389 	/* Otherwise write a devid (which we create if necessary) on the disk */
3390 	if (rc != DDI_SUCCESS)
3391 		rc = xdf_devid_fabricate(vdp);
3392 
3393 	/* If we created a devid or found it on the disk, register it */
3394 	if (rc == DDI_SUCCESS && !existed)
3395 		(void) ddi_devid_register(vdp->xdf_dip, vdp->xdf_tgt_devid);
3396 }
3397 
3398 static int
3399 xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3400 {
3401 	int			n, instance = ddi_get_instance(dip);
3402 	ddi_iblock_cookie_t	ibc, softibc;
3403 	boolean_t		dev_iscd = B_FALSE;
3404 	xdf_t			*vdp;
3405 	char			*oename, *xsname, *str;
3406 	clock_t			timeout;
3407 	int			err = 0;
3408 
3409 	if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM,
3410 	    "xdf_debug", 0)) != 0)
3411 		xdf_debug = n;
3412 
3413 	switch (cmd) {
3414 	case DDI_RESUME:
3415 		return (xdf_resume(dip));
3416 	case DDI_ATTACH:
3417 		break;
3418 	default:
3419 		return (DDI_FAILURE);
3420 	}
3421 	/* DDI_ATTACH */
3422 
3423 	if ((xsname = xvdi_get_xsname(dip)) == NULL ||
3424 	    (oename = xvdi_get_oename(dip)) == NULL)
3425 		return (DDI_FAILURE);
3426 
3427 	/*
3428 	 * Disable auto-detach.  This is necessary so that we don't get
3429 	 * detached while we're disconnected from the back end.
3430 	 */
3431 	if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip,
3432 	    DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS))
3433 		return (DDI_FAILURE);
3434 
3435 	/* driver handles kernel-issued IOCTLs */
3436 	if (ddi_prop_create(DDI_DEV_T_NONE, dip,
3437 	    DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS)
3438 		return (DDI_FAILURE);
3439 
3440 	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
3441 		return (DDI_FAILURE);
3442 
3443 	if (ddi_get_soft_iblock_cookie(dip,
3444 	    DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS)
3445 		return (DDI_FAILURE);
3446 
3447 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
3448 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
3449 		    ddi_get_name_addr(dip));
3450 		return (DDI_FAILURE);
3451 	}
3452 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
3453 		dev_iscd = B_TRUE;
3454 	strfree(str);
3455 
3456 	if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS)
3457 		return (DDI_FAILURE);
3458 
3459 	DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip)));
3460 	vdp = ddi_get_soft_state(xdf_ssp, instance);
3461 	ddi_set_driver_private(dip, vdp);
3462 	vdp->xdf_dip = dip;
3463 	vdp->xdf_addr = ddi_get_name_addr(dip);
3464 	vdp->xdf_suspending = B_FALSE;
3465 	vdp->xdf_media_req_supported = B_FALSE;
3466 	vdp->xdf_peer = INVALID_DOMID;
3467 	vdp->xdf_evtchn = INVALID_EVTCHN;
3468 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
3469 	    offsetof(v_req_t, v_link));
3470 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
3471 	cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL);
3472 	cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL);
3473 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3474 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3475 	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3476 	vdp->xdf_cmlb_reattach = B_TRUE;
3477 	if (dev_iscd) {
3478 		vdp->xdf_dinfo |= VDISK_CDROM;
3479 		vdp->xdf_mstate = DKIO_EJECTED;
3480 	} else {
3481 		vdp->xdf_mstate = DKIO_NONE;
3482 	}
3483 
3484 	if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq",
3485 	    1, TASKQ_DEFAULTPRI, 0)) == NULL)
3486 		goto errout0;
3487 
3488 	if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3489 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)
3490 		goto errout0;
3491 
3492 	if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
3493 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
3494 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
3495 		    ddi_get_name_addr(dip));
3496 		goto errout0;
3497 	}
3498 
3499 	/*
3500 	 * Initialize the physical geometry stucture.  Note that currently
3501 	 * we don't know the size of the backend device so the number
3502 	 * of blocks on the device will be initialized to zero.  Once
3503 	 * we connect to the backend device we'll update the physical
3504 	 * geometry to reflect the real size of the device.
3505 	 */
3506 	xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom);
3507 	vdp->xdf_pgeom_fixed = B_FALSE;
3508 
3509 	/*
3510 	 * Allocate the cmlb handle, minor nodes will be created once
3511 	 * the device is connected with backend.
3512 	 */
3513 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
3514 
3515 	/* We ship with cache-enabled disks */
3516 	vdp->xdf_wce = B_TRUE;
3517 
3518 	mutex_enter(&vdp->xdf_cb_lk);
3519 	/* Watch backend XenbusState change */
3520 	if (xvdi_add_event_handler(dip,
3521 	    XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) {
3522 		mutex_exit(&vdp->xdf_cb_lk);
3523 		goto errout0;
3524 	}
3525 
3526 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3527 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
3528 		    ddi_get_name_addr(dip));
3529 		mutex_exit(&vdp->xdf_cb_lk);
3530 		goto errout1;
3531 	}
3532 
3533 	/* Nothing else to do for CD devices */
3534 	if (dev_iscd) {
3535 		mutex_exit(&vdp->xdf_cb_lk);
3536 		goto done;
3537 	}
3538 
3539 	/*
3540 	 * In order to do cmlb_validate, we have to wait for the disk to
3541 	 * acknowledge the attach, so we can query the backend for the disk
3542 	 * geometry (see xdf_setstate_connected).
3543 	 *
3544 	 * We only wait 30 seconds; if this is the root disk, the boot
3545 	 * will fail, but it would fail anyway if the device never
3546 	 * connected.  If this is a non-boot disk, that disk will fail
3547 	 * to connect, but again, it would fail anyway.
3548 	 */
3549 	timeout = ddi_get_lbolt() + drv_usectohz(XDF_STATE_TIMEOUT);
3550 	while (vdp->xdf_state != XD_CONNECTED && vdp->xdf_state != XD_READY) {
3551 		if (cv_timedwait(&vdp->xdf_dev_cv, &vdp->xdf_cb_lk,
3552 		    timeout) < 0) {
3553 			cmn_err(CE_WARN, "xdf@%s: disk failed to connect",
3554 			    ddi_get_name_addr(dip));
3555 			mutex_exit(&vdp->xdf_cb_lk);
3556 			goto errout1;
3557 		}
3558 	}
3559 	mutex_exit(&vdp->xdf_cb_lk);
3560 
3561 	/*
3562 	 * We call cmlb_validate so that the geometry information in
3563 	 * vdp->xdf_vd_lbl is correct; this fills out the number of
3564 	 * alternate cylinders so that we have a place to write the
3565 	 * devid.
3566 	 */
3567 	if ((err = cmlb_validate(vdp->xdf_vd_lbl, 0, NULL)) != 0) {
3568 		cmn_err(CE_NOTE,
3569 		    "xdf@%s: cmlb_validate failed: %d",
3570 		    ddi_get_name_addr(dip), err);
3571 		/*
3572 		 * We can carry on even if cmlb_validate() returns EINVAL here,
3573 		 * as we'll rewrite the disk label anyway.
3574 		 */
3575 		if (err != EINVAL)
3576 			goto errout1;
3577 	}
3578 
3579 	/*
3580 	 * xdf_devid_setup will only write a devid if one isn't
3581 	 * already present.  If it fails to find or create one, we
3582 	 * create one in-memory so that when we label the disk later,
3583 	 * it will have a devid to use.  This is helpful to deal with
3584 	 * cases where people use the devids of their disks before
3585 	 * labelling them; note that this does cause problems if
3586 	 * people rely on the devids of unlabelled disks to persist
3587 	 * across reboot.
3588 	 */
3589 	xdf_devid_setup(vdp);
3590 	if (vdp->xdf_tgt_devid == NULL) {
3591 		if (ddi_devid_init(vdp->xdf_dip, DEVID_FAB, 0, NULL,
3592 		    &vdp->xdf_tgt_devid) != DDI_SUCCESS) {
3593 			cmn_err(CE_WARN,
3594 			    "xdf@%s_ attach failed, devid_init failed",
3595 			    ddi_get_name_addr(dip));
3596 			goto errout1;
3597 		} else {
3598 			(void) ddi_devid_register(vdp->xdf_dip,
3599 			    vdp->xdf_tgt_devid);
3600 		}
3601 	}
3602 
3603 done:
3604 #ifdef XPV_HVM_DRIVER
3605 	xdf_hvm_add(dip);
3606 
3607 	/* Report our version to dom0 */
3608 	(void) xenbus_printf(XBT_NULL, "guest/xdf", "version", "%d",
3609 	    HVMPV_XDF_VERS);
3610 #endif /* XPV_HVM_DRIVER */
3611 
3612 	/* Create kstat for iostat(8) */
3613 	if (xdf_kstat_create(dip) != 0) {
3614 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
3615 		    ddi_get_name_addr(dip));
3616 		goto errout1;
3617 	}
3618 
3619 	/*
3620 	 * Don't bother with getting real device identification
3621 	 * strings (is it even possible?), they are unlikely to
3622 	 * change often (if at all).
3623 	 */
3624 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_VENDOR_ID,
3625 	    "Xen");
3626 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_PRODUCT_ID,
3627 	    dev_iscd ? "Virtual CD" : "Virtual disk");
3628 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_REVISION_ID,
3629 	    "1.0");
3630 
3631 	ddi_report_dev(dip);
3632 	DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr));
3633 	return (DDI_SUCCESS);
3634 
3635 errout1:
3636 	(void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed);
3637 	xvdi_remove_event_handler(dip, XS_OE_STATE);
3638 errout0:
3639 	if (vdp->xdf_vd_lbl != NULL) {
3640 		cmlb_free_handle(&vdp->xdf_vd_lbl);
3641 		vdp->xdf_vd_lbl = NULL;
3642 	}
3643 	if (vdp->xdf_softintr_id != NULL)
3644 		ddi_remove_softintr(vdp->xdf_softintr_id);
3645 	xvdi_remove_xb_watch_handlers(dip);
3646 	if (vdp->xdf_ready_tq != NULL)
3647 		ddi_taskq_destroy(vdp->xdf_ready_tq);
3648 	mutex_destroy(&vdp->xdf_cb_lk);
3649 	mutex_destroy(&vdp->xdf_dev_lk);
3650 	cv_destroy(&vdp->xdf_dev_cv);
3651 	cv_destroy(&vdp->xdf_hp_status_cv);
3652 	ddi_soft_state_free(xdf_ssp, instance);
3653 	ddi_set_driver_private(dip, NULL);
3654 	ddi_prop_remove_all(dip);
3655 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip));
3656 	return (DDI_FAILURE);
3657 }
3658 
3659 static int
3660 xdf_suspend(dev_info_t *dip)
3661 {
3662 	int		instance = ddi_get_instance(dip);
3663 	xdf_t		*vdp;
3664 
3665 	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
3666 		return (DDI_FAILURE);
3667 
3668 	if (xdf_debug & SUSRES_DBG)
3669 		xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr);
3670 
3671 	xvdi_suspend(dip);
3672 
3673 	mutex_enter(&vdp->xdf_cb_lk);
3674 	mutex_enter(&vdp->xdf_dev_lk);
3675 
3676 	vdp->xdf_suspending = B_TRUE;
3677 	xdf_ring_destroy(vdp);
3678 	xdf_set_state(vdp, XD_SUSPEND);
3679 	vdp->xdf_suspending = B_FALSE;
3680 
3681 	mutex_exit(&vdp->xdf_dev_lk);
3682 	mutex_exit(&vdp->xdf_cb_lk);
3683 
3684 	if (xdf_debug & SUSRES_DBG)
3685 		xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr);
3686 
3687 	return (DDI_SUCCESS);
3688 }
3689 
3690 static int
3691 xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3692 {
3693 	xdf_t *vdp;
3694 	int instance;
3695 
3696 	switch (cmd) {
3697 
3698 	case DDI_PM_SUSPEND:
3699 		break;
3700 
3701 	case DDI_SUSPEND:
3702 		return (xdf_suspend(dip));
3703 
3704 	case DDI_DETACH:
3705 		break;
3706 
3707 	default:
3708 		return (DDI_FAILURE);
3709 	}
3710 
3711 	instance = ddi_get_instance(dip);
3712 	DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip)));
3713 	vdp = ddi_get_soft_state(xdf_ssp, instance);
3714 
3715 	if (vdp == NULL)
3716 		return (DDI_FAILURE);
3717 
3718 	mutex_enter(&vdp->xdf_cb_lk);
3719 	xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
3720 	if (vdp->xdf_state != XD_CLOSED) {
3721 		mutex_exit(&vdp->xdf_cb_lk);
3722 		return (DDI_FAILURE);
3723 	}
3724 	mutex_exit(&vdp->xdf_cb_lk);
3725 
3726 	ASSERT(!ISDMACBON(vdp));
3727 
3728 #ifdef XPV_HVM_DRIVER
3729 	xdf_hvm_rm(dip);
3730 #endif /* XPV_HVM_DRIVER */
3731 
3732 	if (vdp->xdf_timeout_id != 0)
3733 		(void) untimeout(vdp->xdf_timeout_id);
3734 
3735 	xvdi_remove_event_handler(dip, XS_OE_STATE);
3736 	ddi_taskq_destroy(vdp->xdf_ready_tq);
3737 
3738 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
3739 	cmlb_free_handle(&vdp->xdf_vd_lbl);
3740 
3741 	/* we'll support backend running in domU later */
3742 #ifdef	DOMU_BACKEND
3743 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
3744 #endif
3745 
3746 	list_destroy(&vdp->xdf_vreq_act);
3747 	ddi_prop_remove_all(dip);
3748 	xdf_kstat_delete(dip);
3749 	ddi_remove_softintr(vdp->xdf_softintr_id);
3750 	xvdi_remove_xb_watch_handlers(dip);
3751 	ddi_set_driver_private(dip, NULL);
3752 	cv_destroy(&vdp->xdf_dev_cv);
3753 	mutex_destroy(&vdp->xdf_cb_lk);
3754 	mutex_destroy(&vdp->xdf_dev_lk);
3755 	if (vdp->xdf_cache_flush_block != NULL)
3756 		kmem_free(vdp->xdf_flush_mem, 2 * vdp->xdf_xdev_secsize);
3757 	ddi_soft_state_free(xdf_ssp, instance);
3758 	return (DDI_SUCCESS);
3759 }
3760 
3761 /*
3762  * Driver linkage structures.
3763  */
3764 static struct cb_ops xdf_cbops = {
3765 	xdf_open,
3766 	xdf_close,
3767 	xdf_strategy,
3768 	nodev,
3769 	xdf_dump,
3770 	xdf_read,
3771 	xdf_write,
3772 	xdf_ioctl,
3773 	nodev,
3774 	nodev,
3775 	nodev,
3776 	nochpoll,
3777 	xdf_prop_op,
3778 	NULL,
3779 	D_MP | D_NEW | D_64BIT,
3780 	CB_REV,
3781 	xdf_aread,
3782 	xdf_awrite
3783 };
3784 
3785 struct dev_ops xdf_devops = {
3786 	DEVO_REV,		/* devo_rev */
3787 	0,			/* devo_refcnt */
3788 	xdf_getinfo,		/* devo_getinfo */
3789 	nulldev,		/* devo_identify */
3790 	nulldev,		/* devo_probe */
3791 	xdf_attach,		/* devo_attach */
3792 	xdf_detach,		/* devo_detach */
3793 	nodev,			/* devo_reset */
3794 	&xdf_cbops,		/* devo_cb_ops */
3795 	NULL,			/* devo_bus_ops */
3796 	NULL,			/* devo_power */
3797 	ddi_quiesce_not_supported, /* devo_quiesce */
3798 };
3799 
3800 /*
3801  * Module linkage structures.
3802  */
3803 static struct modldrv modldrv = {
3804 	&mod_driverops,		/* Type of module.  This one is a driver */
3805 	"virtual block driver",	/* short description */
3806 	&xdf_devops		/* driver specific ops */
3807 };
3808 
3809 static struct modlinkage xdf_modlinkage = {
3810 	MODREV_1, (void *)&modldrv, NULL
3811 };
3812 
3813 /*
3814  * standard module entry points
3815  */
3816 int
3817 _init(void)
3818 {
3819 	int rc;
3820 
3821 	xdf_major = ddi_name_to_major("xdf");
3822 	if (xdf_major == (major_t)-1)
3823 		return (EINVAL);
3824 
3825 	if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0)
3826 		return (rc);
3827 
3828 	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
3829 	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3830 	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
3831 	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3832 
3833 #ifdef XPV_HVM_DRIVER
3834 	xdf_hvm_init();
3835 #endif /* XPV_HVM_DRIVER */
3836 
3837 	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
3838 #ifdef XPV_HVM_DRIVER
3839 		xdf_hvm_fini();
3840 #endif /* XPV_HVM_DRIVER */
3841 		kmem_cache_destroy(xdf_vreq_cache);
3842 		kmem_cache_destroy(xdf_gs_cache);
3843 		ddi_soft_state_fini(&xdf_ssp);
3844 		return (rc);
3845 	}
3846 
3847 	return (rc);
3848 }
3849 
3850 int
3851 _fini(void)
3852 {
3853 	int err;
3854 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
3855 		return (err);
3856 
3857 #ifdef XPV_HVM_DRIVER
3858 	xdf_hvm_fini();
3859 #endif /* XPV_HVM_DRIVER */
3860 
3861 	kmem_cache_destroy(xdf_vreq_cache);
3862 	kmem_cache_destroy(xdf_gs_cache);
3863 	ddi_soft_state_fini(&xdf_ssp);
3864 
3865 	return (0);
3866 }
3867 
3868 int
3869 _info(struct modinfo *modinfop)
3870 {
3871 	return (mod_info(&xdf_modlinkage, modinfop));
3872 }
3873