xref: /illumos-gate/usr/src/uts/common/xen/io/xdb.c (revision 71e32251703c729dbbebef2101770135584fd8d4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Note: This is the backend part of the split PV disk driver. This driver
29  * is not a nexus driver, nor is it a leaf driver(block/char/stream driver).
30  * Currently, it does not create any minor node. So, although, it runs in
31  * backend domain, it will not be used directly from within dom0.
32  * It simply gets block I/O requests issued by frontend from a shared page
33  * (blkif ring buffer - defined by Xen) between backend and frontend domain,
34  * generates a buf, and push it down to underlying disk target driver via
35  * ldi interface. When buf is done, this driver will generate a response
36  * and put it into ring buffer to inform frontend of the status of the I/O
37  * request issued by it. When a new virtual device entry is added in xenstore,
38  * there will be an watch event sent from Xen to xvdi framework, who will,
39  * in turn, create the devinfo node and try to attach this driver
40  * (see xvdi_create_dev). When frontend peer changes its state to
41  * XenbusStateClose, an event will also be sent from Xen to xvdi framework,
42  * who will detach and remove this devinfo node (see i_xvdi_oestate_handler).
43  * I/O requests get from ring buffer and event coming from xenstore cannot be
44  * trusted. We verify them in xdb_get_buf() and xdb_check_state_transition().
45  *
46  * Virtual device configuration is read/written from/to the database via
47  * xenbus_* interfaces. Driver also use xvdi_* to interact with hypervisor.
48  * There is an on-going effort to make xvdi_* cover all xenbus_*.
49  */
50 
51 #pragma ident	"%Z%%M%	%I%	%E% SMI"
52 
53 #include "xdb.h"
54 #include <sys/lofi.h>
55 #include <vm/hat_i86.h>
56 
57 static xdb_t *xdb_statep;
58 static int xdb_debug = 0;
59 
60 #ifdef DEBUG
61 /*
62  * debug aid functions
63  */
64 
65 static void
66 logva(xdb_t *vdp, uint64_t va)
67 {
68 	uint64_t *page_addrs;
69 	int i;
70 
71 	page_addrs = vdp->page_addrs;
72 	for (i = 0; i < XDB_MAX_IO_PAGES; i++) {
73 		if (page_addrs[i] == va)
74 			debug_enter("VA remapping found!");
75 	}
76 
77 	for (i = 0; i < XDB_MAX_IO_PAGES; i++) {
78 		if (page_addrs[i] == 0) {
79 			page_addrs[i] = va;
80 			break;
81 		}
82 	}
83 	ASSERT(i < XDB_MAX_IO_PAGES);
84 }
85 
86 static void
87 unlogva(xdb_t *vdp, uint64_t va)
88 {
89 	uint64_t *page_addrs;
90 	int i;
91 
92 	page_addrs = vdp->page_addrs;
93 	for (i = 0; i < XDB_MAX_IO_PAGES; i++) {
94 		if (page_addrs[i] == va) {
95 			page_addrs[i] = 0;
96 			break;
97 		}
98 	}
99 	ASSERT(i < XDB_MAX_IO_PAGES);
100 }
101 
102 static void
103 xdb_dump_request_oe(blkif_request_t *req)
104 {
105 	int i;
106 
107 	/*
108 	 * Exploit the public interface definitions for BLKIF_OP_READ
109 	 * etc..
110 	 */
111 	char *op_name[] = { "read", "write", "barrier", "flush" };
112 
113 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "op=%s", op_name[req->operation]));
114 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "num of segments=%d",
115 	    req->nr_segments));
116 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "handle=%d", req->handle));
117 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "id=%llu",
118 	    (unsigned long long)req->id));
119 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "start sector=%llu",
120 	    (unsigned long long)req->sector_number));
121 	for (i = 0; i < req->nr_segments; i++) {
122 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "gref=%d, first sec=%d,"
123 		    "last sec=%d", req->seg[i].gref, req->seg[i].first_sect,
124 		    req->seg[i].last_sect));
125 	}
126 }
127 #endif /* DEBUG */
128 
129 /*
130  * Statistics.
131  */
132 static char *xdb_stats[] = {
133 	"rd_reqs",
134 	"wr_reqs",
135 	"br_reqs",
136 	"fl_reqs",
137 	"oo_reqs"
138 };
139 
140 static int
141 xdb_kstat_update(kstat_t *ksp, int flag)
142 {
143 	xdb_t *vdp;
144 	kstat_named_t *knp;
145 
146 	if (flag != KSTAT_READ)
147 		return (EACCES);
148 
149 	vdp = ksp->ks_private;
150 	knp = ksp->ks_data;
151 
152 	/*
153 	 * Assignment order should match that of the names in
154 	 * xdb_stats.
155 	 */
156 	(knp++)->value.ui64 = vdp->xs_stat_req_reads;
157 	(knp++)->value.ui64 = vdp->xs_stat_req_writes;
158 	(knp++)->value.ui64 = vdp->xs_stat_req_barriers;
159 	(knp++)->value.ui64 = vdp->xs_stat_req_flushes;
160 	(knp++)->value.ui64 = 0; /* oo_req */
161 
162 	return (0);
163 }
164 
165 static boolean_t
166 xdb_kstat_init(xdb_t *vdp)
167 {
168 	int nstat = sizeof (xdb_stats) / sizeof (xdb_stats[0]);
169 	char **cp = xdb_stats;
170 	kstat_named_t *knp;
171 
172 	if ((vdp->xs_kstats = kstat_create("xdb",
173 	    ddi_get_instance(vdp->xs_dip),
174 	    "req_statistics", "block", KSTAT_TYPE_NAMED,
175 	    nstat, 0)) == NULL)
176 		return (B_FALSE);
177 
178 	vdp->xs_kstats->ks_private = vdp;
179 	vdp->xs_kstats->ks_update = xdb_kstat_update;
180 
181 	knp = vdp->xs_kstats->ks_data;
182 	while (nstat > 0) {
183 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
184 		knp++;
185 		cp++;
186 		nstat--;
187 	}
188 
189 	kstat_install(vdp->xs_kstats);
190 
191 	return (B_TRUE);
192 }
193 
194 static int xdb_biodone(buf_t *);
195 
196 static buf_t *
197 xdb_get_buf(xdb_t *vdp, blkif_request_t *req, xdb_request_t *xreq)
198 {
199 	buf_t *bp;
200 	uint8_t segs, curseg;
201 	int sectors;
202 	int i, err;
203 	gnttab_map_grant_ref_t mapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
204 	ddi_acc_handle_t acchdl;
205 
206 	acchdl = vdp->xs_ring_hdl;
207 	bp = XDB_XREQ2BP(xreq);
208 	curseg = xreq->xr_curseg;
209 	/* init a new xdb request */
210 	if (req != NULL) {
211 		ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
212 		boolean_t pagemapok = B_TRUE;
213 		uint8_t op = ddi_get8(acchdl, &req->operation);
214 
215 		xreq->xr_vdp = vdp;
216 		xreq->xr_op = op;
217 		xreq->xr_id = ddi_get64(acchdl, &req->id);
218 		segs = xreq->xr_buf_pages = ddi_get8(acchdl, &req->nr_segments);
219 		if (segs == 0) {
220 			if (op != BLKIF_OP_FLUSH_DISKCACHE)
221 				cmn_err(CE_WARN, "!non-BLKIF_OP_FLUSH_DISKCACHE"
222 				    " is seen from domain %d with zero "
223 				    "length data buffer!", vdp->xs_peer);
224 			bioinit(bp);
225 			bp->b_bcount = 0;
226 			bp->b_lblkno = 0;
227 			bp->b_un.b_addr = NULL;
228 			return (bp);
229 		} else if (op == BLKIF_OP_FLUSH_DISKCACHE) {
230 			cmn_err(CE_WARN, "!BLKIF_OP_FLUSH_DISKCACHE"
231 			    " is seen from domain %d with non-zero "
232 			    "length data buffer!", vdp->xs_peer);
233 		}
234 
235 		/*
236 		 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
237 		 * according to the definition of blk interface by Xen
238 		 * we do sanity check here
239 		 */
240 		if (segs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
241 			segs = xreq->xr_buf_pages =
242 			    BLKIF_MAX_SEGMENTS_PER_REQUEST;
243 
244 		for (i = 0; i < segs; i++) {
245 			uint8_t fs, ls;
246 
247 			mapops[i].host_addr =
248 			    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
249 			    vdp->xs_iopage_va, xreq->xr_idx, i);
250 			mapops[i].dom = vdp->xs_peer;
251 			mapops[i].ref = ddi_get32(acchdl, &req->seg[i].gref);
252 			mapops[i].flags = GNTMAP_host_map;
253 			if (op != BLKIF_OP_READ)
254 				mapops[i].flags |= GNTMAP_readonly;
255 
256 			fs = ddi_get8(acchdl, &req->seg[i].first_sect);
257 			ls = ddi_get8(acchdl, &req->seg[i].last_sect);
258 
259 			/*
260 			 * first_sect should be no bigger than last_sect and
261 			 * both of them should be no bigger than
262 			 * (PAGESIZE / XB_BSIZE - 1) according to definition
263 			 * of blk interface by Xen, so sanity check again
264 			 */
265 			if (fs > (PAGESIZE / XB_BSIZE - 1))
266 				fs = PAGESIZE / XB_BSIZE - 1;
267 			if (ls > (PAGESIZE / XB_BSIZE - 1))
268 				ls = PAGESIZE / XB_BSIZE - 1;
269 			if (fs > ls)
270 				fs = ls;
271 
272 			xreq->xr_segs[i].fs = fs;
273 			xreq->xr_segs[i].ls = ls;
274 		}
275 
276 		/* map in io pages */
277 		err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
278 		    mapops, i);
279 		if (err != 0)
280 			return (NULL);
281 		for (i = 0; i < segs; i++) {
282 			/*
283 			 * Although HYPERVISOR_grant_table_op() returned no
284 			 * error, mapping of each single page can fail. So,
285 			 * we have to do the check here and handle the error
286 			 * if needed
287 			 */
288 			if (mapops[i].status != GNTST_okay) {
289 				int j;
290 				for (j = 0; j < i; j++) {
291 #ifdef DEBUG
292 					unlogva(vdp, mapops[j].host_addr);
293 #endif
294 					xen_release_pfn(
295 					    xreq->xr_plist[j].p_pagenum);
296 				}
297 				pagemapok = B_FALSE;
298 				break;
299 			}
300 			/* record page mapping handle for unmapping later */
301 			xreq->xr_page_hdls[i] = mapops[i].handle;
302 #ifdef DEBUG
303 			logva(vdp, mapops[i].host_addr);
304 #endif
305 			/*
306 			 * Pass the MFNs down using the shadow list (xr_pplist)
307 			 *
308 			 * This is pretty ugly since we have implict knowledge
309 			 * of how the rootnex binds buffers.
310 			 * The GNTTABOP_map_grant_ref op makes us do some ugly
311 			 * stuff since we're not allowed to touch these PTEs
312 			 * from the VM.
313 			 *
314 			 * Obviously, these aren't real page_t's. The rootnex
315 			 * only needs p_pagenum.
316 			 * Also, don't use btop() here or 32 bit PAE breaks.
317 			 */
318 			xreq->xr_pplist[i] = &xreq->xr_plist[i];
319 			xreq->xr_plist[i].p_pagenum =
320 			    xen_assign_pfn(mapops[i].dev_bus_addr >> PAGESHIFT);
321 		}
322 
323 		/*
324 		 * not all pages mapped in successfully, unmap those mapped-in
325 		 * page and return failure
326 		 */
327 		if (!pagemapok) {
328 			gnttab_unmap_grant_ref_t unmapop;
329 
330 			for (i = 0; i < segs; i++) {
331 				if (mapops[i].status != GNTST_okay)
332 					continue;
333 				unmapop.host_addr =
334 				    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
335 				    vdp->xs_iopage_va, xreq->xr_idx, i);
336 				unmapop.dev_bus_addr = NULL;
337 				unmapop.handle = mapops[i].handle;
338 				(void) HYPERVISOR_grant_table_op(
339 				    GNTTABOP_unmap_grant_ref, &unmapop, 1);
340 			}
341 
342 			return (NULL);
343 		}
344 		bioinit(bp);
345 		bp->b_lblkno = ddi_get64(acchdl, &req->sector_number);
346 		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
347 		bp->b_flags |= (ddi_get8(acchdl, &req->operation) ==
348 		    BLKIF_OP_READ) ? B_READ : (B_WRITE | B_ASYNC);
349 	} else {
350 		uint64_t blkst;
351 		int isread;
352 
353 		/* reuse this buf */
354 		blkst = bp->b_lblkno + bp->b_bcount / DEV_BSIZE;
355 		isread = bp->b_flags & B_READ;
356 		bioreset(bp);
357 		bp->b_lblkno = blkst;
358 		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
359 		bp->b_flags |= isread ? B_READ : (B_WRITE | B_ASYNC);
360 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "reuse buf, xreq is %d!!",
361 		    xreq->xr_idx));
362 	}
363 
364 	/* form a buf */
365 	bp->b_un.b_addr = XDB_IOPAGE_VA(vdp->xs_iopage_va, xreq->xr_idx,
366 	    curseg) + xreq->xr_segs[curseg].fs * DEV_BSIZE;
367 	bp->b_shadow = &xreq->xr_pplist[curseg];
368 	bp->b_iodone = xdb_biodone;
369 	sectors = 0;
370 	for (i = curseg; i < xreq->xr_buf_pages; i++) {
371 		/*
372 		 * The xreq->xr_segs[i].fs of the first seg can be non-zero
373 		 * otherwise, we'll break it into multiple bufs
374 		 */
375 		if ((i != curseg) && (xreq->xr_segs[i].fs != 0)) {
376 			break;
377 		}
378 		sectors += (xreq->xr_segs[i].ls - xreq->xr_segs[i].fs + 1);
379 	}
380 	xreq->xr_curseg = i;
381 	bp->b_bcount = sectors * DEV_BSIZE;
382 	bp->b_bufsize = bp->b_bcount;
383 
384 	return (bp);
385 }
386 
387 static xdb_request_t *
388 xdb_get_req(xdb_t *vdp)
389 {
390 	xdb_request_t *req;
391 	int idx;
392 
393 	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
394 	ASSERT(vdp->xs_free_req != -1);
395 	req = &vdp->xs_req[vdp->xs_free_req];
396 	vdp->xs_free_req = req->xr_next;
397 	idx = req->xr_idx;
398 	bzero(req, sizeof (xdb_request_t));
399 	req->xr_idx = idx;
400 	return (req);
401 }
402 
403 static void
404 xdb_free_req(xdb_request_t *req)
405 {
406 	xdb_t *vdp = req->xr_vdp;
407 
408 	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
409 	req->xr_next = vdp->xs_free_req;
410 	vdp->xs_free_req = req->xr_idx;
411 }
412 
413 static void
414 xdb_response(xdb_t *vdp, blkif_request_t *req, boolean_t ok)
415 {
416 	xendev_ring_t *ringp = vdp->xs_ring;
417 	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;
418 	blkif_response_t *resp;
419 
420 	resp = xvdi_ring_get_response(ringp);
421 	ASSERT(resp);
422 
423 	ddi_put64(acchdl, &resp->id, ddi_get64(acchdl, &req->id));
424 	ddi_put8(acchdl, &resp->operation, ddi_get8(acchdl, &req->operation));
425 	ddi_put16(acchdl, (uint16_t *)&resp->status,
426 	    ok ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
427 	if (xvdi_ring_push_response(ringp))
428 		xvdi_notify_oe(vdp->xs_dip);
429 }
430 
431 static void
432 xdb_init_ioreqs(xdb_t *vdp)
433 {
434 	int i;
435 
436 	for (i = 0; i < BLKIF_RING_SIZE; i++) {
437 		vdp->xs_req[i].xr_idx = i;
438 		vdp->xs_req[i].xr_next = i + 1;
439 	}
440 	vdp->xs_req[BLKIF_RING_SIZE - 1].xr_next = -1;
441 	vdp->xs_free_req = 0;
442 
443 	/* alloc va in host dom for io page mapping */
444 	vdp->xs_iopage_va = vmem_xalloc(heap_arena,
445 	    XDB_MAX_IO_PAGES * PAGESIZE, PAGESIZE, 0, 0, 0, 0,
446 	    VM_SLEEP);
447 	for (i = 0; i < XDB_MAX_IO_PAGES; i++)
448 		hat_prepare_mapping(kas.a_hat,
449 		    vdp->xs_iopage_va + i * PAGESIZE);
450 }
451 
452 static void
453 xdb_uninit_ioreqs(xdb_t *vdp)
454 {
455 	int i;
456 
457 	for (i = 0; i < XDB_MAX_IO_PAGES; i++)
458 		hat_release_mapping(kas.a_hat,
459 		    vdp->xs_iopage_va + i * PAGESIZE);
460 	vmem_xfree(heap_arena, vdp->xs_iopage_va,
461 	    XDB_MAX_IO_PAGES * PAGESIZE);
462 }
463 
464 static uint_t
465 xdb_intr(caddr_t arg)
466 {
467 	xendev_ring_t *ringp;
468 	blkif_request_t *req;
469 	xdb_request_t *xreq;
470 	buf_t *bp;
471 	uint8_t op;
472 	xdb_t *vdp = (xdb_t *)arg;
473 	int ret = DDI_INTR_UNCLAIMED;
474 	dev_info_t *dip = vdp->xs_dip;
475 
476 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
477 	    "xdb@%s: I/O request received from dom %d",
478 	    ddi_get_name_addr(dip), vdp->xs_peer));
479 
480 	mutex_enter(&vdp->xs_iomutex);
481 
482 	/* shouldn't touch ring buffer if not in connected state */
483 	if (vdp->xs_if_status != XDB_CONNECTED) {
484 		mutex_exit(&vdp->xs_iomutex);
485 		return (DDI_INTR_UNCLAIMED);
486 	}
487 
488 	ringp = vdp->xs_ring;
489 
490 	/*
491 	 * We'll loop till there is no more request in the ring
492 	 * We won't stuck in this loop for ever since the size of ring buffer
493 	 * is limited, and frontend will stop pushing requests into it when
494 	 * the ring buffer is full
495 	 */
496 
497 	/* req_event will be increased in xvdi_ring_get_request() */
498 	while ((req = xvdi_ring_get_request(ringp)) != NULL) {
499 		ret = DDI_INTR_CLAIMED;
500 
501 		op = ddi_get8(vdp->xs_ring_hdl, &req->operation);
502 		if (op == BLKIF_OP_READ			||
503 		    op == BLKIF_OP_WRITE		||
504 		    op == BLKIF_OP_WRITE_BARRIER	||
505 		    op == BLKIF_OP_FLUSH_DISKCACHE) {
506 #ifdef DEBUG
507 			xdb_dump_request_oe(req);
508 #endif
509 			xreq = xdb_get_req(vdp);
510 			ASSERT(xreq);
511 			switch (op) {
512 			case BLKIF_OP_READ:
513 				vdp->xs_stat_req_reads++;
514 				break;
515 			case BLKIF_OP_WRITE_BARRIER:
516 				vdp->xs_stat_req_barriers++;
517 				/* FALLTHRU */
518 			case BLKIF_OP_WRITE:
519 				vdp->xs_stat_req_writes++;
520 				break;
521 			case BLKIF_OP_FLUSH_DISKCACHE:
522 				vdp->xs_stat_req_flushes++;
523 				break;
524 			}
525 
526 			xreq->xr_curseg = 0; /* start from first segment */
527 			bp = xdb_get_buf(vdp, req, xreq);
528 			if (bp == NULL) {
529 				/* failed to form a buf */
530 				xdb_free_req(xreq);
531 				xdb_response(vdp, req, B_FALSE);
532 				continue;
533 			}
534 			bp->av_forw = NULL;
535 
536 			XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
537 			    " buf %p, blkno %lld, size %lu, addr %p",
538 			    (void *)bp, (longlong_t)bp->b_blkno,
539 			    (ulong_t)bp->b_bcount, (void *)bp->b_un.b_addr));
540 
541 			/* send bp to underlying blk driver */
542 			if (vdp->xs_f_iobuf == NULL) {
543 				vdp->xs_f_iobuf = vdp->xs_l_iobuf = bp;
544 			} else {
545 				vdp->xs_l_iobuf->av_forw = bp;
546 				vdp->xs_l_iobuf = bp;
547 			}
548 			vdp->xs_ionum++;
549 		} else {
550 			xdb_response(vdp, req, B_FALSE);
551 			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
552 			    "Unsupported cmd received from dom %d",
553 			    ddi_get_name_addr(dip), vdp->xs_peer));
554 		}
555 	}
556 	/* notify our taskq to push buf to underlying blk driver */
557 	if (ret == DDI_INTR_CLAIMED)
558 		cv_broadcast(&vdp->xs_iocv);
559 
560 	mutex_exit(&vdp->xs_iomutex);
561 
562 	return (ret);
563 }
564 
565 static int
566 xdb_biodone(buf_t *bp)
567 {
568 	blkif_response_t *resp;
569 	int i, err, bioerr;
570 	uint8_t segs;
571 	gnttab_unmap_grant_ref_t unmapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
572 	xdb_request_t *xreq = XDB_BP2XREQ(bp);
573 	xdb_t *vdp = xreq->xr_vdp;
574 	xendev_ring_t *ringp = vdp->xs_ring;
575 	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;
576 	buf_t *nbp;
577 
578 	bioerr = geterror(bp);
579 	if (bioerr)
580 		XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: I/O error %d",
581 		    ddi_get_name_addr(vdp->xs_dip), bioerr));
582 
583 	/* check if we are done w/ this I/O request */
584 	if ((bioerr == 0) && (xreq->xr_curseg < xreq->xr_buf_pages)) {
585 		nbp = xdb_get_buf(vdp, NULL, xreq);
586 		if (nbp) {
587 			err = ldi_strategy(vdp->xs_ldi_hdl, nbp);
588 			if (err == 0) {
589 				XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
590 				    "sent buf to backend ok"));
591 				return (DDI_SUCCESS);
592 			}
593 			bioerr = EIO;
594 			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
595 			    "sent buf to backend dev failed, err=%d",
596 			    ddi_get_name_addr(vdp->xs_dip), err));
597 		} else {
598 			bioerr = EIO;
599 		}
600 	}
601 
602 	/* unmap io pages */
603 	segs = xreq->xr_buf_pages;
604 	/*
605 	 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
606 	 * according to the definition of blk interface by Xen
607 	 */
608 	ASSERT(segs <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
609 	for (i = 0; i < segs; i++) {
610 		unmapops[i].host_addr = (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
611 		    vdp->xs_iopage_va, xreq->xr_idx, i);
612 #ifdef DEBUG
613 		mutex_enter(&vdp->xs_iomutex);
614 		unlogva(vdp, unmapops[i].host_addr);
615 		mutex_exit(&vdp->xs_iomutex);
616 #endif
617 		unmapops[i].dev_bus_addr = NULL;
618 		unmapops[i].handle = xreq->xr_page_hdls[i];
619 	}
620 	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
621 	    unmapops, segs);
622 	ASSERT(!err);
623 
624 	/*
625 	 * If we have reached a barrier write or a cache flush , then we must
626 	 * flush all our I/Os.
627 	 */
628 	if (xreq->xr_op == BLKIF_OP_WRITE_BARRIER ||
629 	    xreq->xr_op == BLKIF_OP_FLUSH_DISKCACHE) {
630 		/*
631 		 * XXX At this point the write did succeed, so I don't
632 		 * believe we should report an error because the flush
633 		 * failed. However, this is a debatable point, so
634 		 * maybe we need to think more carefully about this.
635 		 * For now, just cast to void.
636 		 */
637 		(void) ldi_ioctl(vdp->xs_ldi_hdl,
638 		    DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, kcred, NULL);
639 	}
640 
641 	mutex_enter(&vdp->xs_iomutex);
642 
643 	/* send response back to frontend */
644 	if (vdp->xs_if_status == XDB_CONNECTED) {
645 		resp = xvdi_ring_get_response(ringp);
646 		ASSERT(resp);
647 		ddi_put64(acchdl, &resp->id, xreq->xr_id);
648 		ddi_put8(acchdl, &resp->operation, xreq->xr_op);
649 		ddi_put16(acchdl, (uint16_t *)&resp->status,
650 		    bioerr ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY);
651 		if (xvdi_ring_push_response(ringp))
652 			xvdi_notify_oe(vdp->xs_dip);
653 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
654 		    "sent resp back to frontend, id=%llu",
655 		    (unsigned long long)xreq->xr_id));
656 	}
657 	/* free io resources */
658 	biofini(bp);
659 	xdb_free_req(xreq);
660 
661 	vdp->xs_ionum--;
662 	if ((vdp->xs_if_status != XDB_CONNECTED) && (vdp->xs_ionum == 0))
663 		/* we're closing, someone is waiting for I/O clean-up */
664 		cv_signal(&vdp->xs_ionumcv);
665 
666 	mutex_exit(&vdp->xs_iomutex);
667 
668 	return (DDI_SUCCESS);
669 }
670 
671 static int
672 xdb_bindto_frontend(xdb_t *vdp)
673 {
674 	int err;
675 	char *oename;
676 	grant_ref_t gref;
677 	evtchn_port_t evtchn;
678 	dev_info_t *dip = vdp->xs_dip;
679 
680 	/*
681 	 * Gather info from frontend
682 	 */
683 	oename = xvdi_get_oename(dip);
684 	if (oename == NULL)
685 		return (DDI_FAILURE);
686 
687 	err = xenbus_gather(XBT_NULL, oename,
688 	    "ring-ref", "%lu", &gref, "event-channel", "%u", &evtchn, NULL);
689 	if (err != 0) {
690 		xvdi_fatal_error(dip, err,
691 		    "Getting ring-ref and evtchn from frontend");
692 		return (DDI_FAILURE);
693 	}
694 
695 	/*
696 	 * map and init ring
697 	 */
698 	err = xvdi_map_ring(dip, BLKIF_RING_SIZE,
699 	    sizeof (union blkif_sring_entry), gref, &vdp->xs_ring);
700 	if (err != DDI_SUCCESS)
701 		return (DDI_FAILURE);
702 	/*
703 	 * This will be removed after we use shadow I/O ring request since
704 	 * we don't need to access the ring itself directly, thus the access
705 	 * handle is not needed
706 	 */
707 	vdp->xs_ring_hdl = vdp->xs_ring->xr_acc_hdl;
708 
709 	/*
710 	 * bind event channel
711 	 */
712 	err = xvdi_bind_evtchn(dip, evtchn);
713 	if (err != DDI_SUCCESS) {
714 		xvdi_unmap_ring(vdp->xs_ring);
715 		return (DDI_FAILURE);
716 	}
717 
718 	return (DDI_SUCCESS);
719 }
720 
721 static void
722 xdb_unbindfrom_frontend(xdb_t *vdp)
723 {
724 	xvdi_free_evtchn(vdp->xs_dip);
725 	xvdi_unmap_ring(vdp->xs_ring);
726 }
727 
728 #define	LOFI_CTRL_NODE	"/dev/lofictl"
729 #define	LOFI_DEV_NODE	"/devices/pseudo/lofi@0:"
730 #define	LOFI_MODE	FREAD | FWRITE | FEXCL
731 
732 static int
733 xdb_setup_node(xdb_t *vdp, char *path)
734 {
735 	dev_info_t *dip;
736 	char *xsnode, *node;
737 	ldi_handle_t ldi_hdl;
738 	struct lofi_ioctl *li;
739 	int minor;
740 	int err;
741 	unsigned int len;
742 
743 	dip = vdp->xs_dip;
744 	xsnode = xvdi_get_xsname(dip);
745 	if (xsnode == NULL)
746 		return (DDI_FAILURE);
747 
748 	err = xenbus_read(XBT_NULL, xsnode, "params", (void **)&node, &len);
749 	if (err != 0) {
750 		xvdi_fatal_error(vdp->xs_dip, err, "reading 'params'");
751 		return (DDI_FAILURE);
752 	}
753 
754 	if (!XDB_IS_LOFI(vdp)) {
755 		(void) strlcpy(path, node, MAXPATHLEN + 1);
756 		kmem_free(node, len);
757 		return (DDI_SUCCESS);
758 	}
759 
760 	do {
761 		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
762 		    &ldi_hdl, vdp->xs_ldi_li);
763 	} while (err == EBUSY);
764 	if (err != 0) {
765 		kmem_free(node, len);
766 		return (DDI_FAILURE);
767 	}
768 
769 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
770 	(void) strlcpy(li->li_filename, node, MAXPATHLEN + 1);
771 	kmem_free(node, len);
772 	if (ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
773 	    LOFI_MODE | FKIOCTL, kcred, &minor) != 0) {
774 		cmn_err(CE_WARN, "xdb@%s: Failed to create lofi dev for %s",
775 		    ddi_get_name_addr(dip), li->li_filename);
776 		(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
777 		kmem_free(li, sizeof (*li));
778 		return (DDI_FAILURE);
779 	}
780 	/*
781 	 * return '/devices/...' instead of '/dev/lofi/...' since the
782 	 * former is available immediately after calling ldi_ioctl
783 	 */
784 	(void) snprintf(path, MAXPATHLEN + 1, LOFI_DEV_NODE "%d", minor);
785 	(void) xenbus_printf(XBT_NULL, xsnode, "node", "%s", path);
786 	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
787 	kmem_free(li, sizeof (*li));
788 	return (DDI_SUCCESS);
789 }
790 
791 static void
792 xdb_teardown_node(xdb_t *vdp)
793 {
794 	dev_info_t *dip;
795 	char *xsnode, *node;
796 	ldi_handle_t ldi_hdl;
797 	struct lofi_ioctl *li;
798 	int err;
799 	unsigned int len;
800 
801 	if (!XDB_IS_LOFI(vdp))
802 		return;
803 
804 	dip = vdp->xs_dip;
805 	xsnode = xvdi_get_xsname(dip);
806 	if (xsnode == NULL)
807 		return;
808 
809 	err = xenbus_read(XBT_NULL, xsnode, "params", (void **)&node, &len);
810 	if (err != 0) {
811 		xvdi_fatal_error(vdp->xs_dip, err, "reading 'params'");
812 		return;
813 	}
814 
815 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
816 	(void) strlcpy(li->li_filename, node, MAXPATHLEN + 1);
817 	kmem_free(node, len);
818 
819 	do {
820 		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
821 		    &ldi_hdl, vdp->xs_ldi_li);
822 	} while (err == EBUSY);
823 
824 	if (err != 0) {
825 		kmem_free(li, sizeof (*li));
826 		return;
827 	}
828 
829 	if (ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE, (intptr_t)li,
830 	    LOFI_MODE | FKIOCTL, kcred, NULL) != 0) {
831 		cmn_err(CE_WARN, "xdb@%s: Failed to delete lofi dev for %s",
832 		    ddi_get_name_addr(dip), li->li_filename);
833 	}
834 
835 	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
836 	kmem_free(li, sizeof (*li));
837 }
838 
839 static int
840 xdb_open_device(xdb_t *vdp)
841 {
842 	uint64_t devsize;
843 	dev_info_t *dip;
844 	char *xsnode;
845 	char *nodepath;
846 	char *mode = NULL;
847 	char *type = NULL;
848 	int err;
849 
850 	dip = vdp->xs_dip;
851 	xsnode = xvdi_get_xsname(dip);
852 	if (xsnode == NULL)
853 		return (DDI_FAILURE);
854 
855 	err = xenbus_gather(XBT_NULL, xsnode,
856 	    "mode", NULL, &mode, "type", NULL, &type, NULL);
857 	if (err != 0) {
858 		if (mode)
859 			kmem_free(mode, strlen(mode) + 1);
860 		if (type)
861 			kmem_free(type, strlen(type) + 1);
862 		xvdi_fatal_error(dip, err,
863 		    "Getting mode and type from backend device");
864 		return (DDI_FAILURE);
865 	}
866 	if (strcmp(type, "file") == 0) {
867 		vdp->xs_type |= XDB_DEV_LOFI;
868 	}
869 	kmem_free(type, strlen(type) + 1);
870 	if ((strcmp(mode, "r") == NULL) || (strcmp(mode, "ro") == NULL)) {
871 		vdp->xs_type |= XDB_DEV_RO;
872 	}
873 	kmem_free(mode, strlen(mode) + 1);
874 
875 	/*
876 	 * try to open backend device
877 	 */
878 	if (ldi_ident_from_dip(dip, &vdp->xs_ldi_li) != 0)
879 		return (DDI_FAILURE);
880 
881 	nodepath = kmem_zalloc(MAXPATHLEN + 1, KM_SLEEP);
882 	err = xdb_setup_node(vdp, nodepath);
883 	if (err != DDI_SUCCESS) {
884 		xvdi_fatal_error(dip, err,
885 		    "Getting device path of backend device");
886 		ldi_ident_release(vdp->xs_ldi_li);
887 		kmem_free(nodepath, MAXPATHLEN + 1);
888 		return (DDI_FAILURE);
889 	}
890 
891 	if (ldi_open_by_name(nodepath,
892 	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE),
893 	    kcred, &vdp->xs_ldi_hdl, vdp->xs_ldi_li) != 0) {
894 		xdb_teardown_node(vdp);
895 		ldi_ident_release(vdp->xs_ldi_li);
896 		cmn_err(CE_WARN, "xdb@%s: Failed to open: %s",
897 		    ddi_get_name_addr(dip), nodepath);
898 		kmem_free(nodepath, MAXPATHLEN + 1);
899 		return (DDI_FAILURE);
900 	}
901 
902 	/* check if it's a CD/DVD disc */
903 	if (ldi_prop_get_int(vdp->xs_ldi_hdl, LDI_DEV_T_ANY | DDI_PROP_DONTPASS,
904 	    "inquiry-device-type", DTYPE_DIRECT) == DTYPE_RODIRECT)
905 		vdp->xs_type |= XDB_DEV_CD;
906 	/* check if it's a removable disk */
907 	if (ldi_prop_exists(vdp->xs_ldi_hdl,
908 	    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
909 	    "removable-media"))
910 		vdp->xs_type |= XDB_DEV_RMB;
911 
912 	if (ldi_get_size(vdp->xs_ldi_hdl, &devsize) != DDI_SUCCESS) {
913 		(void) ldi_close(vdp->xs_ldi_hdl,
914 		    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
915 		xdb_teardown_node(vdp);
916 		ldi_ident_release(vdp->xs_ldi_li);
917 		kmem_free(nodepath, MAXPATHLEN + 1);
918 		return (DDI_FAILURE);
919 	}
920 	vdp->xs_sectors = devsize / XB_BSIZE;
921 
922 	kmem_free(nodepath, MAXPATHLEN + 1);
923 	return (DDI_SUCCESS);
924 }
925 
926 static void
927 xdb_close_device(xdb_t *vdp)
928 {
929 	(void) ldi_close(vdp->xs_ldi_hdl,
930 	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
931 	xdb_teardown_node(vdp);
932 	ldi_ident_release(vdp->xs_ldi_li);
933 	vdp->xs_ldi_li = NULL;
934 	vdp->xs_ldi_hdl = NULL;
935 }
936 
937 /*
938  * Kick-off connect process
939  * If xs_fe_status == XDB_FE_READY and xs_dev_status == XDB_DEV_READY
940  * the xs_if_status will be changed to XDB_CONNECTED on success,
941  * otherwise, xs_if_status will not be changed
942  */
943 static int
944 xdb_start_connect(xdb_t *vdp)
945 {
946 	uint32_t dinfo;
947 	xenbus_transaction_t xbt;
948 	int err, svdst;
949 	char *xsnode;
950 	dev_info_t *dip = vdp->xs_dip;
951 	char *barrier;
952 	uint_t len;
953 
954 	/*
955 	 * Start connect to frontend only when backend device are ready
956 	 * and frontend has moved to XenbusStateInitialised, which means
957 	 * ready to connect
958 	 */
959 	ASSERT((vdp->xs_fe_status == XDB_FE_READY) &&
960 	    (vdp->xs_dev_status == XDB_DEV_READY));
961 
962 	if (((xsnode = xvdi_get_xsname(dip)) == NULL)		 ||
963 	    ((vdp->xs_peer = xvdi_get_oeid(dip)) == (domid_t)-1) ||
964 	    (xdb_open_device(vdp) != DDI_SUCCESS))
965 		return (DDI_FAILURE);
966 
967 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialised);
968 
969 	if (xdb_bindto_frontend(vdp) != DDI_SUCCESS)
970 		goto errout1;
971 
972 	/* init i/o requests */
973 	xdb_init_ioreqs(vdp);
974 
975 	if (ddi_add_intr(dip, 0, NULL, NULL, xdb_intr, (caddr_t)vdp)
976 	    != DDI_SUCCESS)
977 		goto errout2;
978 
979 	/*
980 	 * we can recieve intr any time from now on
981 	 * mark that we're ready to take intr
982 	 */
983 	mutex_enter(&vdp->xs_iomutex);
984 	/*
985 	 * save it in case we need to restore when we
986 	 * fail to write xenstore later
987 	 */
988 	svdst = vdp->xs_if_status;
989 	vdp->xs_if_status = XDB_CONNECTED;
990 	mutex_exit(&vdp->xs_iomutex);
991 
992 	/* write into xenstore the info needed by frontend */
993 trans_retry:
994 	if (xenbus_transaction_start(&xbt)) {
995 		xvdi_fatal_error(dip, EIO, "transaction start");
996 		goto errout3;
997 	}
998 
999 	/*
1000 	 * If feature-barrier isn't present in xenstore, add it.
1001 	 */
1002 	if (xenbus_read(xbt, xsnode, "feature-barrier",
1003 	    (void **)&barrier, &len) != 0) {
1004 		if ((err = xenbus_printf(xbt, xsnode, "feature-barrier",
1005 		    "%d", 1)) != 0) {
1006 			cmn_err(CE_WARN, "xdb@%s: failed to write "
1007 			    "'feature-barrier'", ddi_get_name_addr(dip));
1008 			xvdi_fatal_error(dip, err, "writing 'feature-barrier'");
1009 			goto abort_trans;
1010 		}
1011 	} else
1012 		kmem_free(barrier, len);
1013 
1014 	dinfo = 0;
1015 	if (XDB_IS_RO(vdp))
1016 		dinfo |= VDISK_READONLY;
1017 	if (XDB_IS_CD(vdp))
1018 		dinfo |= VDISK_CDROM;
1019 	if (XDB_IS_RMB(vdp))
1020 		dinfo |= VDISK_REMOVABLE;
1021 	if (err = xenbus_printf(xbt, xsnode, "info", "%u", dinfo)) {
1022 		xvdi_fatal_error(dip, err, "writing 'info'");
1023 		goto abort_trans;
1024 	}
1025 
1026 	/* hard-coded 512-byte sector size */
1027 	if (err = xenbus_printf(xbt, xsnode, "sector-size", "%u", DEV_BSIZE)) {
1028 		xvdi_fatal_error(dip, err, "writing 'sector-size'");
1029 		goto abort_trans;
1030 	}
1031 
1032 	if (err = xenbus_printf(xbt, xsnode, "sectors", "%"PRIu64,
1033 	    vdp->xs_sectors)) {
1034 		xvdi_fatal_error(dip, err, "writing 'sectors'");
1035 		goto abort_trans;
1036 	}
1037 
1038 	if (err = xenbus_printf(xbt, xsnode, "instance", "%d",
1039 	    ddi_get_instance(dip))) {
1040 		xvdi_fatal_error(dip, err, "writing 'instance'");
1041 		goto abort_trans;
1042 	}
1043 
1044 	if ((err = xvdi_switch_state(dip, xbt, XenbusStateConnected)) > 0) {
1045 		xvdi_fatal_error(dip, err, "writing 'state'");
1046 		goto abort_trans;
1047 	}
1048 
1049 	if (err = xenbus_transaction_end(xbt, 0)) {
1050 		if (err == EAGAIN)
1051 			/* transaction is ended, don't need to abort it */
1052 			goto trans_retry;
1053 		xvdi_fatal_error(dip, err, "completing transaction");
1054 		goto errout3;
1055 	}
1056 
1057 	return (DDI_SUCCESS);
1058 
1059 abort_trans:
1060 	(void) xenbus_transaction_end(xbt, 1);
1061 errout3:
1062 	mutex_enter(&vdp->xs_iomutex);
1063 	vdp->xs_if_status = svdst;
1064 	mutex_exit(&vdp->xs_iomutex);
1065 	ddi_remove_intr(dip, 0, NULL);
1066 errout2:
1067 	xdb_uninit_ioreqs(vdp);
1068 	xdb_unbindfrom_frontend(vdp);
1069 errout1:
1070 	xdb_close_device(vdp);
1071 	return (DDI_FAILURE);
1072 }
1073 
1074 /*
1075  * Kick-off disconnect process
1076  * xs_if_status will not be changed
1077  */
1078 static int
1079 xdb_start_disconnect(xdb_t *vdp)
1080 {
1081 	/*
1082 	 * Kick-off disconnect process
1083 	 */
1084 	if (xvdi_switch_state(vdp->xs_dip, XBT_NULL, XenbusStateClosing) > 0)
1085 		return (DDI_FAILURE);
1086 
1087 	return (DDI_SUCCESS);
1088 }
1089 
1090 /*
1091  * Disconnect from frontend and close backend device
1092  * ifstatus will be changed to XDB_DISCONNECTED
1093  * Xenbus state will be changed to XenbusStateClosed
1094  */
1095 static void
1096 xdb_close(dev_info_t *dip)
1097 {
1098 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1099 
1100 	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));
1101 
1102 	mutex_enter(&vdp->xs_iomutex);
1103 
1104 	if (vdp->xs_if_status != XDB_CONNECTED) {
1105 		vdp->xs_if_status = XDB_DISCONNECTED;
1106 		cv_broadcast(&vdp->xs_iocv);
1107 		mutex_exit(&vdp->xs_iomutex);
1108 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1109 		return;
1110 	}
1111 	vdp->xs_if_status = XDB_DISCONNECTED;
1112 	cv_broadcast(&vdp->xs_iocv);
1113 
1114 	mutex_exit(&vdp->xs_iomutex);
1115 
1116 	/* stop accepting I/O request from frontend */
1117 	ddi_remove_intr(dip, 0, NULL);
1118 	/* clear all on-going I/Os, if any */
1119 	mutex_enter(&vdp->xs_iomutex);
1120 	while (vdp->xs_ionum > 0)
1121 		cv_wait(&vdp->xs_ionumcv, &vdp->xs_iomutex);
1122 	mutex_exit(&vdp->xs_iomutex);
1123 
1124 	/* clean up resources and close this interface */
1125 	xdb_uninit_ioreqs(vdp);
1126 	xdb_unbindfrom_frontend(vdp);
1127 	xdb_close_device(vdp);
1128 	vdp->xs_peer = (domid_t)-1;
1129 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1130 }
1131 
1132 /*
1133  * Xdb_check_state_transition will check the XenbusState change to see
1134  * if the change is a valid transition or not.
1135  * The new state is written by frontend domain, or by running xenstore-write
1136  * to change it manually in dom0
1137  */
1138 static int
1139 xdb_check_state_transition(xdb_t *vdp, XenbusState oestate)
1140 {
1141 	enum xdb_state status;
1142 	int stcheck;
1143 #define	STOK	0 /* need further process */
1144 #define	STNOP	1 /* no action need taking */
1145 #define	STBUG	2 /* unexpected state change, could be a bug */
1146 
1147 	status = vdp->xs_if_status;
1148 	stcheck = STOK;
1149 
1150 	switch (status) {
1151 	case XDB_UNKNOWN:
1152 		if (vdp->xs_fe_status == XDB_FE_UNKNOWN) {
1153 			if ((oestate == XenbusStateUnknown)		||
1154 			    (oestate == XenbusStateConnected))
1155 				stcheck = STBUG;
1156 			else if ((oestate == XenbusStateInitialising)	||
1157 			    (oestate == XenbusStateInitWait))
1158 				stcheck = STNOP;
1159 		} else {
1160 			if ((oestate == XenbusStateUnknown)		||
1161 			    (oestate == XenbusStateInitialising)	||
1162 			    (oestate == XenbusStateInitWait)		||
1163 			    (oestate == XenbusStateConnected))
1164 				stcheck = STBUG;
1165 			else if (oestate == XenbusStateInitialised)
1166 				stcheck = STNOP;
1167 		}
1168 		break;
1169 	case XDB_CONNECTED:
1170 		if ((oestate == XenbusStateUnknown)		||
1171 		    (oestate == XenbusStateInitialising)	||
1172 		    (oestate == XenbusStateInitWait)		||
1173 		    (oestate == XenbusStateInitialised))
1174 			stcheck = STBUG;
1175 		else if (oestate == XenbusStateConnected)
1176 			stcheck = STNOP;
1177 		break;
1178 	case XDB_DISCONNECTED:
1179 	default:
1180 			stcheck = STBUG;
1181 	}
1182 
1183 	if (stcheck == STOK)
1184 		return (DDI_SUCCESS);
1185 
1186 	if (stcheck == STBUG)
1187 		cmn_err(CE_NOTE, "xdb@%s: unexpected otherend "
1188 		    "state change to %d!, when status is %d",
1189 		    ddi_get_name_addr(vdp->xs_dip), oestate, status);
1190 
1191 	return (DDI_FAILURE);
1192 }
1193 
1194 static void
1195 xdb_send_buf(void *arg)
1196 {
1197 	buf_t *bp;
1198 	xdb_t *vdp = (xdb_t *)arg;
1199 
1200 	mutex_enter(&vdp->xs_iomutex);
1201 
1202 	while (vdp->xs_if_status != XDB_DISCONNECTED) {
1203 		while ((bp = vdp->xs_f_iobuf) != NULL) {
1204 			vdp->xs_f_iobuf = bp->av_forw;
1205 			bp->av_forw = NULL;
1206 			mutex_exit(&vdp->xs_iomutex);
1207 			if (bp->b_bcount != 0) {
1208 				int err = ldi_strategy(vdp->xs_ldi_hdl, bp);
1209 				if (err != 0) {
1210 					bp->b_flags |= B_ERROR;
1211 					(void) xdb_biodone(bp);
1212 					XDB_DBPRINT(XDB_DBG_IO, (CE_WARN,
1213 					    "xdb@%s: sent buf to backend dev"
1214 					    "failed, err=%d",
1215 					    ddi_get_name_addr(vdp->xs_dip),
1216 					    err));
1217 				} else {
1218 					XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
1219 					    "sent buf to backend ok"));
1220 				}
1221 			} else /* no I/O need to be done */
1222 				(void) xdb_biodone(bp);
1223 
1224 			mutex_enter(&vdp->xs_iomutex);
1225 		}
1226 
1227 		if (vdp->xs_if_status != XDB_DISCONNECTED)
1228 			cv_wait(&vdp->xs_iocv, &vdp->xs_iomutex);
1229 	}
1230 
1231 	mutex_exit(&vdp->xs_iomutex);
1232 }
1233 
1234 /*ARGSUSED*/
1235 static void
1236 xdb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
1237     void *impl_data)
1238 {
1239 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1240 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1241 
1242 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
1243 	    "hotplug status change to %d!", ddi_get_name_addr(dip), state));
1244 
1245 	mutex_enter(&vdp->xs_cbmutex);
1246 	if (state == Connected) {
1247 		/* Hotplug script has completed successfully */
1248 		if (vdp->xs_dev_status == XDB_DEV_UNKNOWN) {
1249 			vdp->xs_dev_status = XDB_DEV_READY;
1250 			if (vdp->xs_fe_status == XDB_FE_READY)
1251 				/* try to connect to frontend */
1252 				if (xdb_start_connect(vdp) != DDI_SUCCESS)
1253 					(void) xdb_start_disconnect(vdp);
1254 		}
1255 	}
1256 	mutex_exit(&vdp->xs_cbmutex);
1257 }
1258 
1259 /*ARGSUSED*/
1260 static void
1261 xdb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
1262     void *impl_data)
1263 {
1264 	XenbusState new_state = *(XenbusState *)impl_data;
1265 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1266 
1267 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
1268 	    "otherend state change to %d!", ddi_get_name_addr(dip), new_state));
1269 
1270 	mutex_enter(&vdp->xs_cbmutex);
1271 
1272 	if (xdb_check_state_transition(vdp, new_state) == DDI_FAILURE) {
1273 		mutex_exit(&vdp->xs_cbmutex);
1274 		return;
1275 	}
1276 
1277 	switch (new_state) {
1278 	case XenbusStateInitialised:
1279 		ASSERT(vdp->xs_if_status == XDB_UNKNOWN);
1280 
1281 		/* frontend is ready for connecting */
1282 		vdp->xs_fe_status = XDB_FE_READY;
1283 
1284 		if (vdp->xs_dev_status == XDB_DEV_READY)
1285 			if (xdb_start_connect(vdp) != DDI_SUCCESS)
1286 				(void) xdb_start_disconnect(vdp);
1287 		break;
1288 	case XenbusStateClosing:
1289 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1290 		break;
1291 	case XenbusStateClosed:
1292 		/* clean up */
1293 		xdb_close(dip);
1294 	}
1295 
1296 	mutex_exit(&vdp->xs_cbmutex);
1297 }
1298 
1299 static int
1300 xdb_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1301 {
1302 	xdb_t *vdp;
1303 	ddi_iblock_cookie_t ibc;
1304 	int instance;
1305 
1306 	switch (cmd) {
1307 	case DDI_RESUME:
1308 		return (DDI_FAILURE);
1309 	case DDI_ATTACH:
1310 		break;
1311 	default:
1312 		return (DDI_FAILURE);
1313 	}
1314 
1315 	/* DDI_ATTACH */
1316 	instance = ddi_get_instance(dip);
1317 	if (ddi_soft_state_zalloc(xdb_statep, instance) != DDI_SUCCESS)
1318 		return (DDI_FAILURE);
1319 
1320 	vdp = ddi_get_soft_state(xdb_statep, instance);
1321 	vdp->xs_dip = dip;
1322 	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
1323 		goto errout1;
1324 
1325 	if (!xdb_kstat_init(vdp))
1326 		goto errout1;
1327 
1328 	mutex_init(&vdp->xs_iomutex, NULL, MUTEX_DRIVER, (void *)ibc);
1329 	mutex_init(&vdp->xs_cbmutex, NULL, MUTEX_DRIVER, (void *)ibc);
1330 	cv_init(&vdp->xs_iocv, NULL, CV_DRIVER, NULL);
1331 	cv_init(&vdp->xs_ionumcv, NULL, CV_DRIVER, NULL);
1332 
1333 	ddi_set_driver_private(dip, vdp);
1334 
1335 	vdp->xs_iotaskq = ddi_taskq_create(dip, "xdb_iotask", 1,
1336 	    TASKQ_DEFAULTPRI, 0);
1337 	if (vdp->xs_iotaskq == NULL)
1338 		goto errout2;
1339 	(void) ddi_taskq_dispatch(vdp->xs_iotaskq, xdb_send_buf, vdp,
1340 	    DDI_SLEEP);
1341 
1342 	/* Watch frontend and hotplug state change */
1343 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xdb_oe_state_change) !=
1344 	    DDI_SUCCESS)
1345 		goto errout3;
1346 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xdb_hp_state_change) !=
1347 	    DDI_SUCCESS) {
1348 		goto errout4;
1349 	}
1350 
1351 	/*
1352 	 * Kick-off hotplug script
1353 	 */
1354 	if (xvdi_post_event(dip, XEN_HP_ADD) != DDI_SUCCESS) {
1355 		cmn_err(CE_WARN, "xdb@%s: failed to start hotplug script",
1356 		    ddi_get_name_addr(dip));
1357 		goto errout4;
1358 	}
1359 
1360 	/*
1361 	 * start waiting for hotplug event and otherend state event
1362 	 * mainly for debugging, frontend will not take any op seeing this
1363 	 */
1364 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
1365 
1366 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: attached!",
1367 	    ddi_get_name_addr(dip)));
1368 	return (DDI_SUCCESS);
1369 
1370 errout4:
1371 	xvdi_remove_event_handler(dip, NULL);
1372 errout3:
1373 	mutex_enter(&vdp->xs_cbmutex);
1374 	mutex_enter(&vdp->xs_iomutex);
1375 	vdp->xs_if_status = XDB_DISCONNECTED;
1376 	cv_broadcast(&vdp->xs_iocv);
1377 	mutex_exit(&vdp->xs_iomutex);
1378 	mutex_exit(&vdp->xs_cbmutex);
1379 	ddi_taskq_destroy(vdp->xs_iotaskq);
1380 errout2:
1381 	ddi_set_driver_private(dip, NULL);
1382 	cv_destroy(&vdp->xs_iocv);
1383 	cv_destroy(&vdp->xs_ionumcv);
1384 	mutex_destroy(&vdp->xs_cbmutex);
1385 	mutex_destroy(&vdp->xs_iomutex);
1386 	kstat_delete(vdp->xs_kstats);
1387 errout1:
1388 	ddi_soft_state_free(xdb_statep, instance);
1389 	return (DDI_FAILURE);
1390 }
1391 
1392 /*ARGSUSED*/
1393 static int
1394 xdb_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1395 {
1396 	int instance = ddi_get_instance(dip);
1397 	xdb_t *vdp = XDB_INST2SOFTS(instance);
1398 
1399 	switch (cmd) {
1400 	case DDI_SUSPEND:
1401 		return (DDI_FAILURE);
1402 	case DDI_DETACH:
1403 		break;
1404 	default:
1405 		return (DDI_FAILURE);
1406 	}
1407 
1408 	/* DDI_DETACH handling */
1409 
1410 	/* shouldn't detach, if still used by frontend */
1411 	mutex_enter(&vdp->xs_iomutex);
1412 	if (vdp->xs_if_status != XDB_DISCONNECTED) {
1413 		mutex_exit(&vdp->xs_iomutex);
1414 		return (DDI_FAILURE);
1415 	}
1416 	mutex_exit(&vdp->xs_iomutex);
1417 
1418 	xvdi_remove_event_handler(dip, NULL);
1419 	/* can do nothing about it, if it fails */
1420 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1421 
1422 	ddi_taskq_destroy(vdp->xs_iotaskq);
1423 	cv_destroy(&vdp->xs_iocv);
1424 	cv_destroy(&vdp->xs_ionumcv);
1425 	mutex_destroy(&vdp->xs_cbmutex);
1426 	mutex_destroy(&vdp->xs_iomutex);
1427 	kstat_delete(vdp->xs_kstats);
1428 	ddi_set_driver_private(dip, NULL);
1429 	ddi_soft_state_free(xdb_statep, instance);
1430 
1431 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: detached!",
1432 	    ddi_get_name_addr(dip)));
1433 	return (DDI_SUCCESS);
1434 }
1435 
1436 static struct dev_ops xdb_dev_ops = {
1437 	DEVO_REV,	/* devo_rev */
1438 	0,		/* devo_refcnt */
1439 	ddi_getinfo_1to1, /* devo_getinfo */
1440 	nulldev,	/* devo_identify */
1441 	nulldev,	/* devo_probe */
1442 	xdb_attach,	/* devo_attach */
1443 	xdb_detach,	/* devo_detach */
1444 	nodev,		/* devo_reset */
1445 	NULL,		/* devo_cb_ops */
1446 	NULL,		/* devo_bus_ops */
1447 	NULL		/* power */
1448 };
1449 
1450 /*
1451  * Module linkage information for the kernel.
1452  */
1453 static struct modldrv modldrv = {
1454 	&mod_driverops,			/* Type of module. */
1455 	"vbd backend driver %I%",	/* Name of the module */
1456 	&xdb_dev_ops			/* driver ops */
1457 };
1458 
1459 static struct modlinkage xdb_modlinkage = {
1460 	MODREV_1,
1461 	&modldrv,
1462 	NULL
1463 };
1464 
1465 int
1466 _init(void)
1467 {
1468 	int rv;
1469 
1470 	if ((rv = ddi_soft_state_init((void **)&xdb_statep,
1471 	    sizeof (xdb_t), 0)) == 0)
1472 		if ((rv = mod_install(&xdb_modlinkage)) != 0)
1473 			ddi_soft_state_fini((void **)&xdb_statep);
1474 	return (rv);
1475 }
1476 
1477 int
1478 _fini(void)
1479 {
1480 	int rv;
1481 
1482 	if ((rv = mod_remove(&xdb_modlinkage)) != 0)
1483 		return (rv);
1484 	ddi_soft_state_fini((void **)&xdb_statep);
1485 	return (rv);
1486 }
1487 
1488 int
1489 _info(struct modinfo *modinfop)
1490 {
1491 	return (mod_info(&xdb_modlinkage, modinfop));
1492 }
1493