xref: /illumos-gate/usr/src/uts/common/xen/io/xdb.c (revision 3afe87ebb25691cb6d158edaa34a6fb9b703a691)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Note: This is the backend part of the split PV disk driver. This driver
29  * is not a nexus driver, nor is it a leaf driver(block/char/stream driver).
30  * Currently, it does not create any minor node. So, although, it runs in
31  * backend domain, it will not be used directly from within dom0.
32  * It simply gets block I/O requests issued by frontend from a shared page
33  * (blkif ring buffer - defined by Xen) between backend and frontend domain,
34  * generates a buf, and push it down to underlying disk target driver via
35  * ldi interface. When buf is done, this driver will generate a response
36  * and put it into ring buffer to inform frontend of the status of the I/O
37  * request issued by it. When a new virtual device entry is added in xenstore,
38  * there will be an watch event sent from Xen to xvdi framework, who will,
39  * in turn, create the devinfo node and try to attach this driver
40  * (see xvdi_create_dev). When frontend peer changes its state to
41  * XenbusStateClose, an event will also be sent from Xen to xvdi framework,
42  * who will detach and remove this devinfo node (see i_xvdi_oestate_handler).
43  * I/O requests get from ring buffer and event coming from xenstore cannot be
44  * trusted. We verify them in xdb_get_buf() and xdb_check_state_transition().
45  *
46  * Virtual device configuration is read/written from/to the database via
47  * xenbus_* interfaces. Driver also use xvdi_* to interact with hypervisor.
48  * There is an on-going effort to make xvdi_* cover all xenbus_*.
49  */
50 
51 #include <sys/types.h>
52 #include <sys/conf.h>
53 #include <sys/ddi.h>
54 #include <sys/dditypes.h>
55 #include <sys/sunddi.h>
56 #include <sys/list.h>
57 #include <sys/dkio.h>
58 #include <sys/cmlb.h>
59 #include <sys/vtoc.h>
60 #include <sys/modctl.h>
61 #include <sys/bootconf.h>
62 #include <sys/promif.h>
63 #include <sys/sysmacros.h>
64 #include <public/io/xenbus.h>
65 #include <xen/sys/xenbus_impl.h>
66 #include <xen/sys/xendev.h>
67 #include <sys/gnttab.h>
68 #include <sys/scsi/generic/inquiry.h>
69 #include <vm/seg_kmem.h>
70 #include <vm/hat_i86.h>
71 #include <sys/gnttab.h>
72 #include <sys/lofi.h>
73 #include <io/xdf.h>
74 #include <xen/io/blkif_impl.h>
75 #include <io/xdb.h>
76 
77 static xdb_t *xdb_statep;
78 static int xdb_debug = 0;
79 
80 static int xdb_push_response(xdb_t *, uint64_t, uint8_t, uint16_t);
81 static int xdb_get_request(xdb_t *, blkif_request_t *);
82 static void blkif_get_x86_32_req(blkif_request_t *, blkif_x86_32_request_t *);
83 static void blkif_get_x86_64_req(blkif_request_t *, blkif_x86_64_request_t *);
84 
85 #ifdef DEBUG
86 /*
87  * debug aid functions
88  */
89 
90 static void
91 logva(xdb_t *vdp, uint64_t va)
92 {
93 	uint64_t *page_addrs;
94 	int i;
95 
96 	page_addrs = vdp->page_addrs;
97 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
98 		if (page_addrs[i] == va)
99 			debug_enter("VA remapping found!");
100 	}
101 
102 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
103 		if (page_addrs[i] == 0) {
104 			page_addrs[i] = va;
105 			break;
106 		}
107 	}
108 	ASSERT(i < XDB_MAX_IO_PAGES(vdp));
109 }
110 
111 static void
112 unlogva(xdb_t *vdp, uint64_t va)
113 {
114 	uint64_t *page_addrs;
115 	int i;
116 
117 	page_addrs = vdp->page_addrs;
118 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
119 		if (page_addrs[i] == va) {
120 			page_addrs[i] = 0;
121 			break;
122 		}
123 	}
124 	ASSERT(i < XDB_MAX_IO_PAGES(vdp));
125 }
126 
127 static void
128 xdb_dump_request_oe(blkif_request_t *req)
129 {
130 	int i;
131 
132 	/*
133 	 * Exploit the public interface definitions for BLKIF_OP_READ
134 	 * etc..
135 	 */
136 	char *op_name[] = { "read", "write", "barrier", "flush" };
137 
138 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "op=%s", op_name[req->operation]));
139 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "num of segments=%d",
140 	    req->nr_segments));
141 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "handle=%d", req->handle));
142 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "id=%llu",
143 	    (unsigned long long)req->id));
144 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "start sector=%llu",
145 	    (unsigned long long)req->sector_number));
146 	for (i = 0; i < req->nr_segments; i++) {
147 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "gref=%d, first sec=%d,"
148 		    "last sec=%d", req->seg[i].gref, req->seg[i].first_sect,
149 		    req->seg[i].last_sect));
150 	}
151 }
152 #endif /* DEBUG */
153 
154 /*
155  * Statistics.
156  */
157 static char *xdb_stats[] = {
158 	"rd_reqs",
159 	"wr_reqs",
160 	"br_reqs",
161 	"fl_reqs",
162 	"oo_reqs"
163 };
164 
165 static int
166 xdb_kstat_update(kstat_t *ksp, int flag)
167 {
168 	xdb_t *vdp;
169 	kstat_named_t *knp;
170 
171 	if (flag != KSTAT_READ)
172 		return (EACCES);
173 
174 	vdp = ksp->ks_private;
175 	knp = ksp->ks_data;
176 
177 	/*
178 	 * Assignment order should match that of the names in
179 	 * xdb_stats.
180 	 */
181 	(knp++)->value.ui64 = vdp->xs_stat_req_reads;
182 	(knp++)->value.ui64 = vdp->xs_stat_req_writes;
183 	(knp++)->value.ui64 = vdp->xs_stat_req_barriers;
184 	(knp++)->value.ui64 = vdp->xs_stat_req_flushes;
185 	(knp++)->value.ui64 = 0; /* oo_req */
186 
187 	return (0);
188 }
189 
190 static boolean_t
191 xdb_kstat_init(xdb_t *vdp)
192 {
193 	int nstat = sizeof (xdb_stats) / sizeof (xdb_stats[0]);
194 	char **cp = xdb_stats;
195 	kstat_named_t *knp;
196 
197 	if ((vdp->xs_kstats = kstat_create("xdb",
198 	    ddi_get_instance(vdp->xs_dip),
199 	    "req_statistics", "block", KSTAT_TYPE_NAMED,
200 	    nstat, 0)) == NULL)
201 		return (B_FALSE);
202 
203 	vdp->xs_kstats->ks_private = vdp;
204 	vdp->xs_kstats->ks_update = xdb_kstat_update;
205 
206 	knp = vdp->xs_kstats->ks_data;
207 	while (nstat > 0) {
208 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
209 		knp++;
210 		cp++;
211 		nstat--;
212 	}
213 
214 	kstat_install(vdp->xs_kstats);
215 
216 	return (B_TRUE);
217 }
218 
219 static int xdb_biodone(buf_t *);
220 
221 static buf_t *
222 xdb_get_buf(xdb_t *vdp, blkif_request_t *req, xdb_request_t *xreq)
223 {
224 	buf_t *bp;
225 	uint8_t segs, curseg;
226 	int sectors;
227 	int i, err;
228 	gnttab_map_grant_ref_t mapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
229 	ddi_acc_handle_t acchdl;
230 
231 	acchdl = vdp->xs_ring_hdl;
232 	bp = XDB_XREQ2BP(xreq);
233 	curseg = xreq->xr_curseg;
234 	/* init a new xdb request */
235 	if (req != NULL) {
236 		ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
237 		boolean_t pagemapok = B_TRUE;
238 		uint8_t op = ddi_get8(acchdl, &req->operation);
239 
240 		xreq->xr_vdp = vdp;
241 		xreq->xr_op = op;
242 		xreq->xr_id = ddi_get64(acchdl, &req->id);
243 		segs = xreq->xr_buf_pages = ddi_get8(acchdl, &req->nr_segments);
244 		if (segs == 0) {
245 			if (op != BLKIF_OP_FLUSH_DISKCACHE)
246 				cmn_err(CE_WARN, "!non-BLKIF_OP_FLUSH_DISKCACHE"
247 				    " is seen from domain %d with zero "
248 				    "length data buffer!", vdp->xs_peer);
249 			bioinit(bp);
250 			bp->b_bcount = 0;
251 			bp->b_lblkno = 0;
252 			bp->b_un.b_addr = NULL;
253 			return (bp);
254 		} else if (op == BLKIF_OP_FLUSH_DISKCACHE) {
255 			cmn_err(CE_WARN, "!BLKIF_OP_FLUSH_DISKCACHE"
256 			    " is seen from domain %d with non-zero "
257 			    "length data buffer!", vdp->xs_peer);
258 		}
259 
260 		/*
261 		 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
262 		 * according to the definition of blk interface by Xen
263 		 * we do sanity check here
264 		 */
265 		if (segs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
266 			segs = xreq->xr_buf_pages =
267 			    BLKIF_MAX_SEGMENTS_PER_REQUEST;
268 
269 		for (i = 0; i < segs; i++) {
270 			uint8_t fs, ls;
271 
272 			mapops[i].host_addr =
273 			    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
274 			    vdp->xs_iopage_va, xreq->xr_idx, i);
275 			mapops[i].dom = vdp->xs_peer;
276 			mapops[i].ref = ddi_get32(acchdl, &req->seg[i].gref);
277 			mapops[i].flags = GNTMAP_host_map;
278 			if (op != BLKIF_OP_READ)
279 				mapops[i].flags |= GNTMAP_readonly;
280 
281 			fs = ddi_get8(acchdl, &req->seg[i].first_sect);
282 			ls = ddi_get8(acchdl, &req->seg[i].last_sect);
283 
284 			/*
285 			 * first_sect should be no bigger than last_sect and
286 			 * both of them should be no bigger than
287 			 * (PAGESIZE / XB_BSIZE - 1) according to definition
288 			 * of blk interface by Xen, so sanity check again
289 			 */
290 			if (fs > (PAGESIZE / XB_BSIZE - 1))
291 				fs = PAGESIZE / XB_BSIZE - 1;
292 			if (ls > (PAGESIZE / XB_BSIZE - 1))
293 				ls = PAGESIZE / XB_BSIZE - 1;
294 			if (fs > ls)
295 				fs = ls;
296 
297 			xreq->xr_segs[i].fs = fs;
298 			xreq->xr_segs[i].ls = ls;
299 		}
300 
301 		/* map in io pages */
302 		err = xen_map_gref(GNTTABOP_map_grant_ref, mapops, i, B_FALSE);
303 		if (err != 0)
304 			return (NULL);
305 		for (i = 0; i < segs; i++) {
306 			/*
307 			 * Although HYPERVISOR_grant_table_op() returned no
308 			 * error, mapping of each single page can fail. So,
309 			 * we have to do the check here and handle the error
310 			 * if needed
311 			 */
312 			if (mapops[i].status != GNTST_okay) {
313 				int j;
314 				for (j = 0; j < i; j++) {
315 #ifdef DEBUG
316 					unlogva(vdp, mapops[j].host_addr);
317 #endif
318 					xen_release_pfn(
319 					    xreq->xr_plist[j].p_pagenum);
320 				}
321 				pagemapok = B_FALSE;
322 				break;
323 			}
324 			/* record page mapping handle for unmapping later */
325 			xreq->xr_page_hdls[i] = mapops[i].handle;
326 #ifdef DEBUG
327 			logva(vdp, mapops[i].host_addr);
328 #endif
329 			/*
330 			 * Pass the MFNs down using the shadow list (xr_pplist)
331 			 *
332 			 * This is pretty ugly since we have implict knowledge
333 			 * of how the rootnex binds buffers.
334 			 * The GNTTABOP_map_grant_ref op makes us do some ugly
335 			 * stuff since we're not allowed to touch these PTEs
336 			 * from the VM.
337 			 *
338 			 * Obviously, these aren't real page_t's. The rootnex
339 			 * only needs p_pagenum.
340 			 * Also, don't use btop() here or 32 bit PAE breaks.
341 			 */
342 			xreq->xr_pplist[i] = &xreq->xr_plist[i];
343 			xreq->xr_plist[i].p_pagenum =
344 			    xen_assign_pfn(mapops[i].dev_bus_addr >> PAGESHIFT);
345 		}
346 
347 		/*
348 		 * not all pages mapped in successfully, unmap those mapped-in
349 		 * page and return failure
350 		 */
351 		if (!pagemapok) {
352 			gnttab_unmap_grant_ref_t unmapop;
353 
354 			for (i = 0; i < segs; i++) {
355 				if (mapops[i].status != GNTST_okay)
356 					continue;
357 				unmapop.host_addr =
358 				    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
359 				    vdp->xs_iopage_va, xreq->xr_idx, i);
360 				unmapop.dev_bus_addr = NULL;
361 				unmapop.handle = mapops[i].handle;
362 				(void) HYPERVISOR_grant_table_op(
363 				    GNTTABOP_unmap_grant_ref, &unmapop, 1);
364 			}
365 
366 			return (NULL);
367 		}
368 		bioinit(bp);
369 		bp->b_lblkno = ddi_get64(acchdl, &req->sector_number);
370 		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
371 		bp->b_flags |= (ddi_get8(acchdl, &req->operation) ==
372 		    BLKIF_OP_READ) ? B_READ : (B_WRITE | B_ASYNC);
373 	} else {
374 		uint64_t blkst;
375 		int isread;
376 
377 		/* reuse this buf */
378 		blkst = bp->b_lblkno + bp->b_bcount / DEV_BSIZE;
379 		isread = bp->b_flags & B_READ;
380 		bioreset(bp);
381 		bp->b_lblkno = blkst;
382 		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
383 		bp->b_flags |= isread ? B_READ : (B_WRITE | B_ASYNC);
384 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "reuse buf, xreq is %d!!",
385 		    xreq->xr_idx));
386 	}
387 
388 	/* form a buf */
389 	bp->b_un.b_addr = XDB_IOPAGE_VA(vdp->xs_iopage_va, xreq->xr_idx,
390 	    curseg) + xreq->xr_segs[curseg].fs * DEV_BSIZE;
391 	bp->b_shadow = &xreq->xr_pplist[curseg];
392 	bp->b_iodone = xdb_biodone;
393 	sectors = 0;
394 	for (i = curseg; i < xreq->xr_buf_pages; i++) {
395 		/*
396 		 * The xreq->xr_segs[i].fs of the first seg can be non-zero
397 		 * otherwise, we'll break it into multiple bufs
398 		 */
399 		if ((i != curseg) && (xreq->xr_segs[i].fs != 0)) {
400 			break;
401 		}
402 		sectors += (xreq->xr_segs[i].ls - xreq->xr_segs[i].fs + 1);
403 	}
404 	xreq->xr_curseg = i;
405 	bp->b_bcount = sectors * DEV_BSIZE;
406 	bp->b_bufsize = bp->b_bcount;
407 
408 	return (bp);
409 }
410 
411 static xdb_request_t *
412 xdb_get_req(xdb_t *vdp)
413 {
414 	xdb_request_t *req;
415 	int idx;
416 
417 	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
418 	ASSERT(vdp->xs_free_req != -1);
419 	req = &vdp->xs_req[vdp->xs_free_req];
420 	vdp->xs_free_req = req->xr_next;
421 	idx = req->xr_idx;
422 	bzero(req, sizeof (xdb_request_t));
423 	req->xr_idx = idx;
424 	return (req);
425 }
426 
427 static void
428 xdb_free_req(xdb_request_t *req)
429 {
430 	xdb_t *vdp = req->xr_vdp;
431 
432 	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
433 	req->xr_next = vdp->xs_free_req;
434 	vdp->xs_free_req = req->xr_idx;
435 }
436 
437 static void
438 xdb_response(xdb_t *vdp, blkif_request_t *req, boolean_t ok)
439 {
440 	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;
441 
442 	if (xdb_push_response(vdp, ddi_get64(acchdl, &req->id),
443 	    ddi_get8(acchdl, &req->operation), ok))
444 		xvdi_notify_oe(vdp->xs_dip);
445 }
446 
447 static void
448 xdb_init_ioreqs(xdb_t *vdp)
449 {
450 	int i;
451 
452 	ASSERT(vdp->xs_nentry);
453 
454 	if (vdp->xs_req == NULL)
455 		vdp->xs_req = kmem_alloc(vdp->xs_nentry *
456 		    sizeof (xdb_request_t), KM_SLEEP);
457 #ifdef DEBUG
458 	if (vdp->page_addrs == NULL)
459 		vdp->page_addrs = kmem_zalloc(XDB_MAX_IO_PAGES(vdp) *
460 		    sizeof (uint64_t), KM_SLEEP);
461 #endif
462 	for (i = 0; i < vdp->xs_nentry; i++) {
463 		vdp->xs_req[i].xr_idx = i;
464 		vdp->xs_req[i].xr_next = i + 1;
465 	}
466 	vdp->xs_req[vdp->xs_nentry - 1].xr_next = -1;
467 	vdp->xs_free_req = 0;
468 
469 	/* alloc va in host dom for io page mapping */
470 	vdp->xs_iopage_va = vmem_xalloc(heap_arena,
471 	    XDB_MAX_IO_PAGES(vdp) * PAGESIZE, PAGESIZE, 0, 0, 0, 0,
472 	    VM_SLEEP);
473 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++)
474 		hat_prepare_mapping(kas.a_hat,
475 		    vdp->xs_iopage_va + i * PAGESIZE, NULL);
476 }
477 
478 static void
479 xdb_uninit_ioreqs(xdb_t *vdp)
480 {
481 	int i;
482 
483 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++)
484 		hat_release_mapping(kas.a_hat,
485 		    vdp->xs_iopage_va + i * PAGESIZE);
486 	vmem_xfree(heap_arena, vdp->xs_iopage_va,
487 	    XDB_MAX_IO_PAGES(vdp) * PAGESIZE);
488 	if (vdp->xs_req != NULL) {
489 		kmem_free(vdp->xs_req, vdp->xs_nentry * sizeof (xdb_request_t));
490 		vdp->xs_req = NULL;
491 	}
492 #ifdef DEBUG
493 	if (vdp->page_addrs != NULL) {
494 		kmem_free(vdp->page_addrs, XDB_MAX_IO_PAGES(vdp) *
495 		    sizeof (uint64_t));
496 		vdp->page_addrs = NULL;
497 	}
498 #endif
499 }
500 
501 static uint_t
502 xdb_intr(caddr_t arg)
503 {
504 	blkif_request_t req;
505 	blkif_request_t *reqp = &req;
506 	xdb_request_t *xreq;
507 	buf_t *bp;
508 	uint8_t op;
509 	xdb_t *vdp = (xdb_t *)arg;
510 	int ret = DDI_INTR_UNCLAIMED;
511 	dev_info_t *dip = vdp->xs_dip;
512 
513 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
514 	    "xdb@%s: I/O request received from dom %d",
515 	    ddi_get_name_addr(dip), vdp->xs_peer));
516 
517 	mutex_enter(&vdp->xs_iomutex);
518 
519 	/* shouldn't touch ring buffer if not in connected state */
520 	if (vdp->xs_if_status != XDB_CONNECTED) {
521 		mutex_exit(&vdp->xs_iomutex);
522 		return (DDI_INTR_UNCLAIMED);
523 	}
524 
525 	/*
526 	 * We'll loop till there is no more request in the ring
527 	 * We won't stuck in this loop for ever since the size of ring buffer
528 	 * is limited, and frontend will stop pushing requests into it when
529 	 * the ring buffer is full
530 	 */
531 
532 	/* req_event will be increased in xvdi_ring_get_request() */
533 	while (xdb_get_request(vdp, reqp)) {
534 		ret = DDI_INTR_CLAIMED;
535 
536 		op = ddi_get8(vdp->xs_ring_hdl, &reqp->operation);
537 		if (op == BLKIF_OP_READ			||
538 		    op == BLKIF_OP_WRITE		||
539 		    op == BLKIF_OP_WRITE_BARRIER	||
540 		    op == BLKIF_OP_FLUSH_DISKCACHE) {
541 #ifdef DEBUG
542 			xdb_dump_request_oe(reqp);
543 #endif
544 			xreq = xdb_get_req(vdp);
545 			ASSERT(xreq);
546 			switch (op) {
547 			case BLKIF_OP_READ:
548 				vdp->xs_stat_req_reads++;
549 				break;
550 			case BLKIF_OP_WRITE_BARRIER:
551 				vdp->xs_stat_req_barriers++;
552 				/* FALLTHRU */
553 			case BLKIF_OP_WRITE:
554 				vdp->xs_stat_req_writes++;
555 				break;
556 			case BLKIF_OP_FLUSH_DISKCACHE:
557 				vdp->xs_stat_req_flushes++;
558 				break;
559 			}
560 
561 			xreq->xr_curseg = 0; /* start from first segment */
562 			bp = xdb_get_buf(vdp, reqp, xreq);
563 			if (bp == NULL) {
564 				/* failed to form a buf */
565 				xdb_free_req(xreq);
566 				xdb_response(vdp, reqp, B_FALSE);
567 				continue;
568 			}
569 			bp->av_forw = NULL;
570 
571 			XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
572 			    " buf %p, blkno %lld, size %lu, addr %p",
573 			    (void *)bp, (longlong_t)bp->b_blkno,
574 			    (ulong_t)bp->b_bcount, (void *)bp->b_un.b_addr));
575 
576 			/* send bp to underlying blk driver */
577 			if (vdp->xs_f_iobuf == NULL) {
578 				vdp->xs_f_iobuf = vdp->xs_l_iobuf = bp;
579 			} else {
580 				vdp->xs_l_iobuf->av_forw = bp;
581 				vdp->xs_l_iobuf = bp;
582 			}
583 		} else {
584 			xdb_response(vdp, reqp, B_FALSE);
585 			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
586 			    "Unsupported cmd received from dom %d",
587 			    ddi_get_name_addr(dip), vdp->xs_peer));
588 		}
589 	}
590 	/* notify our taskq to push buf to underlying blk driver */
591 	if (ret == DDI_INTR_CLAIMED)
592 		cv_broadcast(&vdp->xs_iocv);
593 
594 	mutex_exit(&vdp->xs_iomutex);
595 
596 	return (ret);
597 }
598 
599 static int
600 xdb_biodone(buf_t *bp)
601 {
602 	int i, err, bioerr;
603 	uint8_t segs;
604 	gnttab_unmap_grant_ref_t unmapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
605 	xdb_request_t *xreq = XDB_BP2XREQ(bp);
606 	xdb_t *vdp = xreq->xr_vdp;
607 	buf_t *nbp;
608 
609 	bioerr = geterror(bp);
610 	if (bioerr)
611 		XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: I/O error %d",
612 		    ddi_get_name_addr(vdp->xs_dip), bioerr));
613 
614 	/* check if we are done w/ this I/O request */
615 	if ((bioerr == 0) && (xreq->xr_curseg < xreq->xr_buf_pages)) {
616 		nbp = xdb_get_buf(vdp, NULL, xreq);
617 		if (nbp) {
618 			err = ldi_strategy(vdp->xs_ldi_hdl, nbp);
619 			if (err == 0) {
620 				XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
621 				    "sent buf to backend ok"));
622 				return (DDI_SUCCESS);
623 			}
624 			bioerr = EIO;
625 			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
626 			    "sent buf to backend dev failed, err=%d",
627 			    ddi_get_name_addr(vdp->xs_dip), err));
628 		} else {
629 			bioerr = EIO;
630 		}
631 	}
632 
633 	/* unmap io pages */
634 	segs = xreq->xr_buf_pages;
635 	/*
636 	 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
637 	 * according to the definition of blk interface by Xen
638 	 */
639 	ASSERT(segs <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
640 	for (i = 0; i < segs; i++) {
641 		unmapops[i].host_addr = (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
642 		    vdp->xs_iopage_va, xreq->xr_idx, i);
643 #ifdef DEBUG
644 		mutex_enter(&vdp->xs_iomutex);
645 		unlogva(vdp, unmapops[i].host_addr);
646 		mutex_exit(&vdp->xs_iomutex);
647 #endif
648 		unmapops[i].dev_bus_addr = NULL;
649 		unmapops[i].handle = xreq->xr_page_hdls[i];
650 	}
651 	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
652 	    unmapops, segs);
653 	ASSERT(!err);
654 
655 	/*
656 	 * If we have reached a barrier write or a cache flush , then we must
657 	 * flush all our I/Os.
658 	 */
659 	if (xreq->xr_op == BLKIF_OP_WRITE_BARRIER ||
660 	    xreq->xr_op == BLKIF_OP_FLUSH_DISKCACHE) {
661 		/*
662 		 * XXX At this point the write did succeed, so I don't
663 		 * believe we should report an error because the flush
664 		 * failed. However, this is a debatable point, so
665 		 * maybe we need to think more carefully about this.
666 		 * For now, just cast to void.
667 		 */
668 		(void) ldi_ioctl(vdp->xs_ldi_hdl,
669 		    DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, kcred, NULL);
670 	}
671 
672 	mutex_enter(&vdp->xs_iomutex);
673 
674 	/* send response back to frontend */
675 	if (vdp->xs_if_status == XDB_CONNECTED) {
676 		if (xdb_push_response(vdp, xreq->xr_id, xreq->xr_op, bioerr))
677 			xvdi_notify_oe(vdp->xs_dip);
678 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
679 		    "sent resp back to frontend, id=%llu",
680 		    (unsigned long long)xreq->xr_id));
681 	}
682 	/* free io resources */
683 	biofini(bp);
684 	xdb_free_req(xreq);
685 
686 	vdp->xs_ionum--;
687 	if ((vdp->xs_if_status != XDB_CONNECTED) && (vdp->xs_ionum == 0)) {
688 		/* we're closing, someone is waiting for I/O clean-up */
689 		cv_signal(&vdp->xs_ionumcv);
690 	}
691 
692 	mutex_exit(&vdp->xs_iomutex);
693 
694 	return (DDI_SUCCESS);
695 }
696 
697 static int
698 xdb_bindto_frontend(xdb_t *vdp)
699 {
700 	int err;
701 	char *oename;
702 	grant_ref_t gref;
703 	evtchn_port_t evtchn;
704 	dev_info_t *dip = vdp->xs_dip;
705 	char protocol[64] = "";
706 
707 	/*
708 	 * Gather info from frontend
709 	 */
710 	oename = xvdi_get_oename(dip);
711 	if (oename == NULL)
712 		return (DDI_FAILURE);
713 
714 	err = xenbus_gather(XBT_NULL, oename,
715 	    "ring-ref", "%lu", &gref, "event-channel", "%u", &evtchn, NULL);
716 	if (err != 0) {
717 		xvdi_fatal_error(dip, err,
718 		    "Getting ring-ref and evtchn from frontend");
719 		return (DDI_FAILURE);
720 	}
721 
722 	vdp->xs_blk_protocol = BLKIF_PROTOCOL_NATIVE;
723 	vdp->xs_nentry = BLKIF_RING_SIZE;
724 	vdp->xs_entrysize = sizeof (union blkif_sring_entry);
725 
726 	err = xenbus_gather(XBT_NULL, oename,
727 	    "protocol", "%63s", protocol, NULL);
728 	if (err)
729 		(void) strcpy(protocol, "unspecified, assuming native");
730 	else {
731 		/*
732 		 * We must check for NATIVE first, so that the fast path
733 		 * is taken for copying data from the guest to the host.
734 		 */
735 		if (strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE) != 0) {
736 			if (strcmp(protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
737 				vdp->xs_blk_protocol = BLKIF_PROTOCOL_X86_32;
738 				vdp->xs_nentry = BLKIF_X86_32_RING_SIZE;
739 				vdp->xs_entrysize =
740 				    sizeof (union blkif_x86_32_sring_entry);
741 			} else if (strcmp(protocol, XEN_IO_PROTO_ABI_X86_64) ==
742 			    0) {
743 				vdp->xs_blk_protocol = BLKIF_PROTOCOL_X86_64;
744 				vdp->xs_nentry = BLKIF_X86_64_RING_SIZE;
745 				vdp->xs_entrysize =
746 				    sizeof (union blkif_x86_64_sring_entry);
747 			} else {
748 				xvdi_fatal_error(dip, err, "unknown protocol");
749 				return (DDI_FAILURE);
750 			}
751 		}
752 	}
753 #ifdef DEBUG
754 	cmn_err(CE_NOTE, "!xdb@%s: blkif protocol '%s' ",
755 	    ddi_get_name_addr(dip), protocol);
756 #endif
757 
758 	/*
759 	 * map and init ring
760 	 *
761 	 * The ring parameters must match those which have been allocated
762 	 * in the front end.
763 	 */
764 	err = xvdi_map_ring(dip, vdp->xs_nentry, vdp->xs_entrysize,
765 	    gref, &vdp->xs_ring);
766 	if (err != DDI_SUCCESS)
767 		return (DDI_FAILURE);
768 	/*
769 	 * This will be removed after we use shadow I/O ring request since
770 	 * we don't need to access the ring itself directly, thus the access
771 	 * handle is not needed
772 	 */
773 	vdp->xs_ring_hdl = vdp->xs_ring->xr_acc_hdl;
774 
775 	/*
776 	 * bind event channel
777 	 */
778 	err = xvdi_bind_evtchn(dip, evtchn);
779 	if (err != DDI_SUCCESS) {
780 		xvdi_unmap_ring(vdp->xs_ring);
781 		return (DDI_FAILURE);
782 	}
783 
784 	return (DDI_SUCCESS);
785 }
786 
787 static void
788 xdb_unbindfrom_frontend(xdb_t *vdp)
789 {
790 	xvdi_free_evtchn(vdp->xs_dip);
791 	xvdi_unmap_ring(vdp->xs_ring);
792 }
793 
794 #define	LOFI_CTRL_NODE	"/dev/lofictl"
795 #define	LOFI_DEV_NODE	"/devices/pseudo/lofi@0:"
796 #define	LOFI_MODE	FREAD | FWRITE | FEXCL
797 
798 static int
799 xdb_setup_node(xdb_t *vdp, char *path)
800 {
801 	dev_info_t *dip;
802 	char *xsnode, *node;
803 	ldi_handle_t ldi_hdl;
804 	struct lofi_ioctl *li;
805 	int minor;
806 	int err;
807 	unsigned int len;
808 
809 	dip = vdp->xs_dip;
810 	xsnode = xvdi_get_xsname(dip);
811 	if (xsnode == NULL)
812 		return (DDI_FAILURE);
813 
814 	err = xenbus_read(XBT_NULL, xsnode, "dynamic-device-path",
815 	    (void **)&node, &len);
816 	if (err == ENOENT)
817 		err = xenbus_read(XBT_NULL, xsnode, "params", (void **)&node,
818 		    &len);
819 	if (err != 0) {
820 		xvdi_fatal_error(vdp->xs_dip, err, "reading 'params'");
821 		return (DDI_FAILURE);
822 	}
823 
824 	if (!XDB_IS_LOFI(vdp)) {
825 		(void) strlcpy(path, node, MAXPATHLEN);
826 		kmem_free(node, len);
827 		return (DDI_SUCCESS);
828 	}
829 
830 	do {
831 		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
832 		    &ldi_hdl, vdp->xs_ldi_li);
833 	} while (err == EBUSY);
834 	if (err != 0) {
835 		kmem_free(node, len);
836 		return (DDI_FAILURE);
837 	}
838 
839 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
840 	(void) strlcpy(li->li_filename, node, MAXPATHLEN);
841 	kmem_free(node, len);
842 	if (ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
843 	    LOFI_MODE | FKIOCTL, kcred, &minor) != 0) {
844 		cmn_err(CE_WARN, "xdb@%s: Failed to create lofi dev for %s",
845 		    ddi_get_name_addr(dip), li->li_filename);
846 		(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
847 		kmem_free(li, sizeof (*li));
848 		return (DDI_FAILURE);
849 	}
850 	/*
851 	 * return '/devices/...' instead of '/dev/lofi/...' since the
852 	 * former is available immediately after calling ldi_ioctl
853 	 */
854 	(void) snprintf(path, MAXPATHLEN, LOFI_DEV_NODE "%d", minor);
855 	(void) xenbus_printf(XBT_NULL, xsnode, "node", "%s", path);
856 	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
857 	kmem_free(li, sizeof (*li));
858 	return (DDI_SUCCESS);
859 }
860 
861 static void
862 xdb_teardown_node(xdb_t *vdp)
863 {
864 	dev_info_t *dip;
865 	char *xsnode, *node;
866 	ldi_handle_t ldi_hdl;
867 	struct lofi_ioctl *li;
868 	int err;
869 	unsigned int len;
870 
871 	if (!XDB_IS_LOFI(vdp))
872 		return;
873 
874 	dip = vdp->xs_dip;
875 	xsnode = xvdi_get_xsname(dip);
876 	if (xsnode == NULL)
877 		return;
878 
879 	err = xenbus_read(XBT_NULL, xsnode, "dynamic-device-path",
880 	    (void **)&node, &len);
881 	if (err == ENOENT)
882 		err = xenbus_read(XBT_NULL, xsnode, "params", (void **)&node,
883 		    &len);
884 	if (err != 0) {
885 		xvdi_fatal_error(vdp->xs_dip, err, "reading 'params'");
886 		return;
887 	}
888 
889 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
890 	(void) strlcpy(li->li_filename, node, MAXPATHLEN);
891 	kmem_free(node, len);
892 
893 	do {
894 		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
895 		    &ldi_hdl, vdp->xs_ldi_li);
896 	} while (err == EBUSY);
897 
898 	if (err != 0) {
899 		kmem_free(li, sizeof (*li));
900 		return;
901 	}
902 
903 	if (ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE, (intptr_t)li,
904 	    LOFI_MODE | FKIOCTL, kcred, NULL) != 0) {
905 		cmn_err(CE_WARN, "xdb@%s: Failed to delete lofi dev for %s",
906 		    ddi_get_name_addr(dip), li->li_filename);
907 	}
908 
909 	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
910 	kmem_free(li, sizeof (*li));
911 }
912 
913 static int
914 xdb_open_device(xdb_t *vdp)
915 {
916 	uint64_t devsize;
917 	dev_info_t *dip;
918 	char *xsnode;
919 	char *nodepath;
920 	char *mode = NULL;
921 	char *type = NULL;
922 	int err;
923 
924 	dip = vdp->xs_dip;
925 	xsnode = xvdi_get_xsname(dip);
926 	if (xsnode == NULL)
927 		return (DDI_FAILURE);
928 
929 	err = xenbus_gather(XBT_NULL, xsnode,
930 	    "mode", NULL, &mode, "type", NULL, &type, NULL);
931 	if (err != 0) {
932 		if (mode)
933 			kmem_free(mode, strlen(mode) + 1);
934 		if (type)
935 			kmem_free(type, strlen(type) + 1);
936 		xvdi_fatal_error(dip, err,
937 		    "Getting mode and type from backend device");
938 		return (DDI_FAILURE);
939 	}
940 	if (strcmp(type, "file") == 0) {
941 		vdp->xs_type |= XDB_DEV_LOFI;
942 	}
943 	kmem_free(type, strlen(type) + 1);
944 	if ((strcmp(mode, "r") == NULL) || (strcmp(mode, "ro") == NULL)) {
945 		vdp->xs_type |= XDB_DEV_RO;
946 	}
947 	kmem_free(mode, strlen(mode) + 1);
948 
949 	/*
950 	 * try to open backend device
951 	 */
952 	if (ldi_ident_from_dip(dip, &vdp->xs_ldi_li) != 0)
953 		return (DDI_FAILURE);
954 
955 	nodepath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
956 	err = xdb_setup_node(vdp, nodepath);
957 	if (err != DDI_SUCCESS) {
958 		xvdi_fatal_error(dip, err,
959 		    "Getting device path of backend device");
960 		ldi_ident_release(vdp->xs_ldi_li);
961 		kmem_free(nodepath, MAXPATHLEN);
962 		return (DDI_FAILURE);
963 	}
964 
965 	if (*nodepath == '\0') {
966 		/* Allow a CD-ROM device with an empty backend. */
967 		vdp->xs_sectors = 0;
968 		kmem_free(nodepath, MAXPATHLEN);
969 		return (DDI_SUCCESS);
970 	}
971 
972 	if (ldi_open_by_name(nodepath,
973 	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE),
974 	    kcred, &vdp->xs_ldi_hdl, vdp->xs_ldi_li) != 0) {
975 		xdb_teardown_node(vdp);
976 		ldi_ident_release(vdp->xs_ldi_li);
977 		cmn_err(CE_WARN, "xdb@%s: Failed to open: %s",
978 		    ddi_get_name_addr(dip), nodepath);
979 		kmem_free(nodepath, MAXPATHLEN);
980 		return (DDI_FAILURE);
981 	}
982 
983 	/* check if it's a CD/DVD disc */
984 	if (ldi_prop_get_int(vdp->xs_ldi_hdl, LDI_DEV_T_ANY | DDI_PROP_DONTPASS,
985 	    "inquiry-device-type", DTYPE_DIRECT) == DTYPE_RODIRECT)
986 		vdp->xs_type |= XDB_DEV_CD;
987 	/* check if it's a removable disk */
988 	if (ldi_prop_exists(vdp->xs_ldi_hdl,
989 	    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
990 	    "removable-media"))
991 		vdp->xs_type |= XDB_DEV_RMB;
992 
993 	if (ldi_get_size(vdp->xs_ldi_hdl, &devsize) != DDI_SUCCESS) {
994 		(void) ldi_close(vdp->xs_ldi_hdl,
995 		    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
996 		xdb_teardown_node(vdp);
997 		ldi_ident_release(vdp->xs_ldi_li);
998 		kmem_free(nodepath, MAXPATHLEN);
999 		return (DDI_FAILURE);
1000 	}
1001 	vdp->xs_sectors = devsize / XB_BSIZE;
1002 
1003 	kmem_free(nodepath, MAXPATHLEN);
1004 	return (DDI_SUCCESS);
1005 }
1006 
1007 static void
1008 xdb_close_device(xdb_t *vdp)
1009 {
1010 	(void) ldi_close(vdp->xs_ldi_hdl,
1011 	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
1012 	xdb_teardown_node(vdp);
1013 	ldi_ident_release(vdp->xs_ldi_li);
1014 	vdp->xs_ldi_li = NULL;
1015 	vdp->xs_ldi_hdl = NULL;
1016 }
1017 
1018 /*
1019  * Kick-off connect process
1020  * If xs_fe_status == XDB_FE_READY and xs_dev_status == XDB_DEV_READY
1021  * the xs_if_status will be changed to XDB_CONNECTED on success,
1022  * otherwise, xs_if_status will not be changed
1023  */
1024 static int
1025 xdb_start_connect(xdb_t *vdp)
1026 {
1027 	uint32_t dinfo;
1028 	xenbus_transaction_t xbt;
1029 	int err, svdst;
1030 	char *xsnode;
1031 	dev_info_t *dip = vdp->xs_dip;
1032 	char *barrier;
1033 	uint_t len;
1034 
1035 	/*
1036 	 * Start connect to frontend only when backend device are ready
1037 	 * and frontend has moved to XenbusStateInitialised, which means
1038 	 * ready to connect
1039 	 */
1040 	ASSERT((vdp->xs_fe_status == XDB_FE_READY) &&
1041 	    (vdp->xs_dev_status == XDB_DEV_READY));
1042 
1043 	if (((xsnode = xvdi_get_xsname(dip)) == NULL)		 ||
1044 	    ((vdp->xs_peer = xvdi_get_oeid(dip)) == (domid_t)-1) ||
1045 	    (xdb_open_device(vdp) != DDI_SUCCESS))
1046 		return (DDI_FAILURE);
1047 
1048 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialised);
1049 
1050 	if (xdb_bindto_frontend(vdp) != DDI_SUCCESS)
1051 		goto errout1;
1052 
1053 	/* init i/o requests */
1054 	xdb_init_ioreqs(vdp);
1055 
1056 	if (ddi_add_intr(dip, 0, NULL, NULL, xdb_intr, (caddr_t)vdp)
1057 	    != DDI_SUCCESS)
1058 		goto errout2;
1059 
1060 	/*
1061 	 * we can recieve intr any time from now on
1062 	 * mark that we're ready to take intr
1063 	 */
1064 	mutex_enter(&vdp->xs_iomutex);
1065 	/*
1066 	 * save it in case we need to restore when we
1067 	 * fail to write xenstore later
1068 	 */
1069 	svdst = vdp->xs_if_status;
1070 	vdp->xs_if_status = XDB_CONNECTED;
1071 	mutex_exit(&vdp->xs_iomutex);
1072 
1073 	/* write into xenstore the info needed by frontend */
1074 trans_retry:
1075 	if (xenbus_transaction_start(&xbt)) {
1076 		xvdi_fatal_error(dip, EIO, "transaction start");
1077 		goto errout3;
1078 	}
1079 
1080 	/*
1081 	 * If feature-barrier isn't present in xenstore, add it.
1082 	 */
1083 	if (xenbus_read(xbt, xsnode, "feature-barrier",
1084 	    (void **)&barrier, &len) != 0) {
1085 		if ((err = xenbus_printf(xbt, xsnode, "feature-barrier",
1086 		    "%d", 1)) != 0) {
1087 			cmn_err(CE_WARN, "xdb@%s: failed to write "
1088 			    "'feature-barrier'", ddi_get_name_addr(dip));
1089 			xvdi_fatal_error(dip, err, "writing 'feature-barrier'");
1090 			goto abort_trans;
1091 		}
1092 	} else
1093 		kmem_free(barrier, len);
1094 
1095 	dinfo = 0;
1096 	if (XDB_IS_RO(vdp))
1097 		dinfo |= VDISK_READONLY;
1098 	if (XDB_IS_CD(vdp))
1099 		dinfo |= VDISK_CDROM;
1100 	if (XDB_IS_RMB(vdp))
1101 		dinfo |= VDISK_REMOVABLE;
1102 	if (err = xenbus_printf(xbt, xsnode, "info", "%u", dinfo)) {
1103 		xvdi_fatal_error(dip, err, "writing 'info'");
1104 		goto abort_trans;
1105 	}
1106 
1107 	/* hard-coded 512-byte sector size */
1108 	if (err = xenbus_printf(xbt, xsnode, "sector-size", "%u", DEV_BSIZE)) {
1109 		xvdi_fatal_error(dip, err, "writing 'sector-size'");
1110 		goto abort_trans;
1111 	}
1112 
1113 	if (err = xenbus_printf(xbt, xsnode, "sectors", "%"PRIu64,
1114 	    vdp->xs_sectors)) {
1115 		xvdi_fatal_error(dip, err, "writing 'sectors'");
1116 		goto abort_trans;
1117 	}
1118 
1119 	if (err = xenbus_printf(xbt, xsnode, "instance", "%d",
1120 	    ddi_get_instance(dip))) {
1121 		xvdi_fatal_error(dip, err, "writing 'instance'");
1122 		goto abort_trans;
1123 	}
1124 
1125 	if ((err = xvdi_switch_state(dip, xbt, XenbusStateConnected)) > 0) {
1126 		xvdi_fatal_error(dip, err, "writing 'state'");
1127 		goto abort_trans;
1128 	}
1129 
1130 	if (err = xenbus_transaction_end(xbt, 0)) {
1131 		if (err == EAGAIN)
1132 			/* transaction is ended, don't need to abort it */
1133 			goto trans_retry;
1134 		xvdi_fatal_error(dip, err, "completing transaction");
1135 		goto errout3;
1136 	}
1137 
1138 	return (DDI_SUCCESS);
1139 
1140 abort_trans:
1141 	(void) xenbus_transaction_end(xbt, 1);
1142 errout3:
1143 	mutex_enter(&vdp->xs_iomutex);
1144 	vdp->xs_if_status = svdst;
1145 	mutex_exit(&vdp->xs_iomutex);
1146 	ddi_remove_intr(dip, 0, NULL);
1147 errout2:
1148 	xdb_uninit_ioreqs(vdp);
1149 	xdb_unbindfrom_frontend(vdp);
1150 errout1:
1151 	xdb_close_device(vdp);
1152 	return (DDI_FAILURE);
1153 }
1154 
1155 /*
1156  * Kick-off disconnect process
1157  * xs_if_status will not be changed
1158  */
1159 static int
1160 xdb_start_disconnect(xdb_t *vdp)
1161 {
1162 	/*
1163 	 * Kick-off disconnect process
1164 	 */
1165 	if (xvdi_switch_state(vdp->xs_dip, XBT_NULL, XenbusStateClosing) > 0)
1166 		return (DDI_FAILURE);
1167 
1168 	return (DDI_SUCCESS);
1169 }
1170 
1171 /*
1172  * Disconnect from frontend and close backend device
1173  * ifstatus will be changed to XDB_DISCONNECTED
1174  * Xenbus state will be changed to XenbusStateClosed
1175  */
1176 static void
1177 xdb_close(dev_info_t *dip)
1178 {
1179 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1180 
1181 	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));
1182 
1183 	mutex_enter(&vdp->xs_iomutex);
1184 
1185 	if (vdp->xs_if_status != XDB_CONNECTED) {
1186 		vdp->xs_if_status = XDB_DISCONNECTED;
1187 		cv_broadcast(&vdp->xs_iocv);
1188 		mutex_exit(&vdp->xs_iomutex);
1189 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1190 		return;
1191 	}
1192 	vdp->xs_if_status = XDB_DISCONNECTED;
1193 	cv_broadcast(&vdp->xs_iocv);
1194 
1195 	mutex_exit(&vdp->xs_iomutex);
1196 
1197 	/* stop accepting I/O request from frontend */
1198 	ddi_remove_intr(dip, 0, NULL);
1199 	/* clear all on-going I/Os, if any */
1200 	mutex_enter(&vdp->xs_iomutex);
1201 	while (vdp->xs_ionum > 0)
1202 		cv_wait(&vdp->xs_ionumcv, &vdp->xs_iomutex);
1203 	mutex_exit(&vdp->xs_iomutex);
1204 
1205 	/* clean up resources and close this interface */
1206 	xdb_uninit_ioreqs(vdp);
1207 	xdb_unbindfrom_frontend(vdp);
1208 	xdb_close_device(vdp);
1209 	vdp->xs_peer = (domid_t)-1;
1210 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1211 }
1212 
1213 /*
1214  * Xdb_check_state_transition will check the XenbusState change to see
1215  * if the change is a valid transition or not.
1216  * The new state is written by frontend domain, or by running xenstore-write
1217  * to change it manually in dom0
1218  */
1219 static int
1220 xdb_check_state_transition(xdb_t *vdp, XenbusState oestate)
1221 {
1222 	enum xdb_state status;
1223 	int stcheck;
1224 #define	STOK	0 /* need further process */
1225 #define	STNOP	1 /* no action need taking */
1226 #define	STBUG	2 /* unexpected state change, could be a bug */
1227 
1228 	status = vdp->xs_if_status;
1229 	stcheck = STOK;
1230 
1231 	switch (status) {
1232 	case XDB_UNKNOWN:
1233 		if (vdp->xs_fe_status == XDB_FE_UNKNOWN) {
1234 			if ((oestate == XenbusStateUnknown)		||
1235 			    (oestate == XenbusStateConnected))
1236 				stcheck = STBUG;
1237 			else if ((oestate == XenbusStateInitialising)	||
1238 			    (oestate == XenbusStateInitWait))
1239 				stcheck = STNOP;
1240 		} else {
1241 			if ((oestate == XenbusStateUnknown)		||
1242 			    (oestate == XenbusStateInitialising)	||
1243 			    (oestate == XenbusStateInitWait)		||
1244 			    (oestate == XenbusStateConnected))
1245 				stcheck = STBUG;
1246 			else if (oestate == XenbusStateInitialised)
1247 				stcheck = STNOP;
1248 		}
1249 		break;
1250 	case XDB_CONNECTED:
1251 		if ((oestate == XenbusStateUnknown)		||
1252 		    (oestate == XenbusStateInitialising)	||
1253 		    (oestate == XenbusStateInitWait)		||
1254 		    (oestate == XenbusStateInitialised))
1255 			stcheck = STBUG;
1256 		else if (oestate == XenbusStateConnected)
1257 			stcheck = STNOP;
1258 		break;
1259 	case XDB_DISCONNECTED:
1260 	default:
1261 			stcheck = STBUG;
1262 	}
1263 
1264 	if (stcheck == STOK)
1265 		return (DDI_SUCCESS);
1266 
1267 	if (stcheck == STBUG)
1268 		cmn_err(CE_NOTE, "xdb@%s: unexpected otherend "
1269 		    "state change to %d!, when status is %d",
1270 		    ddi_get_name_addr(vdp->xs_dip), oestate, status);
1271 
1272 	return (DDI_FAILURE);
1273 }
1274 
1275 static void
1276 xdb_send_buf(void *arg)
1277 {
1278 	buf_t *bp;
1279 	xdb_t *vdp = (xdb_t *)arg;
1280 
1281 	mutex_enter(&vdp->xs_iomutex);
1282 
1283 	while (vdp->xs_if_status != XDB_DISCONNECTED) {
1284 		while ((bp = vdp->xs_f_iobuf) != NULL) {
1285 			vdp->xs_f_iobuf = bp->av_forw;
1286 			bp->av_forw = NULL;
1287 			vdp->xs_ionum++;
1288 			mutex_exit(&vdp->xs_iomutex);
1289 			if (bp->b_bcount != 0) {
1290 				int err = ldi_strategy(vdp->xs_ldi_hdl, bp);
1291 				if (err != 0) {
1292 					bp->b_flags |= B_ERROR;
1293 					(void) xdb_biodone(bp);
1294 					XDB_DBPRINT(XDB_DBG_IO, (CE_WARN,
1295 					    "xdb@%s: sent buf to backend dev"
1296 					    "failed, err=%d",
1297 					    ddi_get_name_addr(vdp->xs_dip),
1298 					    err));
1299 				} else {
1300 					XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
1301 					    "sent buf to backend ok"));
1302 				}
1303 			} else /* no I/O need to be done */
1304 				(void) xdb_biodone(bp);
1305 
1306 			mutex_enter(&vdp->xs_iomutex);
1307 		}
1308 
1309 		if (vdp->xs_if_status != XDB_DISCONNECTED)
1310 			cv_wait(&vdp->xs_iocv, &vdp->xs_iomutex);
1311 	}
1312 
1313 	mutex_exit(&vdp->xs_iomutex);
1314 }
1315 
1316 /*ARGSUSED*/
1317 static void
1318 xdb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
1319     void *impl_data)
1320 {
1321 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1322 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1323 
1324 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
1325 	    "hotplug status change to %d!", ddi_get_name_addr(dip), state));
1326 
1327 	mutex_enter(&vdp->xs_cbmutex);
1328 	if (state == Connected) {
1329 		/* Hotplug script has completed successfully */
1330 		if (vdp->xs_dev_status == XDB_DEV_UNKNOWN) {
1331 			vdp->xs_dev_status = XDB_DEV_READY;
1332 			if (vdp->xs_fe_status == XDB_FE_READY)
1333 				/* try to connect to frontend */
1334 				if (xdb_start_connect(vdp) != DDI_SUCCESS)
1335 					(void) xdb_start_disconnect(vdp);
1336 		}
1337 	}
1338 	mutex_exit(&vdp->xs_cbmutex);
1339 }
1340 
1341 /*ARGSUSED*/
1342 static void
1343 xdb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
1344     void *impl_data)
1345 {
1346 	XenbusState new_state = *(XenbusState *)impl_data;
1347 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1348 
1349 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
1350 	    "otherend state change to %d!", ddi_get_name_addr(dip), new_state));
1351 
1352 	mutex_enter(&vdp->xs_cbmutex);
1353 
1354 	if (xdb_check_state_transition(vdp, new_state) == DDI_FAILURE) {
1355 		mutex_exit(&vdp->xs_cbmutex);
1356 		return;
1357 	}
1358 
1359 	switch (new_state) {
1360 	case XenbusStateInitialised:
1361 		ASSERT(vdp->xs_if_status == XDB_UNKNOWN);
1362 
1363 		/* frontend is ready for connecting */
1364 		vdp->xs_fe_status = XDB_FE_READY;
1365 
1366 		if (vdp->xs_dev_status == XDB_DEV_READY)
1367 			if (xdb_start_connect(vdp) != DDI_SUCCESS)
1368 				(void) xdb_start_disconnect(vdp);
1369 		break;
1370 	case XenbusStateClosing:
1371 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1372 		break;
1373 	case XenbusStateClosed:
1374 		/* clean up */
1375 		xdb_close(dip);
1376 
1377 	}
1378 
1379 	mutex_exit(&vdp->xs_cbmutex);
1380 }
1381 
1382 static int
1383 xdb_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1384 {
1385 	xdb_t *vdp;
1386 	ddi_iblock_cookie_t ibc;
1387 	int instance;
1388 
1389 	switch (cmd) {
1390 	case DDI_RESUME:
1391 		return (DDI_FAILURE);
1392 	case DDI_ATTACH:
1393 		break;
1394 	default:
1395 		return (DDI_FAILURE);
1396 	}
1397 
1398 	/* DDI_ATTACH */
1399 	instance = ddi_get_instance(dip);
1400 	if (ddi_soft_state_zalloc(xdb_statep, instance) != DDI_SUCCESS)
1401 		return (DDI_FAILURE);
1402 
1403 	vdp = ddi_get_soft_state(xdb_statep, instance);
1404 	vdp->xs_dip = dip;
1405 	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
1406 		goto errout1;
1407 
1408 	if (!xdb_kstat_init(vdp))
1409 		goto errout1;
1410 
1411 	mutex_init(&vdp->xs_iomutex, NULL, MUTEX_DRIVER, (void *)ibc);
1412 	mutex_init(&vdp->xs_cbmutex, NULL, MUTEX_DRIVER, (void *)ibc);
1413 	cv_init(&vdp->xs_iocv, NULL, CV_DRIVER, NULL);
1414 	cv_init(&vdp->xs_ionumcv, NULL, CV_DRIVER, NULL);
1415 
1416 	ddi_set_driver_private(dip, vdp);
1417 
1418 	vdp->xs_iotaskq = ddi_taskq_create(dip, "xdb_iotask", 1,
1419 	    TASKQ_DEFAULTPRI, 0);
1420 	if (vdp->xs_iotaskq == NULL)
1421 		goto errout2;
1422 	(void) ddi_taskq_dispatch(vdp->xs_iotaskq, xdb_send_buf, vdp,
1423 	    DDI_SLEEP);
1424 
1425 	/* Watch frontend and hotplug state change */
1426 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xdb_oe_state_change,
1427 	    NULL) != DDI_SUCCESS)
1428 		goto errout3;
1429 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xdb_hp_state_change,
1430 	    NULL) != DDI_SUCCESS) {
1431 		goto errout4;
1432 	}
1433 
1434 	/*
1435 	 * Kick-off hotplug script
1436 	 */
1437 	if (xvdi_post_event(dip, XEN_HP_ADD) != DDI_SUCCESS) {
1438 		cmn_err(CE_WARN, "xdb@%s: failed to start hotplug script",
1439 		    ddi_get_name_addr(dip));
1440 		goto errout4;
1441 	}
1442 
1443 	/*
1444 	 * start waiting for hotplug event and otherend state event
1445 	 * mainly for debugging, frontend will not take any op seeing this
1446 	 */
1447 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
1448 
1449 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: attached!",
1450 	    ddi_get_name_addr(dip)));
1451 	return (DDI_SUCCESS);
1452 
1453 errout4:
1454 	xvdi_remove_event_handler(dip, NULL);
1455 errout3:
1456 	mutex_enter(&vdp->xs_cbmutex);
1457 	mutex_enter(&vdp->xs_iomutex);
1458 	vdp->xs_if_status = XDB_DISCONNECTED;
1459 	cv_broadcast(&vdp->xs_iocv);
1460 	mutex_exit(&vdp->xs_iomutex);
1461 	mutex_exit(&vdp->xs_cbmutex);
1462 	ddi_taskq_destroy(vdp->xs_iotaskq);
1463 errout2:
1464 	ddi_set_driver_private(dip, NULL);
1465 	cv_destroy(&vdp->xs_iocv);
1466 	cv_destroy(&vdp->xs_ionumcv);
1467 	mutex_destroy(&vdp->xs_cbmutex);
1468 	mutex_destroy(&vdp->xs_iomutex);
1469 	kstat_delete(vdp->xs_kstats);
1470 errout1:
1471 	ddi_soft_state_free(xdb_statep, instance);
1472 	return (DDI_FAILURE);
1473 }
1474 
1475 /*ARGSUSED*/
1476 static int
1477 xdb_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1478 {
1479 	int instance = ddi_get_instance(dip);
1480 	xdb_t *vdp = XDB_INST2SOFTS(instance);
1481 
1482 	switch (cmd) {
1483 	case DDI_SUSPEND:
1484 		return (DDI_FAILURE);
1485 	case DDI_DETACH:
1486 		break;
1487 	default:
1488 		return (DDI_FAILURE);
1489 	}
1490 
1491 	/* DDI_DETACH handling */
1492 
1493 	/* shouldn't detach, if still used by frontend */
1494 	mutex_enter(&vdp->xs_iomutex);
1495 	if (vdp->xs_if_status != XDB_DISCONNECTED) {
1496 		mutex_exit(&vdp->xs_iomutex);
1497 		return (DDI_FAILURE);
1498 	}
1499 	mutex_exit(&vdp->xs_iomutex);
1500 
1501 	xvdi_remove_event_handler(dip, NULL);
1502 	/* can do nothing about it, if it fails */
1503 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1504 
1505 	ddi_taskq_destroy(vdp->xs_iotaskq);
1506 	cv_destroy(&vdp->xs_iocv);
1507 	cv_destroy(&vdp->xs_ionumcv);
1508 	mutex_destroy(&vdp->xs_cbmutex);
1509 	mutex_destroy(&vdp->xs_iomutex);
1510 	kstat_delete(vdp->xs_kstats);
1511 	ddi_set_driver_private(dip, NULL);
1512 	ddi_soft_state_free(xdb_statep, instance);
1513 
1514 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: detached!",
1515 	    ddi_get_name_addr(dip)));
1516 	return (DDI_SUCCESS);
1517 }
1518 
1519 static struct dev_ops xdb_dev_ops = {
1520 	DEVO_REV,	/* devo_rev */
1521 	0,		/* devo_refcnt */
1522 	ddi_getinfo_1to1, /* devo_getinfo */
1523 	nulldev,	/* devo_identify */
1524 	nulldev,	/* devo_probe */
1525 	xdb_attach,	/* devo_attach */
1526 	xdb_detach,	/* devo_detach */
1527 	nodev,		/* devo_reset */
1528 	NULL,		/* devo_cb_ops */
1529 	NULL,		/* devo_bus_ops */
1530 	NULL,		/* power */
1531 	ddi_quiesce_not_needed,	/* quiesce */
1532 };
1533 
1534 /*
1535  * Module linkage information for the kernel.
1536  */
1537 static struct modldrv modldrv = {
1538 	&mod_driverops,			/* Type of module. */
1539 	"vbd backend driver",	/* Name of the module */
1540 	&xdb_dev_ops			/* driver ops */
1541 };
1542 
1543 static struct modlinkage xdb_modlinkage = {
1544 	MODREV_1,
1545 	&modldrv,
1546 	NULL
1547 };
1548 
1549 int
1550 _init(void)
1551 {
1552 	int rv;
1553 
1554 	if ((rv = ddi_soft_state_init((void **)&xdb_statep,
1555 	    sizeof (xdb_t), 0)) == 0)
1556 		if ((rv = mod_install(&xdb_modlinkage)) != 0)
1557 			ddi_soft_state_fini((void **)&xdb_statep);
1558 	return (rv);
1559 }
1560 
1561 int
1562 _fini(void)
1563 {
1564 	int rv;
1565 
1566 	if ((rv = mod_remove(&xdb_modlinkage)) != 0)
1567 		return (rv);
1568 	ddi_soft_state_fini((void **)&xdb_statep);
1569 	return (rv);
1570 }
1571 
1572 int
1573 _info(struct modinfo *modinfop)
1574 {
1575 	return (mod_info(&xdb_modlinkage, modinfop));
1576 }
1577 
1578 static int
1579 xdb_get_request(xdb_t *vdp, blkif_request_t *req)
1580 {
1581 	void *src = xvdi_ring_get_request(vdp->xs_ring);
1582 
1583 	if (src == NULL)
1584 		return (0);
1585 
1586 	switch (vdp->xs_blk_protocol) {
1587 	case BLKIF_PROTOCOL_NATIVE:
1588 		(void) memcpy(req, src, sizeof (*req));
1589 		break;
1590 	case BLKIF_PROTOCOL_X86_32:
1591 		blkif_get_x86_32_req(req, src);
1592 		break;
1593 	case BLKIF_PROTOCOL_X86_64:
1594 		blkif_get_x86_64_req(req, src);
1595 		break;
1596 	default:
1597 		cmn_err(CE_PANIC, "xdb@%s: unrecognised protocol: %d",
1598 		    ddi_get_name_addr(vdp->xs_dip),
1599 		    vdp->xs_blk_protocol);
1600 	}
1601 	return (1);
1602 }
1603 
1604 static int
1605 xdb_push_response(xdb_t *vdp, uint64_t id, uint8_t op, uint16_t status)
1606 {
1607 	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;
1608 	blkif_response_t *rsp = xvdi_ring_get_response(vdp->xs_ring);
1609 	blkif_x86_32_response_t *rsp_32 = (blkif_x86_32_response_t *)rsp;
1610 	blkif_x86_64_response_t *rsp_64 = (blkif_x86_64_response_t *)rsp;
1611 
1612 	ASSERT(rsp);
1613 
1614 	switch (vdp->xs_blk_protocol) {
1615 	case BLKIF_PROTOCOL_NATIVE:
1616 		ddi_put64(acchdl, &rsp->id, id);
1617 		ddi_put8(acchdl, &rsp->operation, op);
1618 		ddi_put16(acchdl, (uint16_t *)&rsp->status,
1619 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1620 		break;
1621 	case BLKIF_PROTOCOL_X86_32:
1622 		ddi_put64(acchdl, &rsp_32->id, id);
1623 		ddi_put8(acchdl, &rsp_32->operation, op);
1624 		ddi_put16(acchdl, (uint16_t *)&rsp_32->status,
1625 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1626 		break;
1627 	case BLKIF_PROTOCOL_X86_64:
1628 		ddi_put64(acchdl, &rsp_64->id, id);
1629 		ddi_put8(acchdl, &rsp_64->operation, op);
1630 		ddi_put16(acchdl, (uint16_t *)&rsp_64->status,
1631 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1632 		break;
1633 	default:
1634 		cmn_err(CE_PANIC, "xdb@%s: unrecognised protocol: %d",
1635 		    ddi_get_name_addr(vdp->xs_dip),
1636 		    vdp->xs_blk_protocol);
1637 	}
1638 
1639 	return (xvdi_ring_push_response(vdp->xs_ring));
1640 }
1641 
1642 static void
1643 blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
1644 {
1645 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1646 	dst->operation = src->operation;
1647 	dst->nr_segments = src->nr_segments;
1648 	dst->handle = src->handle;
1649 	dst->id = src->id;
1650 	dst->sector_number = src->sector_number;
1651 	if (n > src->nr_segments)
1652 		n = src->nr_segments;
1653 	for (i = 0; i < n; i++)
1654 		dst->seg[i] = src->seg[i];
1655 }
1656 
1657 static void
1658 blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
1659 {
1660 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1661 	dst->operation = src->operation;
1662 	dst->nr_segments = src->nr_segments;
1663 	dst->handle = src->handle;
1664 	dst->id = src->id;
1665 	dst->sector_number = src->sector_number;
1666 	if (n > src->nr_segments)
1667 		n = src->nr_segments;
1668 	for (i = 0; i < n; i++)
1669 		dst->seg[i] = src->seg[i];
1670 }
1671