xref: /titanic_50/usr/src/uts/common/xen/io/xdb.c (revision 6392794b28bef963aa5ad05c3da79435fd0a5a0b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Note: This is the backend part of the split PV disk driver. This driver
29  * is not a nexus driver, nor is it a leaf driver(block/char/stream driver).
30  * Currently, it does not create any minor node. So, although, it runs in
31  * backend domain, it will not be used directly from within dom0.
32  * It simply gets block I/O requests issued by frontend from a shared page
33  * (blkif ring buffer - defined by Xen) between backend and frontend domain,
34  * generates a buf, and push it down to underlying disk target driver via
35  * ldi interface. When buf is done, this driver will generate a response
36  * and put it into ring buffer to inform frontend of the status of the I/O
37  * request issued by it. When a new virtual device entry is added in xenstore,
38  * there will be an watch event sent from Xen to xvdi framework, who will,
39  * in turn, create the devinfo node and try to attach this driver
40  * (see xvdi_create_dev). When frontend peer changes its state to
41  * XenbusStateClose, an event will also be sent from Xen to xvdi framework,
42  * who will detach and remove this devinfo node (see i_xvdi_oestate_handler).
43  * I/O requests get from ring buffer and event coming from xenstore cannot be
44  * trusted. We verify them in xdb_get_buf() and xdb_check_state_transition().
45  *
46  * Virtual device configuration is read/written from/to the database via
47  * xenbus_* interfaces. Driver also use xvdi_* to interact with hypervisor.
48  * There is an on-going effort to make xvdi_* cover all xenbus_*.
49  */
50 
51 #include <sys/types.h>
52 #include <sys/conf.h>
53 #include <sys/ddi.h>
54 #include <sys/dditypes.h>
55 #include <sys/sunddi.h>
56 #include <sys/list.h>
57 #include <sys/dkio.h>
58 #include <sys/cmlb.h>
59 #include <sys/vtoc.h>
60 #include <sys/modctl.h>
61 #include <sys/bootconf.h>
62 #include <sys/promif.h>
63 #include <sys/sysmacros.h>
64 #include <public/io/xenbus.h>
65 #include <xen/sys/xenbus_impl.h>
66 #include <xen/sys/xendev.h>
67 #include <sys/gnttab.h>
68 #include <sys/scsi/generic/inquiry.h>
69 #include <vm/seg_kmem.h>
70 #include <vm/hat_i86.h>
71 #include <sys/gnttab.h>
72 #include <sys/lofi.h>
73 #include <io/xdf.h>
74 #include <xen/io/blkif_impl.h>
75 #include <io/xdb.h>
76 
77 static xdb_t *xdb_statep;
78 static int xdb_debug = 0;
79 
80 static int xdb_push_response(xdb_t *, uint64_t, uint8_t, uint16_t);
81 static int xdb_get_request(xdb_t *, blkif_request_t *);
82 static void blkif_get_x86_32_req(blkif_request_t *, blkif_x86_32_request_t *);
83 static void blkif_get_x86_64_req(blkif_request_t *, blkif_x86_64_request_t *);
84 
85 #ifdef DEBUG
86 /*
87  * debug aid functions
88  */
89 
90 static void
91 logva(xdb_t *vdp, uint64_t va)
92 {
93 	uint64_t *page_addrs;
94 	int i;
95 
96 	page_addrs = vdp->page_addrs;
97 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
98 		if (page_addrs[i] == va)
99 			debug_enter("VA remapping found!");
100 	}
101 
102 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
103 		if (page_addrs[i] == 0) {
104 			page_addrs[i] = va;
105 			break;
106 		}
107 	}
108 	ASSERT(i < XDB_MAX_IO_PAGES(vdp));
109 }
110 
111 static void
112 unlogva(xdb_t *vdp, uint64_t va)
113 {
114 	uint64_t *page_addrs;
115 	int i;
116 
117 	page_addrs = vdp->page_addrs;
118 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
119 		if (page_addrs[i] == va) {
120 			page_addrs[i] = 0;
121 			break;
122 		}
123 	}
124 	ASSERT(i < XDB_MAX_IO_PAGES(vdp));
125 }
126 
127 static void
128 xdb_dump_request_oe(blkif_request_t *req)
129 {
130 	int i;
131 
132 	/*
133 	 * Exploit the public interface definitions for BLKIF_OP_READ
134 	 * etc..
135 	 */
136 	char *op_name[] = { "read", "write", "barrier", "flush" };
137 
138 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "op=%s", op_name[req->operation]));
139 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "num of segments=%d",
140 	    req->nr_segments));
141 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "handle=%d", req->handle));
142 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "id=%llu",
143 	    (unsigned long long)req->id));
144 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "start sector=%llu",
145 	    (unsigned long long)req->sector_number));
146 	for (i = 0; i < req->nr_segments; i++) {
147 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "gref=%d, first sec=%d,"
148 		    "last sec=%d", req->seg[i].gref, req->seg[i].first_sect,
149 		    req->seg[i].last_sect));
150 	}
151 }
152 #endif /* DEBUG */
153 
154 /*
155  * Statistics.
156  */
157 static char *xdb_stats[] = {
158 	"rd_reqs",
159 	"wr_reqs",
160 	"br_reqs",
161 	"fl_reqs",
162 	"oo_reqs"
163 };
164 
165 static int
166 xdb_kstat_update(kstat_t *ksp, int flag)
167 {
168 	xdb_t *vdp;
169 	kstat_named_t *knp;
170 
171 	if (flag != KSTAT_READ)
172 		return (EACCES);
173 
174 	vdp = ksp->ks_private;
175 	knp = ksp->ks_data;
176 
177 	/*
178 	 * Assignment order should match that of the names in
179 	 * xdb_stats.
180 	 */
181 	(knp++)->value.ui64 = vdp->xs_stat_req_reads;
182 	(knp++)->value.ui64 = vdp->xs_stat_req_writes;
183 	(knp++)->value.ui64 = vdp->xs_stat_req_barriers;
184 	(knp++)->value.ui64 = vdp->xs_stat_req_flushes;
185 	(knp++)->value.ui64 = 0; /* oo_req */
186 
187 	return (0);
188 }
189 
190 static boolean_t
191 xdb_kstat_init(xdb_t *vdp)
192 {
193 	int nstat = sizeof (xdb_stats) / sizeof (xdb_stats[0]);
194 	char **cp = xdb_stats;
195 	kstat_named_t *knp;
196 
197 	if ((vdp->xs_kstats = kstat_create("xdb",
198 	    ddi_get_instance(vdp->xs_dip),
199 	    "req_statistics", "block", KSTAT_TYPE_NAMED,
200 	    nstat, 0)) == NULL)
201 		return (B_FALSE);
202 
203 	vdp->xs_kstats->ks_private = vdp;
204 	vdp->xs_kstats->ks_update = xdb_kstat_update;
205 
206 	knp = vdp->xs_kstats->ks_data;
207 	while (nstat > 0) {
208 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
209 		knp++;
210 		cp++;
211 		nstat--;
212 	}
213 
214 	kstat_install(vdp->xs_kstats);
215 
216 	return (B_TRUE);
217 }
218 
219 static int xdb_biodone(buf_t *);
220 
221 static buf_t *
222 xdb_get_buf(xdb_t *vdp, blkif_request_t *req, xdb_request_t *xreq)
223 {
224 	buf_t *bp;
225 	uint8_t segs, curseg;
226 	int sectors;
227 	int i, err;
228 	gnttab_map_grant_ref_t mapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
229 	ddi_acc_handle_t acchdl;
230 
231 	acchdl = vdp->xs_ring_hdl;
232 	bp = XDB_XREQ2BP(xreq);
233 	curseg = xreq->xr_curseg;
234 	/* init a new xdb request */
235 	if (req != NULL) {
236 		ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
237 		boolean_t pagemapok = B_TRUE;
238 		uint8_t op = ddi_get8(acchdl, &req->operation);
239 
240 		xreq->xr_vdp = vdp;
241 		xreq->xr_op = op;
242 		xreq->xr_id = ddi_get64(acchdl, &req->id);
243 		segs = xreq->xr_buf_pages = ddi_get8(acchdl, &req->nr_segments);
244 		if (segs == 0) {
245 			if (op != BLKIF_OP_FLUSH_DISKCACHE)
246 				cmn_err(CE_WARN, "!non-BLKIF_OP_FLUSH_DISKCACHE"
247 				    " is seen from domain %d with zero "
248 				    "length data buffer!", vdp->xs_peer);
249 			bioinit(bp);
250 			bp->b_bcount = 0;
251 			bp->b_lblkno = 0;
252 			bp->b_un.b_addr = NULL;
253 			return (bp);
254 		} else if (op == BLKIF_OP_FLUSH_DISKCACHE) {
255 			cmn_err(CE_WARN, "!BLKIF_OP_FLUSH_DISKCACHE"
256 			    " is seen from domain %d with non-zero "
257 			    "length data buffer!", vdp->xs_peer);
258 		}
259 
260 		/*
261 		 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
262 		 * according to the definition of blk interface by Xen
263 		 * we do sanity check here
264 		 */
265 		if (segs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
266 			segs = xreq->xr_buf_pages =
267 			    BLKIF_MAX_SEGMENTS_PER_REQUEST;
268 
269 		for (i = 0; i < segs; i++) {
270 			uint8_t fs, ls;
271 
272 			mapops[i].host_addr =
273 			    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
274 			    vdp->xs_iopage_va, xreq->xr_idx, i);
275 			mapops[i].dom = vdp->xs_peer;
276 			mapops[i].ref = ddi_get32(acchdl, &req->seg[i].gref);
277 			mapops[i].flags = GNTMAP_host_map;
278 			if (op != BLKIF_OP_READ)
279 				mapops[i].flags |= GNTMAP_readonly;
280 
281 			fs = ddi_get8(acchdl, &req->seg[i].first_sect);
282 			ls = ddi_get8(acchdl, &req->seg[i].last_sect);
283 
284 			/*
285 			 * first_sect should be no bigger than last_sect and
286 			 * both of them should be no bigger than
287 			 * (PAGESIZE / XB_BSIZE - 1) according to definition
288 			 * of blk interface by Xen, so sanity check again
289 			 */
290 			if (fs > (PAGESIZE / XB_BSIZE - 1))
291 				fs = PAGESIZE / XB_BSIZE - 1;
292 			if (ls > (PAGESIZE / XB_BSIZE - 1))
293 				ls = PAGESIZE / XB_BSIZE - 1;
294 			if (fs > ls)
295 				fs = ls;
296 
297 			xreq->xr_segs[i].fs = fs;
298 			xreq->xr_segs[i].ls = ls;
299 		}
300 
301 		/* map in io pages */
302 		err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
303 		    mapops, i);
304 		if (err != 0)
305 			return (NULL);
306 		for (i = 0; i < segs; i++) {
307 			/*
308 			 * Although HYPERVISOR_grant_table_op() returned no
309 			 * error, mapping of each single page can fail. So,
310 			 * we have to do the check here and handle the error
311 			 * if needed
312 			 */
313 			if (mapops[i].status != GNTST_okay) {
314 				int j;
315 				for (j = 0; j < i; j++) {
316 #ifdef DEBUG
317 					unlogva(vdp, mapops[j].host_addr);
318 #endif
319 					xen_release_pfn(
320 					    xreq->xr_plist[j].p_pagenum);
321 				}
322 				pagemapok = B_FALSE;
323 				break;
324 			}
325 			/* record page mapping handle for unmapping later */
326 			xreq->xr_page_hdls[i] = mapops[i].handle;
327 #ifdef DEBUG
328 			logva(vdp, mapops[i].host_addr);
329 #endif
330 			/*
331 			 * Pass the MFNs down using the shadow list (xr_pplist)
332 			 *
333 			 * This is pretty ugly since we have implict knowledge
334 			 * of how the rootnex binds buffers.
335 			 * The GNTTABOP_map_grant_ref op makes us do some ugly
336 			 * stuff since we're not allowed to touch these PTEs
337 			 * from the VM.
338 			 *
339 			 * Obviously, these aren't real page_t's. The rootnex
340 			 * only needs p_pagenum.
341 			 * Also, don't use btop() here or 32 bit PAE breaks.
342 			 */
343 			xreq->xr_pplist[i] = &xreq->xr_plist[i];
344 			xreq->xr_plist[i].p_pagenum =
345 			    xen_assign_pfn(mapops[i].dev_bus_addr >> PAGESHIFT);
346 		}
347 
348 		/*
349 		 * not all pages mapped in successfully, unmap those mapped-in
350 		 * page and return failure
351 		 */
352 		if (!pagemapok) {
353 			gnttab_unmap_grant_ref_t unmapop;
354 
355 			for (i = 0; i < segs; i++) {
356 				if (mapops[i].status != GNTST_okay)
357 					continue;
358 				unmapop.host_addr =
359 				    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
360 				    vdp->xs_iopage_va, xreq->xr_idx, i);
361 				unmapop.dev_bus_addr = NULL;
362 				unmapop.handle = mapops[i].handle;
363 				(void) HYPERVISOR_grant_table_op(
364 				    GNTTABOP_unmap_grant_ref, &unmapop, 1);
365 			}
366 
367 			return (NULL);
368 		}
369 		bioinit(bp);
370 		bp->b_lblkno = ddi_get64(acchdl, &req->sector_number);
371 		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
372 		bp->b_flags |= (ddi_get8(acchdl, &req->operation) ==
373 		    BLKIF_OP_READ) ? B_READ : (B_WRITE | B_ASYNC);
374 	} else {
375 		uint64_t blkst;
376 		int isread;
377 
378 		/* reuse this buf */
379 		blkst = bp->b_lblkno + bp->b_bcount / DEV_BSIZE;
380 		isread = bp->b_flags & B_READ;
381 		bioreset(bp);
382 		bp->b_lblkno = blkst;
383 		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
384 		bp->b_flags |= isread ? B_READ : (B_WRITE | B_ASYNC);
385 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "reuse buf, xreq is %d!!",
386 		    xreq->xr_idx));
387 	}
388 
389 	/* form a buf */
390 	bp->b_un.b_addr = XDB_IOPAGE_VA(vdp->xs_iopage_va, xreq->xr_idx,
391 	    curseg) + xreq->xr_segs[curseg].fs * DEV_BSIZE;
392 	bp->b_shadow = &xreq->xr_pplist[curseg];
393 	bp->b_iodone = xdb_biodone;
394 	sectors = 0;
395 	for (i = curseg; i < xreq->xr_buf_pages; i++) {
396 		/*
397 		 * The xreq->xr_segs[i].fs of the first seg can be non-zero
398 		 * otherwise, we'll break it into multiple bufs
399 		 */
400 		if ((i != curseg) && (xreq->xr_segs[i].fs != 0)) {
401 			break;
402 		}
403 		sectors += (xreq->xr_segs[i].ls - xreq->xr_segs[i].fs + 1);
404 	}
405 	xreq->xr_curseg = i;
406 	bp->b_bcount = sectors * DEV_BSIZE;
407 	bp->b_bufsize = bp->b_bcount;
408 
409 	return (bp);
410 }
411 
412 static xdb_request_t *
413 xdb_get_req(xdb_t *vdp)
414 {
415 	xdb_request_t *req;
416 	int idx;
417 
418 	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
419 	ASSERT(vdp->xs_free_req != -1);
420 	req = &vdp->xs_req[vdp->xs_free_req];
421 	vdp->xs_free_req = req->xr_next;
422 	idx = req->xr_idx;
423 	bzero(req, sizeof (xdb_request_t));
424 	req->xr_idx = idx;
425 	return (req);
426 }
427 
428 static void
429 xdb_free_req(xdb_request_t *req)
430 {
431 	xdb_t *vdp = req->xr_vdp;
432 
433 	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
434 	req->xr_next = vdp->xs_free_req;
435 	vdp->xs_free_req = req->xr_idx;
436 }
437 
438 static void
439 xdb_response(xdb_t *vdp, blkif_request_t *req, boolean_t ok)
440 {
441 	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;
442 
443 	if (xdb_push_response(vdp, ddi_get64(acchdl, &req->id),
444 	    ddi_get8(acchdl, &req->operation), ok))
445 		xvdi_notify_oe(vdp->xs_dip);
446 }
447 
448 static void
449 xdb_init_ioreqs(xdb_t *vdp)
450 {
451 	int i;
452 
453 	ASSERT(vdp->xs_nentry);
454 
455 	if (vdp->xs_req == NULL)
456 		vdp->xs_req = kmem_alloc(vdp->xs_nentry *
457 		    sizeof (xdb_request_t), KM_SLEEP);
458 #ifdef DEBUG
459 	if (vdp->page_addrs == NULL)
460 		vdp->page_addrs = kmem_zalloc(XDB_MAX_IO_PAGES(vdp) *
461 		    sizeof (uint64_t), KM_SLEEP);
462 #endif
463 	for (i = 0; i < vdp->xs_nentry; i++) {
464 		vdp->xs_req[i].xr_idx = i;
465 		vdp->xs_req[i].xr_next = i + 1;
466 	}
467 	vdp->xs_req[vdp->xs_nentry - 1].xr_next = -1;
468 	vdp->xs_free_req = 0;
469 
470 	/* alloc va in host dom for io page mapping */
471 	vdp->xs_iopage_va = vmem_xalloc(heap_arena,
472 	    XDB_MAX_IO_PAGES(vdp) * PAGESIZE, PAGESIZE, 0, 0, 0, 0,
473 	    VM_SLEEP);
474 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++)
475 		hat_prepare_mapping(kas.a_hat,
476 		    vdp->xs_iopage_va + i * PAGESIZE);
477 }
478 
479 static void
480 xdb_uninit_ioreqs(xdb_t *vdp)
481 {
482 	int i;
483 
484 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++)
485 		hat_release_mapping(kas.a_hat,
486 		    vdp->xs_iopage_va + i * PAGESIZE);
487 	vmem_xfree(heap_arena, vdp->xs_iopage_va,
488 	    XDB_MAX_IO_PAGES(vdp) * PAGESIZE);
489 	if (vdp->xs_req != NULL) {
490 		kmem_free(vdp->xs_req, vdp->xs_nentry * sizeof (xdb_request_t));
491 		vdp->xs_req = NULL;
492 	}
493 #ifdef DEBUG
494 	if (vdp->page_addrs != NULL) {
495 		kmem_free(vdp->page_addrs, XDB_MAX_IO_PAGES(vdp) *
496 		    sizeof (uint64_t));
497 		vdp->page_addrs = NULL;
498 	}
499 #endif
500 }
501 
502 static uint_t
503 xdb_intr(caddr_t arg)
504 {
505 	blkif_request_t req;
506 	blkif_request_t *reqp = &req;
507 	xdb_request_t *xreq;
508 	buf_t *bp;
509 	uint8_t op;
510 	xdb_t *vdp = (xdb_t *)arg;
511 	int ret = DDI_INTR_UNCLAIMED;
512 	dev_info_t *dip = vdp->xs_dip;
513 
514 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
515 	    "xdb@%s: I/O request received from dom %d",
516 	    ddi_get_name_addr(dip), vdp->xs_peer));
517 
518 	mutex_enter(&vdp->xs_iomutex);
519 
520 	/* shouldn't touch ring buffer if not in connected state */
521 	if (vdp->xs_if_status != XDB_CONNECTED) {
522 		mutex_exit(&vdp->xs_iomutex);
523 		return (DDI_INTR_UNCLAIMED);
524 	}
525 
526 	/*
527 	 * We'll loop till there is no more request in the ring
528 	 * We won't stuck in this loop for ever since the size of ring buffer
529 	 * is limited, and frontend will stop pushing requests into it when
530 	 * the ring buffer is full
531 	 */
532 
533 	/* req_event will be increased in xvdi_ring_get_request() */
534 	while (xdb_get_request(vdp, reqp)) {
535 		ret = DDI_INTR_CLAIMED;
536 
537 		op = ddi_get8(vdp->xs_ring_hdl, &reqp->operation);
538 		if (op == BLKIF_OP_READ			||
539 		    op == BLKIF_OP_WRITE		||
540 		    op == BLKIF_OP_WRITE_BARRIER	||
541 		    op == BLKIF_OP_FLUSH_DISKCACHE) {
542 #ifdef DEBUG
543 			xdb_dump_request_oe(reqp);
544 #endif
545 			xreq = xdb_get_req(vdp);
546 			ASSERT(xreq);
547 			switch (op) {
548 			case BLKIF_OP_READ:
549 				vdp->xs_stat_req_reads++;
550 				break;
551 			case BLKIF_OP_WRITE_BARRIER:
552 				vdp->xs_stat_req_barriers++;
553 				/* FALLTHRU */
554 			case BLKIF_OP_WRITE:
555 				vdp->xs_stat_req_writes++;
556 				break;
557 			case BLKIF_OP_FLUSH_DISKCACHE:
558 				vdp->xs_stat_req_flushes++;
559 				break;
560 			}
561 
562 			xreq->xr_curseg = 0; /* start from first segment */
563 			bp = xdb_get_buf(vdp, reqp, xreq);
564 			if (bp == NULL) {
565 				/* failed to form a buf */
566 				xdb_free_req(xreq);
567 				xdb_response(vdp, reqp, B_FALSE);
568 				continue;
569 			}
570 			bp->av_forw = NULL;
571 
572 			XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
573 			    " buf %p, blkno %lld, size %lu, addr %p",
574 			    (void *)bp, (longlong_t)bp->b_blkno,
575 			    (ulong_t)bp->b_bcount, (void *)bp->b_un.b_addr));
576 
577 			/* send bp to underlying blk driver */
578 			if (vdp->xs_f_iobuf == NULL) {
579 				vdp->xs_f_iobuf = vdp->xs_l_iobuf = bp;
580 			} else {
581 				vdp->xs_l_iobuf->av_forw = bp;
582 				vdp->xs_l_iobuf = bp;
583 			}
584 		} else {
585 			xdb_response(vdp, reqp, B_FALSE);
586 			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
587 			    "Unsupported cmd received from dom %d",
588 			    ddi_get_name_addr(dip), vdp->xs_peer));
589 		}
590 	}
591 	/* notify our taskq to push buf to underlying blk driver */
592 	if (ret == DDI_INTR_CLAIMED)
593 		cv_broadcast(&vdp->xs_iocv);
594 
595 	mutex_exit(&vdp->xs_iomutex);
596 
597 	return (ret);
598 }
599 
600 static int
601 xdb_biodone(buf_t *bp)
602 {
603 	int i, err, bioerr;
604 	uint8_t segs;
605 	gnttab_unmap_grant_ref_t unmapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
606 	xdb_request_t *xreq = XDB_BP2XREQ(bp);
607 	xdb_t *vdp = xreq->xr_vdp;
608 	buf_t *nbp;
609 
610 	bioerr = geterror(bp);
611 	if (bioerr)
612 		XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: I/O error %d",
613 		    ddi_get_name_addr(vdp->xs_dip), bioerr));
614 
615 	/* check if we are done w/ this I/O request */
616 	if ((bioerr == 0) && (xreq->xr_curseg < xreq->xr_buf_pages)) {
617 		nbp = xdb_get_buf(vdp, NULL, xreq);
618 		if (nbp) {
619 			err = ldi_strategy(vdp->xs_ldi_hdl, nbp);
620 			if (err == 0) {
621 				XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
622 				    "sent buf to backend ok"));
623 				return (DDI_SUCCESS);
624 			}
625 			bioerr = EIO;
626 			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
627 			    "sent buf to backend dev failed, err=%d",
628 			    ddi_get_name_addr(vdp->xs_dip), err));
629 		} else {
630 			bioerr = EIO;
631 		}
632 	}
633 
634 	/* unmap io pages */
635 	segs = xreq->xr_buf_pages;
636 	/*
637 	 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
638 	 * according to the definition of blk interface by Xen
639 	 */
640 	ASSERT(segs <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
641 	for (i = 0; i < segs; i++) {
642 		unmapops[i].host_addr = (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
643 		    vdp->xs_iopage_va, xreq->xr_idx, i);
644 #ifdef DEBUG
645 		mutex_enter(&vdp->xs_iomutex);
646 		unlogva(vdp, unmapops[i].host_addr);
647 		mutex_exit(&vdp->xs_iomutex);
648 #endif
649 		unmapops[i].dev_bus_addr = NULL;
650 		unmapops[i].handle = xreq->xr_page_hdls[i];
651 	}
652 	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
653 	    unmapops, segs);
654 	ASSERT(!err);
655 
656 	/*
657 	 * If we have reached a barrier write or a cache flush , then we must
658 	 * flush all our I/Os.
659 	 */
660 	if (xreq->xr_op == BLKIF_OP_WRITE_BARRIER ||
661 	    xreq->xr_op == BLKIF_OP_FLUSH_DISKCACHE) {
662 		/*
663 		 * XXX At this point the write did succeed, so I don't
664 		 * believe we should report an error because the flush
665 		 * failed. However, this is a debatable point, so
666 		 * maybe we need to think more carefully about this.
667 		 * For now, just cast to void.
668 		 */
669 		(void) ldi_ioctl(vdp->xs_ldi_hdl,
670 		    DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, kcred, NULL);
671 	}
672 
673 	mutex_enter(&vdp->xs_iomutex);
674 
675 	/* send response back to frontend */
676 	if (vdp->xs_if_status == XDB_CONNECTED) {
677 		if (xdb_push_response(vdp, xreq->xr_id, xreq->xr_op, bioerr))
678 			xvdi_notify_oe(vdp->xs_dip);
679 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
680 		    "sent resp back to frontend, id=%llu",
681 		    (unsigned long long)xreq->xr_id));
682 	}
683 	/* free io resources */
684 	biofini(bp);
685 	xdb_free_req(xreq);
686 
687 	vdp->xs_ionum--;
688 	if ((vdp->xs_if_status != XDB_CONNECTED) && (vdp->xs_ionum == 0)) {
689 		/* we're closing, someone is waiting for I/O clean-up */
690 		cv_signal(&vdp->xs_ionumcv);
691 	}
692 
693 	mutex_exit(&vdp->xs_iomutex);
694 
695 	return (DDI_SUCCESS);
696 }
697 
698 static int
699 xdb_bindto_frontend(xdb_t *vdp)
700 {
701 	int err;
702 	char *oename;
703 	grant_ref_t gref;
704 	evtchn_port_t evtchn;
705 	dev_info_t *dip = vdp->xs_dip;
706 	char protocol[64] = "";
707 
708 	/*
709 	 * Gather info from frontend
710 	 */
711 	oename = xvdi_get_oename(dip);
712 	if (oename == NULL)
713 		return (DDI_FAILURE);
714 
715 	err = xenbus_gather(XBT_NULL, oename,
716 	    "ring-ref", "%lu", &gref, "event-channel", "%u", &evtchn, NULL);
717 	if (err != 0) {
718 		xvdi_fatal_error(dip, err,
719 		    "Getting ring-ref and evtchn from frontend");
720 		return (DDI_FAILURE);
721 	}
722 
723 	vdp->xs_blk_protocol = BLKIF_PROTOCOL_NATIVE;
724 	vdp->xs_nentry = BLKIF_RING_SIZE;
725 	vdp->xs_entrysize = sizeof (union blkif_sring_entry);
726 
727 	err = xenbus_gather(XBT_NULL, oename,
728 	    "protocol", "%63s", protocol, NULL);
729 	if (err)
730 		(void) strcpy(protocol, "unspecified, assuming native");
731 	else {
732 		/*
733 		 * We must check for NATIVE first, so that the fast path
734 		 * is taken for copying data from the guest to the host.
735 		 */
736 		if (strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE) != 0) {
737 			if (strcmp(protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
738 				vdp->xs_blk_protocol = BLKIF_PROTOCOL_X86_32;
739 				vdp->xs_nentry = BLKIF_X86_32_RING_SIZE;
740 				vdp->xs_entrysize =
741 				    sizeof (union blkif_x86_32_sring_entry);
742 			} else if (strcmp(protocol, XEN_IO_PROTO_ABI_X86_64) ==
743 			    0) {
744 				vdp->xs_blk_protocol = BLKIF_PROTOCOL_X86_64;
745 				vdp->xs_nentry = BLKIF_X86_64_RING_SIZE;
746 				vdp->xs_entrysize =
747 				    sizeof (union blkif_x86_64_sring_entry);
748 			} else {
749 				xvdi_fatal_error(dip, err, "unknown protocol");
750 				return (DDI_FAILURE);
751 			}
752 		}
753 	}
754 #ifdef DEBUG
755 	cmn_err(CE_NOTE, "!xdb@%s: blkif protocol '%s' ",
756 	    ddi_get_name_addr(dip), protocol);
757 #endif
758 
759 	/*
760 	 * map and init ring
761 	 *
762 	 * The ring parameters must match those which have been allocated
763 	 * in the front end.
764 	 */
765 	err = xvdi_map_ring(dip, vdp->xs_nentry, vdp->xs_entrysize,
766 	    gref, &vdp->xs_ring);
767 	if (err != DDI_SUCCESS)
768 		return (DDI_FAILURE);
769 	/*
770 	 * This will be removed after we use shadow I/O ring request since
771 	 * we don't need to access the ring itself directly, thus the access
772 	 * handle is not needed
773 	 */
774 	vdp->xs_ring_hdl = vdp->xs_ring->xr_acc_hdl;
775 
776 	/*
777 	 * bind event channel
778 	 */
779 	err = xvdi_bind_evtchn(dip, evtchn);
780 	if (err != DDI_SUCCESS) {
781 		xvdi_unmap_ring(vdp->xs_ring);
782 		return (DDI_FAILURE);
783 	}
784 
785 	return (DDI_SUCCESS);
786 }
787 
788 static void
789 xdb_unbindfrom_frontend(xdb_t *vdp)
790 {
791 	xvdi_free_evtchn(vdp->xs_dip);
792 	xvdi_unmap_ring(vdp->xs_ring);
793 }
794 
795 #define	LOFI_CTRL_NODE	"/dev/lofictl"
796 #define	LOFI_DEV_NODE	"/devices/pseudo/lofi@0:"
797 #define	LOFI_MODE	FREAD | FWRITE | FEXCL
798 
799 static int
800 xdb_setup_node(xdb_t *vdp, char *path)
801 {
802 	dev_info_t *dip;
803 	char *xsnode, *node;
804 	ldi_handle_t ldi_hdl;
805 	struct lofi_ioctl *li;
806 	int minor;
807 	int err;
808 	unsigned int len;
809 
810 	dip = vdp->xs_dip;
811 	xsnode = xvdi_get_xsname(dip);
812 	if (xsnode == NULL)
813 		return (DDI_FAILURE);
814 
815 	err = xenbus_read(XBT_NULL, xsnode, "dynamic-device-path",
816 	    (void **)&node, &len);
817 	if (err == ENOENT)
818 		err = xenbus_read(XBT_NULL, xsnode, "params", (void **)&node,
819 		    &len);
820 	if (err != 0) {
821 		xvdi_fatal_error(vdp->xs_dip, err, "reading 'params'");
822 		return (DDI_FAILURE);
823 	}
824 
825 	if (!XDB_IS_LOFI(vdp)) {
826 		(void) strlcpy(path, node, MAXPATHLEN + 1);
827 		kmem_free(node, len);
828 		return (DDI_SUCCESS);
829 	}
830 
831 	do {
832 		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
833 		    &ldi_hdl, vdp->xs_ldi_li);
834 	} while (err == EBUSY);
835 	if (err != 0) {
836 		kmem_free(node, len);
837 		return (DDI_FAILURE);
838 	}
839 
840 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
841 	(void) strlcpy(li->li_filename, node, MAXPATHLEN + 1);
842 	kmem_free(node, len);
843 	if (ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
844 	    LOFI_MODE | FKIOCTL, kcred, &minor) != 0) {
845 		cmn_err(CE_WARN, "xdb@%s: Failed to create lofi dev for %s",
846 		    ddi_get_name_addr(dip), li->li_filename);
847 		(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
848 		kmem_free(li, sizeof (*li));
849 		return (DDI_FAILURE);
850 	}
851 	/*
852 	 * return '/devices/...' instead of '/dev/lofi/...' since the
853 	 * former is available immediately after calling ldi_ioctl
854 	 */
855 	(void) snprintf(path, MAXPATHLEN + 1, LOFI_DEV_NODE "%d", minor);
856 	(void) xenbus_printf(XBT_NULL, xsnode, "node", "%s", path);
857 	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
858 	kmem_free(li, sizeof (*li));
859 	return (DDI_SUCCESS);
860 }
861 
862 static void
863 xdb_teardown_node(xdb_t *vdp)
864 {
865 	dev_info_t *dip;
866 	char *xsnode, *node;
867 	ldi_handle_t ldi_hdl;
868 	struct lofi_ioctl *li;
869 	int err;
870 	unsigned int len;
871 
872 	if (!XDB_IS_LOFI(vdp))
873 		return;
874 
875 	dip = vdp->xs_dip;
876 	xsnode = xvdi_get_xsname(dip);
877 	if (xsnode == NULL)
878 		return;
879 
880 	err = xenbus_read(XBT_NULL, xsnode, "dynamic-device-path",
881 	    (void **)&node, &len);
882 	if (err == ENOENT)
883 		err = xenbus_read(XBT_NULL, xsnode, "params", (void **)&node,
884 		    &len);
885 	if (err != 0) {
886 		xvdi_fatal_error(vdp->xs_dip, err, "reading 'params'");
887 		return;
888 	}
889 
890 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
891 	(void) strlcpy(li->li_filename, node, MAXPATHLEN + 1);
892 	kmem_free(node, len);
893 
894 	do {
895 		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
896 		    &ldi_hdl, vdp->xs_ldi_li);
897 	} while (err == EBUSY);
898 
899 	if (err != 0) {
900 		kmem_free(li, sizeof (*li));
901 		return;
902 	}
903 
904 	if (ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE, (intptr_t)li,
905 	    LOFI_MODE | FKIOCTL, kcred, NULL) != 0) {
906 		cmn_err(CE_WARN, "xdb@%s: Failed to delete lofi dev for %s",
907 		    ddi_get_name_addr(dip), li->li_filename);
908 	}
909 
910 	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
911 	kmem_free(li, sizeof (*li));
912 }
913 
914 static int
915 xdb_open_device(xdb_t *vdp)
916 {
917 	uint64_t devsize;
918 	dev_info_t *dip;
919 	char *xsnode;
920 	char *nodepath;
921 	char *mode = NULL;
922 	char *type = NULL;
923 	int err;
924 
925 	dip = vdp->xs_dip;
926 	xsnode = xvdi_get_xsname(dip);
927 	if (xsnode == NULL)
928 		return (DDI_FAILURE);
929 
930 	err = xenbus_gather(XBT_NULL, xsnode,
931 	    "mode", NULL, &mode, "type", NULL, &type, NULL);
932 	if (err != 0) {
933 		if (mode)
934 			kmem_free(mode, strlen(mode) + 1);
935 		if (type)
936 			kmem_free(type, strlen(type) + 1);
937 		xvdi_fatal_error(dip, err,
938 		    "Getting mode and type from backend device");
939 		return (DDI_FAILURE);
940 	}
941 	if (strcmp(type, "file") == 0) {
942 		vdp->xs_type |= XDB_DEV_LOFI;
943 	}
944 	kmem_free(type, strlen(type) + 1);
945 	if ((strcmp(mode, "r") == NULL) || (strcmp(mode, "ro") == NULL)) {
946 		vdp->xs_type |= XDB_DEV_RO;
947 	}
948 	kmem_free(mode, strlen(mode) + 1);
949 
950 	/*
951 	 * try to open backend device
952 	 */
953 	if (ldi_ident_from_dip(dip, &vdp->xs_ldi_li) != 0)
954 		return (DDI_FAILURE);
955 
956 	nodepath = kmem_zalloc(MAXPATHLEN + 1, KM_SLEEP);
957 	err = xdb_setup_node(vdp, nodepath);
958 	if (err != DDI_SUCCESS) {
959 		xvdi_fatal_error(dip, err,
960 		    "Getting device path of backend device");
961 		ldi_ident_release(vdp->xs_ldi_li);
962 		kmem_free(nodepath, MAXPATHLEN + 1);
963 		return (DDI_FAILURE);
964 	}
965 
966 	if (ldi_open_by_name(nodepath,
967 	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE),
968 	    kcred, &vdp->xs_ldi_hdl, vdp->xs_ldi_li) != 0) {
969 		xdb_teardown_node(vdp);
970 		ldi_ident_release(vdp->xs_ldi_li);
971 		cmn_err(CE_WARN, "xdb@%s: Failed to open: %s",
972 		    ddi_get_name_addr(dip), nodepath);
973 		kmem_free(nodepath, MAXPATHLEN + 1);
974 		return (DDI_FAILURE);
975 	}
976 
977 	/* check if it's a CD/DVD disc */
978 	if (ldi_prop_get_int(vdp->xs_ldi_hdl, LDI_DEV_T_ANY | DDI_PROP_DONTPASS,
979 	    "inquiry-device-type", DTYPE_DIRECT) == DTYPE_RODIRECT)
980 		vdp->xs_type |= XDB_DEV_CD;
981 	/* check if it's a removable disk */
982 	if (ldi_prop_exists(vdp->xs_ldi_hdl,
983 	    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
984 	    "removable-media"))
985 		vdp->xs_type |= XDB_DEV_RMB;
986 
987 	if (ldi_get_size(vdp->xs_ldi_hdl, &devsize) != DDI_SUCCESS) {
988 		(void) ldi_close(vdp->xs_ldi_hdl,
989 		    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
990 		xdb_teardown_node(vdp);
991 		ldi_ident_release(vdp->xs_ldi_li);
992 		kmem_free(nodepath, MAXPATHLEN + 1);
993 		return (DDI_FAILURE);
994 	}
995 	vdp->xs_sectors = devsize / XB_BSIZE;
996 
997 	kmem_free(nodepath, MAXPATHLEN + 1);
998 	return (DDI_SUCCESS);
999 }
1000 
1001 static void
1002 xdb_close_device(xdb_t *vdp)
1003 {
1004 	(void) ldi_close(vdp->xs_ldi_hdl,
1005 	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
1006 	xdb_teardown_node(vdp);
1007 	ldi_ident_release(vdp->xs_ldi_li);
1008 	vdp->xs_ldi_li = NULL;
1009 	vdp->xs_ldi_hdl = NULL;
1010 }
1011 
1012 /*
1013  * Kick-off connect process
1014  * If xs_fe_status == XDB_FE_READY and xs_dev_status == XDB_DEV_READY
1015  * the xs_if_status will be changed to XDB_CONNECTED on success,
1016  * otherwise, xs_if_status will not be changed
1017  */
1018 static int
1019 xdb_start_connect(xdb_t *vdp)
1020 {
1021 	uint32_t dinfo;
1022 	xenbus_transaction_t xbt;
1023 	int err, svdst;
1024 	char *xsnode;
1025 	dev_info_t *dip = vdp->xs_dip;
1026 	char *barrier;
1027 	uint_t len;
1028 
1029 	/*
1030 	 * Start connect to frontend only when backend device are ready
1031 	 * and frontend has moved to XenbusStateInitialised, which means
1032 	 * ready to connect
1033 	 */
1034 	ASSERT((vdp->xs_fe_status == XDB_FE_READY) &&
1035 	    (vdp->xs_dev_status == XDB_DEV_READY));
1036 
1037 	if (((xsnode = xvdi_get_xsname(dip)) == NULL)		 ||
1038 	    ((vdp->xs_peer = xvdi_get_oeid(dip)) == (domid_t)-1) ||
1039 	    (xdb_open_device(vdp) != DDI_SUCCESS))
1040 		return (DDI_FAILURE);
1041 
1042 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialised);
1043 
1044 	if (xdb_bindto_frontend(vdp) != DDI_SUCCESS)
1045 		goto errout1;
1046 
1047 	/* init i/o requests */
1048 	xdb_init_ioreqs(vdp);
1049 
1050 	if (ddi_add_intr(dip, 0, NULL, NULL, xdb_intr, (caddr_t)vdp)
1051 	    != DDI_SUCCESS)
1052 		goto errout2;
1053 
1054 	/*
1055 	 * we can recieve intr any time from now on
1056 	 * mark that we're ready to take intr
1057 	 */
1058 	mutex_enter(&vdp->xs_iomutex);
1059 	/*
1060 	 * save it in case we need to restore when we
1061 	 * fail to write xenstore later
1062 	 */
1063 	svdst = vdp->xs_if_status;
1064 	vdp->xs_if_status = XDB_CONNECTED;
1065 	mutex_exit(&vdp->xs_iomutex);
1066 
1067 	/* write into xenstore the info needed by frontend */
1068 trans_retry:
1069 	if (xenbus_transaction_start(&xbt)) {
1070 		xvdi_fatal_error(dip, EIO, "transaction start");
1071 		goto errout3;
1072 	}
1073 
1074 	/*
1075 	 * If feature-barrier isn't present in xenstore, add it.
1076 	 */
1077 	if (xenbus_read(xbt, xsnode, "feature-barrier",
1078 	    (void **)&barrier, &len) != 0) {
1079 		if ((err = xenbus_printf(xbt, xsnode, "feature-barrier",
1080 		    "%d", 1)) != 0) {
1081 			cmn_err(CE_WARN, "xdb@%s: failed to write "
1082 			    "'feature-barrier'", ddi_get_name_addr(dip));
1083 			xvdi_fatal_error(dip, err, "writing 'feature-barrier'");
1084 			goto abort_trans;
1085 		}
1086 	} else
1087 		kmem_free(barrier, len);
1088 
1089 	dinfo = 0;
1090 	if (XDB_IS_RO(vdp))
1091 		dinfo |= VDISK_READONLY;
1092 	if (XDB_IS_CD(vdp))
1093 		dinfo |= VDISK_CDROM;
1094 	if (XDB_IS_RMB(vdp))
1095 		dinfo |= VDISK_REMOVABLE;
1096 	if (err = xenbus_printf(xbt, xsnode, "info", "%u", dinfo)) {
1097 		xvdi_fatal_error(dip, err, "writing 'info'");
1098 		goto abort_trans;
1099 	}
1100 
1101 	/* hard-coded 512-byte sector size */
1102 	if (err = xenbus_printf(xbt, xsnode, "sector-size", "%u", DEV_BSIZE)) {
1103 		xvdi_fatal_error(dip, err, "writing 'sector-size'");
1104 		goto abort_trans;
1105 	}
1106 
1107 	if (err = xenbus_printf(xbt, xsnode, "sectors", "%"PRIu64,
1108 	    vdp->xs_sectors)) {
1109 		xvdi_fatal_error(dip, err, "writing 'sectors'");
1110 		goto abort_trans;
1111 	}
1112 
1113 	if (err = xenbus_printf(xbt, xsnode, "instance", "%d",
1114 	    ddi_get_instance(dip))) {
1115 		xvdi_fatal_error(dip, err, "writing 'instance'");
1116 		goto abort_trans;
1117 	}
1118 
1119 	if ((err = xvdi_switch_state(dip, xbt, XenbusStateConnected)) > 0) {
1120 		xvdi_fatal_error(dip, err, "writing 'state'");
1121 		goto abort_trans;
1122 	}
1123 
1124 	if (err = xenbus_transaction_end(xbt, 0)) {
1125 		if (err == EAGAIN)
1126 			/* transaction is ended, don't need to abort it */
1127 			goto trans_retry;
1128 		xvdi_fatal_error(dip, err, "completing transaction");
1129 		goto errout3;
1130 	}
1131 
1132 	return (DDI_SUCCESS);
1133 
1134 abort_trans:
1135 	(void) xenbus_transaction_end(xbt, 1);
1136 errout3:
1137 	mutex_enter(&vdp->xs_iomutex);
1138 	vdp->xs_if_status = svdst;
1139 	mutex_exit(&vdp->xs_iomutex);
1140 	ddi_remove_intr(dip, 0, NULL);
1141 errout2:
1142 	xdb_uninit_ioreqs(vdp);
1143 	xdb_unbindfrom_frontend(vdp);
1144 errout1:
1145 	xdb_close_device(vdp);
1146 	return (DDI_FAILURE);
1147 }
1148 
1149 /*
1150  * Kick-off disconnect process
1151  * xs_if_status will not be changed
1152  */
1153 static int
1154 xdb_start_disconnect(xdb_t *vdp)
1155 {
1156 	/*
1157 	 * Kick-off disconnect process
1158 	 */
1159 	if (xvdi_switch_state(vdp->xs_dip, XBT_NULL, XenbusStateClosing) > 0)
1160 		return (DDI_FAILURE);
1161 
1162 	return (DDI_SUCCESS);
1163 }
1164 
1165 /*
1166  * Disconnect from frontend and close backend device
1167  * ifstatus will be changed to XDB_DISCONNECTED
1168  * Xenbus state will be changed to XenbusStateClosed
1169  */
1170 static void
1171 xdb_close(dev_info_t *dip)
1172 {
1173 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1174 
1175 	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));
1176 
1177 	mutex_enter(&vdp->xs_iomutex);
1178 
1179 	if (vdp->xs_if_status != XDB_CONNECTED) {
1180 		vdp->xs_if_status = XDB_DISCONNECTED;
1181 		cv_broadcast(&vdp->xs_iocv);
1182 		mutex_exit(&vdp->xs_iomutex);
1183 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1184 		return;
1185 	}
1186 	vdp->xs_if_status = XDB_DISCONNECTED;
1187 	cv_broadcast(&vdp->xs_iocv);
1188 
1189 	mutex_exit(&vdp->xs_iomutex);
1190 
1191 	/* stop accepting I/O request from frontend */
1192 	ddi_remove_intr(dip, 0, NULL);
1193 	/* clear all on-going I/Os, if any */
1194 	mutex_enter(&vdp->xs_iomutex);
1195 	while (vdp->xs_ionum > 0)
1196 		cv_wait(&vdp->xs_ionumcv, &vdp->xs_iomutex);
1197 	mutex_exit(&vdp->xs_iomutex);
1198 
1199 	/* clean up resources and close this interface */
1200 	xdb_uninit_ioreqs(vdp);
1201 	xdb_unbindfrom_frontend(vdp);
1202 	xdb_close_device(vdp);
1203 	vdp->xs_peer = (domid_t)-1;
1204 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1205 }
1206 
1207 /*
1208  * Xdb_check_state_transition will check the XenbusState change to see
1209  * if the change is a valid transition or not.
1210  * The new state is written by frontend domain, or by running xenstore-write
1211  * to change it manually in dom0
1212  */
1213 static int
1214 xdb_check_state_transition(xdb_t *vdp, XenbusState oestate)
1215 {
1216 	enum xdb_state status;
1217 	int stcheck;
1218 #define	STOK	0 /* need further process */
1219 #define	STNOP	1 /* no action need taking */
1220 #define	STBUG	2 /* unexpected state change, could be a bug */
1221 
1222 	status = vdp->xs_if_status;
1223 	stcheck = STOK;
1224 
1225 	switch (status) {
1226 	case XDB_UNKNOWN:
1227 		if (vdp->xs_fe_status == XDB_FE_UNKNOWN) {
1228 			if ((oestate == XenbusStateUnknown)		||
1229 			    (oestate == XenbusStateConnected))
1230 				stcheck = STBUG;
1231 			else if ((oestate == XenbusStateInitialising)	||
1232 			    (oestate == XenbusStateInitWait))
1233 				stcheck = STNOP;
1234 		} else {
1235 			if ((oestate == XenbusStateUnknown)		||
1236 			    (oestate == XenbusStateInitialising)	||
1237 			    (oestate == XenbusStateInitWait)		||
1238 			    (oestate == XenbusStateConnected))
1239 				stcheck = STBUG;
1240 			else if (oestate == XenbusStateInitialised)
1241 				stcheck = STNOP;
1242 		}
1243 		break;
1244 	case XDB_CONNECTED:
1245 		if ((oestate == XenbusStateUnknown)		||
1246 		    (oestate == XenbusStateInitialising)	||
1247 		    (oestate == XenbusStateInitWait)		||
1248 		    (oestate == XenbusStateInitialised))
1249 			stcheck = STBUG;
1250 		else if (oestate == XenbusStateConnected)
1251 			stcheck = STNOP;
1252 		break;
1253 	case XDB_DISCONNECTED:
1254 	default:
1255 			stcheck = STBUG;
1256 	}
1257 
1258 	if (stcheck == STOK)
1259 		return (DDI_SUCCESS);
1260 
1261 	if (stcheck == STBUG)
1262 		cmn_err(CE_NOTE, "xdb@%s: unexpected otherend "
1263 		    "state change to %d!, when status is %d",
1264 		    ddi_get_name_addr(vdp->xs_dip), oestate, status);
1265 
1266 	return (DDI_FAILURE);
1267 }
1268 
1269 static void
1270 xdb_send_buf(void *arg)
1271 {
1272 	buf_t *bp;
1273 	xdb_t *vdp = (xdb_t *)arg;
1274 
1275 	mutex_enter(&vdp->xs_iomutex);
1276 
1277 	while (vdp->xs_if_status != XDB_DISCONNECTED) {
1278 		while ((bp = vdp->xs_f_iobuf) != NULL) {
1279 			vdp->xs_f_iobuf = bp->av_forw;
1280 			bp->av_forw = NULL;
1281 			vdp->xs_ionum++;
1282 			mutex_exit(&vdp->xs_iomutex);
1283 			if (bp->b_bcount != 0) {
1284 				int err = ldi_strategy(vdp->xs_ldi_hdl, bp);
1285 				if (err != 0) {
1286 					bp->b_flags |= B_ERROR;
1287 					(void) xdb_biodone(bp);
1288 					XDB_DBPRINT(XDB_DBG_IO, (CE_WARN,
1289 					    "xdb@%s: sent buf to backend dev"
1290 					    "failed, err=%d",
1291 					    ddi_get_name_addr(vdp->xs_dip),
1292 					    err));
1293 				} else {
1294 					XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
1295 					    "sent buf to backend ok"));
1296 				}
1297 			} else /* no I/O need to be done */
1298 				(void) xdb_biodone(bp);
1299 
1300 			mutex_enter(&vdp->xs_iomutex);
1301 		}
1302 
1303 		if (vdp->xs_if_status != XDB_DISCONNECTED)
1304 			cv_wait(&vdp->xs_iocv, &vdp->xs_iomutex);
1305 	}
1306 
1307 	mutex_exit(&vdp->xs_iomutex);
1308 }
1309 
1310 /*ARGSUSED*/
1311 static void
1312 xdb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
1313     void *impl_data)
1314 {
1315 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1316 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1317 
1318 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
1319 	    "hotplug status change to %d!", ddi_get_name_addr(dip), state));
1320 
1321 	mutex_enter(&vdp->xs_cbmutex);
1322 	if (state == Connected) {
1323 		/* Hotplug script has completed successfully */
1324 		if (vdp->xs_dev_status == XDB_DEV_UNKNOWN) {
1325 			vdp->xs_dev_status = XDB_DEV_READY;
1326 			if (vdp->xs_fe_status == XDB_FE_READY)
1327 				/* try to connect to frontend */
1328 				if (xdb_start_connect(vdp) != DDI_SUCCESS)
1329 					(void) xdb_start_disconnect(vdp);
1330 		}
1331 	}
1332 	mutex_exit(&vdp->xs_cbmutex);
1333 }
1334 
1335 /*ARGSUSED*/
1336 static void
1337 xdb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
1338     void *impl_data)
1339 {
1340 	XenbusState new_state = *(XenbusState *)impl_data;
1341 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1342 
1343 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
1344 	    "otherend state change to %d!", ddi_get_name_addr(dip), new_state));
1345 
1346 	mutex_enter(&vdp->xs_cbmutex);
1347 
1348 	if (xdb_check_state_transition(vdp, new_state) == DDI_FAILURE) {
1349 		mutex_exit(&vdp->xs_cbmutex);
1350 		return;
1351 	}
1352 
1353 	switch (new_state) {
1354 	case XenbusStateInitialised:
1355 		ASSERT(vdp->xs_if_status == XDB_UNKNOWN);
1356 
1357 		/* frontend is ready for connecting */
1358 		vdp->xs_fe_status = XDB_FE_READY;
1359 
1360 		if (vdp->xs_dev_status == XDB_DEV_READY)
1361 			if (xdb_start_connect(vdp) != DDI_SUCCESS)
1362 				(void) xdb_start_disconnect(vdp);
1363 		break;
1364 	case XenbusStateClosing:
1365 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1366 		break;
1367 	case XenbusStateClosed:
1368 		/* clean up */
1369 		xdb_close(dip);
1370 	}
1371 
1372 	mutex_exit(&vdp->xs_cbmutex);
1373 }
1374 
1375 static int
1376 xdb_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1377 {
1378 	xdb_t *vdp;
1379 	ddi_iblock_cookie_t ibc;
1380 	int instance;
1381 
1382 	switch (cmd) {
1383 	case DDI_RESUME:
1384 		return (DDI_FAILURE);
1385 	case DDI_ATTACH:
1386 		break;
1387 	default:
1388 		return (DDI_FAILURE);
1389 	}
1390 
1391 	/* DDI_ATTACH */
1392 	instance = ddi_get_instance(dip);
1393 	if (ddi_soft_state_zalloc(xdb_statep, instance) != DDI_SUCCESS)
1394 		return (DDI_FAILURE);
1395 
1396 	vdp = ddi_get_soft_state(xdb_statep, instance);
1397 	vdp->xs_dip = dip;
1398 	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
1399 		goto errout1;
1400 
1401 	if (!xdb_kstat_init(vdp))
1402 		goto errout1;
1403 
1404 	mutex_init(&vdp->xs_iomutex, NULL, MUTEX_DRIVER, (void *)ibc);
1405 	mutex_init(&vdp->xs_cbmutex, NULL, MUTEX_DRIVER, (void *)ibc);
1406 	cv_init(&vdp->xs_iocv, NULL, CV_DRIVER, NULL);
1407 	cv_init(&vdp->xs_ionumcv, NULL, CV_DRIVER, NULL);
1408 
1409 	ddi_set_driver_private(dip, vdp);
1410 
1411 	vdp->xs_iotaskq = ddi_taskq_create(dip, "xdb_iotask", 1,
1412 	    TASKQ_DEFAULTPRI, 0);
1413 	if (vdp->xs_iotaskq == NULL)
1414 		goto errout2;
1415 	(void) ddi_taskq_dispatch(vdp->xs_iotaskq, xdb_send_buf, vdp,
1416 	    DDI_SLEEP);
1417 
1418 	/* Watch frontend and hotplug state change */
1419 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xdb_oe_state_change) !=
1420 	    DDI_SUCCESS)
1421 		goto errout3;
1422 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xdb_hp_state_change) !=
1423 	    DDI_SUCCESS) {
1424 		goto errout4;
1425 	}
1426 
1427 	/*
1428 	 * Kick-off hotplug script
1429 	 */
1430 	if (xvdi_post_event(dip, XEN_HP_ADD) != DDI_SUCCESS) {
1431 		cmn_err(CE_WARN, "xdb@%s: failed to start hotplug script",
1432 		    ddi_get_name_addr(dip));
1433 		goto errout4;
1434 	}
1435 
1436 	/*
1437 	 * start waiting for hotplug event and otherend state event
1438 	 * mainly for debugging, frontend will not take any op seeing this
1439 	 */
1440 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
1441 
1442 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: attached!",
1443 	    ddi_get_name_addr(dip)));
1444 	return (DDI_SUCCESS);
1445 
1446 errout4:
1447 	xvdi_remove_event_handler(dip, NULL);
1448 errout3:
1449 	mutex_enter(&vdp->xs_cbmutex);
1450 	mutex_enter(&vdp->xs_iomutex);
1451 	vdp->xs_if_status = XDB_DISCONNECTED;
1452 	cv_broadcast(&vdp->xs_iocv);
1453 	mutex_exit(&vdp->xs_iomutex);
1454 	mutex_exit(&vdp->xs_cbmutex);
1455 	ddi_taskq_destroy(vdp->xs_iotaskq);
1456 errout2:
1457 	ddi_set_driver_private(dip, NULL);
1458 	cv_destroy(&vdp->xs_iocv);
1459 	cv_destroy(&vdp->xs_ionumcv);
1460 	mutex_destroy(&vdp->xs_cbmutex);
1461 	mutex_destroy(&vdp->xs_iomutex);
1462 	kstat_delete(vdp->xs_kstats);
1463 errout1:
1464 	ddi_soft_state_free(xdb_statep, instance);
1465 	return (DDI_FAILURE);
1466 }
1467 
1468 /*ARGSUSED*/
1469 static int
1470 xdb_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1471 {
1472 	int instance = ddi_get_instance(dip);
1473 	xdb_t *vdp = XDB_INST2SOFTS(instance);
1474 
1475 	switch (cmd) {
1476 	case DDI_SUSPEND:
1477 		return (DDI_FAILURE);
1478 	case DDI_DETACH:
1479 		break;
1480 	default:
1481 		return (DDI_FAILURE);
1482 	}
1483 
1484 	/* DDI_DETACH handling */
1485 
1486 	/* shouldn't detach, if still used by frontend */
1487 	mutex_enter(&vdp->xs_iomutex);
1488 	if (vdp->xs_if_status != XDB_DISCONNECTED) {
1489 		mutex_exit(&vdp->xs_iomutex);
1490 		return (DDI_FAILURE);
1491 	}
1492 	mutex_exit(&vdp->xs_iomutex);
1493 
1494 	xvdi_remove_event_handler(dip, NULL);
1495 	/* can do nothing about it, if it fails */
1496 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1497 
1498 	ddi_taskq_destroy(vdp->xs_iotaskq);
1499 	cv_destroy(&vdp->xs_iocv);
1500 	cv_destroy(&vdp->xs_ionumcv);
1501 	mutex_destroy(&vdp->xs_cbmutex);
1502 	mutex_destroy(&vdp->xs_iomutex);
1503 	kstat_delete(vdp->xs_kstats);
1504 	ddi_set_driver_private(dip, NULL);
1505 	ddi_soft_state_free(xdb_statep, instance);
1506 
1507 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: detached!",
1508 	    ddi_get_name_addr(dip)));
1509 	return (DDI_SUCCESS);
1510 }
1511 
1512 static struct dev_ops xdb_dev_ops = {
1513 	DEVO_REV,	/* devo_rev */
1514 	0,		/* devo_refcnt */
1515 	ddi_getinfo_1to1, /* devo_getinfo */
1516 	nulldev,	/* devo_identify */
1517 	nulldev,	/* devo_probe */
1518 	xdb_attach,	/* devo_attach */
1519 	xdb_detach,	/* devo_detach */
1520 	nodev,		/* devo_reset */
1521 	NULL,		/* devo_cb_ops */
1522 	NULL,		/* devo_bus_ops */
1523 	NULL,		/* power */
1524 	ddi_quiesce_not_needed,	/* quiesce */
1525 };
1526 
1527 /*
1528  * Module linkage information for the kernel.
1529  */
1530 static struct modldrv modldrv = {
1531 	&mod_driverops,			/* Type of module. */
1532 	"vbd backend driver",	/* Name of the module */
1533 	&xdb_dev_ops			/* driver ops */
1534 };
1535 
1536 static struct modlinkage xdb_modlinkage = {
1537 	MODREV_1,
1538 	&modldrv,
1539 	NULL
1540 };
1541 
1542 int
1543 _init(void)
1544 {
1545 	int rv;
1546 
1547 	if ((rv = ddi_soft_state_init((void **)&xdb_statep,
1548 	    sizeof (xdb_t), 0)) == 0)
1549 		if ((rv = mod_install(&xdb_modlinkage)) != 0)
1550 			ddi_soft_state_fini((void **)&xdb_statep);
1551 	return (rv);
1552 }
1553 
1554 int
1555 _fini(void)
1556 {
1557 	int rv;
1558 
1559 	if ((rv = mod_remove(&xdb_modlinkage)) != 0)
1560 		return (rv);
1561 	ddi_soft_state_fini((void **)&xdb_statep);
1562 	return (rv);
1563 }
1564 
1565 int
1566 _info(struct modinfo *modinfop)
1567 {
1568 	return (mod_info(&xdb_modlinkage, modinfop));
1569 }
1570 
1571 static int
1572 xdb_get_request(xdb_t *vdp, blkif_request_t *req)
1573 {
1574 	void *src = xvdi_ring_get_request(vdp->xs_ring);
1575 
1576 	if (src == NULL)
1577 		return (0);
1578 
1579 	switch (vdp->xs_blk_protocol) {
1580 	case BLKIF_PROTOCOL_NATIVE:
1581 		(void) memcpy(req, src, sizeof (*req));
1582 		break;
1583 	case BLKIF_PROTOCOL_X86_32:
1584 		blkif_get_x86_32_req(req, src);
1585 		break;
1586 	case BLKIF_PROTOCOL_X86_64:
1587 		blkif_get_x86_64_req(req, src);
1588 		break;
1589 	default:
1590 		cmn_err(CE_PANIC, "xdb@%s: unrecognised protocol: %d",
1591 		    ddi_get_name_addr(vdp->xs_dip),
1592 		    vdp->xs_blk_protocol);
1593 	}
1594 	return (1);
1595 }
1596 
1597 static int
1598 xdb_push_response(xdb_t *vdp, uint64_t id, uint8_t op, uint16_t status)
1599 {
1600 	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;
1601 	blkif_response_t *rsp = xvdi_ring_get_response(vdp->xs_ring);
1602 	blkif_x86_32_response_t *rsp_32 = (blkif_x86_32_response_t *)rsp;
1603 	blkif_x86_64_response_t *rsp_64 = (blkif_x86_64_response_t *)rsp;
1604 
1605 	ASSERT(rsp);
1606 
1607 	switch (vdp->xs_blk_protocol) {
1608 	case BLKIF_PROTOCOL_NATIVE:
1609 		ddi_put64(acchdl, &rsp->id, id);
1610 		ddi_put8(acchdl, &rsp->operation, op);
1611 		ddi_put16(acchdl, (uint16_t *)&rsp->status,
1612 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1613 		break;
1614 	case BLKIF_PROTOCOL_X86_32:
1615 		ddi_put64(acchdl, &rsp_32->id, id);
1616 		ddi_put8(acchdl, &rsp_32->operation, op);
1617 		ddi_put16(acchdl, (uint16_t *)&rsp_32->status,
1618 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1619 		break;
1620 	case BLKIF_PROTOCOL_X86_64:
1621 		ddi_put64(acchdl, &rsp_64->id, id);
1622 		ddi_put8(acchdl, &rsp_64->operation, op);
1623 		ddi_put16(acchdl, (uint16_t *)&rsp_64->status,
1624 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1625 		break;
1626 	default:
1627 		cmn_err(CE_PANIC, "xdb@%s: unrecognised protocol: %d",
1628 		    ddi_get_name_addr(vdp->xs_dip),
1629 		    vdp->xs_blk_protocol);
1630 	}
1631 
1632 	return (xvdi_ring_push_response(vdp->xs_ring));
1633 }
1634 
1635 static void
1636 blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
1637 {
1638 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1639 	dst->operation = src->operation;
1640 	dst->nr_segments = src->nr_segments;
1641 	dst->handle = src->handle;
1642 	dst->id = src->id;
1643 	dst->sector_number = src->sector_number;
1644 	if (n > src->nr_segments)
1645 		n = src->nr_segments;
1646 	for (i = 0; i < n; i++)
1647 		dst->seg[i] = src->seg[i];
1648 }
1649 
1650 static void
1651 blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
1652 {
1653 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1654 	dst->operation = src->operation;
1655 	dst->nr_segments = src->nr_segments;
1656 	dst->handle = src->handle;
1657 	dst->id = src->id;
1658 	dst->sector_number = src->sector_number;
1659 	if (n > src->nr_segments)
1660 		n = src->nr_segments;
1661 	for (i = 0; i < n; i++)
1662 		dst->seg[i] = src->seg[i];
1663 }
1664