xref: /titanic_52/usr/src/uts/common/xen/io/xdb.c (revision 7eea693d6b672899726e75993fddc4e95b52647f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Note: This is the backend part of the split PV disk driver. This driver
29  * is not a nexus driver, nor is it a leaf driver(block/char/stream driver).
30  * Currently, it does not create any minor node. So, although, it runs in
31  * backend domain, it will not be used directly from within dom0.
32  * It simply gets block I/O requests issued by frontend from a shared page
33  * (blkif ring buffer - defined by Xen) between backend and frontend domain,
34  * generates a buf, and push it down to underlying disk target driver via
35  * ldi interface. When buf is done, this driver will generate a response
36  * and put it into ring buffer to inform frontend of the status of the I/O
37  * request issued by it. When a new virtual device entry is added in xenstore,
38  * there will be an watch event sent from Xen to xvdi framework, who will,
39  * in turn, create the devinfo node and try to attach this driver
40  * (see xvdi_create_dev). When frontend peer changes its state to
41  * XenbusStateClose, an event will also be sent from Xen to xvdi framework,
42  * who will detach and remove this devinfo node (see i_xvdi_oestate_handler).
43  * I/O requests get from ring buffer and event coming from xenstore cannot be
44  * trusted. We verify them in xdb_get_buf() and xdb_check_state_transition().
45  *
46  * Virtual device configuration is read/written from/to the database via
47  * xenbus_* interfaces. Driver also use xvdi_* to interact with hypervisor.
48  * There is an on-going effort to make xvdi_* cover all xenbus_*.
49  */
50 
51 #include <sys/types.h>
52 #include <sys/conf.h>
53 #include <sys/ddi.h>
54 #include <sys/dditypes.h>
55 #include <sys/sunddi.h>
56 #include <sys/list.h>
57 #include <sys/dkio.h>
58 #include <sys/cmlb.h>
59 #include <sys/vtoc.h>
60 #include <sys/modctl.h>
61 #include <sys/bootconf.h>
62 #include <sys/promif.h>
63 #include <sys/sysmacros.h>
64 #include <public/io/xenbus.h>
65 #include <xen/sys/xenbus_impl.h>
66 #include <xen/sys/xendev.h>
67 #include <sys/gnttab.h>
68 #include <sys/scsi/generic/inquiry.h>
69 #include <vm/seg_kmem.h>
70 #include <vm/hat_i86.h>
71 #include <sys/gnttab.h>
72 #include <sys/lofi.h>
73 #include <io/xdf.h>
74 #include <xen/io/blkif_impl.h>
75 #include <io/xdb.h>
76 
77 static xdb_t *xdb_statep;
78 static int xdb_debug = 0;
79 
80 static int xdb_push_response(xdb_t *, uint64_t, uint8_t, uint16_t);
81 static int xdb_get_request(xdb_t *, blkif_request_t *);
82 static void blkif_get_x86_32_req(blkif_request_t *, blkif_x86_32_request_t *);
83 static void blkif_get_x86_64_req(blkif_request_t *, blkif_x86_64_request_t *);
84 
85 #ifdef DEBUG
86 /*
87  * debug aid functions
88  */
89 
90 static void
91 logva(xdb_t *vdp, uint64_t va)
92 {
93 	uint64_t *page_addrs;
94 	int i;
95 
96 	page_addrs = vdp->page_addrs;
97 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
98 		if (page_addrs[i] == va)
99 			debug_enter("VA remapping found!");
100 	}
101 
102 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
103 		if (page_addrs[i] == 0) {
104 			page_addrs[i] = va;
105 			break;
106 		}
107 	}
108 	ASSERT(i < XDB_MAX_IO_PAGES(vdp));
109 }
110 
111 static void
112 unlogva(xdb_t *vdp, uint64_t va)
113 {
114 	uint64_t *page_addrs;
115 	int i;
116 
117 	page_addrs = vdp->page_addrs;
118 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
119 		if (page_addrs[i] == va) {
120 			page_addrs[i] = 0;
121 			break;
122 		}
123 	}
124 	ASSERT(i < XDB_MAX_IO_PAGES(vdp));
125 }
126 
127 static void
128 xdb_dump_request_oe(blkif_request_t *req)
129 {
130 	int i;
131 
132 	/*
133 	 * Exploit the public interface definitions for BLKIF_OP_READ
134 	 * etc..
135 	 */
136 	char *op_name[] = { "read", "write", "barrier", "flush" };
137 
138 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "op=%s", op_name[req->operation]));
139 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "num of segments=%d",
140 	    req->nr_segments));
141 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "handle=%d", req->handle));
142 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "id=%llu",
143 	    (unsigned long long)req->id));
144 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "start sector=%llu",
145 	    (unsigned long long)req->sector_number));
146 	for (i = 0; i < req->nr_segments; i++) {
147 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "gref=%d, first sec=%d,"
148 		    "last sec=%d", req->seg[i].gref, req->seg[i].first_sect,
149 		    req->seg[i].last_sect));
150 	}
151 }
152 #endif /* DEBUG */
153 
154 /*
155  * Statistics.
156  */
157 static char *xdb_stats[] = {
158 	"rd_reqs",
159 	"wr_reqs",
160 	"br_reqs",
161 	"fl_reqs",
162 	"oo_reqs"
163 };
164 
165 static int
166 xdb_kstat_update(kstat_t *ksp, int flag)
167 {
168 	xdb_t *vdp;
169 	kstat_named_t *knp;
170 
171 	if (flag != KSTAT_READ)
172 		return (EACCES);
173 
174 	vdp = ksp->ks_private;
175 	knp = ksp->ks_data;
176 
177 	/*
178 	 * Assignment order should match that of the names in
179 	 * xdb_stats.
180 	 */
181 	(knp++)->value.ui64 = vdp->xs_stat_req_reads;
182 	(knp++)->value.ui64 = vdp->xs_stat_req_writes;
183 	(knp++)->value.ui64 = vdp->xs_stat_req_barriers;
184 	(knp++)->value.ui64 = vdp->xs_stat_req_flushes;
185 	(knp++)->value.ui64 = 0; /* oo_req */
186 
187 	return (0);
188 }
189 
190 static boolean_t
191 xdb_kstat_init(xdb_t *vdp)
192 {
193 	int nstat = sizeof (xdb_stats) / sizeof (xdb_stats[0]);
194 	char **cp = xdb_stats;
195 	kstat_named_t *knp;
196 
197 	if ((vdp->xs_kstats = kstat_create("xdb",
198 	    ddi_get_instance(vdp->xs_dip),
199 	    "req_statistics", "block", KSTAT_TYPE_NAMED,
200 	    nstat, 0)) == NULL)
201 		return (B_FALSE);
202 
203 	vdp->xs_kstats->ks_private = vdp;
204 	vdp->xs_kstats->ks_update = xdb_kstat_update;
205 
206 	knp = vdp->xs_kstats->ks_data;
207 	while (nstat > 0) {
208 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
209 		knp++;
210 		cp++;
211 		nstat--;
212 	}
213 
214 	kstat_install(vdp->xs_kstats);
215 
216 	return (B_TRUE);
217 }
218 
219 static int xdb_biodone(buf_t *);
220 
221 static buf_t *
222 xdb_get_buf(xdb_t *vdp, blkif_request_t *req, xdb_request_t *xreq)
223 {
224 	buf_t *bp;
225 	uint8_t segs, curseg;
226 	int sectors;
227 	int i, err;
228 	gnttab_map_grant_ref_t mapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
229 	ddi_acc_handle_t acchdl;
230 
231 	acchdl = vdp->xs_ring_hdl;
232 	bp = XDB_XREQ2BP(xreq);
233 	curseg = xreq->xr_curseg;
234 	/* init a new xdb request */
235 	if (req != NULL) {
236 		ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
237 		boolean_t pagemapok = B_TRUE;
238 		uint8_t op = ddi_get8(acchdl, &req->operation);
239 
240 		xreq->xr_vdp = vdp;
241 		xreq->xr_op = op;
242 		xreq->xr_id = ddi_get64(acchdl, &req->id);
243 		segs = xreq->xr_buf_pages = ddi_get8(acchdl, &req->nr_segments);
244 		if (segs == 0) {
245 			if (op != BLKIF_OP_FLUSH_DISKCACHE)
246 				cmn_err(CE_WARN, "!non-BLKIF_OP_FLUSH_DISKCACHE"
247 				    " is seen from domain %d with zero "
248 				    "length data buffer!", vdp->xs_peer);
249 			bioinit(bp);
250 			bp->b_bcount = 0;
251 			bp->b_lblkno = 0;
252 			bp->b_un.b_addr = NULL;
253 			return (bp);
254 		} else if (op == BLKIF_OP_FLUSH_DISKCACHE) {
255 			cmn_err(CE_WARN, "!BLKIF_OP_FLUSH_DISKCACHE"
256 			    " is seen from domain %d with non-zero "
257 			    "length data buffer!", vdp->xs_peer);
258 		}
259 
260 		/*
261 		 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
262 		 * according to the definition of blk interface by Xen
263 		 * we do sanity check here
264 		 */
265 		if (segs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
266 			segs = xreq->xr_buf_pages =
267 			    BLKIF_MAX_SEGMENTS_PER_REQUEST;
268 
269 		for (i = 0; i < segs; i++) {
270 			uint8_t fs, ls;
271 
272 			mapops[i].host_addr =
273 			    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
274 			    vdp->xs_iopage_va, xreq->xr_idx, i);
275 			mapops[i].dom = vdp->xs_peer;
276 			mapops[i].ref = ddi_get32(acchdl, &req->seg[i].gref);
277 			mapops[i].flags = GNTMAP_host_map;
278 			if (op != BLKIF_OP_READ)
279 				mapops[i].flags |= GNTMAP_readonly;
280 
281 			fs = ddi_get8(acchdl, &req->seg[i].first_sect);
282 			ls = ddi_get8(acchdl, &req->seg[i].last_sect);
283 
284 			/*
285 			 * first_sect should be no bigger than last_sect and
286 			 * both of them should be no bigger than
287 			 * (PAGESIZE / XB_BSIZE - 1) according to definition
288 			 * of blk interface by Xen, so sanity check again
289 			 */
290 			if (fs > (PAGESIZE / XB_BSIZE - 1))
291 				fs = PAGESIZE / XB_BSIZE - 1;
292 			if (ls > (PAGESIZE / XB_BSIZE - 1))
293 				ls = PAGESIZE / XB_BSIZE - 1;
294 			if (fs > ls)
295 				fs = ls;
296 
297 			xreq->xr_segs[i].fs = fs;
298 			xreq->xr_segs[i].ls = ls;
299 		}
300 
301 		/* map in io pages */
302 		err = xen_map_gref(GNTTABOP_map_grant_ref, mapops, i, B_FALSE);
303 		if (err != 0)
304 			return (NULL);
305 		for (i = 0; i < segs; i++) {
306 			/*
307 			 * Although HYPERVISOR_grant_table_op() returned no
308 			 * error, mapping of each single page can fail. So,
309 			 * we have to do the check here and handle the error
310 			 * if needed
311 			 */
312 			if (mapops[i].status != GNTST_okay) {
313 				int j;
314 				for (j = 0; j < i; j++) {
315 #ifdef DEBUG
316 					unlogva(vdp, mapops[j].host_addr);
317 #endif
318 					xen_release_pfn(
319 					    xreq->xr_plist[j].p_pagenum);
320 				}
321 				pagemapok = B_FALSE;
322 				break;
323 			}
324 			/* record page mapping handle for unmapping later */
325 			xreq->xr_page_hdls[i] = mapops[i].handle;
326 #ifdef DEBUG
327 			logva(vdp, mapops[i].host_addr);
328 #endif
329 			/*
330 			 * Pass the MFNs down using the shadow list (xr_pplist)
331 			 *
332 			 * This is pretty ugly since we have implict knowledge
333 			 * of how the rootnex binds buffers.
334 			 * The GNTTABOP_map_grant_ref op makes us do some ugly
335 			 * stuff since we're not allowed to touch these PTEs
336 			 * from the VM.
337 			 *
338 			 * Obviously, these aren't real page_t's. The rootnex
339 			 * only needs p_pagenum.
340 			 * Also, don't use btop() here or 32 bit PAE breaks.
341 			 */
342 			xreq->xr_pplist[i] = &xreq->xr_plist[i];
343 			xreq->xr_plist[i].p_pagenum =
344 			    xen_assign_pfn(mapops[i].dev_bus_addr >> PAGESHIFT);
345 		}
346 
347 		/*
348 		 * not all pages mapped in successfully, unmap those mapped-in
349 		 * page and return failure
350 		 */
351 		if (!pagemapok) {
352 			gnttab_unmap_grant_ref_t unmapop;
353 
354 			for (i = 0; i < segs; i++) {
355 				if (mapops[i].status != GNTST_okay)
356 					continue;
357 				unmapop.host_addr =
358 				    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
359 				    vdp->xs_iopage_va, xreq->xr_idx, i);
360 				unmapop.dev_bus_addr = NULL;
361 				unmapop.handle = mapops[i].handle;
362 				(void) HYPERVISOR_grant_table_op(
363 				    GNTTABOP_unmap_grant_ref, &unmapop, 1);
364 			}
365 
366 			return (NULL);
367 		}
368 		bioinit(bp);
369 		bp->b_lblkno = ddi_get64(acchdl, &req->sector_number);
370 		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
371 		bp->b_flags |= (ddi_get8(acchdl, &req->operation) ==
372 		    BLKIF_OP_READ) ? B_READ : (B_WRITE | B_ASYNC);
373 	} else {
374 		uint64_t blkst;
375 		int isread;
376 
377 		/* reuse this buf */
378 		blkst = bp->b_lblkno + bp->b_bcount / DEV_BSIZE;
379 		isread = bp->b_flags & B_READ;
380 		bioreset(bp);
381 		bp->b_lblkno = blkst;
382 		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
383 		bp->b_flags |= isread ? B_READ : (B_WRITE | B_ASYNC);
384 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "reuse buf, xreq is %d!!",
385 		    xreq->xr_idx));
386 	}
387 
388 	/* form a buf */
389 	bp->b_un.b_addr = XDB_IOPAGE_VA(vdp->xs_iopage_va, xreq->xr_idx,
390 	    curseg) + xreq->xr_segs[curseg].fs * DEV_BSIZE;
391 	bp->b_shadow = &xreq->xr_pplist[curseg];
392 	bp->b_iodone = xdb_biodone;
393 	sectors = 0;
394 	for (i = curseg; i < xreq->xr_buf_pages; i++) {
395 		/*
396 		 * The xreq->xr_segs[i].fs of the first seg can be non-zero
397 		 * otherwise, we'll break it into multiple bufs
398 		 */
399 		if ((i != curseg) && (xreq->xr_segs[i].fs != 0)) {
400 			break;
401 		}
402 		sectors += (xreq->xr_segs[i].ls - xreq->xr_segs[i].fs + 1);
403 	}
404 	xreq->xr_curseg = i;
405 	bp->b_bcount = sectors * DEV_BSIZE;
406 	bp->b_bufsize = bp->b_bcount;
407 
408 	return (bp);
409 }
410 
411 static xdb_request_t *
412 xdb_get_req(xdb_t *vdp)
413 {
414 	xdb_request_t *req;
415 	int idx;
416 
417 	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
418 	ASSERT(vdp->xs_free_req != -1);
419 	req = &vdp->xs_req[vdp->xs_free_req];
420 	vdp->xs_free_req = req->xr_next;
421 	idx = req->xr_idx;
422 	bzero(req, sizeof (xdb_request_t));
423 	req->xr_idx = idx;
424 	return (req);
425 }
426 
427 static void
428 xdb_free_req(xdb_request_t *req)
429 {
430 	xdb_t *vdp = req->xr_vdp;
431 
432 	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
433 	req->xr_next = vdp->xs_free_req;
434 	vdp->xs_free_req = req->xr_idx;
435 }
436 
437 static void
438 xdb_response(xdb_t *vdp, blkif_request_t *req, boolean_t ok)
439 {
440 	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;
441 
442 	if (xdb_push_response(vdp, ddi_get64(acchdl, &req->id),
443 	    ddi_get8(acchdl, &req->operation), ok))
444 		xvdi_notify_oe(vdp->xs_dip);
445 }
446 
447 static void
448 xdb_init_ioreqs(xdb_t *vdp)
449 {
450 	int i;
451 
452 	ASSERT(vdp->xs_nentry);
453 
454 	if (vdp->xs_req == NULL)
455 		vdp->xs_req = kmem_alloc(vdp->xs_nentry *
456 		    sizeof (xdb_request_t), KM_SLEEP);
457 #ifdef DEBUG
458 	if (vdp->page_addrs == NULL)
459 		vdp->page_addrs = kmem_zalloc(XDB_MAX_IO_PAGES(vdp) *
460 		    sizeof (uint64_t), KM_SLEEP);
461 #endif
462 	for (i = 0; i < vdp->xs_nentry; i++) {
463 		vdp->xs_req[i].xr_idx = i;
464 		vdp->xs_req[i].xr_next = i + 1;
465 	}
466 	vdp->xs_req[vdp->xs_nentry - 1].xr_next = -1;
467 	vdp->xs_free_req = 0;
468 
469 	/* alloc va in host dom for io page mapping */
470 	vdp->xs_iopage_va = vmem_xalloc(heap_arena,
471 	    XDB_MAX_IO_PAGES(vdp) * PAGESIZE, PAGESIZE, 0, 0, 0, 0,
472 	    VM_SLEEP);
473 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++)
474 		hat_prepare_mapping(kas.a_hat,
475 		    vdp->xs_iopage_va + i * PAGESIZE, NULL);
476 }
477 
478 static void
479 xdb_uninit_ioreqs(xdb_t *vdp)
480 {
481 	int i;
482 
483 	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++)
484 		hat_release_mapping(kas.a_hat,
485 		    vdp->xs_iopage_va + i * PAGESIZE);
486 	vmem_xfree(heap_arena, vdp->xs_iopage_va,
487 	    XDB_MAX_IO_PAGES(vdp) * PAGESIZE);
488 	if (vdp->xs_req != NULL) {
489 		kmem_free(vdp->xs_req, vdp->xs_nentry * sizeof (xdb_request_t));
490 		vdp->xs_req = NULL;
491 	}
492 #ifdef DEBUG
493 	if (vdp->page_addrs != NULL) {
494 		kmem_free(vdp->page_addrs, XDB_MAX_IO_PAGES(vdp) *
495 		    sizeof (uint64_t));
496 		vdp->page_addrs = NULL;
497 	}
498 #endif
499 }
500 
501 static uint_t
502 xdb_intr(caddr_t arg)
503 {
504 	blkif_request_t req;
505 	blkif_request_t *reqp = &req;
506 	xdb_request_t *xreq;
507 	buf_t *bp;
508 	uint8_t op;
509 	xdb_t *vdp = (xdb_t *)arg;
510 	int ret = DDI_INTR_UNCLAIMED;
511 	dev_info_t *dip = vdp->xs_dip;
512 
513 	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
514 	    "xdb@%s: I/O request received from dom %d",
515 	    ddi_get_name_addr(dip), vdp->xs_peer));
516 
517 	mutex_enter(&vdp->xs_iomutex);
518 
519 	/* shouldn't touch ring buffer if not in connected state */
520 	if (vdp->xs_if_status != XDB_CONNECTED) {
521 		mutex_exit(&vdp->xs_iomutex);
522 		return (DDI_INTR_UNCLAIMED);
523 	}
524 
525 	/*
526 	 * We'll loop till there is no more request in the ring
527 	 * We won't stuck in this loop for ever since the size of ring buffer
528 	 * is limited, and frontend will stop pushing requests into it when
529 	 * the ring buffer is full
530 	 */
531 
532 	/* req_event will be increased in xvdi_ring_get_request() */
533 	while (xdb_get_request(vdp, reqp)) {
534 		ret = DDI_INTR_CLAIMED;
535 
536 		op = ddi_get8(vdp->xs_ring_hdl, &reqp->operation);
537 		if (op == BLKIF_OP_READ			||
538 		    op == BLKIF_OP_WRITE		||
539 		    op == BLKIF_OP_WRITE_BARRIER	||
540 		    op == BLKIF_OP_FLUSH_DISKCACHE) {
541 #ifdef DEBUG
542 			xdb_dump_request_oe(reqp);
543 #endif
544 			xreq = xdb_get_req(vdp);
545 			ASSERT(xreq);
546 			switch (op) {
547 			case BLKIF_OP_READ:
548 				vdp->xs_stat_req_reads++;
549 				break;
550 			case BLKIF_OP_WRITE_BARRIER:
551 				vdp->xs_stat_req_barriers++;
552 				/* FALLTHRU */
553 			case BLKIF_OP_WRITE:
554 				vdp->xs_stat_req_writes++;
555 				break;
556 			case BLKIF_OP_FLUSH_DISKCACHE:
557 				vdp->xs_stat_req_flushes++;
558 				break;
559 			}
560 
561 			xreq->xr_curseg = 0; /* start from first segment */
562 			bp = xdb_get_buf(vdp, reqp, xreq);
563 			if (bp == NULL) {
564 				/* failed to form a buf */
565 				xdb_free_req(xreq);
566 				xdb_response(vdp, reqp, B_FALSE);
567 				continue;
568 			}
569 			bp->av_forw = NULL;
570 
571 			XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
572 			    " buf %p, blkno %lld, size %lu, addr %p",
573 			    (void *)bp, (longlong_t)bp->b_blkno,
574 			    (ulong_t)bp->b_bcount, (void *)bp->b_un.b_addr));
575 
576 			/* send bp to underlying blk driver */
577 			if (vdp->xs_f_iobuf == NULL) {
578 				vdp->xs_f_iobuf = vdp->xs_l_iobuf = bp;
579 			} else {
580 				vdp->xs_l_iobuf->av_forw = bp;
581 				vdp->xs_l_iobuf = bp;
582 			}
583 		} else {
584 			xdb_response(vdp, reqp, B_FALSE);
585 			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
586 			    "Unsupported cmd received from dom %d",
587 			    ddi_get_name_addr(dip), vdp->xs_peer));
588 		}
589 	}
590 	/* notify our taskq to push buf to underlying blk driver */
591 	if (ret == DDI_INTR_CLAIMED)
592 		cv_broadcast(&vdp->xs_iocv);
593 
594 	mutex_exit(&vdp->xs_iomutex);
595 
596 	return (ret);
597 }
598 
599 static int
600 xdb_biodone(buf_t *bp)
601 {
602 	int i, err, bioerr;
603 	uint8_t segs;
604 	gnttab_unmap_grant_ref_t unmapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
605 	xdb_request_t *xreq = XDB_BP2XREQ(bp);
606 	xdb_t *vdp = xreq->xr_vdp;
607 	buf_t *nbp;
608 
609 	bioerr = geterror(bp);
610 	if (bioerr)
611 		XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: I/O error %d",
612 		    ddi_get_name_addr(vdp->xs_dip), bioerr));
613 
614 	/* check if we are done w/ this I/O request */
615 	if ((bioerr == 0) && (xreq->xr_curseg < xreq->xr_buf_pages)) {
616 		nbp = xdb_get_buf(vdp, NULL, xreq);
617 		if (nbp) {
618 			err = ldi_strategy(vdp->xs_ldi_hdl, nbp);
619 			if (err == 0) {
620 				XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
621 				    "sent buf to backend ok"));
622 				return (DDI_SUCCESS);
623 			}
624 			bioerr = EIO;
625 			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
626 			    "sent buf to backend dev failed, err=%d",
627 			    ddi_get_name_addr(vdp->xs_dip), err));
628 		} else {
629 			bioerr = EIO;
630 		}
631 	}
632 
633 	/* unmap io pages */
634 	segs = xreq->xr_buf_pages;
635 	/*
636 	 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
637 	 * according to the definition of blk interface by Xen
638 	 */
639 	ASSERT(segs <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
640 	for (i = 0; i < segs; i++) {
641 		unmapops[i].host_addr = (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
642 		    vdp->xs_iopage_va, xreq->xr_idx, i);
643 #ifdef DEBUG
644 		mutex_enter(&vdp->xs_iomutex);
645 		unlogva(vdp, unmapops[i].host_addr);
646 		mutex_exit(&vdp->xs_iomutex);
647 #endif
648 		unmapops[i].dev_bus_addr = NULL;
649 		unmapops[i].handle = xreq->xr_page_hdls[i];
650 	}
651 	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
652 	    unmapops, segs);
653 	ASSERT(!err);
654 
655 	/*
656 	 * If we have reached a barrier write or a cache flush , then we must
657 	 * flush all our I/Os.
658 	 */
659 	if (xreq->xr_op == BLKIF_OP_WRITE_BARRIER ||
660 	    xreq->xr_op == BLKIF_OP_FLUSH_DISKCACHE) {
661 		/*
662 		 * XXX At this point the write did succeed, so I don't
663 		 * believe we should report an error because the flush
664 		 * failed. However, this is a debatable point, so
665 		 * maybe we need to think more carefully about this.
666 		 * For now, just cast to void.
667 		 */
668 		(void) ldi_ioctl(vdp->xs_ldi_hdl,
669 		    DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, kcred, NULL);
670 	}
671 
672 	mutex_enter(&vdp->xs_iomutex);
673 
674 	/* send response back to frontend */
675 	if (vdp->xs_if_status == XDB_CONNECTED) {
676 		if (xdb_push_response(vdp, xreq->xr_id, xreq->xr_op, bioerr))
677 			xvdi_notify_oe(vdp->xs_dip);
678 		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
679 		    "sent resp back to frontend, id=%llu",
680 		    (unsigned long long)xreq->xr_id));
681 	}
682 	/* free io resources */
683 	biofini(bp);
684 	xdb_free_req(xreq);
685 
686 	vdp->xs_ionum--;
687 	if ((vdp->xs_if_status != XDB_CONNECTED) && (vdp->xs_ionum == 0)) {
688 		/* we're closing, someone is waiting for I/O clean-up */
689 		cv_signal(&vdp->xs_ionumcv);
690 	}
691 
692 	mutex_exit(&vdp->xs_iomutex);
693 
694 	return (DDI_SUCCESS);
695 }
696 
697 static int
698 xdb_bindto_frontend(xdb_t *vdp)
699 {
700 	int err;
701 	char *oename;
702 	grant_ref_t gref;
703 	evtchn_port_t evtchn;
704 	dev_info_t *dip = vdp->xs_dip;
705 	char protocol[64] = "";
706 
707 	/*
708 	 * Gather info from frontend
709 	 */
710 	oename = xvdi_get_oename(dip);
711 	if (oename == NULL)
712 		return (DDI_FAILURE);
713 
714 	err = xenbus_gather(XBT_NULL, oename,
715 	    "ring-ref", "%lu", &gref, "event-channel", "%u", &evtchn, NULL);
716 	if (err != 0) {
717 		xvdi_fatal_error(dip, err,
718 		    "Getting ring-ref and evtchn from frontend");
719 		return (DDI_FAILURE);
720 	}
721 
722 	vdp->xs_blk_protocol = BLKIF_PROTOCOL_NATIVE;
723 	vdp->xs_nentry = BLKIF_RING_SIZE;
724 	vdp->xs_entrysize = sizeof (union blkif_sring_entry);
725 
726 	err = xenbus_gather(XBT_NULL, oename,
727 	    "protocol", "%63s", protocol, NULL);
728 	if (err)
729 		(void) strcpy(protocol, "unspecified, assuming native");
730 	else {
731 		/*
732 		 * We must check for NATIVE first, so that the fast path
733 		 * is taken for copying data from the guest to the host.
734 		 */
735 		if (strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE) != 0) {
736 			if (strcmp(protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
737 				vdp->xs_blk_protocol = BLKIF_PROTOCOL_X86_32;
738 				vdp->xs_nentry = BLKIF_X86_32_RING_SIZE;
739 				vdp->xs_entrysize =
740 				    sizeof (union blkif_x86_32_sring_entry);
741 			} else if (strcmp(protocol, XEN_IO_PROTO_ABI_X86_64) ==
742 			    0) {
743 				vdp->xs_blk_protocol = BLKIF_PROTOCOL_X86_64;
744 				vdp->xs_nentry = BLKIF_X86_64_RING_SIZE;
745 				vdp->xs_entrysize =
746 				    sizeof (union blkif_x86_64_sring_entry);
747 			} else {
748 				xvdi_fatal_error(dip, err, "unknown protocol");
749 				return (DDI_FAILURE);
750 			}
751 		}
752 	}
753 #ifdef DEBUG
754 	cmn_err(CE_NOTE, "!xdb@%s: blkif protocol '%s' ",
755 	    ddi_get_name_addr(dip), protocol);
756 #endif
757 
758 	/*
759 	 * map and init ring
760 	 *
761 	 * The ring parameters must match those which have been allocated
762 	 * in the front end.
763 	 */
764 	err = xvdi_map_ring(dip, vdp->xs_nentry, vdp->xs_entrysize,
765 	    gref, &vdp->xs_ring);
766 	if (err != DDI_SUCCESS)
767 		return (DDI_FAILURE);
768 	/*
769 	 * This will be removed after we use shadow I/O ring request since
770 	 * we don't need to access the ring itself directly, thus the access
771 	 * handle is not needed
772 	 */
773 	vdp->xs_ring_hdl = vdp->xs_ring->xr_acc_hdl;
774 
775 	/*
776 	 * bind event channel
777 	 */
778 	err = xvdi_bind_evtchn(dip, evtchn);
779 	if (err != DDI_SUCCESS) {
780 		xvdi_unmap_ring(vdp->xs_ring);
781 		return (DDI_FAILURE);
782 	}
783 
784 	return (DDI_SUCCESS);
785 }
786 
787 static void
788 xdb_unbindfrom_frontend(xdb_t *vdp)
789 {
790 	xvdi_free_evtchn(vdp->xs_dip);
791 	xvdi_unmap_ring(vdp->xs_ring);
792 }
793 
794 #define	LOFI_CTRL_NODE	"/dev/lofictl"
795 #define	LOFI_DEV_NODE	"/devices/pseudo/lofi@0:"
796 #define	LOFI_MODE	FREAD | FWRITE | FEXCL
797 
798 static int
799 xdb_setup_node(xdb_t *vdp, char *path)
800 {
801 	dev_info_t *dip;
802 	char *xsnode, *node;
803 	ldi_handle_t ldi_hdl;
804 	struct lofi_ioctl *li;
805 	int minor;
806 	int err;
807 	unsigned int len;
808 
809 	dip = vdp->xs_dip;
810 	xsnode = xvdi_get_xsname(dip);
811 	if (xsnode == NULL)
812 		return (DDI_FAILURE);
813 
814 	err = xenbus_read(XBT_NULL, xsnode, "dynamic-device-path",
815 	    (void **)&node, &len);
816 	if (err == ENOENT)
817 		err = xenbus_read(XBT_NULL, xsnode, "params", (void **)&node,
818 		    &len);
819 	if (err != 0) {
820 		xvdi_fatal_error(vdp->xs_dip, err, "reading 'params'");
821 		return (DDI_FAILURE);
822 	}
823 
824 	if (!XDB_IS_LOFI(vdp)) {
825 		(void) strlcpy(path, node, MAXPATHLEN + 1);
826 		kmem_free(node, len);
827 		return (DDI_SUCCESS);
828 	}
829 
830 	do {
831 		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
832 		    &ldi_hdl, vdp->xs_ldi_li);
833 	} while (err == EBUSY);
834 	if (err != 0) {
835 		kmem_free(node, len);
836 		return (DDI_FAILURE);
837 	}
838 
839 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
840 	(void) strlcpy(li->li_filename, node, MAXPATHLEN + 1);
841 	kmem_free(node, len);
842 	if (ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
843 	    LOFI_MODE | FKIOCTL, kcred, &minor) != 0) {
844 		cmn_err(CE_WARN, "xdb@%s: Failed to create lofi dev for %s",
845 		    ddi_get_name_addr(dip), li->li_filename);
846 		(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
847 		kmem_free(li, sizeof (*li));
848 		return (DDI_FAILURE);
849 	}
850 	/*
851 	 * return '/devices/...' instead of '/dev/lofi/...' since the
852 	 * former is available immediately after calling ldi_ioctl
853 	 */
854 	(void) snprintf(path, MAXPATHLEN + 1, LOFI_DEV_NODE "%d", minor);
855 	(void) xenbus_printf(XBT_NULL, xsnode, "node", "%s", path);
856 	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
857 	kmem_free(li, sizeof (*li));
858 	return (DDI_SUCCESS);
859 }
860 
861 static void
862 xdb_teardown_node(xdb_t *vdp)
863 {
864 	dev_info_t *dip;
865 	char *xsnode, *node;
866 	ldi_handle_t ldi_hdl;
867 	struct lofi_ioctl *li;
868 	int err;
869 	unsigned int len;
870 
871 	if (!XDB_IS_LOFI(vdp))
872 		return;
873 
874 	dip = vdp->xs_dip;
875 	xsnode = xvdi_get_xsname(dip);
876 	if (xsnode == NULL)
877 		return;
878 
879 	err = xenbus_read(XBT_NULL, xsnode, "dynamic-device-path",
880 	    (void **)&node, &len);
881 	if (err == ENOENT)
882 		err = xenbus_read(XBT_NULL, xsnode, "params", (void **)&node,
883 		    &len);
884 	if (err != 0) {
885 		xvdi_fatal_error(vdp->xs_dip, err, "reading 'params'");
886 		return;
887 	}
888 
889 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
890 	(void) strlcpy(li->li_filename, node, MAXPATHLEN + 1);
891 	kmem_free(node, len);
892 
893 	do {
894 		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
895 		    &ldi_hdl, vdp->xs_ldi_li);
896 	} while (err == EBUSY);
897 
898 	if (err != 0) {
899 		kmem_free(li, sizeof (*li));
900 		return;
901 	}
902 
903 	if (ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE, (intptr_t)li,
904 	    LOFI_MODE | FKIOCTL, kcred, NULL) != 0) {
905 		cmn_err(CE_WARN, "xdb@%s: Failed to delete lofi dev for %s",
906 		    ddi_get_name_addr(dip), li->li_filename);
907 	}
908 
909 	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
910 	kmem_free(li, sizeof (*li));
911 }
912 
913 static int
914 xdb_open_device(xdb_t *vdp)
915 {
916 	uint64_t devsize;
917 	dev_info_t *dip;
918 	char *xsnode;
919 	char *nodepath;
920 	char *mode = NULL;
921 	char *type = NULL;
922 	int err;
923 
924 	dip = vdp->xs_dip;
925 	xsnode = xvdi_get_xsname(dip);
926 	if (xsnode == NULL)
927 		return (DDI_FAILURE);
928 
929 	err = xenbus_gather(XBT_NULL, xsnode,
930 	    "mode", NULL, &mode, "type", NULL, &type, NULL);
931 	if (err != 0) {
932 		if (mode)
933 			kmem_free(mode, strlen(mode) + 1);
934 		if (type)
935 			kmem_free(type, strlen(type) + 1);
936 		xvdi_fatal_error(dip, err,
937 		    "Getting mode and type from backend device");
938 		return (DDI_FAILURE);
939 	}
940 	if (strcmp(type, "file") == 0) {
941 		vdp->xs_type |= XDB_DEV_LOFI;
942 	}
943 	kmem_free(type, strlen(type) + 1);
944 	if ((strcmp(mode, "r") == NULL) || (strcmp(mode, "ro") == NULL)) {
945 		vdp->xs_type |= XDB_DEV_RO;
946 	}
947 	kmem_free(mode, strlen(mode) + 1);
948 
949 	/*
950 	 * try to open backend device
951 	 */
952 	if (ldi_ident_from_dip(dip, &vdp->xs_ldi_li) != 0)
953 		return (DDI_FAILURE);
954 
955 	nodepath = kmem_zalloc(MAXPATHLEN + 1, KM_SLEEP);
956 	err = xdb_setup_node(vdp, nodepath);
957 	if (err != DDI_SUCCESS) {
958 		xvdi_fatal_error(dip, err,
959 		    "Getting device path of backend device");
960 		ldi_ident_release(vdp->xs_ldi_li);
961 		kmem_free(nodepath, MAXPATHLEN + 1);
962 		return (DDI_FAILURE);
963 	}
964 
965 	if (ldi_open_by_name(nodepath,
966 	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE),
967 	    kcred, &vdp->xs_ldi_hdl, vdp->xs_ldi_li) != 0) {
968 		xdb_teardown_node(vdp);
969 		ldi_ident_release(vdp->xs_ldi_li);
970 		cmn_err(CE_WARN, "xdb@%s: Failed to open: %s",
971 		    ddi_get_name_addr(dip), nodepath);
972 		kmem_free(nodepath, MAXPATHLEN + 1);
973 		return (DDI_FAILURE);
974 	}
975 
976 	/* check if it's a CD/DVD disc */
977 	if (ldi_prop_get_int(vdp->xs_ldi_hdl, LDI_DEV_T_ANY | DDI_PROP_DONTPASS,
978 	    "inquiry-device-type", DTYPE_DIRECT) == DTYPE_RODIRECT)
979 		vdp->xs_type |= XDB_DEV_CD;
980 	/* check if it's a removable disk */
981 	if (ldi_prop_exists(vdp->xs_ldi_hdl,
982 	    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
983 	    "removable-media"))
984 		vdp->xs_type |= XDB_DEV_RMB;
985 
986 	if (ldi_get_size(vdp->xs_ldi_hdl, &devsize) != DDI_SUCCESS) {
987 		(void) ldi_close(vdp->xs_ldi_hdl,
988 		    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
989 		xdb_teardown_node(vdp);
990 		ldi_ident_release(vdp->xs_ldi_li);
991 		kmem_free(nodepath, MAXPATHLEN + 1);
992 		return (DDI_FAILURE);
993 	}
994 	vdp->xs_sectors = devsize / XB_BSIZE;
995 
996 	kmem_free(nodepath, MAXPATHLEN + 1);
997 	return (DDI_SUCCESS);
998 }
999 
1000 static void
1001 xdb_close_device(xdb_t *vdp)
1002 {
1003 	(void) ldi_close(vdp->xs_ldi_hdl,
1004 	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
1005 	xdb_teardown_node(vdp);
1006 	ldi_ident_release(vdp->xs_ldi_li);
1007 	vdp->xs_ldi_li = NULL;
1008 	vdp->xs_ldi_hdl = NULL;
1009 }
1010 
1011 /*
1012  * Kick-off connect process
1013  * If xs_fe_status == XDB_FE_READY and xs_dev_status == XDB_DEV_READY
1014  * the xs_if_status will be changed to XDB_CONNECTED on success,
1015  * otherwise, xs_if_status will not be changed
1016  */
1017 static int
1018 xdb_start_connect(xdb_t *vdp)
1019 {
1020 	uint32_t dinfo;
1021 	xenbus_transaction_t xbt;
1022 	int err, svdst;
1023 	char *xsnode;
1024 	dev_info_t *dip = vdp->xs_dip;
1025 	char *barrier;
1026 	uint_t len;
1027 
1028 	/*
1029 	 * Start connect to frontend only when backend device are ready
1030 	 * and frontend has moved to XenbusStateInitialised, which means
1031 	 * ready to connect
1032 	 */
1033 	ASSERT((vdp->xs_fe_status == XDB_FE_READY) &&
1034 	    (vdp->xs_dev_status == XDB_DEV_READY));
1035 
1036 	if (((xsnode = xvdi_get_xsname(dip)) == NULL)		 ||
1037 	    ((vdp->xs_peer = xvdi_get_oeid(dip)) == (domid_t)-1) ||
1038 	    (xdb_open_device(vdp) != DDI_SUCCESS))
1039 		return (DDI_FAILURE);
1040 
1041 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialised);
1042 
1043 	if (xdb_bindto_frontend(vdp) != DDI_SUCCESS)
1044 		goto errout1;
1045 
1046 	/* init i/o requests */
1047 	xdb_init_ioreqs(vdp);
1048 
1049 	if (ddi_add_intr(dip, 0, NULL, NULL, xdb_intr, (caddr_t)vdp)
1050 	    != DDI_SUCCESS)
1051 		goto errout2;
1052 
1053 	/*
1054 	 * we can recieve intr any time from now on
1055 	 * mark that we're ready to take intr
1056 	 */
1057 	mutex_enter(&vdp->xs_iomutex);
1058 	/*
1059 	 * save it in case we need to restore when we
1060 	 * fail to write xenstore later
1061 	 */
1062 	svdst = vdp->xs_if_status;
1063 	vdp->xs_if_status = XDB_CONNECTED;
1064 	mutex_exit(&vdp->xs_iomutex);
1065 
1066 	/* write into xenstore the info needed by frontend */
1067 trans_retry:
1068 	if (xenbus_transaction_start(&xbt)) {
1069 		xvdi_fatal_error(dip, EIO, "transaction start");
1070 		goto errout3;
1071 	}
1072 
1073 	/*
1074 	 * If feature-barrier isn't present in xenstore, add it.
1075 	 */
1076 	if (xenbus_read(xbt, xsnode, "feature-barrier",
1077 	    (void **)&barrier, &len) != 0) {
1078 		if ((err = xenbus_printf(xbt, xsnode, "feature-barrier",
1079 		    "%d", 1)) != 0) {
1080 			cmn_err(CE_WARN, "xdb@%s: failed to write "
1081 			    "'feature-barrier'", ddi_get_name_addr(dip));
1082 			xvdi_fatal_error(dip, err, "writing 'feature-barrier'");
1083 			goto abort_trans;
1084 		}
1085 	} else
1086 		kmem_free(barrier, len);
1087 
1088 	dinfo = 0;
1089 	if (XDB_IS_RO(vdp))
1090 		dinfo |= VDISK_READONLY;
1091 	if (XDB_IS_CD(vdp))
1092 		dinfo |= VDISK_CDROM;
1093 	if (XDB_IS_RMB(vdp))
1094 		dinfo |= VDISK_REMOVABLE;
1095 	if (err = xenbus_printf(xbt, xsnode, "info", "%u", dinfo)) {
1096 		xvdi_fatal_error(dip, err, "writing 'info'");
1097 		goto abort_trans;
1098 	}
1099 
1100 	/* hard-coded 512-byte sector size */
1101 	if (err = xenbus_printf(xbt, xsnode, "sector-size", "%u", DEV_BSIZE)) {
1102 		xvdi_fatal_error(dip, err, "writing 'sector-size'");
1103 		goto abort_trans;
1104 	}
1105 
1106 	if (err = xenbus_printf(xbt, xsnode, "sectors", "%"PRIu64,
1107 	    vdp->xs_sectors)) {
1108 		xvdi_fatal_error(dip, err, "writing 'sectors'");
1109 		goto abort_trans;
1110 	}
1111 
1112 	if (err = xenbus_printf(xbt, xsnode, "instance", "%d",
1113 	    ddi_get_instance(dip))) {
1114 		xvdi_fatal_error(dip, err, "writing 'instance'");
1115 		goto abort_trans;
1116 	}
1117 
1118 	if ((err = xvdi_switch_state(dip, xbt, XenbusStateConnected)) > 0) {
1119 		xvdi_fatal_error(dip, err, "writing 'state'");
1120 		goto abort_trans;
1121 	}
1122 
1123 	if (err = xenbus_transaction_end(xbt, 0)) {
1124 		if (err == EAGAIN)
1125 			/* transaction is ended, don't need to abort it */
1126 			goto trans_retry;
1127 		xvdi_fatal_error(dip, err, "completing transaction");
1128 		goto errout3;
1129 	}
1130 
1131 	return (DDI_SUCCESS);
1132 
1133 abort_trans:
1134 	(void) xenbus_transaction_end(xbt, 1);
1135 errout3:
1136 	mutex_enter(&vdp->xs_iomutex);
1137 	vdp->xs_if_status = svdst;
1138 	mutex_exit(&vdp->xs_iomutex);
1139 	ddi_remove_intr(dip, 0, NULL);
1140 errout2:
1141 	xdb_uninit_ioreqs(vdp);
1142 	xdb_unbindfrom_frontend(vdp);
1143 errout1:
1144 	xdb_close_device(vdp);
1145 	return (DDI_FAILURE);
1146 }
1147 
1148 /*
1149  * Kick-off disconnect process
1150  * xs_if_status will not be changed
1151  */
1152 static int
1153 xdb_start_disconnect(xdb_t *vdp)
1154 {
1155 	/*
1156 	 * Kick-off disconnect process
1157 	 */
1158 	if (xvdi_switch_state(vdp->xs_dip, XBT_NULL, XenbusStateClosing) > 0)
1159 		return (DDI_FAILURE);
1160 
1161 	return (DDI_SUCCESS);
1162 }
1163 
1164 /*
1165  * Disconnect from frontend and close backend device
1166  * ifstatus will be changed to XDB_DISCONNECTED
1167  * Xenbus state will be changed to XenbusStateClosed
1168  */
1169 static void
1170 xdb_close(dev_info_t *dip)
1171 {
1172 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1173 
1174 	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));
1175 
1176 	mutex_enter(&vdp->xs_iomutex);
1177 
1178 	if (vdp->xs_if_status != XDB_CONNECTED) {
1179 		vdp->xs_if_status = XDB_DISCONNECTED;
1180 		cv_broadcast(&vdp->xs_iocv);
1181 		mutex_exit(&vdp->xs_iomutex);
1182 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1183 		return;
1184 	}
1185 	vdp->xs_if_status = XDB_DISCONNECTED;
1186 	cv_broadcast(&vdp->xs_iocv);
1187 
1188 	mutex_exit(&vdp->xs_iomutex);
1189 
1190 	/* stop accepting I/O request from frontend */
1191 	ddi_remove_intr(dip, 0, NULL);
1192 	/* clear all on-going I/Os, if any */
1193 	mutex_enter(&vdp->xs_iomutex);
1194 	while (vdp->xs_ionum > 0)
1195 		cv_wait(&vdp->xs_ionumcv, &vdp->xs_iomutex);
1196 	mutex_exit(&vdp->xs_iomutex);
1197 
1198 	/* clean up resources and close this interface */
1199 	xdb_uninit_ioreqs(vdp);
1200 	xdb_unbindfrom_frontend(vdp);
1201 	xdb_close_device(vdp);
1202 	vdp->xs_peer = (domid_t)-1;
1203 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1204 }
1205 
1206 /*
1207  * Xdb_check_state_transition will check the XenbusState change to see
1208  * if the change is a valid transition or not.
1209  * The new state is written by frontend domain, or by running xenstore-write
1210  * to change it manually in dom0
1211  */
1212 static int
1213 xdb_check_state_transition(xdb_t *vdp, XenbusState oestate)
1214 {
1215 	enum xdb_state status;
1216 	int stcheck;
1217 #define	STOK	0 /* need further process */
1218 #define	STNOP	1 /* no action need taking */
1219 #define	STBUG	2 /* unexpected state change, could be a bug */
1220 
1221 	status = vdp->xs_if_status;
1222 	stcheck = STOK;
1223 
1224 	switch (status) {
1225 	case XDB_UNKNOWN:
1226 		if (vdp->xs_fe_status == XDB_FE_UNKNOWN) {
1227 			if ((oestate == XenbusStateUnknown)		||
1228 			    (oestate == XenbusStateConnected))
1229 				stcheck = STBUG;
1230 			else if ((oestate == XenbusStateInitialising)	||
1231 			    (oestate == XenbusStateInitWait))
1232 				stcheck = STNOP;
1233 		} else {
1234 			if ((oestate == XenbusStateUnknown)		||
1235 			    (oestate == XenbusStateInitialising)	||
1236 			    (oestate == XenbusStateInitWait)		||
1237 			    (oestate == XenbusStateConnected))
1238 				stcheck = STBUG;
1239 			else if (oestate == XenbusStateInitialised)
1240 				stcheck = STNOP;
1241 		}
1242 		break;
1243 	case XDB_CONNECTED:
1244 		if ((oestate == XenbusStateUnknown)		||
1245 		    (oestate == XenbusStateInitialising)	||
1246 		    (oestate == XenbusStateInitWait)		||
1247 		    (oestate == XenbusStateInitialised))
1248 			stcheck = STBUG;
1249 		else if (oestate == XenbusStateConnected)
1250 			stcheck = STNOP;
1251 		break;
1252 	case XDB_DISCONNECTED:
1253 	default:
1254 			stcheck = STBUG;
1255 	}
1256 
1257 	if (stcheck == STOK)
1258 		return (DDI_SUCCESS);
1259 
1260 	if (stcheck == STBUG)
1261 		cmn_err(CE_NOTE, "xdb@%s: unexpected otherend "
1262 		    "state change to %d!, when status is %d",
1263 		    ddi_get_name_addr(vdp->xs_dip), oestate, status);
1264 
1265 	return (DDI_FAILURE);
1266 }
1267 
1268 static void
1269 xdb_send_buf(void *arg)
1270 {
1271 	buf_t *bp;
1272 	xdb_t *vdp = (xdb_t *)arg;
1273 
1274 	mutex_enter(&vdp->xs_iomutex);
1275 
1276 	while (vdp->xs_if_status != XDB_DISCONNECTED) {
1277 		while ((bp = vdp->xs_f_iobuf) != NULL) {
1278 			vdp->xs_f_iobuf = bp->av_forw;
1279 			bp->av_forw = NULL;
1280 			vdp->xs_ionum++;
1281 			mutex_exit(&vdp->xs_iomutex);
1282 			if (bp->b_bcount != 0) {
1283 				int err = ldi_strategy(vdp->xs_ldi_hdl, bp);
1284 				if (err != 0) {
1285 					bp->b_flags |= B_ERROR;
1286 					(void) xdb_biodone(bp);
1287 					XDB_DBPRINT(XDB_DBG_IO, (CE_WARN,
1288 					    "xdb@%s: sent buf to backend dev"
1289 					    "failed, err=%d",
1290 					    ddi_get_name_addr(vdp->xs_dip),
1291 					    err));
1292 				} else {
1293 					XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
1294 					    "sent buf to backend ok"));
1295 				}
1296 			} else /* no I/O need to be done */
1297 				(void) xdb_biodone(bp);
1298 
1299 			mutex_enter(&vdp->xs_iomutex);
1300 		}
1301 
1302 		if (vdp->xs_if_status != XDB_DISCONNECTED)
1303 			cv_wait(&vdp->xs_iocv, &vdp->xs_iomutex);
1304 	}
1305 
1306 	mutex_exit(&vdp->xs_iomutex);
1307 }
1308 
1309 /*ARGSUSED*/
1310 static void
1311 xdb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
1312     void *impl_data)
1313 {
1314 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1315 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1316 
1317 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
1318 	    "hotplug status change to %d!", ddi_get_name_addr(dip), state));
1319 
1320 	mutex_enter(&vdp->xs_cbmutex);
1321 	if (state == Connected) {
1322 		/* Hotplug script has completed successfully */
1323 		if (vdp->xs_dev_status == XDB_DEV_UNKNOWN) {
1324 			vdp->xs_dev_status = XDB_DEV_READY;
1325 			if (vdp->xs_fe_status == XDB_FE_READY)
1326 				/* try to connect to frontend */
1327 				if (xdb_start_connect(vdp) != DDI_SUCCESS)
1328 					(void) xdb_start_disconnect(vdp);
1329 		}
1330 	}
1331 	mutex_exit(&vdp->xs_cbmutex);
1332 }
1333 
1334 /*ARGSUSED*/
1335 static void
1336 xdb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
1337     void *impl_data)
1338 {
1339 	XenbusState new_state = *(XenbusState *)impl_data;
1340 	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);
1341 
1342 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
1343 	    "otherend state change to %d!", ddi_get_name_addr(dip), new_state));
1344 
1345 	mutex_enter(&vdp->xs_cbmutex);
1346 
1347 	if (xdb_check_state_transition(vdp, new_state) == DDI_FAILURE) {
1348 		mutex_exit(&vdp->xs_cbmutex);
1349 		return;
1350 	}
1351 
1352 	switch (new_state) {
1353 	case XenbusStateInitialised:
1354 		ASSERT(vdp->xs_if_status == XDB_UNKNOWN);
1355 
1356 		/* frontend is ready for connecting */
1357 		vdp->xs_fe_status = XDB_FE_READY;
1358 
1359 		if (vdp->xs_dev_status == XDB_DEV_READY)
1360 			if (xdb_start_connect(vdp) != DDI_SUCCESS)
1361 				(void) xdb_start_disconnect(vdp);
1362 		break;
1363 	case XenbusStateClosing:
1364 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1365 		break;
1366 	case XenbusStateClosed:
1367 		/* clean up */
1368 		xdb_close(dip);
1369 
1370 	}
1371 
1372 	mutex_exit(&vdp->xs_cbmutex);
1373 }
1374 
1375 static int
1376 xdb_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1377 {
1378 	xdb_t *vdp;
1379 	ddi_iblock_cookie_t ibc;
1380 	int instance;
1381 
1382 	switch (cmd) {
1383 	case DDI_RESUME:
1384 		return (DDI_FAILURE);
1385 	case DDI_ATTACH:
1386 		break;
1387 	default:
1388 		return (DDI_FAILURE);
1389 	}
1390 
1391 	/* DDI_ATTACH */
1392 	instance = ddi_get_instance(dip);
1393 	if (ddi_soft_state_zalloc(xdb_statep, instance) != DDI_SUCCESS)
1394 		return (DDI_FAILURE);
1395 
1396 	vdp = ddi_get_soft_state(xdb_statep, instance);
1397 	vdp->xs_dip = dip;
1398 	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
1399 		goto errout1;
1400 
1401 	if (!xdb_kstat_init(vdp))
1402 		goto errout1;
1403 
1404 	mutex_init(&vdp->xs_iomutex, NULL, MUTEX_DRIVER, (void *)ibc);
1405 	mutex_init(&vdp->xs_cbmutex, NULL, MUTEX_DRIVER, (void *)ibc);
1406 	cv_init(&vdp->xs_iocv, NULL, CV_DRIVER, NULL);
1407 	cv_init(&vdp->xs_ionumcv, NULL, CV_DRIVER, NULL);
1408 
1409 	ddi_set_driver_private(dip, vdp);
1410 
1411 	vdp->xs_iotaskq = ddi_taskq_create(dip, "xdb_iotask", 1,
1412 	    TASKQ_DEFAULTPRI, 0);
1413 	if (vdp->xs_iotaskq == NULL)
1414 		goto errout2;
1415 	(void) ddi_taskq_dispatch(vdp->xs_iotaskq, xdb_send_buf, vdp,
1416 	    DDI_SLEEP);
1417 
1418 	/* Watch frontend and hotplug state change */
1419 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xdb_oe_state_change,
1420 	    NULL) != DDI_SUCCESS)
1421 		goto errout3;
1422 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xdb_hp_state_change,
1423 	    NULL) != DDI_SUCCESS) {
1424 		goto errout4;
1425 	}
1426 
1427 	/*
1428 	 * Kick-off hotplug script
1429 	 */
1430 	if (xvdi_post_event(dip, XEN_HP_ADD) != DDI_SUCCESS) {
1431 		cmn_err(CE_WARN, "xdb@%s: failed to start hotplug script",
1432 		    ddi_get_name_addr(dip));
1433 		goto errout4;
1434 	}
1435 
1436 	/*
1437 	 * start waiting for hotplug event and otherend state event
1438 	 * mainly for debugging, frontend will not take any op seeing this
1439 	 */
1440 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
1441 
1442 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: attached!",
1443 	    ddi_get_name_addr(dip)));
1444 	return (DDI_SUCCESS);
1445 
1446 errout4:
1447 	xvdi_remove_event_handler(dip, NULL);
1448 errout3:
1449 	mutex_enter(&vdp->xs_cbmutex);
1450 	mutex_enter(&vdp->xs_iomutex);
1451 	vdp->xs_if_status = XDB_DISCONNECTED;
1452 	cv_broadcast(&vdp->xs_iocv);
1453 	mutex_exit(&vdp->xs_iomutex);
1454 	mutex_exit(&vdp->xs_cbmutex);
1455 	ddi_taskq_destroy(vdp->xs_iotaskq);
1456 errout2:
1457 	ddi_set_driver_private(dip, NULL);
1458 	cv_destroy(&vdp->xs_iocv);
1459 	cv_destroy(&vdp->xs_ionumcv);
1460 	mutex_destroy(&vdp->xs_cbmutex);
1461 	mutex_destroy(&vdp->xs_iomutex);
1462 	kstat_delete(vdp->xs_kstats);
1463 errout1:
1464 	ddi_soft_state_free(xdb_statep, instance);
1465 	return (DDI_FAILURE);
1466 }
1467 
1468 /*ARGSUSED*/
1469 static int
1470 xdb_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1471 {
1472 	int instance = ddi_get_instance(dip);
1473 	xdb_t *vdp = XDB_INST2SOFTS(instance);
1474 
1475 	switch (cmd) {
1476 	case DDI_SUSPEND:
1477 		return (DDI_FAILURE);
1478 	case DDI_DETACH:
1479 		break;
1480 	default:
1481 		return (DDI_FAILURE);
1482 	}
1483 
1484 	/* DDI_DETACH handling */
1485 
1486 	/* shouldn't detach, if still used by frontend */
1487 	mutex_enter(&vdp->xs_iomutex);
1488 	if (vdp->xs_if_status != XDB_DISCONNECTED) {
1489 		mutex_exit(&vdp->xs_iomutex);
1490 		return (DDI_FAILURE);
1491 	}
1492 	mutex_exit(&vdp->xs_iomutex);
1493 
1494 	xvdi_remove_event_handler(dip, NULL);
1495 	/* can do nothing about it, if it fails */
1496 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1497 
1498 	ddi_taskq_destroy(vdp->xs_iotaskq);
1499 	cv_destroy(&vdp->xs_iocv);
1500 	cv_destroy(&vdp->xs_ionumcv);
1501 	mutex_destroy(&vdp->xs_cbmutex);
1502 	mutex_destroy(&vdp->xs_iomutex);
1503 	kstat_delete(vdp->xs_kstats);
1504 	ddi_set_driver_private(dip, NULL);
1505 	ddi_soft_state_free(xdb_statep, instance);
1506 
1507 	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: detached!",
1508 	    ddi_get_name_addr(dip)));
1509 	return (DDI_SUCCESS);
1510 }
1511 
1512 static struct dev_ops xdb_dev_ops = {
1513 	DEVO_REV,	/* devo_rev */
1514 	0,		/* devo_refcnt */
1515 	ddi_getinfo_1to1, /* devo_getinfo */
1516 	nulldev,	/* devo_identify */
1517 	nulldev,	/* devo_probe */
1518 	xdb_attach,	/* devo_attach */
1519 	xdb_detach,	/* devo_detach */
1520 	nodev,		/* devo_reset */
1521 	NULL,		/* devo_cb_ops */
1522 	NULL,		/* devo_bus_ops */
1523 	NULL,		/* power */
1524 	ddi_quiesce_not_needed,	/* quiesce */
1525 };
1526 
1527 /*
1528  * Module linkage information for the kernel.
1529  */
1530 static struct modldrv modldrv = {
1531 	&mod_driverops,			/* Type of module. */
1532 	"vbd backend driver",	/* Name of the module */
1533 	&xdb_dev_ops			/* driver ops */
1534 };
1535 
1536 static struct modlinkage xdb_modlinkage = {
1537 	MODREV_1,
1538 	&modldrv,
1539 	NULL
1540 };
1541 
1542 int
1543 _init(void)
1544 {
1545 	int rv;
1546 
1547 	if ((rv = ddi_soft_state_init((void **)&xdb_statep,
1548 	    sizeof (xdb_t), 0)) == 0)
1549 		if ((rv = mod_install(&xdb_modlinkage)) != 0)
1550 			ddi_soft_state_fini((void **)&xdb_statep);
1551 	return (rv);
1552 }
1553 
1554 int
1555 _fini(void)
1556 {
1557 	int rv;
1558 
1559 	if ((rv = mod_remove(&xdb_modlinkage)) != 0)
1560 		return (rv);
1561 	ddi_soft_state_fini((void **)&xdb_statep);
1562 	return (rv);
1563 }
1564 
1565 int
1566 _info(struct modinfo *modinfop)
1567 {
1568 	return (mod_info(&xdb_modlinkage, modinfop));
1569 }
1570 
1571 static int
1572 xdb_get_request(xdb_t *vdp, blkif_request_t *req)
1573 {
1574 	void *src = xvdi_ring_get_request(vdp->xs_ring);
1575 
1576 	if (src == NULL)
1577 		return (0);
1578 
1579 	switch (vdp->xs_blk_protocol) {
1580 	case BLKIF_PROTOCOL_NATIVE:
1581 		(void) memcpy(req, src, sizeof (*req));
1582 		break;
1583 	case BLKIF_PROTOCOL_X86_32:
1584 		blkif_get_x86_32_req(req, src);
1585 		break;
1586 	case BLKIF_PROTOCOL_X86_64:
1587 		blkif_get_x86_64_req(req, src);
1588 		break;
1589 	default:
1590 		cmn_err(CE_PANIC, "xdb@%s: unrecognised protocol: %d",
1591 		    ddi_get_name_addr(vdp->xs_dip),
1592 		    vdp->xs_blk_protocol);
1593 	}
1594 	return (1);
1595 }
1596 
1597 static int
1598 xdb_push_response(xdb_t *vdp, uint64_t id, uint8_t op, uint16_t status)
1599 {
1600 	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;
1601 	blkif_response_t *rsp = xvdi_ring_get_response(vdp->xs_ring);
1602 	blkif_x86_32_response_t *rsp_32 = (blkif_x86_32_response_t *)rsp;
1603 	blkif_x86_64_response_t *rsp_64 = (blkif_x86_64_response_t *)rsp;
1604 
1605 	ASSERT(rsp);
1606 
1607 	switch (vdp->xs_blk_protocol) {
1608 	case BLKIF_PROTOCOL_NATIVE:
1609 		ddi_put64(acchdl, &rsp->id, id);
1610 		ddi_put8(acchdl, &rsp->operation, op);
1611 		ddi_put16(acchdl, (uint16_t *)&rsp->status,
1612 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1613 		break;
1614 	case BLKIF_PROTOCOL_X86_32:
1615 		ddi_put64(acchdl, &rsp_32->id, id);
1616 		ddi_put8(acchdl, &rsp_32->operation, op);
1617 		ddi_put16(acchdl, (uint16_t *)&rsp_32->status,
1618 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1619 		break;
1620 	case BLKIF_PROTOCOL_X86_64:
1621 		ddi_put64(acchdl, &rsp_64->id, id);
1622 		ddi_put8(acchdl, &rsp_64->operation, op);
1623 		ddi_put16(acchdl, (uint16_t *)&rsp_64->status,
1624 		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
1625 		break;
1626 	default:
1627 		cmn_err(CE_PANIC, "xdb@%s: unrecognised protocol: %d",
1628 		    ddi_get_name_addr(vdp->xs_dip),
1629 		    vdp->xs_blk_protocol);
1630 	}
1631 
1632 	return (xvdi_ring_push_response(vdp->xs_ring));
1633 }
1634 
1635 static void
1636 blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
1637 {
1638 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1639 	dst->operation = src->operation;
1640 	dst->nr_segments = src->nr_segments;
1641 	dst->handle = src->handle;
1642 	dst->id = src->id;
1643 	dst->sector_number = src->sector_number;
1644 	if (n > src->nr_segments)
1645 		n = src->nr_segments;
1646 	for (i = 0; i < n; i++)
1647 		dst->seg[i] = src->seg[i];
1648 }
1649 
1650 static void
1651 blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
1652 {
1653 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1654 	dst->operation = src->operation;
1655 	dst->nr_segments = src->nr_segments;
1656 	dst->handle = src->handle;
1657 	dst->id = src->id;
1658 	dst->sector_number = src->sector_number;
1659 	if (n > src->nr_segments)
1660 		n = src->nr_segments;
1661 	for (i = 0; i < n; i++)
1662 		dst->seg[i] = src->seg[i];
1663 }
1664