xref: /freebsd/sys/dev/xen/blkfront/blkfront.c (revision 830940567b49bb0c08dfaed40418999e76616909)
1 /*-
2  * All rights reserved.
3  *
4  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
5  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
6  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
7  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
8  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
9  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
10  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
11  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
12  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
13  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
14  * SUCH DAMAGE.
15  *
16  */
17 
18 /*
19  * XenoBSD block device driver
20  */
21 
22 #include <sys/cdefs.h>
23 __FBSDID("$FreeBSD$");
24 
25 #include <sys/param.h>
26 #include <sys/systm.h>
27 #include <sys/malloc.h>
28 #include <sys/kernel.h>
29 #include <vm/vm.h>
30 #include <vm/pmap.h>
31 
32 #include <sys/bio.h>
33 #include <sys/bus.h>
34 #include <sys/conf.h>
35 #include <sys/module.h>
36 
37 #include <machine/bus.h>
38 #include <sys/rman.h>
39 #include <machine/resource.h>
40 #include <machine/intr_machdep.h>
41 #include <machine/vmparam.h>
42 
43 #include <machine/xen/xen-os.h>
44 #include <machine/xen/xenfunc.h>
45 #include <xen/hypervisor.h>
46 #include <xen/xen_intr.h>
47 #include <xen/evtchn.h>
48 #include <xen/gnttab.h>
49 #include <xen/interface/grant_table.h>
50 #include <xen/interface/io/protocols.h>
51 #include <xen/xenbus/xenbusvar.h>
52 
53 #include <geom/geom_disk.h>
54 
55 #include <dev/xen/blkfront/block.h>
56 
57 #include "xenbus_if.h"
58 
59 #define    ASSERT(S)       KASSERT(S, (#S))
60 /* prototypes */
61 struct xb_softc;
62 static void xb_startio(struct xb_softc *sc);
63 static void connect(device_t, struct blkfront_info *);
64 static void blkfront_closing(device_t);
65 static int blkfront_detach(device_t);
66 static int talk_to_backend(device_t, struct blkfront_info *);
67 static int setup_blkring(device_t, struct blkfront_info *);
68 static void blkif_int(void *);
69 #if 0
70 static void blkif_restart_queue(void *arg);
71 #endif
72 static void blkif_recover(struct blkfront_info *);
73 static void blkif_completion(struct blk_shadow *);
74 static void blkif_free(struct blkfront_info *, int);
75 
76 #define GRANT_INVALID_REF 0
77 #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
78 
79 LIST_HEAD(xb_softc_list_head, xb_softc) xbsl_head;
80 
81 /* Control whether runtime update of vbds is enabled. */
82 #define ENABLE_VBD_UPDATE 0
83 
84 #if ENABLE_VBD_UPDATE
85 static void vbd_update(void);
86 #endif
87 
88 
89 #define BLKIF_STATE_DISCONNECTED 0
90 #define BLKIF_STATE_CONNECTED    1
91 #define BLKIF_STATE_SUSPENDED    2
92 
93 #ifdef notyet
94 static char *blkif_state_name[] = {
95 	[BLKIF_STATE_DISCONNECTED] = "disconnected",
96 	[BLKIF_STATE_CONNECTED]    = "connected",
97 	[BLKIF_STATE_SUSPENDED]    = "closed",
98 };
99 
100 static char * blkif_status_name[] = {
101 	[BLKIF_INTERFACE_STATUS_CLOSED]       = "closed",
102 	[BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
103 	[BLKIF_INTERFACE_STATUS_CONNECTED]    = "connected",
104 	[BLKIF_INTERFACE_STATUS_CHANGED]      = "changed",
105 };
106 #endif
107 #define WPRINTK(fmt, args...) printf("[XEN] " fmt, ##args)
108 #if 0
109 #define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args)
110 #else
111 #define DPRINTK(fmt, args...)
112 #endif
113 
114 static grant_ref_t gref_head;
115 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
116     (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
117 
118 static void kick_pending_request_queues(struct blkfront_info *);
119 static int blkif_open(struct disk *dp);
120 static int blkif_close(struct disk *dp);
121 static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td);
122 static int blkif_queue_request(struct bio *bp);
123 static void xb_strategy(struct bio *bp);
124 
125 
126 
127 /* XXX move to xb_vbd.c when VBD update support is added */
128 #define MAX_VBDS 64
129 
130 #define XBD_SECTOR_SIZE		512	/* XXX: assume for now */
131 #define XBD_SECTOR_SHFT		9
132 
133 static struct mtx blkif_io_lock;
134 
135 static vm_paddr_t
136 pfn_to_mfn(vm_paddr_t pfn)
137 {
138 	return (phystomach(pfn << PAGE_SHIFT) >> PAGE_SHIFT);
139 }
140 
141 /*
142  * Translate Linux major/minor to an appropriate name and unit
143  * number. For HVM guests, this allows us to use the same drive names
144  * with blkfront as the emulated drives, easing transition slightly.
145  */
146 static void
147 blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
148 {
149 	static struct vdev_info {
150 		int major;
151 		int shift;
152 		int base;
153 		const char *name;
154 	} info[] = {
155 		{3,	6,	0,	"ad"},	/* ide0 */
156 		{22,	6,	2,	"ad"},	/* ide1 */
157 		{33,	6,	4,	"ad"},	/* ide2 */
158 		{34,	6,	6,	"ad"},	/* ide3 */
159 		{56,	6,	8,	"ad"},	/* ide4 */
160 		{57,	6,	10,	"ad"},	/* ide5 */
161 		{88,	6,	12,	"ad"},	/* ide6 */
162 		{89,	6,	14,	"ad"},	/* ide7 */
163 		{90,	6,	16,	"ad"},	/* ide8 */
164 		{91,	6,	18,	"ad"},	/* ide9 */
165 
166 		{8,	4,	0,	"da"},	/* scsi disk0 */
167 		{65,	4,	16,	"da"},	/* scsi disk1 */
168 		{66,	4,	32,	"da"},	/* scsi disk2 */
169 		{67,	4,	48,	"da"},	/* scsi disk3 */
170 		{68,	4,	64,	"da"},	/* scsi disk4 */
171 		{69,	4,	80,	"da"},	/* scsi disk5 */
172 		{70,	4,	96,	"da"},	/* scsi disk6 */
173 		{71,	4,	112,	"da"},	/* scsi disk7 */
174 		{128,	4,	128,	"da"},	/* scsi disk8 */
175 		{129,	4,	144,	"da"},	/* scsi disk9 */
176 		{130,	4,	160,	"da"},	/* scsi disk10 */
177 		{131,	4,	176,	"da"},	/* scsi disk11 */
178 		{132,	4,	192,	"da"},	/* scsi disk12 */
179 		{133,	4,	208,	"da"},	/* scsi disk13 */
180 		{134,	4,	224,	"da"},	/* scsi disk14 */
181 		{135,	4,	240,	"da"},	/* scsi disk15 */
182 
183 		{202,	4,	0,	"xbd"},	/* xbd */
184 
185 		{0,	0,	0,	NULL},
186 	};
187 	int major = vdevice >> 8;
188 	int minor = vdevice & 0xff;
189 	int i;
190 
191 	if (vdevice & (1 << 28)) {
192 		*unit = (vdevice & ((1 << 28) - 1)) >> 8;
193 		*name = "xbd";
194 	}
195 
196 	for (i = 0; info[i].major; i++) {
197 		if (info[i].major == major) {
198 			*unit = info[i].base + (minor >> info[i].shift);
199 			*name = info[i].name;
200 			return;
201 		}
202 	}
203 
204 	*unit = minor >> 4;
205 	*name = "xbd";
206 }
207 
208 int
209 xlvbd_add(device_t dev, blkif_sector_t capacity,
210     int vdevice, uint16_t vdisk_info, uint16_t sector_size,
211     struct blkfront_info *info)
212 {
213 	struct xb_softc	*sc;
214 	int	unit, error = 0;
215 	const char *name;
216 
217 	blkfront_vdevice_to_unit(vdevice, &unit, &name);
218 
219 	sc = (struct xb_softc *)malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
220 	sc->xb_unit = unit;
221 	sc->xb_info = info;
222 	info->sc = sc;
223 
224 	if (strcmp(name, "xbd"))
225 		device_printf(dev, "attaching as %s%d\n", name, unit);
226 
227 	memset(&sc->xb_disk, 0, sizeof(sc->xb_disk));
228 	sc->xb_disk = disk_alloc();
229 	sc->xb_disk->d_unit = sc->xb_unit;
230 	sc->xb_disk->d_open = blkif_open;
231 	sc->xb_disk->d_close = blkif_close;
232 	sc->xb_disk->d_ioctl = blkif_ioctl;
233 	sc->xb_disk->d_strategy = xb_strategy;
234 	sc->xb_disk->d_name = name;
235 	sc->xb_disk->d_drv1 = sc;
236 	sc->xb_disk->d_sectorsize = sector_size;
237 
238 	/* XXX */
239 	sc->xb_disk->d_mediasize = capacity << XBD_SECTOR_SHFT;
240 #if 0
241 	sc->xb_disk->d_maxsize = DFLTPHYS;
242 #else /* XXX: xen can't handle large single i/o requests */
243 	sc->xb_disk->d_maxsize = 4096;
244 #endif
245 #ifdef notyet
246 	XENPRINTF("attaching device 0x%x unit %d capacity %llu\n",
247 		  xb_diskinfo[sc->xb_unit].device, sc->xb_unit,
248 		  sc->xb_disk->d_mediasize);
249 #endif
250 	sc->xb_disk->d_flags = 0;
251 	disk_create(sc->xb_disk, DISK_VERSION_00);
252 	bioq_init(&sc->xb_bioq);
253 
254 	return error;
255 }
256 
257 void
258 xlvbd_del(struct blkfront_info *info)
259 {
260 	struct xb_softc	*sc;
261 
262 	sc = info->sc;
263 	disk_destroy(sc->xb_disk);
264 }
265 /************************ end VBD support *****************/
266 
267 /*
268  * Read/write routine for a buffer.  Finds the proper unit, place it on
269  * the sortq and kick the controller.
270  */
271 static void
272 xb_strategy(struct bio *bp)
273 {
274 	struct xb_softc	*sc = (struct xb_softc *)bp->bio_disk->d_drv1;
275 
276 	/* bogus disk? */
277 	if (sc == NULL) {
278 		bp->bio_error = EINVAL;
279 		bp->bio_flags |= BIO_ERROR;
280 		goto bad;
281 	}
282 
283 	DPRINTK("");
284 
285 	/*
286 	 * Place it in the queue of disk activities for this disk
287 	 */
288 	mtx_lock(&blkif_io_lock);
289 	bioq_disksort(&sc->xb_bioq, bp);
290 
291 	xb_startio(sc);
292 	mtx_unlock(&blkif_io_lock);
293 	return;
294 
295  bad:
296 	/*
297 	 * Correctly set the bio to indicate a failed tranfer.
298 	 */
299 	bp->bio_resid = bp->bio_bcount;
300 	biodone(bp);
301 	return;
302 }
303 
304 static int
305 blkfront_probe(device_t dev)
306 {
307 
308 	if (!strcmp(xenbus_get_type(dev), "vbd")) {
309 		device_set_desc(dev, "Virtual Block Device");
310 		device_quiet(dev);
311 		return (0);
312 	}
313 
314 	return (ENXIO);
315 }
316 
317 /*
318  * Setup supplies the backend dir, virtual device.  We place an event
319  * channel and shared frame entries.  We watch backend to wait if it's
320  * ok.
321  */
322 static int
323 blkfront_attach(device_t dev)
324 {
325 	int error, vdevice, i, unit;
326 	struct blkfront_info *info;
327 	const char *name;
328 
329 	/* FIXME: Use dynamic device id if this is not set. */
330 	error = xenbus_scanf(XBT_NIL, xenbus_get_node(dev),
331 	    "virtual-device", NULL, "%i", &vdevice);
332 	if (error) {
333 		xenbus_dev_fatal(dev, error, "reading virtual-device");
334 		printf("couldn't find virtual device");
335 		return (error);
336 	}
337 
338 	blkfront_vdevice_to_unit(vdevice, &unit, &name);
339 	if (!strcmp(name, "xbd"))
340 		device_set_unit(dev, unit);
341 
342 	info = device_get_softc(dev);
343 
344 	/*
345 	 * XXX debug only
346 	 */
347 	for (i = 0; i < sizeof(*info); i++)
348 			if (((uint8_t *)info)[i] != 0)
349 					panic("non-null memory");
350 
351 	info->shadow_free = 0;
352 	info->xbdev = dev;
353 	info->vdevice = vdevice;
354 	info->connected = BLKIF_STATE_DISCONNECTED;
355 
356 	/* work queue needed ? */
357 	for (i = 0; i < BLK_RING_SIZE; i++)
358 		info->shadow[i].req.id = i+1;
359 	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
360 
361 	/* Front end dir is a number, which is used as the id. */
362 	info->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0);
363 
364 	error = talk_to_backend(dev, info);
365 	if (error)
366 		return (error);
367 
368 	return (0);
369 }
370 
371 static int
372 blkfront_suspend(device_t dev)
373 {
374 	struct blkfront_info *info = device_get_softc(dev);
375 
376 	/* Prevent new requests being issued until we fix things up. */
377 	mtx_lock(&blkif_io_lock);
378 	info->connected = BLKIF_STATE_SUSPENDED;
379 	mtx_unlock(&blkif_io_lock);
380 
381 	return (0);
382 }
383 
384 static int
385 blkfront_resume(device_t dev)
386 {
387 	struct blkfront_info *info = device_get_softc(dev);
388 	int err;
389 
390 	DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev));
391 
392 	blkif_free(info, 1);
393 	err = talk_to_backend(dev, info);
394 	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
395 		blkif_recover(info);
396 
397 	return (err);
398 }
399 
400 /* Common code used when first setting up, and when resuming. */
401 static int
402 talk_to_backend(device_t dev, struct blkfront_info *info)
403 {
404 	const char *message = NULL;
405 	struct xenbus_transaction xbt;
406 	int err;
407 
408 	/* Create shared ring, alloc event channel. */
409 	err = setup_blkring(dev, info);
410 	if (err)
411 		goto out;
412 
413  again:
414 	err = xenbus_transaction_start(&xbt);
415 	if (err) {
416 		xenbus_dev_fatal(dev, err, "starting transaction");
417 		goto destroy_blkring;
418 	}
419 
420 	err = xenbus_printf(xbt, xenbus_get_node(dev),
421 			    "ring-ref","%u", info->ring_ref);
422 	if (err) {
423 		message = "writing ring-ref";
424 		goto abort_transaction;
425 	}
426 	err = xenbus_printf(xbt, xenbus_get_node(dev),
427 		"event-channel", "%u", irq_to_evtchn_port(info->irq));
428 	if (err) {
429 		message = "writing event-channel";
430 		goto abort_transaction;
431 	}
432 	err = xenbus_printf(xbt, xenbus_get_node(dev),
433 		"protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
434 	if (err) {
435 		message = "writing protocol";
436 		goto abort_transaction;
437 	}
438 
439 	err = xenbus_transaction_end(xbt, 0);
440 	if (err) {
441 		if (err == EAGAIN)
442 			goto again;
443 		xenbus_dev_fatal(dev, err, "completing transaction");
444 		goto destroy_blkring;
445 	}
446 	xenbus_set_state(dev, XenbusStateInitialised);
447 
448 	return 0;
449 
450  abort_transaction:
451 	xenbus_transaction_end(xbt, 1);
452 	if (message)
453 		xenbus_dev_fatal(dev, err, "%s", message);
454  destroy_blkring:
455 	blkif_free(info, 0);
456  out:
457 	return err;
458 }
459 
460 static int
461 setup_blkring(device_t dev, struct blkfront_info *info)
462 {
463 	blkif_sring_t *sring;
464 	int error;
465 
466 	info->ring_ref = GRANT_INVALID_REF;
467 
468 	sring = (blkif_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO);
469 	if (sring == NULL) {
470 		xenbus_dev_fatal(dev, ENOMEM, "allocating shared ring");
471 		return ENOMEM;
472 	}
473 	SHARED_RING_INIT(sring);
474 	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
475 
476 	error = xenbus_grant_ring(dev,
477 	    (vtomach(info->ring.sring) >> PAGE_SHIFT), &info->ring_ref);
478 	if (error) {
479 		free(sring, M_DEVBUF);
480 		info->ring.sring = NULL;
481 		goto fail;
482 	}
483 
484 	error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(dev),
485 	    "xbd", (driver_intr_t *)blkif_int, info,
486 	    INTR_TYPE_BIO | INTR_MPSAFE, &info->irq);
487 	if (error) {
488 		xenbus_dev_fatal(dev, error,
489 		    "bind_evtchn_to_irqhandler failed");
490 		goto fail;
491 	}
492 
493 	return (0);
494  fail:
495 	blkif_free(info, 0);
496 	return (error);
497 }
498 
499 
500 /**
501  * Callback received when the backend's state changes.
502  */
503 static int
504 blkfront_backend_changed(device_t dev, XenbusState backend_state)
505 {
506 	struct blkfront_info *info = device_get_softc(dev);
507 
508 	DPRINTK("backend_state=%d\n", backend_state);
509 
510 	switch (backend_state) {
511 	case XenbusStateUnknown:
512 	case XenbusStateInitialising:
513 	case XenbusStateInitWait:
514 	case XenbusStateInitialised:
515 	case XenbusStateClosed:
516 	case XenbusStateReconfigured:
517 	case XenbusStateReconfiguring:
518 		break;
519 
520 	case XenbusStateConnected:
521 		connect(dev, info);
522 		break;
523 
524 	case XenbusStateClosing:
525 		if (info->users > 0)
526 			xenbus_dev_error(dev, -EBUSY,
527 					 "Device in use; refusing to close");
528 		else
529 			blkfront_closing(dev);
530 #ifdef notyet
531 		bd = bdget(info->dev);
532 		if (bd == NULL)
533 			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
534 
535 		down(&bd->bd_sem);
536 		if (info->users > 0)
537 			xenbus_dev_error(dev, -EBUSY,
538 					 "Device in use; refusing to close");
539 		else
540 			blkfront_closing(dev);
541 		up(&bd->bd_sem);
542 		bdput(bd);
543 #endif
544 	}
545 
546 	return (0);
547 }
548 
549 /*
550 ** Invoked when the backend is finally 'ready' (and has told produced
551 ** the details about the physical device - #sectors, size, etc).
552 */
553 static void
554 connect(device_t dev, struct blkfront_info *info)
555 {
556 	unsigned long sectors, sector_size;
557 	unsigned int binfo;
558 	int err;
559 
560         if( (info->connected == BLKIF_STATE_CONNECTED) ||
561 	    (info->connected == BLKIF_STATE_SUSPENDED) )
562 		return;
563 
564 	DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev));
565 
566 	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
567 			    "sectors", "%lu", &sectors,
568 			    "info", "%u", &binfo,
569 			    "sector-size", "%lu", &sector_size,
570 			    NULL);
571 	if (err) {
572 		xenbus_dev_fatal(dev, err,
573 		    "reading backend fields at %s",
574 		    xenbus_get_otherend_path(dev));
575 		return;
576 	}
577 	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
578 			    "feature-barrier", "%lu", &info->feature_barrier,
579 			    NULL);
580 	if (err)
581 		info->feature_barrier = 0;
582 
583 	device_printf(dev, "%juMB <%s> at %s",
584 	    (uintmax_t) sectors / (1048576 / sector_size),
585 	    device_get_desc(dev),
586 	    xenbus_get_node(dev));
587 	bus_print_child_footer(device_get_parent(dev), dev);
588 
589 	xlvbd_add(dev, sectors, info->vdevice, binfo, sector_size, info);
590 
591 	(void)xenbus_set_state(dev, XenbusStateConnected);
592 
593 	/* Kick pending requests. */
594 	mtx_lock(&blkif_io_lock);
595 	info->connected = BLKIF_STATE_CONNECTED;
596 	kick_pending_request_queues(info);
597 	mtx_unlock(&blkif_io_lock);
598 	info->is_ready = 1;
599 
600 #if 0
601 	add_disk(info->gd);
602 #endif
603 }
604 
605 /**
606  * Handle the change of state of the backend to Closing.  We must delete our
607  * device-layer structures now, to ensure that writes are flushed through to
608  * the backend.  Once is this done, we can switch to Closed in
609  * acknowledgement.
610  */
611 static void
612 blkfront_closing(device_t dev)
613 {
614 	struct blkfront_info *info = device_get_softc(dev);
615 
616 	DPRINTK("blkfront_closing: %s removed\n", xenbus_get_node(dev));
617 
618 	if (info->mi) {
619 		DPRINTK("Calling xlvbd_del\n");
620 		xlvbd_del(info);
621 		info->mi = NULL;
622 	}
623 
624 	xenbus_set_state(dev, XenbusStateClosed);
625 }
626 
627 
628 static int
629 blkfront_detach(device_t dev)
630 {
631 	struct blkfront_info *info = device_get_softc(dev);
632 
633 	DPRINTK("blkfront_remove: %s removed\n", xenbus_get_node(dev));
634 
635 	blkif_free(info, 0);
636 
637 	return 0;
638 }
639 
640 
641 static inline int
642 GET_ID_FROM_FREELIST(struct blkfront_info *info)
643 {
644 	unsigned long nfree = info->shadow_free;
645 
646 	KASSERT(nfree <= BLK_RING_SIZE, ("free %lu > RING_SIZE", nfree));
647 	info->shadow_free = info->shadow[nfree].req.id;
648 	info->shadow[nfree].req.id = 0x0fffffee; /* debug */
649 	return nfree;
650 }
651 
652 static inline void
653 ADD_ID_TO_FREELIST(struct blkfront_info *info, unsigned long id)
654 {
655 	info->shadow[id].req.id  = info->shadow_free;
656 	info->shadow[id].request = 0;
657 	info->shadow_free = id;
658 }
659 
660 static inline void
661 flush_requests(struct blkfront_info *info)
662 {
663 	int notify;
664 
665 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
666 
667 	if (notify)
668 		notify_remote_via_irq(info->irq);
669 }
670 
671 static void
672 kick_pending_request_queues(struct blkfront_info *info)
673 {
674 	/* XXX check if we can't simplify */
675 #if 0
676 	if (!RING_FULL(&info->ring)) {
677 		/* Re-enable calldowns. */
678 		blk_start_queue(info->rq);
679 		/* Kick things off immediately. */
680 		do_blkif_request(info->rq);
681 	}
682 #endif
683 	if (!RING_FULL(&info->ring)) {
684 #if 0
685 		sc = LIST_FIRST(&xbsl_head);
686 		LIST_REMOVE(sc, entry);
687 		/* Re-enable calldowns. */
688 		blk_start_queue(di->rq);
689 #endif
690 		/* Kick things off immediately. */
691 		xb_startio(info->sc);
692 	}
693 }
694 
695 #if 0
696 /* XXX */
697 static void blkif_restart_queue(void *arg)
698 {
699 	struct blkfront_info *info = (struct blkfront_info *)arg;
700 
701 	mtx_lock(&blkif_io_lock);
702 	kick_pending_request_queues(info);
703 	mtx_unlock(&blkif_io_lock);
704 }
705 #endif
706 
707 static void blkif_restart_queue_callback(void *arg)
708 {
709 #if 0
710 	struct blkfront_info *info = (struct blkfront_info *)arg;
711 	/* XXX BSD equiv ? */
712 
713 	schedule_work(&info->work);
714 #endif
715 }
716 
717 static int
718 blkif_open(struct disk *dp)
719 {
720 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
721 
722 	if (sc == NULL) {
723 		printf("xb%d: not found", sc->xb_unit);
724 		return (ENXIO);
725 	}
726 
727 	sc->xb_flags |= XB_OPEN;
728 	sc->xb_info->users++;
729 	return (0);
730 }
731 
732 static int
733 blkif_close(struct disk *dp)
734 {
735 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
736 
737 	if (sc == NULL)
738 		return (ENXIO);
739 	sc->xb_flags &= ~XB_OPEN;
740 	if (--(sc->xb_info->users) == 0) {
741 		/* Check whether we have been instructed to close.  We will
742 		   have ignored this request initially, as the device was
743 		   still mounted. */
744 		device_t dev = sc->xb_info->xbdev;
745 		XenbusState state =
746 			xenbus_read_driver_state(xenbus_get_otherend_path(dev));
747 
748 		if (state == XenbusStateClosing)
749 			blkfront_closing(dev);
750 	}
751 	return (0);
752 }
753 
754 static int
755 blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td)
756 {
757 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
758 
759 	if (sc == NULL)
760 		return (ENXIO);
761 
762 	return (ENOTTY);
763 }
764 
765 
766 /*
767  * blkif_queue_request
768  *
769  * request block io
770  *
771  * id: for guest use only.
772  * operation: BLKIF_OP_{READ,WRITE,PROBE}
773  * buffer: buffer to read/write into. this should be a
774  *   virtual address in the guest os.
775  */
776 static int blkif_queue_request(struct bio *bp)
777 {
778 	caddr_t alignbuf;
779 	vm_paddr_t buffer_ma;
780 	blkif_request_t     *ring_req;
781 	unsigned long id;
782 	uint64_t fsect, lsect;
783 	struct xb_softc *sc = (struct xb_softc *)bp->bio_disk->d_drv1;
784 	struct blkfront_info *info = sc->xb_info;
785 	int ref;
786 
787 	if (unlikely(sc->xb_info->connected != BLKIF_STATE_CONNECTED))
788 		return 1;
789 
790 	if (gnttab_alloc_grant_references(
791 		    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
792 		gnttab_request_free_callback(
793 			&info->callback,
794 			blkif_restart_queue_callback,
795 			info,
796 			BLKIF_MAX_SEGMENTS_PER_REQUEST);
797 		return 1;
798 	}
799 
800 	/* Check if the buffer is properly aligned */
801 	if ((vm_offset_t)bp->bio_data & PAGE_MASK) {
802 		int align = (bp->bio_bcount < PAGE_SIZE/2) ? XBD_SECTOR_SIZE :
803 			PAGE_SIZE;
804 		caddr_t newbuf = malloc(bp->bio_bcount + align, M_DEVBUF,
805 					M_NOWAIT);
806 
807 		alignbuf = (char *)roundup2((u_long)newbuf, align);
808 
809 		/* save a copy of the current buffer */
810 		bp->bio_driver1 = newbuf;
811 		bp->bio_driver2 = alignbuf;
812 
813 		/* Copy the data for a write */
814 		if (bp->bio_cmd == BIO_WRITE)
815 			bcopy(bp->bio_data, alignbuf, bp->bio_bcount);
816 	} else
817 		alignbuf = bp->bio_data;
818 
819 	/* Fill out a communications ring structure. */
820 	ring_req 	         = RING_GET_REQUEST(&info->ring,
821 						    info->ring.req_prod_pvt);
822 	id		         = GET_ID_FROM_FREELIST(info);
823 	info->shadow[id].request = (unsigned long)bp;
824 
825 	ring_req->id 	         = id;
826 	ring_req->operation 	 = (bp->bio_cmd == BIO_READ) ? BLKIF_OP_READ :
827 		BLKIF_OP_WRITE;
828 
829 	ring_req->sector_number= (blkif_sector_t)bp->bio_pblkno;
830 	ring_req->handle 	  = (blkif_vdev_t)(uintptr_t)sc->xb_disk;
831 
832 	ring_req->nr_segments  = 0;	/* XXX not doing scatter/gather since buffer
833 					 * chaining is not supported.
834 					 */
835 
836 	buffer_ma = vtomach(alignbuf);
837 	fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
838 	lsect = fsect + (bp->bio_bcount >> XBD_SECTOR_SHFT) - 1;
839 	/* install a grant reference. */
840 	ref = gnttab_claim_grant_reference(&gref_head);
841 	KASSERT( ref != -ENOSPC, ("grant_reference failed") );
842 
843 	gnttab_grant_foreign_access_ref(
844 		ref,
845 		xenbus_get_otherend_id(info->xbdev),
846 		buffer_ma >> PAGE_SHIFT,
847 		ring_req->operation & 1 ); /* ??? */
848 	info->shadow[id].frame[ring_req->nr_segments] =
849 		buffer_ma >> PAGE_SHIFT;
850 
851 	ring_req->seg[ring_req->nr_segments] =
852 		(struct blkif_request_segment) {
853 			.gref       = ref,
854 			.first_sect = fsect,
855 			.last_sect  = lsect };
856 
857 	ring_req->nr_segments++;
858 	KASSERT((buffer_ma & (XBD_SECTOR_SIZE-1)) == 0,
859 		("XEN buffer must be sector aligned"));
860 	KASSERT(lsect <= 7,
861 		("XEN disk driver data cannot cross a page boundary"));
862 
863 	buffer_ma &= ~PAGE_MASK;
864 
865 	info->ring.req_prod_pvt++;
866 
867 	/* Keep a private copy so we can reissue requests when recovering. */
868 	info->shadow[id].req = *ring_req;
869 
870 	gnttab_free_grant_references(gref_head);
871 
872 	return 0;
873 }
874 
875 
876 
877 /*
878  * Dequeue buffers and place them in the shared communication ring.
879  * Return when no more requests can be accepted or all buffers have
880  * been queued.
881  *
882  * Signal XEN once the ring has been filled out.
883  */
884 static void
885 xb_startio(struct xb_softc *sc)
886 {
887 	struct bio		*bp;
888 	int			queued = 0;
889 	struct blkfront_info *info = sc->xb_info;
890 	DPRINTK("");
891 
892 	mtx_assert(&blkif_io_lock, MA_OWNED);
893 
894 	while ((bp = bioq_takefirst(&sc->xb_bioq)) != NULL) {
895 
896 		if (RING_FULL(&info->ring))
897 			goto wait;
898 
899 		if (blkif_queue_request(bp)) {
900 		wait:
901 			bioq_insert_head(&sc->xb_bioq, bp);
902 			break;
903 		}
904 		queued++;
905 	}
906 
907 	if (queued != 0)
908 		flush_requests(sc->xb_info);
909 }
910 
911 static void
912 blkif_int(void *xsc)
913 {
914 	struct xb_softc *sc = NULL;
915 	struct bio *bp;
916 	blkif_response_t *bret;
917 	RING_IDX i, rp;
918 	struct blkfront_info *info = xsc;
919 	DPRINTK("");
920 
921 	TRACE_ENTER;
922 
923 	mtx_lock(&blkif_io_lock);
924 
925 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
926 		mtx_unlock(&blkif_io_lock);
927 		return;
928 	}
929 
930  again:
931 	rp = info->ring.sring->rsp_prod;
932 	rmb(); /* Ensure we see queued responses up to 'rp'. */
933 
934 	for (i = info->ring.rsp_cons; i != rp; i++) {
935 		unsigned long id;
936 
937 		bret = RING_GET_RESPONSE(&info->ring, i);
938 		id   = bret->id;
939 		bp   = (struct bio *)info->shadow[id].request;
940 
941 		blkif_completion(&info->shadow[id]);
942 
943 		ADD_ID_TO_FREELIST(info, id);
944 
945 		switch (bret->operation) {
946 		case BLKIF_OP_READ:
947 			/* had an unaligned buffer that needs to be copied */
948 			if (bp->bio_driver1)
949 				bcopy(bp->bio_driver2, bp->bio_data, bp->bio_bcount);
950 			/* FALLTHROUGH */
951 		case BLKIF_OP_WRITE:
952 
953 			/* free the copy buffer */
954 			if (bp->bio_driver1) {
955 				free(bp->bio_driver1, M_DEVBUF);
956 				bp->bio_driver1 = NULL;
957 			}
958 
959 			if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) {
960 					printf("Bad return from blkdev data request: %x\n",
961 					  bret->status);
962 				bp->bio_flags |= BIO_ERROR;
963 			}
964 
965 			sc = (struct xb_softc *)bp->bio_disk->d_drv1;
966 
967 			if (bp->bio_flags & BIO_ERROR)
968 				bp->bio_error = EIO;
969 			else
970 				bp->bio_resid = 0;
971 
972 			biodone(bp);
973 			break;
974 		default:
975 			panic("received invalid operation");
976 			break;
977 		}
978 	}
979 
980 	info->ring.rsp_cons = i;
981 
982 	if (i != info->ring.req_prod_pvt) {
983 		int more_to_do;
984 		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
985 		if (more_to_do)
986 			goto again;
987 	} else {
988 		info->ring.sring->rsp_event = i + 1;
989 	}
990 
991 	kick_pending_request_queues(info);
992 
993 	mtx_unlock(&blkif_io_lock);
994 }
995 
996 static void
997 blkif_free(struct blkfront_info *info, int suspend)
998 {
999 
1000 /* Prevent new requests being issued until we fix things up. */
1001 	mtx_lock(&blkif_io_lock);
1002 	info->connected = suspend ?
1003 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1004 	mtx_unlock(&blkif_io_lock);
1005 
1006 	/* Free resources associated with old device channel. */
1007 	if (info->ring_ref != GRANT_INVALID_REF) {
1008 		gnttab_end_foreign_access(info->ring_ref,
1009 					  info->ring.sring);
1010 		info->ring_ref = GRANT_INVALID_REF;
1011 		info->ring.sring = NULL;
1012 	}
1013 	if (info->irq)
1014 		unbind_from_irqhandler(info->irq);
1015 	info->irq = 0;
1016 
1017 }
1018 
1019 static void
1020 blkif_completion(struct blk_shadow *s)
1021 {
1022 	int i;
1023 
1024 	for (i = 0; i < s->req.nr_segments; i++)
1025 		gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
1026 }
1027 
1028 static void
1029 blkif_recover(struct blkfront_info *info)
1030 {
1031 	int i, j;
1032 	blkif_request_t *req;
1033 	struct blk_shadow *copy;
1034 
1035 	if (!info->sc)
1036 		return;
1037 
1038 	/* Stage 1: Make a safe copy of the shadow state. */
1039 	copy = (struct blk_shadow *)malloc(sizeof(info->shadow), M_DEVBUF, M_NOWAIT|M_ZERO);
1040 	memcpy(copy, info->shadow, sizeof(info->shadow));
1041 
1042 	/* Stage 2: Set up free list. */
1043 	memset(&info->shadow, 0, sizeof(info->shadow));
1044 	for (i = 0; i < BLK_RING_SIZE; i++)
1045 		info->shadow[i].req.id = i+1;
1046 	info->shadow_free = info->ring.req_prod_pvt;
1047 	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1048 
1049 	/* Stage 3: Find pending requests and requeue them. */
1050 	for (i = 0; i < BLK_RING_SIZE; i++) {
1051 		/* Not in use? */
1052 		if (copy[i].request == 0)
1053 			continue;
1054 
1055 		/* Grab a request slot and copy shadow state into it. */
1056 		req = RING_GET_REQUEST(
1057 			&info->ring, info->ring.req_prod_pvt);
1058 		*req = copy[i].req;
1059 
1060 		/* We get a new request id, and must reset the shadow state. */
1061 		req->id = GET_ID_FROM_FREELIST(info);
1062 		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
1063 
1064 		/* Rewrite any grant references invalidated by suspend/resume. */
1065 		for (j = 0; j < req->nr_segments; j++)
1066 			gnttab_grant_foreign_access_ref(
1067 				req->seg[j].gref,
1068 				xenbus_get_otherend_id(info->xbdev),
1069 				pfn_to_mfn(info->shadow[req->id].frame[j]),
1070 				0 /* assume not readonly */);
1071 
1072 		info->shadow[req->id].req = *req;
1073 
1074 		info->ring.req_prod_pvt++;
1075 	}
1076 
1077 	free(copy, M_DEVBUF);
1078 
1079 	xenbus_set_state(info->xbdev, XenbusStateConnected);
1080 
1081 	/* Now safe for us to use the shared ring */
1082 	mtx_lock(&blkif_io_lock);
1083 	info->connected = BLKIF_STATE_CONNECTED;
1084 	mtx_unlock(&blkif_io_lock);
1085 
1086 	/* Send off requeued requests */
1087 	mtx_lock(&blkif_io_lock);
1088 	flush_requests(info);
1089 
1090 	/* Kick any other new requests queued since we resumed */
1091 	kick_pending_request_queues(info);
1092 	mtx_unlock(&blkif_io_lock);
1093 }
1094 
1095 /* ** Driver registration ** */
1096 static device_method_t blkfront_methods[] = {
1097 	/* Device interface */
1098 	DEVMETHOD(device_probe,         blkfront_probe),
1099 	DEVMETHOD(device_attach,        blkfront_attach),
1100 	DEVMETHOD(device_detach,        blkfront_detach),
1101 	DEVMETHOD(device_shutdown,      bus_generic_shutdown),
1102 	DEVMETHOD(device_suspend,       blkfront_suspend),
1103 	DEVMETHOD(device_resume,        blkfront_resume),
1104 
1105 	/* Xenbus interface */
1106 	DEVMETHOD(xenbus_backend_changed, blkfront_backend_changed),
1107 
1108 	{ 0, 0 }
1109 };
1110 
1111 static driver_t blkfront_driver = {
1112 	"xbd",
1113 	blkfront_methods,
1114 	sizeof(struct blkfront_info),
1115 };
1116 devclass_t blkfront_devclass;
1117 
1118 DRIVER_MODULE(xbd, xenbus, blkfront_driver, blkfront_devclass, 0, 0);
1119 
1120 MTX_SYSINIT(ioreq, &blkif_io_lock, "BIO LOCK", MTX_NOWITNESS); /* XXX how does one enroll a lock? */
1121 
1122