xref: /freebsd/sys/dev/xen/blkfront/blkfront.c (revision aa64588d28258aef88cc33b8043112e8856948d0)
1 /*
2  * XenBSD block device driver
3  *
4  * Copyright (c) 2009 Scott Long, Yahoo!
5  * Copyright (c) 2009 Frank Suchomel, Citrix
6  * Copyright (c) 2009 Doug F. Rabson, Citrix
7  * Copyright (c) 2005 Kip Macy
8  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
9  * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
10  *
11  *
12  * Permission is hereby granted, free of charge, to any person obtaining a copy
13  * of this software and associated documentation files (the "Software"), to
14  * deal in the Software without restriction, including without limitation the
15  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
16  * sell copies of the Software, and to permit persons to whom the Software is
17  * furnished to do so, subject to the following conditions:
18  *
19  * The above copyright notice and this permission notice shall be included in
20  * all copies or substantial portions of the Software.
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/kernel.h>
37 #include <vm/vm.h>
38 #include <vm/pmap.h>
39 
40 #include <sys/bio.h>
41 #include <sys/bus.h>
42 #include <sys/conf.h>
43 #include <sys/module.h>
44 
45 #include <machine/bus.h>
46 #include <sys/rman.h>
47 #include <machine/resource.h>
48 #include <machine/intr_machdep.h>
49 #include <machine/vmparam.h>
50 #include <sys/bus_dma.h>
51 
52 #include <machine/xen/xen-os.h>
53 #include <machine/xen/xenfunc.h>
54 #include <xen/hypervisor.h>
55 #include <xen/xen_intr.h>
56 #include <xen/evtchn.h>
57 #include <xen/gnttab.h>
58 #include <xen/interface/grant_table.h>
59 #include <xen/interface/io/protocols.h>
60 #include <xen/xenbus/xenbusvar.h>
61 
62 #include <geom/geom_disk.h>
63 
64 #include <dev/xen/blkfront/block.h>
65 
66 #include "xenbus_if.h"
67 
68 /* prototypes */
69 static void xb_free_command(struct xb_command *cm);
70 static void xb_startio(struct xb_softc *sc);
71 static void connect(struct xb_softc *);
72 static void blkfront_closing(device_t);
73 static int blkfront_detach(device_t);
74 static int talk_to_backend(struct xb_softc *);
75 static int setup_blkring(struct xb_softc *);
76 static void blkif_int(void *);
77 static void blkif_recover(struct xb_softc *);
78 static void blkif_completion(struct xb_command *);
79 static void blkif_free(struct xb_softc *, int);
80 static void blkif_queue_cb(void *, bus_dma_segment_t *, int, int);
81 
82 #define GRANT_INVALID_REF 0
83 
84 /* Control whether runtime update of vbds is enabled. */
85 #define ENABLE_VBD_UPDATE 0
86 
87 #if ENABLE_VBD_UPDATE
88 static void vbd_update(void);
89 #endif
90 
91 #define BLKIF_STATE_DISCONNECTED 0
92 #define BLKIF_STATE_CONNECTED    1
93 #define BLKIF_STATE_SUSPENDED    2
94 
95 #ifdef notyet
96 static char *blkif_state_name[] = {
97 	[BLKIF_STATE_DISCONNECTED] = "disconnected",
98 	[BLKIF_STATE_CONNECTED]    = "connected",
99 	[BLKIF_STATE_SUSPENDED]    = "closed",
100 };
101 
102 static char * blkif_status_name[] = {
103 	[BLKIF_INTERFACE_STATUS_CLOSED]       = "closed",
104 	[BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
105 	[BLKIF_INTERFACE_STATUS_CONNECTED]    = "connected",
106 	[BLKIF_INTERFACE_STATUS_CHANGED]      = "changed",
107 };
108 #endif
109 
110 #if 0
111 #define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args)
112 #else
113 #define DPRINTK(fmt, args...)
114 #endif
115 
116 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
117     (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
118 
119 #define BLKIF_MAXIO	(32 * 1024)
120 
121 static int blkif_open(struct disk *dp);
122 static int blkif_close(struct disk *dp);
123 static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td);
124 static int blkif_queue_request(struct xb_softc *sc, struct xb_command *cm);
125 static void xb_strategy(struct bio *bp);
126 
127 // In order to quiesce the device during kernel dumps, outstanding requests to
128 // DOM0 for disk reads/writes need to be accounted for.
129 static	int	xb_dump(void *, void *, vm_offset_t, off_t, size_t);
130 
131 /* XXX move to xb_vbd.c when VBD update support is added */
132 #define MAX_VBDS 64
133 
134 #define XBD_SECTOR_SIZE		512	/* XXX: assume for now */
135 #define XBD_SECTOR_SHFT		9
136 
137 /*
138  * Translate Linux major/minor to an appropriate name and unit
139  * number. For HVM guests, this allows us to use the same drive names
140  * with blkfront as the emulated drives, easing transition slightly.
141  */
142 static void
143 blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
144 {
145 	static struct vdev_info {
146 		int major;
147 		int shift;
148 		int base;
149 		const char *name;
150 	} info[] = {
151 		{3,	6,	0,	"ad"},	/* ide0 */
152 		{22,	6,	2,	"ad"},	/* ide1 */
153 		{33,	6,	4,	"ad"},	/* ide2 */
154 		{34,	6,	6,	"ad"},	/* ide3 */
155 		{56,	6,	8,	"ad"},	/* ide4 */
156 		{57,	6,	10,	"ad"},	/* ide5 */
157 		{88,	6,	12,	"ad"},	/* ide6 */
158 		{89,	6,	14,	"ad"},	/* ide7 */
159 		{90,	6,	16,	"ad"},	/* ide8 */
160 		{91,	6,	18,	"ad"},	/* ide9 */
161 
162 		{8,	4,	0,	"da"},	/* scsi disk0 */
163 		{65,	4,	16,	"da"},	/* scsi disk1 */
164 		{66,	4,	32,	"da"},	/* scsi disk2 */
165 		{67,	4,	48,	"da"},	/* scsi disk3 */
166 		{68,	4,	64,	"da"},	/* scsi disk4 */
167 		{69,	4,	80,	"da"},	/* scsi disk5 */
168 		{70,	4,	96,	"da"},	/* scsi disk6 */
169 		{71,	4,	112,	"da"},	/* scsi disk7 */
170 		{128,	4,	128,	"da"},	/* scsi disk8 */
171 		{129,	4,	144,	"da"},	/* scsi disk9 */
172 		{130,	4,	160,	"da"},	/* scsi disk10 */
173 		{131,	4,	176,	"da"},	/* scsi disk11 */
174 		{132,	4,	192,	"da"},	/* scsi disk12 */
175 		{133,	4,	208,	"da"},	/* scsi disk13 */
176 		{134,	4,	224,	"da"},	/* scsi disk14 */
177 		{135,	4,	240,	"da"},	/* scsi disk15 */
178 
179 		{202,	4,	0,	"xbd"},	/* xbd */
180 
181 		{0,	0,	0,	NULL},
182 	};
183 	int major = vdevice >> 8;
184 	int minor = vdevice & 0xff;
185 	int i;
186 
187 	if (vdevice & (1 << 28)) {
188 		*unit = (vdevice & ((1 << 28) - 1)) >> 8;
189 		*name = "xbd";
190 	}
191 
192 	for (i = 0; info[i].major; i++) {
193 		if (info[i].major == major) {
194 			*unit = info[i].base + (minor >> info[i].shift);
195 			*name = info[i].name;
196 			return;
197 		}
198 	}
199 
200 	*unit = minor >> 4;
201 	*name = "xbd";
202 }
203 
204 int
205 xlvbd_add(struct xb_softc *sc, blkif_sector_t capacity,
206     int vdevice, uint16_t vdisk_info, uint16_t sector_size)
207 {
208 	int	unit, error = 0;
209 	const char *name;
210 
211 	blkfront_vdevice_to_unit(vdevice, &unit, &name);
212 
213 	sc->xb_unit = unit;
214 
215 	if (strcmp(name, "xbd"))
216 		device_printf(sc->xb_dev, "attaching as %s%d\n", name, unit);
217 
218 	memset(&sc->xb_disk, 0, sizeof(sc->xb_disk));
219 	sc->xb_disk = disk_alloc();
220 	sc->xb_disk->d_unit = sc->xb_unit;
221 	sc->xb_disk->d_open = blkif_open;
222 	sc->xb_disk->d_close = blkif_close;
223 	sc->xb_disk->d_ioctl = blkif_ioctl;
224 	sc->xb_disk->d_strategy = xb_strategy;
225 	sc->xb_disk->d_dump = xb_dump;
226 	sc->xb_disk->d_name = name;
227 	sc->xb_disk->d_drv1 = sc;
228 	sc->xb_disk->d_sectorsize = sector_size;
229 
230 	sc->xb_disk->d_mediasize = capacity << XBD_SECTOR_SHFT;
231 	sc->xb_disk->d_maxsize = BLKIF_MAXIO;
232 	sc->xb_disk->d_flags = 0;
233 	disk_create(sc->xb_disk, DISK_VERSION_00);
234 
235 	return error;
236 }
237 
238 void
239 xlvbd_del(struct xb_softc *sc)
240 {
241 
242 	disk_destroy(sc->xb_disk);
243 }
244 /************************ end VBD support *****************/
245 
246 /*
247  * Read/write routine for a buffer.  Finds the proper unit, place it on
248  * the sortq and kick the controller.
249  */
250 static void
251 xb_strategy(struct bio *bp)
252 {
253 	struct xb_softc	*sc = (struct xb_softc *)bp->bio_disk->d_drv1;
254 
255 	/* bogus disk? */
256 	if (sc == NULL) {
257 		bp->bio_error = EINVAL;
258 		bp->bio_flags |= BIO_ERROR;
259 		bp->bio_resid = bp->bio_bcount;
260 		biodone(bp);
261 		return;
262 	}
263 
264 	/*
265 	 * Place it in the queue of disk activities for this disk
266 	 */
267 	mtx_lock(&sc->xb_io_lock);
268 
269 	xb_enqueue_bio(sc, bp);
270 	xb_startio(sc);
271 
272 	mtx_unlock(&sc->xb_io_lock);
273 	return;
274 }
275 
276 static void
277 xb_bio_complete(struct xb_softc *sc, struct xb_command *cm)
278 {
279 	struct bio *bp;
280 
281 	bp = cm->bp;
282 
283 	if ( unlikely(cm->status != BLKIF_RSP_OKAY) ) {
284 		disk_err(bp, "disk error" , -1, 0);
285 		printf(" status: %x\n", cm->status);
286 		bp->bio_flags |= BIO_ERROR;
287 	}
288 
289 	if (bp->bio_flags & BIO_ERROR)
290 		bp->bio_error = EIO;
291 	else
292 		bp->bio_resid = 0;
293 
294 	xb_free_command(cm);
295 	biodone(bp);
296 }
297 
298 // Quiesce the disk writes for a dump file before allowing the next buffer.
299 static void
300 xb_quiesce(struct xb_softc *sc)
301 {
302 	int		mtd;
303 
304 	// While there are outstanding requests
305 	while (!TAILQ_EMPTY(&sc->cm_busy)) {
306 		RING_FINAL_CHECK_FOR_RESPONSES(&sc->ring, mtd);
307 		if (mtd) {
308 			/* Recieved request completions, update queue. */
309 			blkif_int(sc);
310 		}
311 		if (!TAILQ_EMPTY(&sc->cm_busy)) {
312 			/*
313 			 * Still pending requests, wait for the disk i/o
314 			 * to complete.
315 			 */
316 			HYPERVISOR_yield();
317 		}
318 	}
319 }
320 
321 /* Kernel dump function for a paravirtualized disk device */
322 static void
323 xb_dump_complete(struct xb_command *cm)
324 {
325 
326 	xb_enqueue_complete(cm);
327 }
328 
329 static int
330 xb_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
331         size_t length)
332 {
333 	struct	disk   	*dp = arg;
334 	struct xb_softc	*sc = (struct xb_softc *) dp->d_drv1;
335 	struct xb_command *cm;
336 	size_t		chunk;
337 	int		sbp;
338 	int		rc = 0;
339 
340 	if (length <= 0)
341 		return (rc);
342 
343 	xb_quiesce(sc);	/* All quiet on the western front. */
344 
345 	/*
346 	 * If this lock is held, then this module is failing, and a
347 	 * successful kernel dump is highly unlikely anyway.
348 	 */
349 	mtx_lock(&sc->xb_io_lock);
350 
351 	/* Split the 64KB block as needed */
352 	for (sbp=0; length > 0; sbp++) {
353 		cm = xb_dequeue_free(sc);
354 		if (cm == NULL) {
355 			mtx_unlock(&sc->xb_io_lock);
356 			device_printf(sc->xb_dev, "dump: no more commands?\n");
357 			return (EBUSY);
358 		}
359 
360 		if (gnttab_alloc_grant_references(
361 		    BLKIF_MAX_SEGMENTS_PER_REQUEST, &cm->gref_head) < 0) {
362 			xb_free_command(cm);
363 			mtx_unlock(&sc->xb_io_lock);
364 			device_printf(sc->xb_dev, "no more grant allocs?\n");
365 			return (EBUSY);
366 		}
367 
368 		chunk = length > BLKIF_MAXIO ? BLKIF_MAXIO : length;
369 		cm->data = virtual;
370 		cm->datalen = chunk;
371 		cm->operation = BLKIF_OP_WRITE;
372 		cm->sector_number = offset / dp->d_sectorsize;
373 		cm->cm_complete = xb_dump_complete;
374 
375 		xb_enqueue_ready(cm);
376 
377 		length -= chunk;
378 		offset += chunk;
379 		virtual = (char *) virtual + chunk;
380 	}
381 
382 	/* Tell DOM0 to do the I/O */
383 	xb_startio(sc);
384 	mtx_unlock(&sc->xb_io_lock);
385 
386 	/* Poll for the completion. */
387 	xb_quiesce(sc);	/* All quite on the eastern front */
388 
389 	/* If there were any errors, bail out... */
390 	while ((cm = xb_dequeue_complete(sc)) != NULL) {
391 		if (cm->status != BLKIF_RSP_OKAY) {
392 			device_printf(sc->xb_dev,
393 			    "Dump I/O failed at sector %jd\n",
394 			    cm->sector_number);
395 			rc = EIO;
396 		}
397 		xb_free_command(cm);
398 	}
399 
400 	return (rc);
401 }
402 
403 
404 static int
405 blkfront_probe(device_t dev)
406 {
407 
408 	if (!strcmp(xenbus_get_type(dev), "vbd")) {
409 		device_set_desc(dev, "Virtual Block Device");
410 		device_quiet(dev);
411 		return (0);
412 	}
413 
414 	return (ENXIO);
415 }
416 
417 /*
418  * Setup supplies the backend dir, virtual device.  We place an event
419  * channel and shared frame entries.  We watch backend to wait if it's
420  * ok.
421  */
422 static int
423 blkfront_attach(device_t dev)
424 {
425 	struct xb_softc *sc;
426 	struct xb_command *cm;
427 	const char *name;
428 	int error, vdevice, i, unit;
429 
430 	/* FIXME: Use dynamic device id if this is not set. */
431 	error = xenbus_scanf(XBT_NIL, xenbus_get_node(dev),
432 	    "virtual-device", NULL, "%i", &vdevice);
433 	if (error) {
434 		xenbus_dev_fatal(dev, error, "reading virtual-device");
435 		printf("couldn't find virtual device");
436 		return (error);
437 	}
438 
439 	blkfront_vdevice_to_unit(vdevice, &unit, &name);
440 	if (!strcmp(name, "xbd"))
441 		device_set_unit(dev, unit);
442 
443 	sc = device_get_softc(dev);
444 	mtx_init(&sc->xb_io_lock, "blkfront i/o lock", NULL, MTX_DEF);
445 	xb_initq_free(sc);
446 	xb_initq_busy(sc);
447 	xb_initq_ready(sc);
448 	xb_initq_complete(sc);
449 	xb_initq_bio(sc);
450 
451 	/* Allocate parent DMA tag */
452 	if (bus_dma_tag_create(	NULL,			/* parent */
453 				512, 4096,		/* algnmnt, boundary */
454 				BUS_SPACE_MAXADDR,	/* lowaddr */
455 				BUS_SPACE_MAXADDR,	/* highaddr */
456 				NULL, NULL,		/* filter, filterarg */
457 				BLKIF_MAXIO,		/* maxsize */
458 				BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* nsegments */
459 				PAGE_SIZE,		/* maxsegsize */
460 				BUS_DMA_ALLOCNOW,	/* flags */
461 				busdma_lock_mutex,	/* lockfunc */
462 				&sc->xb_io_lock,	/* lockarg */
463 				&sc->xb_io_dmat)) {
464 		device_printf(dev, "Cannot allocate parent DMA tag\n");
465 		return (ENOMEM);
466 	}
467 #ifdef notyet
468 	if (bus_dma_tag_set(sc->xb_io_dmat, BUS_DMA_SET_MINSEGSZ,
469 		XBD_SECTOR_SIZE)) {
470 		device_printf(dev, "Cannot set sector size\n");
471 		return (EINVAL);
472 	}
473 #endif
474 
475 	sc->xb_dev = dev;
476 	sc->vdevice = vdevice;
477 	sc->connected = BLKIF_STATE_DISCONNECTED;
478 
479 	/* work queue needed ? */
480 	for (i = 0; i < BLK_RING_SIZE; i++) {
481 		cm = &sc->shadow[i];
482 		cm->req.id = i;
483 		cm->cm_sc = sc;
484 		if (bus_dmamap_create(sc->xb_io_dmat, 0, &cm->map) != 0)
485 			break;
486 		xb_free_command(cm);
487 	}
488 
489 	/* Front end dir is a number, which is used as the id. */
490 	sc->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0);
491 
492 	error = talk_to_backend(sc);
493 	if (error)
494 		return (error);
495 
496 	return (0);
497 }
498 
499 static int
500 blkfront_suspend(device_t dev)
501 {
502 	struct xb_softc *sc = device_get_softc(dev);
503 
504 	/* Prevent new requests being issued until we fix things up. */
505 	mtx_lock(&sc->xb_io_lock);
506 	sc->connected = BLKIF_STATE_SUSPENDED;
507 	mtx_unlock(&sc->xb_io_lock);
508 
509 	return (0);
510 }
511 
512 static int
513 blkfront_resume(device_t dev)
514 {
515 	struct xb_softc *sc = device_get_softc(dev);
516 	int err;
517 
518 	DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev));
519 
520 	blkif_free(sc, 1);
521 	err = talk_to_backend(sc);
522 	if (sc->connected == BLKIF_STATE_SUSPENDED && !err)
523 		blkif_recover(sc);
524 
525 	return (err);
526 }
527 
528 /* Common code used when first setting up, and when resuming. */
529 static int
530 talk_to_backend(struct xb_softc *sc)
531 {
532 	device_t dev;
533 	struct xenbus_transaction xbt;
534 	const char *message = NULL;
535 	int err;
536 
537 	/* Create shared ring, alloc event channel. */
538 	dev = sc->xb_dev;
539 	err = setup_blkring(sc);
540 	if (err)
541 		goto out;
542 
543  again:
544 	err = xenbus_transaction_start(&xbt);
545 	if (err) {
546 		xenbus_dev_fatal(dev, err, "starting transaction");
547 		goto destroy_blkring;
548 	}
549 
550 	err = xenbus_printf(xbt, xenbus_get_node(dev),
551 			    "ring-ref","%u", sc->ring_ref);
552 	if (err) {
553 		message = "writing ring-ref";
554 		goto abort_transaction;
555 	}
556 	err = xenbus_printf(xbt, xenbus_get_node(dev),
557 		"event-channel", "%u", irq_to_evtchn_port(sc->irq));
558 	if (err) {
559 		message = "writing event-channel";
560 		goto abort_transaction;
561 	}
562 	err = xenbus_printf(xbt, xenbus_get_node(dev),
563 		"protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
564 	if (err) {
565 		message = "writing protocol";
566 		goto abort_transaction;
567 	}
568 
569 	err = xenbus_transaction_end(xbt, 0);
570 	if (err) {
571 		if (err == EAGAIN)
572 			goto again;
573 		xenbus_dev_fatal(dev, err, "completing transaction");
574 		goto destroy_blkring;
575 	}
576 	xenbus_set_state(dev, XenbusStateInitialised);
577 
578 	return 0;
579 
580  abort_transaction:
581 	xenbus_transaction_end(xbt, 1);
582 	if (message)
583 		xenbus_dev_fatal(dev, err, "%s", message);
584  destroy_blkring:
585 	blkif_free(sc, 0);
586  out:
587 	return err;
588 }
589 
590 static int
591 setup_blkring(struct xb_softc *sc)
592 {
593 	blkif_sring_t *sring;
594 	int error;
595 
596 	sc->ring_ref = GRANT_INVALID_REF;
597 
598 	sring = (blkif_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO);
599 	if (sring == NULL) {
600 		xenbus_dev_fatal(sc->xb_dev, ENOMEM, "allocating shared ring");
601 		return ENOMEM;
602 	}
603 	SHARED_RING_INIT(sring);
604 	FRONT_RING_INIT(&sc->ring, sring, PAGE_SIZE);
605 
606 	error = xenbus_grant_ring(sc->xb_dev,
607 	    (vtomach(sc->ring.sring) >> PAGE_SHIFT), &sc->ring_ref);
608 	if (error) {
609 		free(sring, M_DEVBUF);
610 		sc->ring.sring = NULL;
611 		goto fail;
612 	}
613 
614 	error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(sc->xb_dev),
615 	    "xbd", (driver_intr_t *)blkif_int, sc,
616 	    INTR_TYPE_BIO | INTR_MPSAFE, &sc->irq);
617 	if (error) {
618 		xenbus_dev_fatal(sc->xb_dev, error,
619 		    "bind_evtchn_to_irqhandler failed");
620 		goto fail;
621 	}
622 
623 	return (0);
624  fail:
625 	blkif_free(sc, 0);
626 	return (error);
627 }
628 
629 
630 /**
631  * Callback received when the backend's state changes.
632  */
633 static int
634 blkfront_backend_changed(device_t dev, XenbusState backend_state)
635 {
636 	struct xb_softc *sc = device_get_softc(dev);
637 
638 	DPRINTK("backend_state=%d\n", backend_state);
639 
640 	switch (backend_state) {
641 	case XenbusStateUnknown:
642 	case XenbusStateInitialising:
643 	case XenbusStateInitWait:
644 	case XenbusStateInitialised:
645 	case XenbusStateClosed:
646 	case XenbusStateReconfigured:
647 	case XenbusStateReconfiguring:
648 		break;
649 
650 	case XenbusStateConnected:
651 		connect(sc);
652 		break;
653 
654 	case XenbusStateClosing:
655 		if (sc->users > 0)
656 			xenbus_dev_error(dev, -EBUSY,
657 					 "Device in use; refusing to close");
658 		else
659 			blkfront_closing(dev);
660 #ifdef notyet
661 		bd = bdget(sc->dev);
662 		if (bd == NULL)
663 			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
664 
665 		down(&bd->bd_sem);
666 		if (sc->users > 0)
667 			xenbus_dev_error(dev, -EBUSY,
668 					 "Device in use; refusing to close");
669 		else
670 			blkfront_closing(dev);
671 		up(&bd->bd_sem);
672 		bdput(bd);
673 #endif
674 	}
675 
676 	return (0);
677 }
678 
679 /*
680 ** Invoked when the backend is finally 'ready' (and has told produced
681 ** the details about the physical device - #sectors, size, etc).
682 */
683 static void
684 connect(struct xb_softc *sc)
685 {
686 	device_t dev = sc->xb_dev;
687 	unsigned long sectors, sector_size;
688 	unsigned int binfo;
689 	int err, feature_barrier;
690 
691         if( (sc->connected == BLKIF_STATE_CONNECTED) ||
692 	    (sc->connected == BLKIF_STATE_SUSPENDED) )
693 		return;
694 
695 	DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev));
696 
697 	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
698 			    "sectors", "%lu", &sectors,
699 			    "info", "%u", &binfo,
700 			    "sector-size", "%lu", &sector_size,
701 			    NULL);
702 	if (err) {
703 		xenbus_dev_fatal(dev, err,
704 		    "reading backend fields at %s",
705 		    xenbus_get_otherend_path(dev));
706 		return;
707 	}
708 	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
709 			    "feature-barrier", "%lu", &feature_barrier,
710 			    NULL);
711 	if (!err || feature_barrier)
712 		sc->xb_flags |= XB_BARRIER;
713 
714 	device_printf(dev, "%juMB <%s> at %s",
715 	    (uintmax_t) sectors / (1048576 / sector_size),
716 	    device_get_desc(dev),
717 	    xenbus_get_node(dev));
718 	bus_print_child_footer(device_get_parent(dev), dev);
719 
720 	xlvbd_add(sc, sectors, sc->vdevice, binfo, sector_size);
721 
722 	(void)xenbus_set_state(dev, XenbusStateConnected);
723 
724 	/* Kick pending requests. */
725 	mtx_lock(&sc->xb_io_lock);
726 	sc->connected = BLKIF_STATE_CONNECTED;
727 	xb_startio(sc);
728 	sc->xb_flags |= XB_READY;
729 	mtx_unlock(&sc->xb_io_lock);
730 
731 }
732 
733 /**
734  * Handle the change of state of the backend to Closing.  We must delete our
735  * device-layer structures now, to ensure that writes are flushed through to
736  * the backend.  Once this is done, we can switch to Closed in
737  * acknowledgement.
738  */
739 static void
740 blkfront_closing(device_t dev)
741 {
742 	struct xb_softc *sc = device_get_softc(dev);
743 
744 	DPRINTK("blkfront_closing: %s removed\n", xenbus_get_node(dev));
745 
746 	if (sc->mi) {
747 		DPRINTK("Calling xlvbd_del\n");
748 		xlvbd_del(sc);
749 		sc->mi = NULL;
750 	}
751 
752 	xenbus_set_state(dev, XenbusStateClosed);
753 }
754 
755 
756 static int
757 blkfront_detach(device_t dev)
758 {
759 	struct xb_softc *sc = device_get_softc(dev);
760 
761 	DPRINTK("blkfront_remove: %s removed\n", xenbus_get_node(dev));
762 
763 	blkif_free(sc, 0);
764 	mtx_destroy(&sc->xb_io_lock);
765 
766 	return 0;
767 }
768 
769 
770 static inline void
771 flush_requests(struct xb_softc *sc)
772 {
773 	int notify;
774 
775 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->ring, notify);
776 
777 	if (notify)
778 		notify_remote_via_irq(sc->irq);
779 }
780 
781 static void blkif_restart_queue_callback(void *arg)
782 {
783 	struct xb_softc *sc = arg;
784 
785 	xb_startio(sc);
786 }
787 
788 static int
789 blkif_open(struct disk *dp)
790 {
791 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
792 
793 	if (sc == NULL) {
794 		printf("xb%d: not found", sc->xb_unit);
795 		return (ENXIO);
796 	}
797 
798 	sc->xb_flags |= XB_OPEN;
799 	sc->users++;
800 	return (0);
801 }
802 
803 static int
804 blkif_close(struct disk *dp)
805 {
806 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
807 
808 	if (sc == NULL)
809 		return (ENXIO);
810 	sc->xb_flags &= ~XB_OPEN;
811 	if (--(sc->users) == 0) {
812 		/* Check whether we have been instructed to close.  We will
813 		   have ignored this request initially, as the device was
814 		   still mounted. */
815 		device_t dev = sc->xb_dev;
816 		XenbusState state =
817 			xenbus_read_driver_state(xenbus_get_otherend_path(dev));
818 
819 		if (state == XenbusStateClosing)
820 			blkfront_closing(dev);
821 	}
822 	return (0);
823 }
824 
825 static int
826 blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td)
827 {
828 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
829 
830 	if (sc == NULL)
831 		return (ENXIO);
832 
833 	return (ENOTTY);
834 }
835 
836 static void
837 xb_free_command(struct xb_command *cm)
838 {
839 
840 	KASSERT((cm->cm_flags & XB_ON_XBQ_MASK) == 0,
841 	    ("Freeing command that is still on a queue\n"));
842 
843 	cm->cm_flags = 0;
844 	cm->bp = NULL;
845 	cm->cm_complete = NULL;
846 	xb_enqueue_free(cm);
847 }
848 
849 /*
850  * blkif_queue_request
851  *
852  * request block io
853  *
854  * id: for guest use only.
855  * operation: BLKIF_OP_{READ,WRITE,PROBE}
856  * buffer: buffer to read/write into. this should be a
857  *   virtual address in the guest os.
858  */
859 static struct xb_command *
860 xb_bio_command(struct xb_softc *sc)
861 {
862 	struct xb_command *cm;
863 	struct bio *bp;
864 
865 	if (unlikely(sc->connected != BLKIF_STATE_CONNECTED))
866 		return (NULL);
867 
868 	bp = xb_dequeue_bio(sc);
869 	if (bp == NULL)
870 		return (NULL);
871 
872 	if ((cm = xb_dequeue_free(sc)) == NULL) {
873 		xb_requeue_bio(sc, bp);
874 		return (NULL);
875 	}
876 
877 	if (gnttab_alloc_grant_references(BLKIF_MAX_SEGMENTS_PER_REQUEST,
878 	    &cm->gref_head) < 0) {
879 		gnttab_request_free_callback(&sc->callback,
880 			blkif_restart_queue_callback, sc,
881 			BLKIF_MAX_SEGMENTS_PER_REQUEST);
882 		xb_requeue_bio(sc, bp);
883 		xb_enqueue_free(cm);
884 		sc->xb_flags |= XB_FROZEN;
885 		return (NULL);
886 	}
887 
888 	/* XXX Can we grab refs before doing the load so that the ref can
889 	 * be filled out here?
890 	 */
891 	cm->bp = bp;
892 	cm->data = bp->bio_data;
893 	cm->datalen = bp->bio_bcount;
894 	cm->operation = (bp->bio_cmd == BIO_READ) ? BLKIF_OP_READ :
895 	    BLKIF_OP_WRITE;
896 	cm->sector_number = (blkif_sector_t)bp->bio_pblkno;
897 
898 	return (cm);
899 }
900 
901 static int
902 blkif_queue_request(struct xb_softc *sc, struct xb_command *cm)
903 {
904 	int	error;
905 
906 	error = bus_dmamap_load(sc->xb_io_dmat, cm->map, cm->data, cm->datalen,
907 	    blkif_queue_cb, cm, 0);
908 	if (error == EINPROGRESS) {
909 		printf("EINPROGRESS\n");
910 		sc->xb_flags |= XB_FROZEN;
911 		cm->cm_flags |= XB_CMD_FROZEN;
912 		return (0);
913 	}
914 
915 	return (error);
916 }
917 
918 static void
919 blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
920 {
921 	struct xb_softc *sc;
922 	struct xb_command *cm;
923 	blkif_request_t	*ring_req;
924 	vm_paddr_t buffer_ma;
925 	uint64_t fsect, lsect;
926 	int ref, i, op;
927 
928 	cm = arg;
929 	sc = cm->cm_sc;
930 
931 	if (error) {
932 		printf("error %d in blkif_queue_cb\n", error);
933 		cm->bp->bio_error = EIO;
934 		biodone(cm->bp);
935 		xb_free_command(cm);
936 		return;
937 	}
938 
939 	/* Fill out a communications ring structure. */
940 	ring_req = RING_GET_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
941 	if (ring_req == NULL) {
942 		/* XXX Is this possible? */
943 		printf("ring_req NULL, requeuing\n");
944 		xb_enqueue_ready(cm);
945 		return;
946 	}
947 	ring_req->id = cm->req.id;
948 	ring_req->operation = cm->operation;
949 	ring_req->sector_number = cm->sector_number;
950 	ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xb_disk;
951 	ring_req->nr_segments = nsegs;
952 
953 	for (i = 0; i < nsegs; i++) {
954 		buffer_ma = segs[i].ds_addr;
955 		fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
956 		lsect = fsect + (segs[i].ds_len  >> XBD_SECTOR_SHFT) - 1;
957 
958 		KASSERT(lsect <= 7,
959 		    ("XEN disk driver data cannot cross a page boundary"));
960 
961 		/* install a grant reference. */
962 		ref = gnttab_claim_grant_reference(&cm->gref_head);
963 		KASSERT( ref >= 0, ("grant_reference failed") );
964 
965 		gnttab_grant_foreign_access_ref(
966 			ref,
967 			xenbus_get_otherend_id(sc->xb_dev),
968 			buffer_ma >> PAGE_SHIFT,
969 			ring_req->operation & 1 ); /* ??? */
970 
971 		ring_req->seg[i] =
972 			(struct blkif_request_segment) {
973 				.gref       = ref,
974 				.first_sect = fsect,
975 				.last_sect  = lsect };
976 	}
977 
978 
979 	if (cm->operation == BLKIF_OP_READ)
980 		op = BUS_DMASYNC_PREREAD;
981 	else if (cm->operation == BLKIF_OP_WRITE)
982 		op = BUS_DMASYNC_PREWRITE;
983 	else
984 		op = 0;
985 	bus_dmamap_sync(sc->xb_io_dmat, cm->map, op);
986 
987 	sc->ring.req_prod_pvt++;
988 
989 	/* Keep a private copy so we can reissue requests when recovering. */
990 	cm->req = *ring_req;
991 
992 	xb_enqueue_busy(cm);
993 
994 	gnttab_free_grant_references(cm->gref_head);
995 
996 	/*
997 	 * This flag means that we're probably executing in the busdma swi
998 	 * instead of in the startio context, so an explicit flush is needed.
999 	 */
1000 	if (cm->cm_flags & XB_CMD_FROZEN)
1001 		flush_requests(sc);
1002 
1003 	return;
1004 }
1005 
1006 /*
1007  * Dequeue buffers and place them in the shared communication ring.
1008  * Return when no more requests can be accepted or all buffers have
1009  * been queued.
1010  *
1011  * Signal XEN once the ring has been filled out.
1012  */
1013 static void
1014 xb_startio(struct xb_softc *sc)
1015 {
1016 	struct xb_command *cm;
1017 	int error, queued = 0;
1018 
1019 	mtx_assert(&sc->xb_io_lock, MA_OWNED);
1020 
1021 	while (!RING_FULL(&sc->ring)) {
1022 		if (sc->xb_flags & XB_FROZEN)
1023 			break;
1024 
1025 		cm = xb_dequeue_ready(sc);
1026 
1027 		if (cm == NULL)
1028 		    cm = xb_bio_command(sc);
1029 
1030 		if (cm == NULL)
1031 			break;
1032 
1033 		if ((error = blkif_queue_request(sc, cm)) != 0) {
1034 			printf("blkif_queue_request returned %d\n", error);
1035 			break;
1036 		}
1037 		queued++;
1038 	}
1039 
1040 	if (queued != 0)
1041 		flush_requests(sc);
1042 }
1043 
1044 static void
1045 blkif_int(void *xsc)
1046 {
1047 	struct xb_softc *sc = xsc;
1048 	struct xb_command *cm;
1049 	blkif_response_t *bret;
1050 	RING_IDX i, rp;
1051 	int op;
1052 
1053 	mtx_lock(&sc->xb_io_lock);
1054 
1055 	if (unlikely(sc->connected != BLKIF_STATE_CONNECTED)) {
1056 		mtx_unlock(&sc->xb_io_lock);
1057 		return;
1058 	}
1059 
1060  again:
1061 	rp = sc->ring.sring->rsp_prod;
1062 	rmb(); /* Ensure we see queued responses up to 'rp'. */
1063 
1064 	for (i = sc->ring.rsp_cons; i != rp; i++) {
1065 		bret = RING_GET_RESPONSE(&sc->ring, i);
1066 		cm   = &sc->shadow[bret->id];
1067 
1068 		xb_remove_busy(cm);
1069 		blkif_completion(cm);
1070 
1071 		if (cm->operation == BLKIF_OP_READ)
1072 			op = BUS_DMASYNC_POSTREAD;
1073 		else if (cm->operation == BLKIF_OP_WRITE)
1074 			op = BUS_DMASYNC_POSTWRITE;
1075 		else
1076 			op = 0;
1077 		bus_dmamap_sync(sc->xb_io_dmat, cm->map, op);
1078 		bus_dmamap_unload(sc->xb_io_dmat, cm->map);
1079 
1080 		/*
1081 		 * If commands are completing then resources are probably
1082 		 * being freed as well.  It's a cheap assumption even when
1083 		 * wrong.
1084 		 */
1085 		sc->xb_flags &= ~XB_FROZEN;
1086 
1087 		/*
1088 		 * Directly call the i/o complete routine to save an
1089 		 * an indirection in the common case.
1090 		 */
1091 		cm->status = bret->status;
1092 		if (cm->bp)
1093 			xb_bio_complete(sc, cm);
1094 		else if (cm->cm_complete)
1095 			(cm->cm_complete)(cm);
1096 		else
1097 			xb_free_command(cm);
1098 	}
1099 
1100 	sc->ring.rsp_cons = i;
1101 
1102 	if (i != sc->ring.req_prod_pvt) {
1103 		int more_to_do;
1104 		RING_FINAL_CHECK_FOR_RESPONSES(&sc->ring, more_to_do);
1105 		if (more_to_do)
1106 			goto again;
1107 	} else {
1108 		sc->ring.sring->rsp_event = i + 1;
1109 	}
1110 
1111 	xb_startio(sc);
1112 
1113 	mtx_unlock(&sc->xb_io_lock);
1114 }
1115 
1116 static void
1117 blkif_free(struct xb_softc *sc, int suspend)
1118 {
1119 
1120 /* Prevent new requests being issued until we fix things up. */
1121 	mtx_lock(&sc->xb_io_lock);
1122 	sc->connected = suspend ?
1123 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1124 	mtx_unlock(&sc->xb_io_lock);
1125 
1126 	/* Free resources associated with old device channel. */
1127 	if (sc->ring_ref != GRANT_INVALID_REF) {
1128 		gnttab_end_foreign_access(sc->ring_ref,
1129 					  sc->ring.sring);
1130 		sc->ring_ref = GRANT_INVALID_REF;
1131 		sc->ring.sring = NULL;
1132 	}
1133 	if (sc->irq)
1134 		unbind_from_irqhandler(sc->irq);
1135 	sc->irq = 0;
1136 
1137 }
1138 
1139 static void
1140 blkif_completion(struct xb_command *s)
1141 {
1142 	int i;
1143 
1144 	for (i = 0; i < s->req.nr_segments; i++)
1145 		gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
1146 }
1147 
1148 static void
1149 blkif_recover(struct xb_softc *sc)
1150 {
1151 	/*
1152 	 * XXX The whole concept of not quiescing and completing all i/o
1153 	 * during suspend, and then hoping to recover and replay the
1154 	 * resulting abandoned I/O during resume, is laughable.  At best,
1155 	 * it invalidates the i/o ordering rules required by just about
1156 	 * every filesystem, and at worst it'll corrupt data.  The code
1157 	 * has been removed until further notice.
1158 	 */
1159 }
1160 
1161 /* ** Driver registration ** */
1162 static device_method_t blkfront_methods[] = {
1163 	/* Device interface */
1164 	DEVMETHOD(device_probe,         blkfront_probe),
1165 	DEVMETHOD(device_attach,        blkfront_attach),
1166 	DEVMETHOD(device_detach,        blkfront_detach),
1167 	DEVMETHOD(device_shutdown,      bus_generic_shutdown),
1168 	DEVMETHOD(device_suspend,       blkfront_suspend),
1169 	DEVMETHOD(device_resume,        blkfront_resume),
1170 
1171 	/* Xenbus interface */
1172 	DEVMETHOD(xenbus_backend_changed, blkfront_backend_changed),
1173 
1174 	{ 0, 0 }
1175 };
1176 
1177 static driver_t blkfront_driver = {
1178 	"xbd",
1179 	blkfront_methods,
1180 	sizeof(struct xb_softc),
1181 };
1182 devclass_t blkfront_devclass;
1183 
1184 DRIVER_MODULE(xbd, xenbus, blkfront_driver, blkfront_devclass, 0, 0);
1185