xref: /freebsd/sys/dev/xen/blkfront/blkfront.c (revision 8698b76c3d2dbfbfee3563aac3f5ffc533c0e83d)
1 /*-
2  * All rights reserved.
3  *
4  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
5  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
6  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
7  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
8  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
9  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
10  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
11  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
12  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
13  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
14  * SUCH DAMAGE.
15  *
16  */
17 
18 /*
19  * XenBSD block device driver
20  *
21  * Copyright (c) 2009 Frank Suchomel, Citrix
22  */
23 
24 #include <sys/cdefs.h>
25 __FBSDID("$FreeBSD$");
26 
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/malloc.h>
30 #include <sys/kernel.h>
31 #include <vm/vm.h>
32 #include <vm/pmap.h>
33 
34 #include <sys/bio.h>
35 #include <sys/bus.h>
36 #include <sys/conf.h>
37 #include <sys/module.h>
38 
39 #include <machine/bus.h>
40 #include <sys/rman.h>
41 #include <machine/resource.h>
42 #include <machine/intr_machdep.h>
43 #include <machine/vmparam.h>
44 
45 #include <machine/xen/xen-os.h>
46 #include <machine/xen/xenfunc.h>
47 #include <xen/hypervisor.h>
48 #include <xen/xen_intr.h>
49 #include <xen/evtchn.h>
50 #include <xen/gnttab.h>
51 #include <xen/interface/grant_table.h>
52 #include <xen/interface/io/protocols.h>
53 #include <xen/xenbus/xenbusvar.h>
54 
55 #include <geom/geom_disk.h>
56 
57 #include <dev/xen/blkfront/block.h>
58 
59 #include "xenbus_if.h"
60 
61 #define    ASSERT(S)       KASSERT(S, (#S))
62 /* prototypes */
63 struct xb_softc;
64 static void xb_startio(struct xb_softc *sc);
65 static void connect(device_t, struct blkfront_info *);
66 static void blkfront_closing(device_t);
67 static int blkfront_detach(device_t);
68 static int talk_to_backend(device_t, struct blkfront_info *);
69 static int setup_blkring(device_t, struct blkfront_info *);
70 static void blkif_int(void *);
71 #if 0
72 static void blkif_restart_queue(void *arg);
73 #endif
74 static void blkif_recover(struct blkfront_info *);
75 static void blkif_completion(struct blk_shadow *);
76 static void blkif_free(struct blkfront_info *, int);
77 
78 #define GRANT_INVALID_REF 0
79 #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
80 
81 LIST_HEAD(xb_softc_list_head, xb_softc) xbsl_head;
82 
83 /* Control whether runtime update of vbds is enabled. */
84 #define ENABLE_VBD_UPDATE 0
85 
86 #if ENABLE_VBD_UPDATE
87 static void vbd_update(void);
88 #endif
89 
90 
91 #define BLKIF_STATE_DISCONNECTED 0
92 #define BLKIF_STATE_CONNECTED    1
93 #define BLKIF_STATE_SUSPENDED    2
94 
95 #ifdef notyet
96 static char *blkif_state_name[] = {
97 	[BLKIF_STATE_DISCONNECTED] = "disconnected",
98 	[BLKIF_STATE_CONNECTED]    = "connected",
99 	[BLKIF_STATE_SUSPENDED]    = "closed",
100 };
101 
102 static char * blkif_status_name[] = {
103 	[BLKIF_INTERFACE_STATUS_CLOSED]       = "closed",
104 	[BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
105 	[BLKIF_INTERFACE_STATUS_CONNECTED]    = "connected",
106 	[BLKIF_INTERFACE_STATUS_CHANGED]      = "changed",
107 };
108 #endif
109 #define WPRINTK(fmt, args...) printf("[XEN] " fmt, ##args)
110 #if 0
111 #define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args)
112 #else
113 #define DPRINTK(fmt, args...)
114 #endif
115 
116 static grant_ref_t gref_head;
117 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
118     (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
119 
120 static void kick_pending_request_queues(struct blkfront_info *);
121 static int blkif_open(struct disk *dp);
122 static int blkif_close(struct disk *dp);
123 static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td);
124 static int blkif_queue_request(struct bio *bp);
125 static void xb_strategy(struct bio *bp);
126 
127 // In order to quiesce the device during kernel dumps, outstanding requests to
128 // DOM0 for disk reads/writes need to be accounted for.
129 static	int	blkif_queued_requests;
130 static	int	xb_dump(void *, void *, vm_offset_t, off_t, size_t);
131 
132 
133 /* XXX move to xb_vbd.c when VBD update support is added */
134 #define MAX_VBDS 64
135 
136 #define XBD_SECTOR_SIZE		512	/* XXX: assume for now */
137 #define XBD_SECTOR_SHFT		9
138 
139 static struct mtx blkif_io_lock;
140 
141 static vm_paddr_t
142 pfn_to_mfn(vm_paddr_t pfn)
143 {
144 	return (phystomach(pfn << PAGE_SHIFT) >> PAGE_SHIFT);
145 }
146 
147 /*
148  * Translate Linux major/minor to an appropriate name and unit
149  * number. For HVM guests, this allows us to use the same drive names
150  * with blkfront as the emulated drives, easing transition slightly.
151  */
152 static void
153 blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
154 {
155 	static struct vdev_info {
156 		int major;
157 		int shift;
158 		int base;
159 		const char *name;
160 	} info[] = {
161 		{3,	6,	0,	"ad"},	/* ide0 */
162 		{22,	6,	2,	"ad"},	/* ide1 */
163 		{33,	6,	4,	"ad"},	/* ide2 */
164 		{34,	6,	6,	"ad"},	/* ide3 */
165 		{56,	6,	8,	"ad"},	/* ide4 */
166 		{57,	6,	10,	"ad"},	/* ide5 */
167 		{88,	6,	12,	"ad"},	/* ide6 */
168 		{89,	6,	14,	"ad"},	/* ide7 */
169 		{90,	6,	16,	"ad"},	/* ide8 */
170 		{91,	6,	18,	"ad"},	/* ide9 */
171 
172 		{8,	4,	0,	"da"},	/* scsi disk0 */
173 		{65,	4,	16,	"da"},	/* scsi disk1 */
174 		{66,	4,	32,	"da"},	/* scsi disk2 */
175 		{67,	4,	48,	"da"},	/* scsi disk3 */
176 		{68,	4,	64,	"da"},	/* scsi disk4 */
177 		{69,	4,	80,	"da"},	/* scsi disk5 */
178 		{70,	4,	96,	"da"},	/* scsi disk6 */
179 		{71,	4,	112,	"da"},	/* scsi disk7 */
180 		{128,	4,	128,	"da"},	/* scsi disk8 */
181 		{129,	4,	144,	"da"},	/* scsi disk9 */
182 		{130,	4,	160,	"da"},	/* scsi disk10 */
183 		{131,	4,	176,	"da"},	/* scsi disk11 */
184 		{132,	4,	192,	"da"},	/* scsi disk12 */
185 		{133,	4,	208,	"da"},	/* scsi disk13 */
186 		{134,	4,	224,	"da"},	/* scsi disk14 */
187 		{135,	4,	240,	"da"},	/* scsi disk15 */
188 
189 		{202,	4,	0,	"xbd"},	/* xbd */
190 
191 		{0,	0,	0,	NULL},
192 	};
193 	int major = vdevice >> 8;
194 	int minor = vdevice & 0xff;
195 	int i;
196 
197 	if (vdevice & (1 << 28)) {
198 		*unit = (vdevice & ((1 << 28) - 1)) >> 8;
199 		*name = "xbd";
200 	}
201 
202 	for (i = 0; info[i].major; i++) {
203 		if (info[i].major == major) {
204 			*unit = info[i].base + (minor >> info[i].shift);
205 			*name = info[i].name;
206 			return;
207 		}
208 	}
209 
210 	*unit = minor >> 4;
211 	*name = "xbd";
212 }
213 
214 int
215 xlvbd_add(device_t dev, blkif_sector_t capacity,
216     int vdevice, uint16_t vdisk_info, uint16_t sector_size,
217     struct blkfront_info *info)
218 {
219 	struct xb_softc	*sc;
220 	int	unit, error = 0;
221 	const char *name;
222 
223 	blkfront_vdevice_to_unit(vdevice, &unit, &name);
224 
225 	sc = (struct xb_softc *)malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
226 	sc->xb_unit = unit;
227 	sc->xb_info = info;
228 	info->sc = sc;
229 
230 	if (strcmp(name, "xbd"))
231 		device_printf(dev, "attaching as %s%d\n", name, unit);
232 
233 	memset(&sc->xb_disk, 0, sizeof(sc->xb_disk));
234 	sc->xb_disk = disk_alloc();
235 	sc->xb_disk->d_unit = sc->xb_unit;
236 	sc->xb_disk->d_open = blkif_open;
237 	sc->xb_disk->d_close = blkif_close;
238 	sc->xb_disk->d_ioctl = blkif_ioctl;
239 	sc->xb_disk->d_strategy = xb_strategy;
240 	sc->xb_disk->d_dump = xb_dump;
241 	sc->xb_disk->d_name = name;
242 	sc->xb_disk->d_drv1 = sc;
243 	sc->xb_disk->d_sectorsize = sector_size;
244 
245 	/* XXX */
246 	sc->xb_disk->d_mediasize = capacity << XBD_SECTOR_SHFT;
247 #if 0
248 	sc->xb_disk->d_maxsize = DFLTPHYS;
249 #else /* XXX: xen can't handle large single i/o requests */
250 	sc->xb_disk->d_maxsize = 4096;
251 #endif
252 #ifdef notyet
253 	XENPRINTF("attaching device 0x%x unit %d capacity %llu\n",
254 		  xb_diskinfo[sc->xb_unit].device, sc->xb_unit,
255 		  sc->xb_disk->d_mediasize);
256 #endif
257 	sc->xb_disk->d_flags = 0;
258 	disk_create(sc->xb_disk, DISK_VERSION_00);
259 	bioq_init(&sc->xb_bioq);
260 
261 	return error;
262 }
263 
264 void
265 xlvbd_del(struct blkfront_info *info)
266 {
267 	struct xb_softc	*sc;
268 
269 	sc = info->sc;
270 	disk_destroy(sc->xb_disk);
271 }
272 /************************ end VBD support *****************/
273 
274 /*
275  * Read/write routine for a buffer.  Finds the proper unit, place it on
276  * the sortq and kick the controller.
277  */
278 static void
279 xb_strategy(struct bio *bp)
280 {
281 	struct xb_softc	*sc = (struct xb_softc *)bp->bio_disk->d_drv1;
282 
283 	/* bogus disk? */
284 	if (sc == NULL) {
285 		bp->bio_error = EINVAL;
286 		bp->bio_flags |= BIO_ERROR;
287 		goto bad;
288 	}
289 
290 	DPRINTK("");
291 
292 	/*
293 	 * Place it in the queue of disk activities for this disk
294 	 */
295 	mtx_lock(&blkif_io_lock);
296 
297 	bioq_disksort(&sc->xb_bioq, bp);
298 	xb_startio(sc);
299 
300 	mtx_unlock(&blkif_io_lock);
301 	return;
302 
303  bad:
304 	/*
305 	 * Correctly set the bio to indicate a failed tranfer.
306 	 */
307 	bp->bio_resid = bp->bio_bcount;
308 	biodone(bp);
309 	return;
310 }
311 
312 static void xb_quiesce(struct blkfront_info *info);
313 // Quiesce the disk writes for a dump file before allowing the next buffer.
314 static void
315 xb_quiesce(struct blkfront_info *info)
316 {
317 	int		mtd;
318 
319 	// While there are outstanding requests
320 	while (blkif_queued_requests) {
321 		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, mtd);
322 		if (mtd) {
323 			// Recieved request completions, update queue.
324 			blkif_int(info);
325 		}
326 		if (blkif_queued_requests) {
327 			// Still pending requests, wait for the disk i/o to complete
328 			HYPERVISOR_block();
329 		}
330 	}
331 }
332 
333 // Some bio structures for dumping core
334 #define DUMP_BIO_NO 16				// 16 * 4KB = 64KB dump block
335 static	struct bio		xb_dump_bp[DUMP_BIO_NO];
336 
337 // Kernel dump function for a paravirtualized disk device
338 static int
339 xb_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
340         size_t length)
341 {
342 			int				 sbp;
343   			int			     mbp;
344 			size_t			 chunk;
345 	struct	disk   			*dp = arg;
346 	struct	xb_softc		*sc = (struct xb_softc *) dp->d_drv1;
347 	        int	    		 rc = 0;
348 
349 	xb_quiesce(sc->xb_info);		// All quiet on the western front.
350 	if (length > 0) {
351 		// If this lock is held, then this module is failing, and a successful
352 		// kernel dump is highly unlikely anyway.
353 		mtx_lock(&blkif_io_lock);
354 		// Split the 64KB block into 16 4KB blocks
355 		for (sbp=0; length>0 && sbp<DUMP_BIO_NO; sbp++) {
356 			chunk = length > PAGE_SIZE ? PAGE_SIZE : length;
357 			xb_dump_bp[sbp].bio_disk   = dp;
358 			xb_dump_bp[sbp].bio_pblkno = offset / dp->d_sectorsize;
359 			xb_dump_bp[sbp].bio_bcount = chunk;
360 			xb_dump_bp[sbp].bio_resid  = chunk;
361 			xb_dump_bp[sbp].bio_data   = virtual;
362 			xb_dump_bp[sbp].bio_cmd    = BIO_WRITE;
363 			xb_dump_bp[sbp].bio_done   = NULL;
364 
365 			bioq_disksort(&sc->xb_bioq, &xb_dump_bp[sbp]);
366 
367 			length -= chunk;
368 			offset += chunk;
369 			virtual = (char *) virtual + chunk;
370 		}
371 		// Tell DOM0 to do the I/O
372 		xb_startio(sc);
373 		mtx_unlock(&blkif_io_lock);
374 
375 		// Must wait for the completion: the dump routine reuses the same
376 		//                               16 x 4KB buffer space.
377 		xb_quiesce(sc->xb_info);	// All quite on the eastern front
378 		// If there were any errors, bail out...
379 		for (mbp=0; mbp<sbp; mbp++) {
380 			if ((rc = xb_dump_bp[mbp].bio_error)) break;
381 		}
382 	}
383 	return (rc);
384 }
385 
386 
387 static int
388 blkfront_probe(device_t dev)
389 {
390 
391 	if (!strcmp(xenbus_get_type(dev), "vbd")) {
392 		device_set_desc(dev, "Virtual Block Device");
393 		device_quiet(dev);
394 		return (0);
395 	}
396 
397 	return (ENXIO);
398 }
399 
400 /*
401  * Setup supplies the backend dir, virtual device.  We place an event
402  * channel and shared frame entries.  We watch backend to wait if it's
403  * ok.
404  */
405 static int
406 blkfront_attach(device_t dev)
407 {
408 	int error, vdevice, i, unit;
409 	struct blkfront_info *info;
410 	const char *name;
411 
412 	/* FIXME: Use dynamic device id if this is not set. */
413 	error = xenbus_scanf(XBT_NIL, xenbus_get_node(dev),
414 	    "virtual-device", NULL, "%i", &vdevice);
415 	if (error) {
416 		xenbus_dev_fatal(dev, error, "reading virtual-device");
417 		printf("couldn't find virtual device");
418 		return (error);
419 	}
420 
421 	blkfront_vdevice_to_unit(vdevice, &unit, &name);
422 	if (!strcmp(name, "xbd"))
423 		device_set_unit(dev, unit);
424 
425 	info = device_get_softc(dev);
426 
427 	/*
428 	 * XXX debug only
429 	 */
430 	for (i = 0; i < sizeof(*info); i++)
431 			if (((uint8_t *)info)[i] != 0)
432 					panic("non-null memory");
433 
434 	info->shadow_free = 0;
435 	info->xbdev = dev;
436 	info->vdevice = vdevice;
437 	info->connected = BLKIF_STATE_DISCONNECTED;
438 
439 	/* work queue needed ? */
440 	for (i = 0; i < BLK_RING_SIZE; i++)
441 		info->shadow[i].req.id = i+1;
442 	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
443 
444 	/* Front end dir is a number, which is used as the id. */
445 	info->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0);
446 
447 	error = talk_to_backend(dev, info);
448 	if (error)
449 		return (error);
450 
451 	return (0);
452 }
453 
454 static int
455 blkfront_suspend(device_t dev)
456 {
457 	struct blkfront_info *info = device_get_softc(dev);
458 
459 	/* Prevent new requests being issued until we fix things up. */
460 	mtx_lock(&blkif_io_lock);
461 	info->connected = BLKIF_STATE_SUSPENDED;
462 	mtx_unlock(&blkif_io_lock);
463 
464 	return (0);
465 }
466 
467 static int
468 blkfront_resume(device_t dev)
469 {
470 	struct blkfront_info *info = device_get_softc(dev);
471 	int err;
472 
473 	DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev));
474 
475 	blkif_free(info, 1);
476 	err = talk_to_backend(dev, info);
477 	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
478 		blkif_recover(info);
479 
480 	return (err);
481 }
482 
483 /* Common code used when first setting up, and when resuming. */
484 static int
485 talk_to_backend(device_t dev, struct blkfront_info *info)
486 {
487 	const char *message = NULL;
488 	struct xenbus_transaction xbt;
489 	int err;
490 
491 	/* Create shared ring, alloc event channel. */
492 	err = setup_blkring(dev, info);
493 	if (err)
494 		goto out;
495 
496  again:
497 	err = xenbus_transaction_start(&xbt);
498 	if (err) {
499 		xenbus_dev_fatal(dev, err, "starting transaction");
500 		goto destroy_blkring;
501 	}
502 
503 	err = xenbus_printf(xbt, xenbus_get_node(dev),
504 			    "ring-ref","%u", info->ring_ref);
505 	if (err) {
506 		message = "writing ring-ref";
507 		goto abort_transaction;
508 	}
509 	err = xenbus_printf(xbt, xenbus_get_node(dev),
510 		"event-channel", "%u", irq_to_evtchn_port(info->irq));
511 	if (err) {
512 		message = "writing event-channel";
513 		goto abort_transaction;
514 	}
515 	err = xenbus_printf(xbt, xenbus_get_node(dev),
516 		"protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
517 	if (err) {
518 		message = "writing protocol";
519 		goto abort_transaction;
520 	}
521 
522 	err = xenbus_transaction_end(xbt, 0);
523 	if (err) {
524 		if (err == EAGAIN)
525 			goto again;
526 		xenbus_dev_fatal(dev, err, "completing transaction");
527 		goto destroy_blkring;
528 	}
529 	xenbus_set_state(dev, XenbusStateInitialised);
530 
531 	return 0;
532 
533  abort_transaction:
534 	xenbus_transaction_end(xbt, 1);
535 	if (message)
536 		xenbus_dev_fatal(dev, err, "%s", message);
537  destroy_blkring:
538 	blkif_free(info, 0);
539  out:
540 	return err;
541 }
542 
543 static int
544 setup_blkring(device_t dev, struct blkfront_info *info)
545 {
546 	blkif_sring_t *sring;
547 	int error;
548 
549 	info->ring_ref = GRANT_INVALID_REF;
550 
551 	sring = (blkif_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO);
552 	if (sring == NULL) {
553 		xenbus_dev_fatal(dev, ENOMEM, "allocating shared ring");
554 		return ENOMEM;
555 	}
556 	SHARED_RING_INIT(sring);
557 	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
558 
559 	error = xenbus_grant_ring(dev,
560 	    (vtomach(info->ring.sring) >> PAGE_SHIFT), &info->ring_ref);
561 	if (error) {
562 		free(sring, M_DEVBUF);
563 		info->ring.sring = NULL;
564 		goto fail;
565 	}
566 
567 	error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(dev),
568 	    "xbd", (driver_intr_t *)blkif_int, info,
569 	    INTR_TYPE_BIO | INTR_MPSAFE, &info->irq);
570 	if (error) {
571 		xenbus_dev_fatal(dev, error,
572 		    "bind_evtchn_to_irqhandler failed");
573 		goto fail;
574 	}
575 
576 	return (0);
577  fail:
578 	blkif_free(info, 0);
579 	return (error);
580 }
581 
582 
583 /**
584  * Callback received when the backend's state changes.
585  */
586 static int
587 blkfront_backend_changed(device_t dev, XenbusState backend_state)
588 {
589 	struct blkfront_info *info = device_get_softc(dev);
590 
591 	DPRINTK("backend_state=%d\n", backend_state);
592 
593 	switch (backend_state) {
594 	case XenbusStateUnknown:
595 	case XenbusStateInitialising:
596 	case XenbusStateInitWait:
597 	case XenbusStateInitialised:
598 	case XenbusStateClosed:
599 	case XenbusStateReconfigured:
600 	case XenbusStateReconfiguring:
601 		break;
602 
603 	case XenbusStateConnected:
604 		connect(dev, info);
605 		break;
606 
607 	case XenbusStateClosing:
608 		if (info->users > 0)
609 			xenbus_dev_error(dev, -EBUSY,
610 					 "Device in use; refusing to close");
611 		else
612 			blkfront_closing(dev);
613 #ifdef notyet
614 		bd = bdget(info->dev);
615 		if (bd == NULL)
616 			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
617 
618 		down(&bd->bd_sem);
619 		if (info->users > 0)
620 			xenbus_dev_error(dev, -EBUSY,
621 					 "Device in use; refusing to close");
622 		else
623 			blkfront_closing(dev);
624 		up(&bd->bd_sem);
625 		bdput(bd);
626 #endif
627 	}
628 
629 	return (0);
630 }
631 
632 /*
633 ** Invoked when the backend is finally 'ready' (and has told produced
634 ** the details about the physical device - #sectors, size, etc).
635 */
636 static void
637 connect(device_t dev, struct blkfront_info *info)
638 {
639 	unsigned long sectors, sector_size;
640 	unsigned int binfo;
641 	int err;
642 
643         if( (info->connected == BLKIF_STATE_CONNECTED) ||
644 	    (info->connected == BLKIF_STATE_SUSPENDED) )
645 		return;
646 
647 	DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev));
648 
649 	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
650 			    "sectors", "%lu", &sectors,
651 			    "info", "%u", &binfo,
652 			    "sector-size", "%lu", &sector_size,
653 			    NULL);
654 	if (err) {
655 		xenbus_dev_fatal(dev, err,
656 		    "reading backend fields at %s",
657 		    xenbus_get_otherend_path(dev));
658 		return;
659 	}
660 	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
661 			    "feature-barrier", "%lu", &info->feature_barrier,
662 			    NULL);
663 	if (err)
664 		info->feature_barrier = 0;
665 
666 	device_printf(dev, "%juMB <%s> at %s",
667 	    (uintmax_t) sectors / (1048576 / sector_size),
668 	    device_get_desc(dev),
669 	    xenbus_get_node(dev));
670 	bus_print_child_footer(device_get_parent(dev), dev);
671 
672 	xlvbd_add(dev, sectors, info->vdevice, binfo, sector_size, info);
673 
674 	(void)xenbus_set_state(dev, XenbusStateConnected);
675 
676 	/* Kick pending requests. */
677 	mtx_lock(&blkif_io_lock);
678 	info->connected = BLKIF_STATE_CONNECTED;
679 	kick_pending_request_queues(info);
680 	mtx_unlock(&blkif_io_lock);
681 	info->is_ready = 1;
682 
683 #if 0
684 	add_disk(info->gd);
685 #endif
686 }
687 
688 /**
689  * Handle the change of state of the backend to Closing.  We must delete our
690  * device-layer structures now, to ensure that writes are flushed through to
691  * the backend.  Once is this done, we can switch to Closed in
692  * acknowledgement.
693  */
694 static void
695 blkfront_closing(device_t dev)
696 {
697 	struct blkfront_info *info = device_get_softc(dev);
698 
699 	DPRINTK("blkfront_closing: %s removed\n", xenbus_get_node(dev));
700 
701 	if (info->mi) {
702 		DPRINTK("Calling xlvbd_del\n");
703 		xlvbd_del(info);
704 		info->mi = NULL;
705 	}
706 
707 	xenbus_set_state(dev, XenbusStateClosed);
708 }
709 
710 
711 static int
712 blkfront_detach(device_t dev)
713 {
714 	struct blkfront_info *info = device_get_softc(dev);
715 
716 	DPRINTK("blkfront_remove: %s removed\n", xenbus_get_node(dev));
717 
718 	blkif_free(info, 0);
719 
720 	return 0;
721 }
722 
723 
724 static inline int
725 GET_ID_FROM_FREELIST(struct blkfront_info *info)
726 {
727 	unsigned long nfree = info->shadow_free;
728 
729 	KASSERT(nfree <= BLK_RING_SIZE, ("free %lu > RING_SIZE", nfree));
730 	info->shadow_free = info->shadow[nfree].req.id;
731 	info->shadow[nfree].req.id = 0x0fffffee; /* debug */
732 	atomic_add_int(&blkif_queued_requests, 1);
733 	return nfree;
734 }
735 
736 static inline void
737 ADD_ID_TO_FREELIST(struct blkfront_info *info, unsigned long id)
738 {
739 	info->shadow[id].req.id  = info->shadow_free;
740 	info->shadow[id].request = 0;
741 	info->shadow_free = id;
742 	atomic_subtract_int(&blkif_queued_requests, 1);
743 }
744 
745 static inline void
746 flush_requests(struct blkfront_info *info)
747 {
748 	int notify;
749 
750 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
751 
752 	if (notify)
753 		notify_remote_via_irq(info->irq);
754 }
755 
756 static void
757 kick_pending_request_queues(struct blkfront_info *info)
758 {
759 	/* XXX check if we can't simplify */
760 #if 0
761 	if (!RING_FULL(&info->ring)) {
762 		/* Re-enable calldowns. */
763 		blk_start_queue(info->rq);
764 		/* Kick things off immediately. */
765 		do_blkif_request(info->rq);
766 	}
767 #endif
768 	if (!RING_FULL(&info->ring)) {
769 #if 0
770 		sc = LIST_FIRST(&xbsl_head);
771 		LIST_REMOVE(sc, entry);
772 		/* Re-enable calldowns. */
773 		blk_start_queue(di->rq);
774 #endif
775 		/* Kick things off immediately. */
776 		xb_startio(info->sc);
777 	}
778 }
779 
780 #if 0
781 /* XXX */
782 static void blkif_restart_queue(void *arg)
783 {
784 	struct blkfront_info *info = (struct blkfront_info *)arg;
785 
786 	mtx_lock(&blkif_io_lock);
787 	kick_pending_request_queues(info);
788 	mtx_unlock(&blkif_io_lock);
789 }
790 #endif
791 
792 static void blkif_restart_queue_callback(void *arg)
793 {
794 #if 0
795 	struct blkfront_info *info = (struct blkfront_info *)arg;
796 	/* XXX BSD equiv ? */
797 
798 	schedule_work(&info->work);
799 #endif
800 }
801 
802 static int
803 blkif_open(struct disk *dp)
804 {
805 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
806 
807 	if (sc == NULL) {
808 		printf("xb%d: not found", sc->xb_unit);
809 		return (ENXIO);
810 	}
811 
812 	sc->xb_flags |= XB_OPEN;
813 	sc->xb_info->users++;
814 	return (0);
815 }
816 
817 static int
818 blkif_close(struct disk *dp)
819 {
820 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
821 
822 	if (sc == NULL)
823 		return (ENXIO);
824 	sc->xb_flags &= ~XB_OPEN;
825 	if (--(sc->xb_info->users) == 0) {
826 		/* Check whether we have been instructed to close.  We will
827 		   have ignored this request initially, as the device was
828 		   still mounted. */
829 		device_t dev = sc->xb_info->xbdev;
830 		XenbusState state =
831 			xenbus_read_driver_state(xenbus_get_otherend_path(dev));
832 
833 		if (state == XenbusStateClosing)
834 			blkfront_closing(dev);
835 	}
836 	return (0);
837 }
838 
839 static int
840 blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td)
841 {
842 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
843 
844 	if (sc == NULL)
845 		return (ENXIO);
846 
847 	return (ENOTTY);
848 }
849 
850 
851 /*
852  * blkif_queue_request
853  *
854  * request block io
855  *
856  * id: for guest use only.
857  * operation: BLKIF_OP_{READ,WRITE,PROBE}
858  * buffer: buffer to read/write into. this should be a
859  *   virtual address in the guest os.
860  */
861 static int blkif_queue_request(struct bio *bp)
862 {
863 	caddr_t alignbuf;
864 	vm_paddr_t buffer_ma;
865 	blkif_request_t     *ring_req;
866 	unsigned long id;
867 	uint64_t fsect, lsect;
868 	struct xb_softc *sc = (struct xb_softc *)bp->bio_disk->d_drv1;
869 	struct blkfront_info *info = sc->xb_info;
870 	int ref;
871 
872 	if (unlikely(sc->xb_info->connected != BLKIF_STATE_CONNECTED))
873 		return 1;
874 
875 	if (gnttab_alloc_grant_references(
876 		    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
877 		gnttab_request_free_callback(
878 			&info->callback,
879 			blkif_restart_queue_callback,
880 			info,
881 			BLKIF_MAX_SEGMENTS_PER_REQUEST);
882 		return 1;
883 	}
884 
885 	/* Check if the buffer is properly aligned */
886 	if ((vm_offset_t)bp->bio_data & PAGE_MASK) {
887 		int align = (bp->bio_bcount < PAGE_SIZE/2) ? XBD_SECTOR_SIZE :
888 			PAGE_SIZE;
889 		caddr_t newbuf = malloc(bp->bio_bcount + align, M_DEVBUF,
890 					M_NOWAIT);
891 
892 		alignbuf = (char *)roundup2((u_long)newbuf, align);
893 
894 		/* save a copy of the current buffer */
895 		bp->bio_driver1 = newbuf;
896 		bp->bio_driver2 = alignbuf;
897 
898 		/* Copy the data for a write */
899 		if (bp->bio_cmd == BIO_WRITE)
900 			bcopy(bp->bio_data, alignbuf, bp->bio_bcount);
901 	} else
902 		alignbuf = bp->bio_data;
903 
904 	/* Fill out a communications ring structure. */
905 	ring_req 	         = RING_GET_REQUEST(&info->ring,
906 						    info->ring.req_prod_pvt);
907 	id		         = GET_ID_FROM_FREELIST(info);
908 	info->shadow[id].request = (unsigned long)bp;
909 
910 	ring_req->id 	         = id;
911 	ring_req->operation 	 = (bp->bio_cmd == BIO_READ) ? BLKIF_OP_READ :
912 		BLKIF_OP_WRITE;
913 
914 	ring_req->sector_number= (blkif_sector_t)bp->bio_pblkno;
915 	ring_req->handle 	  = (blkif_vdev_t)(uintptr_t)sc->xb_disk;
916 
917 	ring_req->nr_segments  = 0;	/* XXX not doing scatter/gather since buffer
918 					 * chaining is not supported.
919 					 */
920 
921 	buffer_ma = vtomach(alignbuf);
922 	fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
923 	lsect = fsect + (bp->bio_bcount >> XBD_SECTOR_SHFT) - 1;
924 	/* install a grant reference. */
925 	ref = gnttab_claim_grant_reference(&gref_head);
926 	KASSERT( ref != -ENOSPC, ("grant_reference failed") );
927 
928 	gnttab_grant_foreign_access_ref(
929 		ref,
930 		xenbus_get_otherend_id(info->xbdev),
931 		buffer_ma >> PAGE_SHIFT,
932 		ring_req->operation & 1 ); /* ??? */
933 	info->shadow[id].frame[ring_req->nr_segments] =
934 		buffer_ma >> PAGE_SHIFT;
935 
936 	ring_req->seg[ring_req->nr_segments] =
937 		(struct blkif_request_segment) {
938 			.gref       = ref,
939 			.first_sect = fsect,
940 			.last_sect  = lsect };
941 
942 	ring_req->nr_segments++;
943 	KASSERT((buffer_ma & (XBD_SECTOR_SIZE-1)) == 0,
944 		("XEN buffer must be sector aligned"));
945 	KASSERT(lsect <= 7,
946 		("XEN disk driver data cannot cross a page boundary"));
947 
948 	buffer_ma &= ~PAGE_MASK;
949 
950 	info->ring.req_prod_pvt++;
951 
952 	/* Keep a private copy so we can reissue requests when recovering. */
953 	info->shadow[id].req = *ring_req;
954 
955 	gnttab_free_grant_references(gref_head);
956 
957 	return 0;
958 }
959 
960 
961 
962 /*
963  * Dequeue buffers and place them in the shared communication ring.
964  * Return when no more requests can be accepted or all buffers have
965  * been queued.
966  *
967  * Signal XEN once the ring has been filled out.
968  */
969 static void
970 xb_startio(struct xb_softc *sc)
971 {
972 	struct bio		*bp;
973 	int			queued = 0;
974 	struct blkfront_info *info = sc->xb_info;
975 	DPRINTK("");
976 
977 	mtx_assert(&blkif_io_lock, MA_OWNED);
978 
979 	while ((bp = bioq_takefirst(&sc->xb_bioq)) != NULL) {
980 
981 		if (RING_FULL(&info->ring))
982 			goto wait;
983 
984 		if (blkif_queue_request(bp)) {
985 		wait:
986 			bioq_insert_head(&sc->xb_bioq, bp);
987 			break;
988 		}
989 		queued++;
990 	}
991 
992 	if (queued != 0)
993 		flush_requests(sc->xb_info);
994 }
995 
996 static void
997 blkif_int(void *xsc)
998 {
999 	struct xb_softc *sc = NULL;
1000 	struct bio *bp;
1001 	blkif_response_t *bret;
1002 	RING_IDX i, rp;
1003 	struct blkfront_info *info = xsc;
1004 	DPRINTK("");
1005 
1006 	TRACE_ENTER;
1007 
1008 	mtx_lock(&blkif_io_lock);
1009 
1010 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
1011 		mtx_unlock(&blkif_io_lock);
1012 		return;
1013 	}
1014 
1015  again:
1016 	rp = info->ring.sring->rsp_prod;
1017 	rmb(); /* Ensure we see queued responses up to 'rp'. */
1018 
1019 	for (i = info->ring.rsp_cons; i != rp; i++) {
1020 		unsigned long id;
1021 
1022 		bret = RING_GET_RESPONSE(&info->ring, i);
1023 		id   = bret->id;
1024 		bp   = (struct bio *)info->shadow[id].request;
1025 
1026 		blkif_completion(&info->shadow[id]);
1027 
1028 		ADD_ID_TO_FREELIST(info, id);
1029 
1030 		switch (bret->operation) {
1031 		case BLKIF_OP_READ:
1032 			/* had an unaligned buffer that needs to be copied */
1033 			if (bp->bio_driver1)
1034 				bcopy(bp->bio_driver2, bp->bio_data, bp->bio_bcount);
1035 			/* FALLTHROUGH */
1036 		case BLKIF_OP_WRITE:
1037 
1038 			/* free the copy buffer */
1039 			if (bp->bio_driver1) {
1040 				free(bp->bio_driver1, M_DEVBUF);
1041 				bp->bio_driver1 = NULL;
1042 			}
1043 
1044 			if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) {
1045 					printf("Bad return from blkdev data request: %x\n",
1046 					  bret->status);
1047 				bp->bio_flags |= BIO_ERROR;
1048 			}
1049 
1050 			sc = (struct xb_softc *)bp->bio_disk->d_drv1;
1051 
1052 			if (bp->bio_flags & BIO_ERROR)
1053 				bp->bio_error = EIO;
1054 			else
1055 				bp->bio_resid = 0;
1056 
1057 			biodone(bp);
1058 			break;
1059 		default:
1060 			panic("received invalid operation");
1061 			break;
1062 		}
1063 	}
1064 
1065 	info->ring.rsp_cons = i;
1066 
1067 	if (i != info->ring.req_prod_pvt) {
1068 		int more_to_do;
1069 		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
1070 		if (more_to_do)
1071 			goto again;
1072 	} else {
1073 		info->ring.sring->rsp_event = i + 1;
1074 	}
1075 
1076 	kick_pending_request_queues(info);
1077 
1078 	mtx_unlock(&blkif_io_lock);
1079 }
1080 
1081 static void
1082 blkif_free(struct blkfront_info *info, int suspend)
1083 {
1084 
1085 /* Prevent new requests being issued until we fix things up. */
1086 	mtx_lock(&blkif_io_lock);
1087 	info->connected = suspend ?
1088 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1089 	mtx_unlock(&blkif_io_lock);
1090 
1091 	/* Free resources associated with old device channel. */
1092 	if (info->ring_ref != GRANT_INVALID_REF) {
1093 		gnttab_end_foreign_access(info->ring_ref,
1094 					  info->ring.sring);
1095 		info->ring_ref = GRANT_INVALID_REF;
1096 		info->ring.sring = NULL;
1097 	}
1098 	if (info->irq)
1099 		unbind_from_irqhandler(info->irq);
1100 	info->irq = 0;
1101 
1102 }
1103 
1104 static void
1105 blkif_completion(struct blk_shadow *s)
1106 {
1107 	int i;
1108 
1109 	for (i = 0; i < s->req.nr_segments; i++)
1110 		gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
1111 }
1112 
1113 static void
1114 blkif_recover(struct blkfront_info *info)
1115 {
1116 	int i, j;
1117 	blkif_request_t *req;
1118 	struct blk_shadow *copy;
1119 
1120 	if (!info->sc)
1121 		return;
1122 
1123 	/* Stage 1: Make a safe copy of the shadow state. */
1124 	copy = (struct blk_shadow *)malloc(sizeof(info->shadow), M_DEVBUF, M_NOWAIT|M_ZERO);
1125 	memcpy(copy, info->shadow, sizeof(info->shadow));
1126 
1127 	/* Stage 2: Set up free list. */
1128 	memset(&info->shadow, 0, sizeof(info->shadow));
1129 	for (i = 0; i < BLK_RING_SIZE; i++)
1130 		info->shadow[i].req.id = i+1;
1131 	info->shadow_free = info->ring.req_prod_pvt;
1132 	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1133 
1134 	/* Stage 3: Find pending requests and requeue them. */
1135 	for (i = 0; i < BLK_RING_SIZE; i++) {
1136 		/* Not in use? */
1137 		if (copy[i].request == 0)
1138 			continue;
1139 
1140 		/* Grab a request slot and copy shadow state into it. */
1141 		req = RING_GET_REQUEST(
1142 			&info->ring, info->ring.req_prod_pvt);
1143 		*req = copy[i].req;
1144 
1145 		/* We get a new request id, and must reset the shadow state. */
1146 		req->id = GET_ID_FROM_FREELIST(info);
1147 		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
1148 
1149 		/* Rewrite any grant references invalidated by suspend/resume. */
1150 		for (j = 0; j < req->nr_segments; j++)
1151 			gnttab_grant_foreign_access_ref(
1152 				req->seg[j].gref,
1153 				xenbus_get_otherend_id(info->xbdev),
1154 				pfn_to_mfn(info->shadow[req->id].frame[j]),
1155 				0 /* assume not readonly */);
1156 
1157 		info->shadow[req->id].req = *req;
1158 
1159 		info->ring.req_prod_pvt++;
1160 	}
1161 
1162 	free(copy, M_DEVBUF);
1163 
1164 	xenbus_set_state(info->xbdev, XenbusStateConnected);
1165 
1166 	/* Now safe for us to use the shared ring */
1167 	mtx_lock(&blkif_io_lock);
1168 	info->connected = BLKIF_STATE_CONNECTED;
1169 	mtx_unlock(&blkif_io_lock);
1170 
1171 	/* Send off requeued requests */
1172 	mtx_lock(&blkif_io_lock);
1173 	flush_requests(info);
1174 
1175 	/* Kick any other new requests queued since we resumed */
1176 	kick_pending_request_queues(info);
1177 	mtx_unlock(&blkif_io_lock);
1178 }
1179 
1180 /* ** Driver registration ** */
1181 static device_method_t blkfront_methods[] = {
1182 	/* Device interface */
1183 	DEVMETHOD(device_probe,         blkfront_probe),
1184 	DEVMETHOD(device_attach,        blkfront_attach),
1185 	DEVMETHOD(device_detach,        blkfront_detach),
1186 	DEVMETHOD(device_shutdown,      bus_generic_shutdown),
1187 	DEVMETHOD(device_suspend,       blkfront_suspend),
1188 	DEVMETHOD(device_resume,        blkfront_resume),
1189 
1190 	/* Xenbus interface */
1191 	DEVMETHOD(xenbus_backend_changed, blkfront_backend_changed),
1192 
1193 	{ 0, 0 }
1194 };
1195 
1196 static driver_t blkfront_driver = {
1197 	"xbd",
1198 	blkfront_methods,
1199 	sizeof(struct blkfront_info),
1200 };
1201 devclass_t blkfront_devclass;
1202 
1203 DRIVER_MODULE(xbd, xenbus, blkfront_driver, blkfront_devclass, 0, 0);
1204 
1205 MTX_SYSINIT(ioreq, &blkif_io_lock, "BIO LOCK", MTX_NOWITNESS); /* XXX how does one enroll a lock? */
1206 
1207