xref: /freebsd/sys/dev/xen/blkfront/blkfront.c (revision 9999d2cb7248eb9be1a287a25f2d1fbb64044091)
1 /*
2  * XenBSD block device driver
3  *
4  * Copyright (c) 2009 Frank Suchomel, Citrix
5  * Copyright (c) 2009 Doug F. Rabson, Citrix
6  * Copyright (c) 2005 Kip Macy
7  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
8  * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
9  *
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to
13  * deal in the Software without restriction, including without limitation the
14  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
15  * sell copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26  * DEALINGS IN THE SOFTWARE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/malloc.h>
35 #include <sys/kernel.h>
36 #include <vm/vm.h>
37 #include <vm/pmap.h>
38 
39 #include <sys/bio.h>
40 #include <sys/bus.h>
41 #include <sys/conf.h>
42 #include <sys/module.h>
43 
44 #include <machine/bus.h>
45 #include <sys/rman.h>
46 #include <machine/resource.h>
47 #include <machine/intr_machdep.h>
48 #include <machine/vmparam.h>
49 
50 #include <machine/xen/xen-os.h>
51 #include <machine/xen/xenfunc.h>
52 #include <xen/hypervisor.h>
53 #include <xen/xen_intr.h>
54 #include <xen/evtchn.h>
55 #include <xen/gnttab.h>
56 #include <xen/interface/grant_table.h>
57 #include <xen/interface/io/protocols.h>
58 #include <xen/xenbus/xenbusvar.h>
59 
60 #include <geom/geom_disk.h>
61 
62 #include <dev/xen/blkfront/block.h>
63 
64 #include "xenbus_if.h"
65 
66 #define    ASSERT(S)       KASSERT(S, (#S))
67 /* prototypes */
68 struct xb_softc;
69 static void xb_startio(struct xb_softc *sc);
70 static void connect(device_t, struct blkfront_info *);
71 static void blkfront_closing(device_t);
72 static int blkfront_detach(device_t);
73 static int talk_to_backend(device_t, struct blkfront_info *);
74 static int setup_blkring(device_t, struct blkfront_info *);
75 static void blkif_int(void *);
76 #if 0
77 static void blkif_restart_queue(void *arg);
78 #endif
79 static void blkif_recover(struct blkfront_info *);
80 static void blkif_completion(struct blk_shadow *);
81 static void blkif_free(struct blkfront_info *, int);
82 
83 #define GRANT_INVALID_REF 0
84 #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
85 
86 LIST_HEAD(xb_softc_list_head, xb_softc) xbsl_head;
87 
88 /* Control whether runtime update of vbds is enabled. */
89 #define ENABLE_VBD_UPDATE 0
90 
91 #if ENABLE_VBD_UPDATE
92 static void vbd_update(void);
93 #endif
94 
95 
96 #define BLKIF_STATE_DISCONNECTED 0
97 #define BLKIF_STATE_CONNECTED    1
98 #define BLKIF_STATE_SUSPENDED    2
99 
100 #ifdef notyet
101 static char *blkif_state_name[] = {
102 	[BLKIF_STATE_DISCONNECTED] = "disconnected",
103 	[BLKIF_STATE_CONNECTED]    = "connected",
104 	[BLKIF_STATE_SUSPENDED]    = "closed",
105 };
106 
107 static char * blkif_status_name[] = {
108 	[BLKIF_INTERFACE_STATUS_CLOSED]       = "closed",
109 	[BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
110 	[BLKIF_INTERFACE_STATUS_CONNECTED]    = "connected",
111 	[BLKIF_INTERFACE_STATUS_CHANGED]      = "changed",
112 };
113 #endif
114 #define WPRINTK(fmt, args...) printf("[XEN] " fmt, ##args)
115 #if 0
116 #define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args)
117 #else
118 #define DPRINTK(fmt, args...)
119 #endif
120 
121 static grant_ref_t gref_head;
122 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
123     (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
124 
125 static void kick_pending_request_queues(struct blkfront_info *);
126 static int blkif_open(struct disk *dp);
127 static int blkif_close(struct disk *dp);
128 static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td);
129 static int blkif_queue_request(struct bio *bp);
130 static void xb_strategy(struct bio *bp);
131 
132 // In order to quiesce the device during kernel dumps, outstanding requests to
133 // DOM0 for disk reads/writes need to be accounted for.
134 static	int	blkif_queued_requests;
135 static	int	xb_dump(void *, void *, vm_offset_t, off_t, size_t);
136 
137 
138 /* XXX move to xb_vbd.c when VBD update support is added */
139 #define MAX_VBDS 64
140 
141 #define XBD_SECTOR_SIZE		512	/* XXX: assume for now */
142 #define XBD_SECTOR_SHFT		9
143 
144 static struct mtx blkif_io_lock;
145 
146 static vm_paddr_t
147 pfn_to_mfn(vm_paddr_t pfn)
148 {
149 	return (phystomach(pfn << PAGE_SHIFT) >> PAGE_SHIFT);
150 }
151 
152 /*
153  * Translate Linux major/minor to an appropriate name and unit
154  * number. For HVM guests, this allows us to use the same drive names
155  * with blkfront as the emulated drives, easing transition slightly.
156  */
157 static void
158 blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
159 {
160 	static struct vdev_info {
161 		int major;
162 		int shift;
163 		int base;
164 		const char *name;
165 	} info[] = {
166 		{3,	6,	0,	"ad"},	/* ide0 */
167 		{22,	6,	2,	"ad"},	/* ide1 */
168 		{33,	6,	4,	"ad"},	/* ide2 */
169 		{34,	6,	6,	"ad"},	/* ide3 */
170 		{56,	6,	8,	"ad"},	/* ide4 */
171 		{57,	6,	10,	"ad"},	/* ide5 */
172 		{88,	6,	12,	"ad"},	/* ide6 */
173 		{89,	6,	14,	"ad"},	/* ide7 */
174 		{90,	6,	16,	"ad"},	/* ide8 */
175 		{91,	6,	18,	"ad"},	/* ide9 */
176 
177 		{8,	4,	0,	"da"},	/* scsi disk0 */
178 		{65,	4,	16,	"da"},	/* scsi disk1 */
179 		{66,	4,	32,	"da"},	/* scsi disk2 */
180 		{67,	4,	48,	"da"},	/* scsi disk3 */
181 		{68,	4,	64,	"da"},	/* scsi disk4 */
182 		{69,	4,	80,	"da"},	/* scsi disk5 */
183 		{70,	4,	96,	"da"},	/* scsi disk6 */
184 		{71,	4,	112,	"da"},	/* scsi disk7 */
185 		{128,	4,	128,	"da"},	/* scsi disk8 */
186 		{129,	4,	144,	"da"},	/* scsi disk9 */
187 		{130,	4,	160,	"da"},	/* scsi disk10 */
188 		{131,	4,	176,	"da"},	/* scsi disk11 */
189 		{132,	4,	192,	"da"},	/* scsi disk12 */
190 		{133,	4,	208,	"da"},	/* scsi disk13 */
191 		{134,	4,	224,	"da"},	/* scsi disk14 */
192 		{135,	4,	240,	"da"},	/* scsi disk15 */
193 
194 		{202,	4,	0,	"xbd"},	/* xbd */
195 
196 		{0,	0,	0,	NULL},
197 	};
198 	int major = vdevice >> 8;
199 	int minor = vdevice & 0xff;
200 	int i;
201 
202 	if (vdevice & (1 << 28)) {
203 		*unit = (vdevice & ((1 << 28) - 1)) >> 8;
204 		*name = "xbd";
205 	}
206 
207 	for (i = 0; info[i].major; i++) {
208 		if (info[i].major == major) {
209 			*unit = info[i].base + (minor >> info[i].shift);
210 			*name = info[i].name;
211 			return;
212 		}
213 	}
214 
215 	*unit = minor >> 4;
216 	*name = "xbd";
217 }
218 
219 int
220 xlvbd_add(device_t dev, blkif_sector_t capacity,
221     int vdevice, uint16_t vdisk_info, uint16_t sector_size,
222     struct blkfront_info *info)
223 {
224 	struct xb_softc	*sc;
225 	int	unit, error = 0;
226 	const char *name;
227 
228 	blkfront_vdevice_to_unit(vdevice, &unit, &name);
229 
230 	sc = (struct xb_softc *)malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
231 	sc->xb_unit = unit;
232 	sc->xb_info = info;
233 	info->sc = sc;
234 
235 	if (strcmp(name, "xbd"))
236 		device_printf(dev, "attaching as %s%d\n", name, unit);
237 
238 	memset(&sc->xb_disk, 0, sizeof(sc->xb_disk));
239 	sc->xb_disk = disk_alloc();
240 	sc->xb_disk->d_unit = sc->xb_unit;
241 	sc->xb_disk->d_open = blkif_open;
242 	sc->xb_disk->d_close = blkif_close;
243 	sc->xb_disk->d_ioctl = blkif_ioctl;
244 	sc->xb_disk->d_strategy = xb_strategy;
245 	sc->xb_disk->d_dump = xb_dump;
246 	sc->xb_disk->d_name = name;
247 	sc->xb_disk->d_drv1 = sc;
248 	sc->xb_disk->d_sectorsize = sector_size;
249 
250 	/* XXX */
251 	sc->xb_disk->d_mediasize = capacity << XBD_SECTOR_SHFT;
252 #if 0
253 	sc->xb_disk->d_maxsize = DFLTPHYS;
254 #else /* XXX: xen can't handle large single i/o requests */
255 	sc->xb_disk->d_maxsize = 4096;
256 #endif
257 #ifdef notyet
258 	XENPRINTF("attaching device 0x%x unit %d capacity %llu\n",
259 		  xb_diskinfo[sc->xb_unit].device, sc->xb_unit,
260 		  sc->xb_disk->d_mediasize);
261 #endif
262 	sc->xb_disk->d_flags = 0;
263 	disk_create(sc->xb_disk, DISK_VERSION_00);
264 	bioq_init(&sc->xb_bioq);
265 
266 	return error;
267 }
268 
269 void
270 xlvbd_del(struct blkfront_info *info)
271 {
272 	struct xb_softc	*sc;
273 
274 	sc = info->sc;
275 	disk_destroy(sc->xb_disk);
276 }
277 /************************ end VBD support *****************/
278 
279 /*
280  * Read/write routine for a buffer.  Finds the proper unit, place it on
281  * the sortq and kick the controller.
282  */
283 static void
284 xb_strategy(struct bio *bp)
285 {
286 	struct xb_softc	*sc = (struct xb_softc *)bp->bio_disk->d_drv1;
287 
288 	/* bogus disk? */
289 	if (sc == NULL) {
290 		bp->bio_error = EINVAL;
291 		bp->bio_flags |= BIO_ERROR;
292 		goto bad;
293 	}
294 
295 	DPRINTK("");
296 
297 	/*
298 	 * Place it in the queue of disk activities for this disk
299 	 */
300 	mtx_lock(&blkif_io_lock);
301 
302 	bioq_disksort(&sc->xb_bioq, bp);
303 	xb_startio(sc);
304 
305 	mtx_unlock(&blkif_io_lock);
306 	return;
307 
308  bad:
309 	/*
310 	 * Correctly set the bio to indicate a failed tranfer.
311 	 */
312 	bp->bio_resid = bp->bio_bcount;
313 	biodone(bp);
314 	return;
315 }
316 
317 static void xb_quiesce(struct blkfront_info *info);
318 // Quiesce the disk writes for a dump file before allowing the next buffer.
319 static void
320 xb_quiesce(struct blkfront_info *info)
321 {
322 	int		mtd;
323 
324 	// While there are outstanding requests
325 	while (blkif_queued_requests) {
326 		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, mtd);
327 		if (mtd) {
328 			// Recieved request completions, update queue.
329 			blkif_int(info);
330 		}
331 		if (blkif_queued_requests) {
332 			// Still pending requests, wait for the disk i/o to complete
333 			HYPERVISOR_yield();
334 		}
335 	}
336 }
337 
338 // Some bio structures for dumping core
339 #define DUMP_BIO_NO 16				// 16 * 4KB = 64KB dump block
340 static	struct bio		xb_dump_bp[DUMP_BIO_NO];
341 
342 // Kernel dump function for a paravirtualized disk device
343 static int
344 xb_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
345         size_t length)
346 {
347 			int				 sbp;
348   			int			     mbp;
349 			size_t			 chunk;
350 	struct	disk   			*dp = arg;
351 	struct	xb_softc		*sc = (struct xb_softc *) dp->d_drv1;
352 	        int	    		 rc = 0;
353 
354 	xb_quiesce(sc->xb_info);		// All quiet on the western front.
355 	if (length > 0) {
356 		// If this lock is held, then this module is failing, and a successful
357 		// kernel dump is highly unlikely anyway.
358 		mtx_lock(&blkif_io_lock);
359 		// Split the 64KB block into 16 4KB blocks
360 		for (sbp=0; length>0 && sbp<DUMP_BIO_NO; sbp++) {
361 			chunk = length > PAGE_SIZE ? PAGE_SIZE : length;
362 			xb_dump_bp[sbp].bio_disk   = dp;
363 			xb_dump_bp[sbp].bio_pblkno = offset / dp->d_sectorsize;
364 			xb_dump_bp[sbp].bio_bcount = chunk;
365 			xb_dump_bp[sbp].bio_resid  = chunk;
366 			xb_dump_bp[sbp].bio_data   = virtual;
367 			xb_dump_bp[sbp].bio_cmd    = BIO_WRITE;
368 			xb_dump_bp[sbp].bio_done   = NULL;
369 
370 			bioq_disksort(&sc->xb_bioq, &xb_dump_bp[sbp]);
371 
372 			length -= chunk;
373 			offset += chunk;
374 			virtual = (char *) virtual + chunk;
375 		}
376 		// Tell DOM0 to do the I/O
377 		xb_startio(sc);
378 		mtx_unlock(&blkif_io_lock);
379 
380 		// Must wait for the completion: the dump routine reuses the same
381 		//                               16 x 4KB buffer space.
382 		xb_quiesce(sc->xb_info);	// All quite on the eastern front
383 		// If there were any errors, bail out...
384 		for (mbp=0; mbp<sbp; mbp++) {
385 			if ((rc = xb_dump_bp[mbp].bio_error)) break;
386 		}
387 	}
388 	return (rc);
389 }
390 
391 
392 static int
393 blkfront_probe(device_t dev)
394 {
395 
396 	if (!strcmp(xenbus_get_type(dev), "vbd")) {
397 		device_set_desc(dev, "Virtual Block Device");
398 		device_quiet(dev);
399 		return (0);
400 	}
401 
402 	return (ENXIO);
403 }
404 
405 /*
406  * Setup supplies the backend dir, virtual device.  We place an event
407  * channel and shared frame entries.  We watch backend to wait if it's
408  * ok.
409  */
410 static int
411 blkfront_attach(device_t dev)
412 {
413 	int error, vdevice, i, unit;
414 	struct blkfront_info *info;
415 	const char *name;
416 
417 	/* FIXME: Use dynamic device id if this is not set. */
418 	error = xenbus_scanf(XBT_NIL, xenbus_get_node(dev),
419 	    "virtual-device", NULL, "%i", &vdevice);
420 	if (error) {
421 		xenbus_dev_fatal(dev, error, "reading virtual-device");
422 		printf("couldn't find virtual device");
423 		return (error);
424 	}
425 
426 	blkfront_vdevice_to_unit(vdevice, &unit, &name);
427 	if (!strcmp(name, "xbd"))
428 		device_set_unit(dev, unit);
429 
430 	info = device_get_softc(dev);
431 
432 	/*
433 	 * XXX debug only
434 	 */
435 	for (i = 0; i < sizeof(*info); i++)
436 			if (((uint8_t *)info)[i] != 0)
437 					panic("non-null memory");
438 
439 	info->shadow_free = 0;
440 	info->xbdev = dev;
441 	info->vdevice = vdevice;
442 	info->connected = BLKIF_STATE_DISCONNECTED;
443 
444 	/* work queue needed ? */
445 	for (i = 0; i < BLK_RING_SIZE; i++)
446 		info->shadow[i].req.id = i+1;
447 	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
448 
449 	/* Front end dir is a number, which is used as the id. */
450 	info->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0);
451 
452 	error = talk_to_backend(dev, info);
453 	if (error)
454 		return (error);
455 
456 	return (0);
457 }
458 
459 static int
460 blkfront_suspend(device_t dev)
461 {
462 	struct blkfront_info *info = device_get_softc(dev);
463 
464 	/* Prevent new requests being issued until we fix things up. */
465 	mtx_lock(&blkif_io_lock);
466 	info->connected = BLKIF_STATE_SUSPENDED;
467 	mtx_unlock(&blkif_io_lock);
468 
469 	return (0);
470 }
471 
472 static int
473 blkfront_resume(device_t dev)
474 {
475 	struct blkfront_info *info = device_get_softc(dev);
476 	int err;
477 
478 	DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev));
479 
480 	blkif_free(info, 1);
481 	err = talk_to_backend(dev, info);
482 	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
483 		blkif_recover(info);
484 
485 	return (err);
486 }
487 
488 /* Common code used when first setting up, and when resuming. */
489 static int
490 talk_to_backend(device_t dev, struct blkfront_info *info)
491 {
492 	const char *message = NULL;
493 	struct xenbus_transaction xbt;
494 	int err;
495 
496 	/* Create shared ring, alloc event channel. */
497 	err = setup_blkring(dev, info);
498 	if (err)
499 		goto out;
500 
501  again:
502 	err = xenbus_transaction_start(&xbt);
503 	if (err) {
504 		xenbus_dev_fatal(dev, err, "starting transaction");
505 		goto destroy_blkring;
506 	}
507 
508 	err = xenbus_printf(xbt, xenbus_get_node(dev),
509 			    "ring-ref","%u", info->ring_ref);
510 	if (err) {
511 		message = "writing ring-ref";
512 		goto abort_transaction;
513 	}
514 	err = xenbus_printf(xbt, xenbus_get_node(dev),
515 		"event-channel", "%u", irq_to_evtchn_port(info->irq));
516 	if (err) {
517 		message = "writing event-channel";
518 		goto abort_transaction;
519 	}
520 	err = xenbus_printf(xbt, xenbus_get_node(dev),
521 		"protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
522 	if (err) {
523 		message = "writing protocol";
524 		goto abort_transaction;
525 	}
526 
527 	err = xenbus_transaction_end(xbt, 0);
528 	if (err) {
529 		if (err == EAGAIN)
530 			goto again;
531 		xenbus_dev_fatal(dev, err, "completing transaction");
532 		goto destroy_blkring;
533 	}
534 	xenbus_set_state(dev, XenbusStateInitialised);
535 
536 	return 0;
537 
538  abort_transaction:
539 	xenbus_transaction_end(xbt, 1);
540 	if (message)
541 		xenbus_dev_fatal(dev, err, "%s", message);
542  destroy_blkring:
543 	blkif_free(info, 0);
544  out:
545 	return err;
546 }
547 
548 static int
549 setup_blkring(device_t dev, struct blkfront_info *info)
550 {
551 	blkif_sring_t *sring;
552 	int error;
553 
554 	info->ring_ref = GRANT_INVALID_REF;
555 
556 	sring = (blkif_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO);
557 	if (sring == NULL) {
558 		xenbus_dev_fatal(dev, ENOMEM, "allocating shared ring");
559 		return ENOMEM;
560 	}
561 	SHARED_RING_INIT(sring);
562 	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
563 
564 	error = xenbus_grant_ring(dev,
565 	    (vtomach(info->ring.sring) >> PAGE_SHIFT), &info->ring_ref);
566 	if (error) {
567 		free(sring, M_DEVBUF);
568 		info->ring.sring = NULL;
569 		goto fail;
570 	}
571 
572 	error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(dev),
573 	    "xbd", (driver_intr_t *)blkif_int, info,
574 	    INTR_TYPE_BIO | INTR_MPSAFE, &info->irq);
575 	if (error) {
576 		xenbus_dev_fatal(dev, error,
577 		    "bind_evtchn_to_irqhandler failed");
578 		goto fail;
579 	}
580 
581 	return (0);
582  fail:
583 	blkif_free(info, 0);
584 	return (error);
585 }
586 
587 
588 /**
589  * Callback received when the backend's state changes.
590  */
591 static int
592 blkfront_backend_changed(device_t dev, XenbusState backend_state)
593 {
594 	struct blkfront_info *info = device_get_softc(dev);
595 
596 	DPRINTK("backend_state=%d\n", backend_state);
597 
598 	switch (backend_state) {
599 	case XenbusStateUnknown:
600 	case XenbusStateInitialising:
601 	case XenbusStateInitWait:
602 	case XenbusStateInitialised:
603 	case XenbusStateClosed:
604 	case XenbusStateReconfigured:
605 	case XenbusStateReconfiguring:
606 		break;
607 
608 	case XenbusStateConnected:
609 		connect(dev, info);
610 		break;
611 
612 	case XenbusStateClosing:
613 		if (info->users > 0)
614 			xenbus_dev_error(dev, -EBUSY,
615 					 "Device in use; refusing to close");
616 		else
617 			blkfront_closing(dev);
618 #ifdef notyet
619 		bd = bdget(info->dev);
620 		if (bd == NULL)
621 			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
622 
623 		down(&bd->bd_sem);
624 		if (info->users > 0)
625 			xenbus_dev_error(dev, -EBUSY,
626 					 "Device in use; refusing to close");
627 		else
628 			blkfront_closing(dev);
629 		up(&bd->bd_sem);
630 		bdput(bd);
631 #endif
632 	}
633 
634 	return (0);
635 }
636 
637 /*
638 ** Invoked when the backend is finally 'ready' (and has told produced
639 ** the details about the physical device - #sectors, size, etc).
640 */
641 static void
642 connect(device_t dev, struct blkfront_info *info)
643 {
644 	unsigned long sectors, sector_size;
645 	unsigned int binfo;
646 	int err;
647 
648         if( (info->connected == BLKIF_STATE_CONNECTED) ||
649 	    (info->connected == BLKIF_STATE_SUSPENDED) )
650 		return;
651 
652 	DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev));
653 
654 	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
655 			    "sectors", "%lu", &sectors,
656 			    "info", "%u", &binfo,
657 			    "sector-size", "%lu", &sector_size,
658 			    NULL);
659 	if (err) {
660 		xenbus_dev_fatal(dev, err,
661 		    "reading backend fields at %s",
662 		    xenbus_get_otherend_path(dev));
663 		return;
664 	}
665 	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
666 			    "feature-barrier", "%lu", &info->feature_barrier,
667 			    NULL);
668 	if (err)
669 		info->feature_barrier = 0;
670 
671 	device_printf(dev, "%juMB <%s> at %s",
672 	    (uintmax_t) sectors / (1048576 / sector_size),
673 	    device_get_desc(dev),
674 	    xenbus_get_node(dev));
675 	bus_print_child_footer(device_get_parent(dev), dev);
676 
677 	xlvbd_add(dev, sectors, info->vdevice, binfo, sector_size, info);
678 
679 	(void)xenbus_set_state(dev, XenbusStateConnected);
680 
681 	/* Kick pending requests. */
682 	mtx_lock(&blkif_io_lock);
683 	info->connected = BLKIF_STATE_CONNECTED;
684 	kick_pending_request_queues(info);
685 	mtx_unlock(&blkif_io_lock);
686 	info->is_ready = 1;
687 
688 #if 0
689 	add_disk(info->gd);
690 #endif
691 }
692 
693 /**
694  * Handle the change of state of the backend to Closing.  We must delete our
695  * device-layer structures now, to ensure that writes are flushed through to
696  * the backend.  Once is this done, we can switch to Closed in
697  * acknowledgement.
698  */
699 static void
700 blkfront_closing(device_t dev)
701 {
702 	struct blkfront_info *info = device_get_softc(dev);
703 
704 	DPRINTK("blkfront_closing: %s removed\n", xenbus_get_node(dev));
705 
706 	if (info->mi) {
707 		DPRINTK("Calling xlvbd_del\n");
708 		xlvbd_del(info);
709 		info->mi = NULL;
710 	}
711 
712 	xenbus_set_state(dev, XenbusStateClosed);
713 }
714 
715 
716 static int
717 blkfront_detach(device_t dev)
718 {
719 	struct blkfront_info *info = device_get_softc(dev);
720 
721 	DPRINTK("blkfront_remove: %s removed\n", xenbus_get_node(dev));
722 
723 	blkif_free(info, 0);
724 
725 	return 0;
726 }
727 
728 
729 static inline int
730 GET_ID_FROM_FREELIST(struct blkfront_info *info)
731 {
732 	unsigned long nfree = info->shadow_free;
733 
734 	KASSERT(nfree <= BLK_RING_SIZE, ("free %lu > RING_SIZE", nfree));
735 	info->shadow_free = info->shadow[nfree].req.id;
736 	info->shadow[nfree].req.id = 0x0fffffee; /* debug */
737 	atomic_add_int(&blkif_queued_requests, 1);
738 	return nfree;
739 }
740 
741 static inline void
742 ADD_ID_TO_FREELIST(struct blkfront_info *info, unsigned long id)
743 {
744 	info->shadow[id].req.id  = info->shadow_free;
745 	info->shadow[id].request = 0;
746 	info->shadow_free = id;
747 	atomic_subtract_int(&blkif_queued_requests, 1);
748 }
749 
750 static inline void
751 flush_requests(struct blkfront_info *info)
752 {
753 	int notify;
754 
755 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
756 
757 	if (notify)
758 		notify_remote_via_irq(info->irq);
759 }
760 
761 static void
762 kick_pending_request_queues(struct blkfront_info *info)
763 {
764 	/* XXX check if we can't simplify */
765 #if 0
766 	if (!RING_FULL(&info->ring)) {
767 		/* Re-enable calldowns. */
768 		blk_start_queue(info->rq);
769 		/* Kick things off immediately. */
770 		do_blkif_request(info->rq);
771 	}
772 #endif
773 	if (!RING_FULL(&info->ring)) {
774 #if 0
775 		sc = LIST_FIRST(&xbsl_head);
776 		LIST_REMOVE(sc, entry);
777 		/* Re-enable calldowns. */
778 		blk_start_queue(di->rq);
779 #endif
780 		/* Kick things off immediately. */
781 		xb_startio(info->sc);
782 	}
783 }
784 
785 #if 0
786 /* XXX */
787 static void blkif_restart_queue(void *arg)
788 {
789 	struct blkfront_info *info = (struct blkfront_info *)arg;
790 
791 	mtx_lock(&blkif_io_lock);
792 	kick_pending_request_queues(info);
793 	mtx_unlock(&blkif_io_lock);
794 }
795 #endif
796 
797 static void blkif_restart_queue_callback(void *arg)
798 {
799 #if 0
800 	struct blkfront_info *info = (struct blkfront_info *)arg;
801 	/* XXX BSD equiv ? */
802 
803 	schedule_work(&info->work);
804 #endif
805 }
806 
807 static int
808 blkif_open(struct disk *dp)
809 {
810 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
811 
812 	if (sc == NULL) {
813 		printf("xb%d: not found", sc->xb_unit);
814 		return (ENXIO);
815 	}
816 
817 	sc->xb_flags |= XB_OPEN;
818 	sc->xb_info->users++;
819 	return (0);
820 }
821 
822 static int
823 blkif_close(struct disk *dp)
824 {
825 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
826 
827 	if (sc == NULL)
828 		return (ENXIO);
829 	sc->xb_flags &= ~XB_OPEN;
830 	if (--(sc->xb_info->users) == 0) {
831 		/* Check whether we have been instructed to close.  We will
832 		   have ignored this request initially, as the device was
833 		   still mounted. */
834 		device_t dev = sc->xb_info->xbdev;
835 		XenbusState state =
836 			xenbus_read_driver_state(xenbus_get_otherend_path(dev));
837 
838 		if (state == XenbusStateClosing)
839 			blkfront_closing(dev);
840 	}
841 	return (0);
842 }
843 
844 static int
845 blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td)
846 {
847 	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
848 
849 	if (sc == NULL)
850 		return (ENXIO);
851 
852 	return (ENOTTY);
853 }
854 
855 
856 /*
857  * blkif_queue_request
858  *
859  * request block io
860  *
861  * id: for guest use only.
862  * operation: BLKIF_OP_{READ,WRITE,PROBE}
863  * buffer: buffer to read/write into. this should be a
864  *   virtual address in the guest os.
865  */
866 static int blkif_queue_request(struct bio *bp)
867 {
868 	caddr_t alignbuf;
869 	vm_paddr_t buffer_ma;
870 	blkif_request_t     *ring_req;
871 	unsigned long id;
872 	uint64_t fsect, lsect;
873 	struct xb_softc *sc = (struct xb_softc *)bp->bio_disk->d_drv1;
874 	struct blkfront_info *info = sc->xb_info;
875 	int ref;
876 
877 	if (unlikely(sc->xb_info->connected != BLKIF_STATE_CONNECTED))
878 		return 1;
879 
880 	if (gnttab_alloc_grant_references(
881 		    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
882 		gnttab_request_free_callback(
883 			&info->callback,
884 			blkif_restart_queue_callback,
885 			info,
886 			BLKIF_MAX_SEGMENTS_PER_REQUEST);
887 		return 1;
888 	}
889 
890 	/* Check if the buffer is properly aligned */
891 	if ((vm_offset_t)bp->bio_data & PAGE_MASK) {
892 		int align = (bp->bio_bcount < PAGE_SIZE/2) ? XBD_SECTOR_SIZE :
893 			PAGE_SIZE;
894 		caddr_t newbuf = malloc(bp->bio_bcount + align, M_DEVBUF,
895 					M_NOWAIT);
896 
897 		alignbuf = (char *)roundup2((u_long)newbuf, align);
898 
899 		/* save a copy of the current buffer */
900 		bp->bio_driver1 = newbuf;
901 		bp->bio_driver2 = alignbuf;
902 
903 		/* Copy the data for a write */
904 		if (bp->bio_cmd == BIO_WRITE)
905 			bcopy(bp->bio_data, alignbuf, bp->bio_bcount);
906 	} else
907 		alignbuf = bp->bio_data;
908 
909 	/* Fill out a communications ring structure. */
910 	ring_req 	         = RING_GET_REQUEST(&info->ring,
911 						    info->ring.req_prod_pvt);
912 	id		         = GET_ID_FROM_FREELIST(info);
913 	info->shadow[id].request = (unsigned long)bp;
914 
915 	ring_req->id 	         = id;
916 	ring_req->operation 	 = (bp->bio_cmd == BIO_READ) ? BLKIF_OP_READ :
917 		BLKIF_OP_WRITE;
918 
919 	ring_req->sector_number= (blkif_sector_t)bp->bio_pblkno;
920 	ring_req->handle 	  = (blkif_vdev_t)(uintptr_t)sc->xb_disk;
921 
922 	ring_req->nr_segments  = 0;	/* XXX not doing scatter/gather since buffer
923 					 * chaining is not supported.
924 					 */
925 
926 	buffer_ma = vtomach(alignbuf);
927 	fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
928 	lsect = fsect + (bp->bio_bcount >> XBD_SECTOR_SHFT) - 1;
929 	/* install a grant reference. */
930 	ref = gnttab_claim_grant_reference(&gref_head);
931 	KASSERT( ref != -ENOSPC, ("grant_reference failed") );
932 
933 	gnttab_grant_foreign_access_ref(
934 		ref,
935 		xenbus_get_otherend_id(info->xbdev),
936 		buffer_ma >> PAGE_SHIFT,
937 		ring_req->operation & 1 ); /* ??? */
938 	info->shadow[id].frame[ring_req->nr_segments] =
939 		buffer_ma >> PAGE_SHIFT;
940 
941 	ring_req->seg[ring_req->nr_segments] =
942 		(struct blkif_request_segment) {
943 			.gref       = ref,
944 			.first_sect = fsect,
945 			.last_sect  = lsect };
946 
947 	ring_req->nr_segments++;
948 	KASSERT((buffer_ma & (XBD_SECTOR_SIZE-1)) == 0,
949 		("XEN buffer must be sector aligned"));
950 	KASSERT(lsect <= 7,
951 		("XEN disk driver data cannot cross a page boundary"));
952 
953 	buffer_ma &= ~PAGE_MASK;
954 
955 	info->ring.req_prod_pvt++;
956 
957 	/* Keep a private copy so we can reissue requests when recovering. */
958 	info->shadow[id].req = *ring_req;
959 
960 	gnttab_free_grant_references(gref_head);
961 
962 	return 0;
963 }
964 
965 
966 
967 /*
968  * Dequeue buffers and place them in the shared communication ring.
969  * Return when no more requests can be accepted or all buffers have
970  * been queued.
971  *
972  * Signal XEN once the ring has been filled out.
973  */
974 static void
975 xb_startio(struct xb_softc *sc)
976 {
977 	struct bio		*bp;
978 	int			queued = 0;
979 	struct blkfront_info *info = sc->xb_info;
980 	DPRINTK("");
981 
982 	mtx_assert(&blkif_io_lock, MA_OWNED);
983 
984 	while ((bp = bioq_takefirst(&sc->xb_bioq)) != NULL) {
985 
986 		if (RING_FULL(&info->ring))
987 			goto wait;
988 
989 		if (blkif_queue_request(bp)) {
990 		wait:
991 			bioq_insert_head(&sc->xb_bioq, bp);
992 			break;
993 		}
994 		queued++;
995 	}
996 
997 	if (queued != 0)
998 		flush_requests(sc->xb_info);
999 }
1000 
1001 static void
1002 blkif_int(void *xsc)
1003 {
1004 	struct xb_softc *sc = NULL;
1005 	struct bio *bp;
1006 	blkif_response_t *bret;
1007 	RING_IDX i, rp;
1008 	struct blkfront_info *info = xsc;
1009 	DPRINTK("");
1010 
1011 	TRACE_ENTER;
1012 
1013 	mtx_lock(&blkif_io_lock);
1014 
1015 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
1016 		mtx_unlock(&blkif_io_lock);
1017 		return;
1018 	}
1019 
1020  again:
1021 	rp = info->ring.sring->rsp_prod;
1022 	rmb(); /* Ensure we see queued responses up to 'rp'. */
1023 
1024 	for (i = info->ring.rsp_cons; i != rp; i++) {
1025 		unsigned long id;
1026 
1027 		bret = RING_GET_RESPONSE(&info->ring, i);
1028 		id   = bret->id;
1029 		bp   = (struct bio *)info->shadow[id].request;
1030 
1031 		blkif_completion(&info->shadow[id]);
1032 
1033 		ADD_ID_TO_FREELIST(info, id);
1034 
1035 		switch (bret->operation) {
1036 		case BLKIF_OP_READ:
1037 			/* had an unaligned buffer that needs to be copied */
1038 			if (bp->bio_driver1)
1039 				bcopy(bp->bio_driver2, bp->bio_data, bp->bio_bcount);
1040 			/* FALLTHROUGH */
1041 		case BLKIF_OP_WRITE:
1042 
1043 			/* free the copy buffer */
1044 			if (bp->bio_driver1) {
1045 				free(bp->bio_driver1, M_DEVBUF);
1046 				bp->bio_driver1 = NULL;
1047 			}
1048 
1049 			if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) {
1050 					printf("Bad return from blkdev data request: %x\n",
1051 					  bret->status);
1052 				bp->bio_flags |= BIO_ERROR;
1053 			}
1054 
1055 			sc = (struct xb_softc *)bp->bio_disk->d_drv1;
1056 
1057 			if (bp->bio_flags & BIO_ERROR)
1058 				bp->bio_error = EIO;
1059 			else
1060 				bp->bio_resid = 0;
1061 
1062 			biodone(bp);
1063 			break;
1064 		default:
1065 			panic("received invalid operation");
1066 			break;
1067 		}
1068 	}
1069 
1070 	info->ring.rsp_cons = i;
1071 
1072 	if (i != info->ring.req_prod_pvt) {
1073 		int more_to_do;
1074 		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
1075 		if (more_to_do)
1076 			goto again;
1077 	} else {
1078 		info->ring.sring->rsp_event = i + 1;
1079 	}
1080 
1081 	kick_pending_request_queues(info);
1082 
1083 	mtx_unlock(&blkif_io_lock);
1084 }
1085 
1086 static void
1087 blkif_free(struct blkfront_info *info, int suspend)
1088 {
1089 
1090 /* Prevent new requests being issued until we fix things up. */
1091 	mtx_lock(&blkif_io_lock);
1092 	info->connected = suspend ?
1093 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1094 	mtx_unlock(&blkif_io_lock);
1095 
1096 	/* Free resources associated with old device channel. */
1097 	if (info->ring_ref != GRANT_INVALID_REF) {
1098 		gnttab_end_foreign_access(info->ring_ref,
1099 					  info->ring.sring);
1100 		info->ring_ref = GRANT_INVALID_REF;
1101 		info->ring.sring = NULL;
1102 	}
1103 	if (info->irq)
1104 		unbind_from_irqhandler(info->irq);
1105 	info->irq = 0;
1106 
1107 }
1108 
1109 static void
1110 blkif_completion(struct blk_shadow *s)
1111 {
1112 	int i;
1113 
1114 	for (i = 0; i < s->req.nr_segments; i++)
1115 		gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
1116 }
1117 
1118 static void
1119 blkif_recover(struct blkfront_info *info)
1120 {
1121 	int i, j;
1122 	blkif_request_t *req;
1123 	struct blk_shadow *copy;
1124 
1125 	if (!info->sc)
1126 		return;
1127 
1128 	/* Stage 1: Make a safe copy of the shadow state. */
1129 	copy = (struct blk_shadow *)malloc(sizeof(info->shadow), M_DEVBUF, M_NOWAIT|M_ZERO);
1130 	memcpy(copy, info->shadow, sizeof(info->shadow));
1131 
1132 	/* Stage 2: Set up free list. */
1133 	memset(&info->shadow, 0, sizeof(info->shadow));
1134 	for (i = 0; i < BLK_RING_SIZE; i++)
1135 		info->shadow[i].req.id = i+1;
1136 	info->shadow_free = info->ring.req_prod_pvt;
1137 	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1138 
1139 	/* Stage 3: Find pending requests and requeue them. */
1140 	for (i = 0; i < BLK_RING_SIZE; i++) {
1141 		/* Not in use? */
1142 		if (copy[i].request == 0)
1143 			continue;
1144 
1145 		/* Grab a request slot and copy shadow state into it. */
1146 		req = RING_GET_REQUEST(
1147 			&info->ring, info->ring.req_prod_pvt);
1148 		*req = copy[i].req;
1149 
1150 		/* We get a new request id, and must reset the shadow state. */
1151 		req->id = GET_ID_FROM_FREELIST(info);
1152 		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
1153 
1154 		/* Rewrite any grant references invalidated by suspend/resume. */
1155 		for (j = 0; j < req->nr_segments; j++)
1156 			gnttab_grant_foreign_access_ref(
1157 				req->seg[j].gref,
1158 				xenbus_get_otherend_id(info->xbdev),
1159 				pfn_to_mfn(info->shadow[req->id].frame[j]),
1160 				0 /* assume not readonly */);
1161 
1162 		info->shadow[req->id].req = *req;
1163 
1164 		info->ring.req_prod_pvt++;
1165 	}
1166 
1167 	free(copy, M_DEVBUF);
1168 
1169 	xenbus_set_state(info->xbdev, XenbusStateConnected);
1170 
1171 	/* Now safe for us to use the shared ring */
1172 	mtx_lock(&blkif_io_lock);
1173 	info->connected = BLKIF_STATE_CONNECTED;
1174 	mtx_unlock(&blkif_io_lock);
1175 
1176 	/* Send off requeued requests */
1177 	mtx_lock(&blkif_io_lock);
1178 	flush_requests(info);
1179 
1180 	/* Kick any other new requests queued since we resumed */
1181 	kick_pending_request_queues(info);
1182 	mtx_unlock(&blkif_io_lock);
1183 }
1184 
1185 /* ** Driver registration ** */
1186 static device_method_t blkfront_methods[] = {
1187 	/* Device interface */
1188 	DEVMETHOD(device_probe,         blkfront_probe),
1189 	DEVMETHOD(device_attach,        blkfront_attach),
1190 	DEVMETHOD(device_detach,        blkfront_detach),
1191 	DEVMETHOD(device_shutdown,      bus_generic_shutdown),
1192 	DEVMETHOD(device_suspend,       blkfront_suspend),
1193 	DEVMETHOD(device_resume,        blkfront_resume),
1194 
1195 	/* Xenbus interface */
1196 	DEVMETHOD(xenbus_backend_changed, blkfront_backend_changed),
1197 
1198 	{ 0, 0 }
1199 };
1200 
1201 static driver_t blkfront_driver = {
1202 	"xbd",
1203 	blkfront_methods,
1204 	sizeof(struct blkfront_info),
1205 };
1206 devclass_t blkfront_devclass;
1207 
1208 DRIVER_MODULE(xbd, xenbus, blkfront_driver, blkfront_devclass, 0, 0);
1209 
1210 MTX_SYSINIT(ioreq, &blkif_io_lock, "BIO LOCK", MTX_NOWITNESS); /* XXX how does one enroll a lock? */
1211 
1212