1 /*- 2 * Copyright (c) 2009-2010 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 * Ken Merry (Spectra Logic Corporation) 32 */ 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 36 /** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 */ 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/kernel.h> 46 #include <sys/malloc.h> 47 48 #include <sys/bio.h> 49 #include <sys/bus.h> 50 #include <sys/conf.h> 51 #include <sys/devicestat.h> 52 #include <sys/disk.h> 53 #include <sys/fcntl.h> 54 #include <sys/filedesc.h> 55 #include <sys/kdb.h> 56 #include <sys/module.h> 57 #include <sys/namei.h> 58 #include <sys/proc.h> 59 #include <sys/rman.h> 60 #include <sys/taskqueue.h> 61 #include <sys/types.h> 62 #include <sys/vnode.h> 63 #include <sys/mount.h> 64 65 #include <geom/geom.h> 66 67 #include <machine/_inttypes.h> 68 #include <machine/xen/xen-os.h> 69 70 #include <vm/vm.h> 71 #include <vm/vm_extern.h> 72 #include <vm/vm_kern.h> 73 74 #include <xen/blkif.h> 75 #include <xen/evtchn.h> 76 #include <xen/gnttab.h> 77 #include <xen/xen_intr.h> 78 79 #include <xen/interface/event_channel.h> 80 #include <xen/interface/grant_table.h> 81 82 #include <xen/xenbus/xenbusvar.h> 83 84 /*--------------------------- Compile-time Tunables --------------------------*/ 85 /** 86 * The maximum number of outstanding request blocks (request headers plus 87 * additional segment blocks) we will allow in a negotiated block-front/back 88 * communication channel. 89 */ 90 #define XBB_MAX_REQUESTS 256 91 92 /** 93 * \brief Define to force all I/O to be performed on memory owned by the 94 * backend device, with a copy-in/out to the remote domain's memory. 95 * 96 * \note This option is currently required when this driver's domain is 97 * operating in HVM mode on a system using an IOMMU. 98 * 99 * This driver uses Xen's grant table API to gain access to the memory of 100 * the remote domains it serves. When our domain is operating in PV mode, 101 * the grant table mechanism directly updates our domain's page table entries 102 * to point to the physical pages of the remote domain. This scheme guarantees 103 * that blkback and the backing devices it uses can safely perform DMA 104 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 105 * insure that our domain cannot DMA to pages owned by another domain. As 106 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 107 * table API. For this reason, in HVM mode, we must bounce all requests into 108 * memory that is mapped into our domain at domain startup and thus has 109 * valid IOMMU mappings. 110 */ 111 #define XBB_USE_BOUNCE_BUFFERS 112 113 /** 114 * \brief Define to enable rudimentary request logging to the console. 115 */ 116 #undef XBB_DEBUG 117 118 /*---------------------------------- Macros ----------------------------------*/ 119 /** 120 * Custom malloc type for all driver allocations. 121 */ 122 MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 123 124 #ifdef XBB_DEBUG 125 #define DPRINTF(fmt, args...) \ 126 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 127 #else 128 #define DPRINTF(fmt, args...) do {} while(0) 129 #endif 130 131 /** 132 * The maximum mapped region size per request we will allow in a negotiated 133 * block-front/back communication channel. 134 */ 135 #define XBB_MAX_REQUEST_SIZE \ 136 MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 137 138 /** 139 * The maximum number of segments (within a request header and accompanying 140 * segment blocks) per request we will allow in a negotiated block-front/back 141 * communication channel. 142 */ 143 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 144 (MIN(UIO_MAXIOV, \ 145 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 146 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 147 148 /** 149 * The maximum number of shared memory ring pages we will allow in a 150 * negotiated block-front/back communication channel. Allow enough 151 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 152 */ 153 #define XBB_MAX_RING_PAGES \ 154 BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \ 155 * XBB_MAX_REQUESTS) 156 157 /*--------------------------- Forward Declarations ---------------------------*/ 158 struct xbb_softc; 159 160 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 161 ...) __attribute__((format(printf, 3, 4))); 162 static int xbb_shutdown(struct xbb_softc *xbb); 163 static int xbb_detach(device_t dev); 164 165 /*------------------------------ Data Structures -----------------------------*/ 166 /** 167 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 168 */ 169 struct xbb_xen_req { 170 /** 171 * Linked list links used to aggregate idle request in the 172 * request free pool (xbb->request_free_slist). 173 */ 174 SLIST_ENTRY(xbb_xen_req) links; 175 176 /** 177 * Back reference to the parent block back instance for this 178 * request. Used during bio_done handling. 179 */ 180 struct xbb_softc *xbb; 181 182 /** 183 * The remote domain's identifier for this I/O request. 184 */ 185 uint64_t id; 186 187 /** 188 * Kernel virtual address space reserved for this request 189 * structure and used to map the remote domain's pages for 190 * this I/O, into our domain's address space. 191 */ 192 uint8_t *kva; 193 194 #ifdef XBB_USE_BOUNCE_BUFFERS 195 /** 196 * Pre-allocated domain local memory used to proxy remote 197 * domain memory during I/O operations. 198 */ 199 uint8_t *bounce; 200 #endif 201 202 /** 203 * Base, psuedo-physical address, corresponding to the start 204 * of this request's kva region. 205 */ 206 uint64_t gnt_base; 207 208 /** 209 * The number of pages currently mapped for this request. 210 */ 211 int nr_pages; 212 213 /** 214 * The number of 512 byte sectors comprising this requests. 215 */ 216 int nr_512b_sectors; 217 218 /** 219 * The number of struct bio requests still outstanding for this 220 * request on the backend device. This field is only used for 221 * device (rather than file) backed I/O. 222 */ 223 int pendcnt; 224 225 /** 226 * BLKIF_OP code for this request. 227 */ 228 int operation; 229 230 /** 231 * BLKIF_RSP status code for this request. 232 * 233 * This field allows an error status to be recorded even if the 234 * delivery of this status must be deferred. Deferred reporting 235 * is necessary, for example, when an error is detected during 236 * completion processing of one bio when other bios for this 237 * request are still outstanding. 238 */ 239 int status; 240 241 /** 242 * Device statistics request ordering type (ordered or simple). 243 */ 244 devstat_tag_type ds_tag_type; 245 246 /** 247 * Device statistics request type (read, write, no_data). 248 */ 249 devstat_trans_flags ds_trans_type; 250 251 /** 252 * The start time for this request. 253 */ 254 struct bintime ds_t0; 255 256 /** 257 * Array of grant handles (one per page) used to map this request. 258 */ 259 grant_handle_t *gnt_handles; 260 }; 261 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 262 263 /** 264 * \brief Configuration data for the shared memory request ring 265 * used to communicate with the front-end client of this 266 * this driver. 267 */ 268 struct xbb_ring_config { 269 /** KVA address where ring memory is mapped. */ 270 vm_offset_t va; 271 272 /** The pseudo-physical address where ring memory is mapped.*/ 273 uint64_t gnt_addr; 274 275 /** 276 * Grant table handles, one per-ring page, returned by the 277 * hyperpervisor upon mapping of the ring and required to 278 * unmap it when a connection is torn down. 279 */ 280 grant_handle_t handle[XBB_MAX_RING_PAGES]; 281 282 /** 283 * The device bus address returned by the hypervisor when 284 * mapping the ring and required to unmap it when a connection 285 * is torn down. 286 */ 287 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 288 289 /** The number of ring pages mapped for the current connection. */ 290 u_int ring_pages; 291 292 /** 293 * The grant references, one per-ring page, supplied by the 294 * front-end, allowing us to reference the ring pages in the 295 * front-end's domain and to map these pages into our own domain. 296 */ 297 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 298 299 /** The interrupt driven even channel used to signal ring events. */ 300 evtchn_port_t evtchn; 301 }; 302 303 /** 304 * Per-instance connection state flags. 305 */ 306 typedef enum 307 { 308 /** 309 * The front-end requested a read-only mount of the 310 * back-end device/file. 311 */ 312 XBBF_READ_ONLY = 0x01, 313 314 /** Communication with the front-end has been established. */ 315 XBBF_RING_CONNECTED = 0x02, 316 317 /** 318 * Front-end requests exist in the ring and are waiting for 319 * xbb_xen_req objects to free up. 320 */ 321 XBBF_RESOURCE_SHORTAGE = 0x04, 322 323 /** Connection teardown in progress. */ 324 XBBF_SHUTDOWN = 0x08 325 } xbb_flag_t; 326 327 /** Backend device type. */ 328 typedef enum { 329 /** Backend type unknown. */ 330 XBB_TYPE_NONE = 0x00, 331 332 /** 333 * Backend type disk (access via cdev switch 334 * strategy routine). 335 */ 336 XBB_TYPE_DISK = 0x01, 337 338 /** Backend type file (access vnode operations.). */ 339 XBB_TYPE_FILE = 0x02 340 } xbb_type; 341 342 /** 343 * \brief Structure used to memoize information about a per-request 344 * scatter-gather list. 345 * 346 * The chief benefit of using this data structure is it avoids having 347 * to reparse the possibly discontiguous S/G list in the original 348 * request. Due to the way that the mapping of the memory backing an 349 * I/O transaction is handled by Xen, a second pass is unavoidable. 350 * At least this way the second walk is a simple array traversal. 351 * 352 * \note A single Scatter/Gather element in the block interface covers 353 * at most 1 machine page. In this context a sector (blkif 354 * nomenclature, not what I'd choose) is a 512b aligned unit 355 * of mapping within the machine page referenced by an S/G 356 * element. 357 */ 358 struct xbb_sg { 359 /** The number of 512b data chunks mapped in this S/G element. */ 360 int16_t nsect; 361 362 /** 363 * The index (0 based) of the first 512b data chunk mapped 364 * in this S/G element. 365 */ 366 uint8_t first_sect; 367 368 /** 369 * The index (0 based) of the last 512b data chunk mapped 370 * in this S/G element. 371 */ 372 uint8_t last_sect; 373 }; 374 375 /** 376 * Character device backend specific configuration data. 377 */ 378 struct xbb_dev_data { 379 /** Cdev used for device backend access. */ 380 struct cdev *cdev; 381 382 /** Cdev switch used for device backend access. */ 383 struct cdevsw *csw; 384 385 /** Used to hold a reference on opened cdev backend devices. */ 386 int dev_ref; 387 }; 388 389 /** 390 * File backend specific configuration data. 391 */ 392 struct xbb_file_data { 393 /** Credentials to use for vnode backed (file based) I/O. */ 394 struct ucred *cred; 395 396 /** 397 * \brief Array of io vectors used to process file based I/O. 398 * 399 * Only a single file based request is outstanding per-xbb instance, 400 * so we only need one of these. 401 */ 402 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST]; 403 #ifdef XBB_USE_BOUNCE_BUFFERS 404 405 /** 406 * \brief Array of io vectors used to handle bouncing of file reads. 407 * 408 * Vnode operations are free to modify uio data during their 409 * exectuion. In the case of a read with bounce buffering active, 410 * we need some of the data from the original uio in order to 411 * bounce-out the read data. This array serves as the temporary 412 * storage for this saved data. 413 */ 414 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST]; 415 416 /** 417 * \brief Array of memoized bounce buffer kva offsets used 418 * in the file based backend. 419 * 420 * Due to the way that the mapping of the memory backing an 421 * I/O transaction is handled by Xen, a second pass through 422 * the request sg elements is unavoidable. We memoize the computed 423 * bounce address here to reduce the cost of the second walk. 424 */ 425 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQUEST]; 426 #endif /* XBB_USE_BOUNCE_BUFFERS */ 427 }; 428 429 /** 430 * Collection of backend type specific data. 431 */ 432 union xbb_backend_data { 433 struct xbb_dev_data dev; 434 struct xbb_file_data file; 435 }; 436 437 /** 438 * Function signature of backend specific I/O handlers. 439 */ 440 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, blkif_request_t *ring_req, 441 struct xbb_xen_req *req, int nseg, 442 int operation, int flags); 443 444 /** 445 * Per-instance configuration data. 446 */ 447 struct xbb_softc { 448 449 /** 450 * Task-queue used to process I/O requests. 451 */ 452 struct taskqueue *io_taskqueue; 453 454 /** 455 * Single "run the request queue" task enqueued 456 * on io_taskqueue. 457 */ 458 struct task io_task; 459 460 /** Device type for this instance. */ 461 xbb_type device_type; 462 463 /** NewBus device corresponding to this instance. */ 464 device_t dev; 465 466 /** Backend specific dispatch routine for this instance. */ 467 xbb_dispatch_t dispatch_io; 468 469 /** The number of requests outstanding on the backend device/file. */ 470 u_int active_request_count; 471 472 /** Free pool of request tracking structures. */ 473 struct xbb_xen_req_slist request_free_slist; 474 475 /** Array, sized at connection time, of request tracking structures. */ 476 struct xbb_xen_req *requests; 477 478 /** 479 * Global pool of kva used for mapping remote domain ring 480 * and I/O transaction data. 481 */ 482 vm_offset_t kva; 483 484 /** Psuedo-physical address corresponding to kva. */ 485 uint64_t gnt_base_addr; 486 487 /** The size of the global kva pool. */ 488 int kva_size; 489 490 /** 491 * \brief Cached value of the front-end's domain id. 492 * 493 * This value is used at once for each mapped page in 494 * a transaction. We cache it to avoid incuring the 495 * cost of an ivar access every time this is needed. 496 */ 497 domid_t otherend_id; 498 499 /** 500 * \brief The blkif protocol abi in effect. 501 * 502 * There are situations where the back and front ends can 503 * have a different, native abi (e.g. intel x86_64 and 504 * 32bit x86 domains on the same machine). The back-end 505 * always accomodates the front-end's native abi. That 506 * value is pulled from the XenStore and recorded here. 507 */ 508 int abi; 509 510 /** 511 * \brief The maximum number of requests allowed to be in 512 * flight at a time. 513 * 514 * This value is negotiated via the XenStore. 515 */ 516 uint32_t max_requests; 517 518 /** 519 * \brief The maximum number of segments (1 page per segment) 520 * that can be mapped by a request. 521 * 522 * This value is negotiated via the XenStore. 523 */ 524 uint32_t max_request_segments; 525 526 /** 527 * The maximum size of any request to this back-end 528 * device. 529 * 530 * This value is negotiated via the XenStore. 531 */ 532 uint32_t max_request_size; 533 534 /** Various configuration and state bit flags. */ 535 xbb_flag_t flags; 536 537 /** Ring mapping and interrupt configuration data. */ 538 struct xbb_ring_config ring_config; 539 540 /** Runtime, cross-abi safe, structures for ring access. */ 541 blkif_back_rings_t rings; 542 543 /** IRQ mapping for the communication ring event channel. */ 544 int irq; 545 546 /** 547 * \brief Backend access mode flags (e.g. write, or read-only). 548 * 549 * This value is passed to us by the front-end via the XenStore. 550 */ 551 char *dev_mode; 552 553 /** 554 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 555 * 556 * This value is passed to us by the front-end via the XenStore. 557 * Currently unused. 558 */ 559 char *dev_type; 560 561 /** 562 * \brief Backend device/file identifier. 563 * 564 * This value is passed to us by the front-end via the XenStore. 565 * We expect this to be a POSIX path indicating the file or 566 * device to open. 567 */ 568 char *dev_name; 569 570 /** 571 * Vnode corresponding to the backend device node or file 572 * we are acessing. 573 */ 574 struct vnode *vn; 575 576 union xbb_backend_data backend; 577 /** The native sector size of the backend. */ 578 u_int sector_size; 579 580 /** log2 of sector_size. */ 581 u_int sector_size_shift; 582 583 /** Size in bytes of the backend device or file. */ 584 off_t media_size; 585 586 /** 587 * \brief media_size expressed in terms of the backend native 588 * sector size. 589 * 590 * (e.g. xbb->media_size >> xbb->sector_size_shift). 591 */ 592 uint64_t media_num_sectors; 593 594 /** 595 * \brief Array of memoized scatter gather data computed during the 596 * conversion of blkif ring requests to internal xbb_xen_req 597 * structures. 598 * 599 * Ring processing is serialized so we only need one of these. 600 */ 601 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQUEST]; 602 603 /** Mutex protecting per-instance data. */ 604 struct mtx lock; 605 606 #ifdef XENHVM 607 /** 608 * Resource representing allocated physical address space 609 * associated with our per-instance kva region. 610 */ 611 struct resource *pseudo_phys_res; 612 613 /** Resource id for allocated physical address space. */ 614 int pseudo_phys_res_id; 615 #endif 616 617 /** I/O statistics. */ 618 struct devstat *xbb_stats; 619 }; 620 621 /*---------------------------- Request Processing ----------------------------*/ 622 /** 623 * Allocate an internal transaction tracking structure from the free pool. 624 * 625 * \param xbb Per-instance xbb configuration structure. 626 * 627 * \return On success, a pointer to the allocated xbb_xen_req structure. 628 * Otherwise NULL. 629 */ 630 static inline struct xbb_xen_req * 631 xbb_get_req(struct xbb_softc *xbb) 632 { 633 struct xbb_xen_req *req; 634 635 req = NULL; 636 mtx_lock(&xbb->lock); 637 638 /* 639 * Do not allow new requests to be allocated while we 640 * are shutting down. 641 */ 642 if ((xbb->flags & XBBF_SHUTDOWN) == 0) { 643 if ((req = SLIST_FIRST(&xbb->request_free_slist)) != NULL) { 644 SLIST_REMOVE_HEAD(&xbb->request_free_slist, links); 645 xbb->active_request_count++; 646 } else { 647 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 648 } 649 } 650 mtx_unlock(&xbb->lock); 651 return (req); 652 } 653 654 /** 655 * Return an allocated transaction tracking structure to the free pool. 656 * 657 * \param xbb Per-instance xbb configuration structure. 658 * \param req The request structure to free. 659 */ 660 static inline void 661 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 662 { 663 int wake_thread; 664 665 mtx_lock(&xbb->lock); 666 wake_thread = xbb->flags & XBBF_RESOURCE_SHORTAGE; 667 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 668 SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links); 669 xbb->active_request_count--; 670 671 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 672 /* 673 * Shutdown is in progress. See if we can 674 * progress further now that one more request 675 * has completed and been returned to the 676 * free pool. 677 */ 678 xbb_shutdown(xbb); 679 } 680 mtx_unlock(&xbb->lock); 681 682 if (wake_thread != 0) 683 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 684 } 685 686 /** 687 * Given a page index and 512b sector offset within that page, 688 * calculate an offset into a request's kva region. 689 * 690 * \param req The request structure whose kva region will be accessed. 691 * \param pagenr The page index used to compute the kva offset. 692 * \param sector The 512b sector index used to compute the page relative 693 * kva offset. 694 * 695 * \return The computed global KVA offset. 696 */ 697 static inline uint8_t * 698 xbb_req_vaddr(struct xbb_xen_req *req, int pagenr, int sector) 699 { 700 return (req->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 701 } 702 703 #ifdef XBB_USE_BOUNCE_BUFFERS 704 /** 705 * Given a page index and 512b sector offset within that page, 706 * calculate an offset into a request's local bounce memory region. 707 * 708 * \param req The request structure whose bounce region will be accessed. 709 * \param pagenr The page index used to compute the bounce offset. 710 * \param sector The 512b sector index used to compute the page relative 711 * bounce offset. 712 * 713 * \return The computed global bounce buffer address. 714 */ 715 static inline uint8_t * 716 xbb_req_bounce_addr(struct xbb_xen_req *req, int pagenr, int sector) 717 { 718 return (req->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 719 } 720 #endif 721 722 /** 723 * Given a page number and 512b sector offset within that page, 724 * calculate an offset into the request's memory region that the 725 * underlying backend device/file should use for I/O. 726 * 727 * \param req The request structure whose I/O region will be accessed. 728 * \param pagenr The page index used to compute the I/O offset. 729 * \param sector The 512b sector index used to compute the page relative 730 * I/O offset. 731 * 732 * \return The computed global I/O address. 733 * 734 * Depending on configuration, this will either be a local bounce buffer 735 * or a pointer to the memory mapped in from the front-end domain for 736 * this request. 737 */ 738 static inline uint8_t * 739 xbb_req_ioaddr(struct xbb_xen_req *req, int pagenr, int sector) 740 { 741 #ifdef XBB_USE_BOUNCE_BUFFERS 742 return (xbb_req_bounce_addr(req, pagenr, sector)); 743 #else 744 return (xbb_req_vaddr(req, pagenr, sector)); 745 #endif 746 } 747 748 /** 749 * Given a page index and 512b sector offset within that page, calculate 750 * an offset into the local psuedo-physical address space used to map a 751 * front-end's request data into a request. 752 * 753 * \param req The request structure whose pseudo-physical region 754 * will be accessed. 755 * \param pagenr The page index used to compute the pseudo-physical offset. 756 * \param sector The 512b sector index used to compute the page relative 757 * pseudo-physical offset. 758 * 759 * \return The computed global pseudo-phsyical address. 760 * 761 * Depending on configuration, this will either be a local bounce buffer 762 * or a pointer to the memory mapped in from the front-end domain for 763 * this request. 764 */ 765 static inline uintptr_t 766 xbb_req_gntaddr(struct xbb_xen_req *req, int pagenr, int sector) 767 { 768 return ((uintptr_t)(req->gnt_base 769 + (PAGE_SIZE * pagenr) + (sector << 9))); 770 } 771 772 /** 773 * Unmap the front-end pages associated with this I/O request. 774 * 775 * \param req The request structure to unmap. 776 */ 777 static void 778 xbb_unmap_req(struct xbb_xen_req *req) 779 { 780 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQUEST]; 781 u_int i; 782 u_int invcount; 783 int error; 784 785 invcount = 0; 786 for (i = 0; i < req->nr_pages; i++) { 787 788 if (req->gnt_handles[i] == GRANT_REF_INVALID) 789 continue; 790 791 unmap[invcount].host_addr = xbb_req_gntaddr(req, i, 0); 792 unmap[invcount].dev_bus_addr = 0; 793 unmap[invcount].handle = req->gnt_handles[i]; 794 req->gnt_handles[i] = GRANT_REF_INVALID; 795 invcount++; 796 } 797 798 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 799 unmap, invcount); 800 KASSERT(error == 0, ("Grant table operation failed")); 801 } 802 803 /** 804 * Create and transmit a response to a blkif request. 805 * 806 * \param xbb Per-instance xbb configuration structure. 807 * \param req The request structure to which to respond. 808 * \param status The status code to report. See BLKIF_RSP_* 809 * in sys/xen/interface/io/blkif.h. 810 */ 811 static void 812 xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 813 { 814 blkif_response_t *resp; 815 int more_to_do; 816 int notify; 817 818 more_to_do = 0; 819 820 /* 821 * Place on the response ring for the relevant domain. 822 * For now, only the spacing between entries is different 823 * in the different ABIs, not the response entry layout. 824 */ 825 mtx_lock(&xbb->lock); 826 switch (xbb->abi) { 827 case BLKIF_PROTOCOL_NATIVE: 828 resp = RING_GET_RESPONSE(&xbb->rings.native, 829 xbb->rings.native.rsp_prod_pvt); 830 break; 831 case BLKIF_PROTOCOL_X86_32: 832 resp = (blkif_response_t *) 833 RING_GET_RESPONSE(&xbb->rings.x86_32, 834 xbb->rings.x86_32.rsp_prod_pvt); 835 break; 836 case BLKIF_PROTOCOL_X86_64: 837 resp = (blkif_response_t *) 838 RING_GET_RESPONSE(&xbb->rings.x86_64, 839 xbb->rings.x86_64.rsp_prod_pvt); 840 break; 841 default: 842 panic("Unexpected blkif protocol ABI."); 843 } 844 845 resp->id = req->id; 846 resp->operation = req->operation; 847 resp->status = status; 848 849 xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages); 850 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify); 851 852 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 853 854 /* 855 * Tail check for pending requests. Allows frontend to avoid 856 * notifications if requests are already in flight (lower 857 * overheads and promotes batching). 858 */ 859 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 860 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 861 862 more_to_do = 1; 863 } 864 865 mtx_unlock(&xbb->lock); 866 867 if (more_to_do) 868 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 869 870 if (notify) 871 notify_remote_via_irq(xbb->irq); 872 } 873 874 /** 875 * Completion handler for buffer I/O requests issued by the device 876 * backend driver. 877 * 878 * \param bio The buffer I/O request on which to perform completion 879 * processing. 880 */ 881 static void 882 xbb_bio_done(struct bio *bio) 883 { 884 struct xbb_softc *xbb; 885 struct xbb_xen_req *req; 886 887 req = bio->bio_caller1; 888 xbb = req->xbb; 889 890 /* Only include transferred I/O in stats. */ 891 req->nr_512b_sectors -= bio->bio_resid >> 9; 892 if (bio->bio_error) { 893 DPRINTF("BIO returned error %d for operation on device %s\n", 894 bio->bio_error, xbb->dev_name); 895 req->status = BLKIF_RSP_ERROR; 896 897 if (bio->bio_error == ENXIO 898 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 899 900 /* 901 * Backend device has disappeared. Signal the 902 * front-end that we (the device proxy) want to 903 * go away. 904 */ 905 xenbus_set_state(xbb->dev, XenbusStateClosing); 906 } 907 } 908 909 #ifdef XBB_USE_BOUNCE_BUFFERS 910 if (bio->bio_cmd == BIO_READ) { 911 vm_offset_t kva_offset; 912 913 kva_offset = (vm_offset_t)bio->bio_data 914 - (vm_offset_t)req->bounce; 915 memcpy((uint8_t *)req->kva + kva_offset, 916 bio->bio_data, bio->bio_bcount); 917 } 918 #endif /* XBB_USE_BOUNCE_BUFFERS */ 919 920 if (atomic_fetchadd_int(&req->pendcnt, -1) == 1) { 921 xbb_unmap_req(req); 922 xbb_send_response(xbb, req, req->status); 923 devstat_end_transaction(xbb->xbb_stats, 924 /*bytes*/req->nr_512b_sectors << 9, 925 req->ds_tag_type, 926 req->ds_trans_type, 927 /*now*/NULL, 928 /*then*/&req->ds_t0); 929 xbb_release_req(xbb, req); 930 } 931 932 g_destroy_bio(bio); 933 } 934 935 /** 936 * Parse a blkif request into an internal request structure and send 937 * it to the backend for processing. 938 * 939 * \param xbb Per-instance xbb configuration structure. 940 * \param ring_req Front-end's I/O request as pulled from the shared 941 * communication ring. 942 * \param req Allocated internal request structure. 943 * \param req_ring_idx The location of ring_req within the shared 944 * communication ring. 945 * 946 * This routine performs the backend common aspects of request parsing 947 * including compiling an internal request structure, parsing the S/G 948 * list and any secondary ring requests in which they may reside, and 949 * the mapping of front-end I/O pages into our domain. 950 */ 951 static void 952 xbb_dispatch_io(struct xbb_softc *xbb, blkif_request_t *ring_req, 953 struct xbb_xen_req *req, RING_IDX req_ring_idx) 954 { 955 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQUEST]; 956 struct xbb_sg *xbb_sg; 957 struct gnttab_map_grant_ref *map; 958 struct blkif_request_segment *sg; 959 struct blkif_request_segment *last_block_sg; 960 u_int nseg; 961 u_int seg_idx; 962 u_int block_segs; 963 int nr_sects; 964 int operation; 965 uint8_t bio_flags; 966 int error; 967 968 nseg = ring_req->nr_segments; 969 nr_sects = 0; 970 req->xbb = xbb; 971 req->id = ring_req->id; 972 req->operation = ring_req->operation; 973 req->status = BLKIF_RSP_OKAY; 974 req->ds_tag_type = DEVSTAT_TAG_SIMPLE; 975 req->nr_pages = nseg; 976 req->nr_512b_sectors = 0; 977 bio_flags = 0; 978 sg = NULL; 979 980 binuptime(&req->ds_t0); 981 devstat_start_transaction(xbb->xbb_stats, &req->ds_t0); 982 983 switch (req->operation) { 984 case BLKIF_OP_WRITE_BARRIER: 985 bio_flags |= BIO_ORDERED; 986 req->ds_tag_type = DEVSTAT_TAG_ORDERED; 987 /* FALLTHROUGH */ 988 case BLKIF_OP_WRITE: 989 operation = BIO_WRITE; 990 req->ds_trans_type = DEVSTAT_WRITE; 991 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 992 DPRINTF("Attempt to write to read only device %s\n", 993 xbb->dev_name); 994 goto fail_send_response; 995 } 996 break; 997 case BLKIF_OP_READ: 998 operation = BIO_READ; 999 req->ds_trans_type = DEVSTAT_READ; 1000 break; 1001 case BLKIF_OP_FLUSH_DISKCACHE: 1002 operation = BIO_FLUSH; 1003 req->ds_tag_type = DEVSTAT_TAG_ORDERED; 1004 req->ds_trans_type = DEVSTAT_NO_DATA; 1005 goto do_dispatch; 1006 /*NOTREACHED*/ 1007 default: 1008 DPRINTF("error: unknown block io operation [%d]\n", 1009 req->operation); 1010 goto fail_send_response; 1011 } 1012 1013 /* Check that number of segments is sane. */ 1014 if (unlikely(nseg == 0) 1015 || unlikely(nseg > xbb->max_request_segments)) { 1016 DPRINTF("Bad number of segments in request (%d)\n", nseg); 1017 goto fail_send_response; 1018 } 1019 1020 map = maps; 1021 xbb_sg = xbb->xbb_sgs; 1022 block_segs = MIN(req->nr_pages, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); 1023 sg = ring_req->seg; 1024 last_block_sg = sg + block_segs; 1025 seg_idx = 0; 1026 while (1) { 1027 1028 while (sg < last_block_sg) { 1029 1030 xbb_sg->first_sect = sg->first_sect; 1031 xbb_sg->last_sect = sg->last_sect; 1032 xbb_sg->nsect = 1033 (int8_t)(sg->last_sect - sg->first_sect + 1); 1034 1035 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1036 || (xbb_sg->nsect <= 0)) 1037 goto fail_send_response; 1038 1039 nr_sects += xbb_sg->nsect; 1040 map->host_addr = xbb_req_gntaddr(req, seg_idx, 1041 /*sector*/0); 1042 map->flags = GNTMAP_host_map; 1043 map->ref = sg->gref; 1044 map->dom = xbb->otherend_id; 1045 if (operation == BIO_WRITE) 1046 map->flags |= GNTMAP_readonly; 1047 sg++; 1048 map++; 1049 xbb_sg++; 1050 seg_idx++; 1051 } 1052 1053 block_segs = MIN(nseg - seg_idx, 1054 BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); 1055 if (block_segs == 0) 1056 break; 1057 1058 /* 1059 * Fetch the next request block full of SG elements. 1060 * For now, only the spacing between entries is different 1061 * in the different ABIs, not the sg entry layout. 1062 */ 1063 req_ring_idx++; 1064 switch (xbb->abi) { 1065 case BLKIF_PROTOCOL_NATIVE: 1066 sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native, 1067 req_ring_idx); 1068 break; 1069 case BLKIF_PROTOCOL_X86_32: 1070 { 1071 sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32, 1072 req_ring_idx); 1073 break; 1074 } 1075 case BLKIF_PROTOCOL_X86_64: 1076 { 1077 sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64, 1078 req_ring_idx); 1079 break; 1080 } 1081 default: 1082 panic("Unexpected blkif protocol ABI."); 1083 /* NOTREACHED */ 1084 } 1085 last_block_sg = sg + block_segs; 1086 } 1087 1088 /* Convert to the disk's sector size */ 1089 req->nr_512b_sectors = nr_sects; 1090 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1091 1092 if ((req->nr_512b_sectors & ((xbb->sector_size >> 9) - 1)) != 0) { 1093 device_printf(xbb->dev, "%s: I/O size (%d) is not a multiple " 1094 "of the backing store sector size (%d)\n", 1095 __func__, req->nr_512b_sectors << 9, 1096 xbb->sector_size); 1097 goto fail_send_response; 1098 } 1099 1100 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1101 maps, req->nr_pages); 1102 if (error != 0) 1103 panic("Grant table operation failed (%d)", error); 1104 1105 for (seg_idx = 0, map = maps; seg_idx < nseg; seg_idx++, map++) { 1106 1107 if (unlikely(map->status != 0)) { 1108 DPRINTF("invalid buffer -- could not remap it (%d)\n", 1109 map->status); 1110 DPRINTF("Mapping(%d): Host Addr 0x%lx, flags 0x%x " 1111 "ref 0x%x, dom %d\n", seg_idx, 1112 map->host_addr, map->flags, map->ref, 1113 map->dom); 1114 goto fail_unmap_req; 1115 } 1116 1117 req->gnt_handles[seg_idx] = map->handle; 1118 } 1119 if (ring_req->sector_number + nr_sects > xbb->media_num_sectors) { 1120 1121 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1122 "extends past end of device %s\n", 1123 operation == BIO_READ ? "read" : "write", 1124 ring_req->sector_number, 1125 ring_req->sector_number + nr_sects, xbb->dev_name); 1126 goto fail_unmap_req; 1127 } 1128 1129 do_dispatch: 1130 1131 error = xbb->dispatch_io(xbb, 1132 ring_req, 1133 req, 1134 nseg, 1135 operation, 1136 bio_flags); 1137 1138 if (error != 0) { 1139 if (operation == BIO_FLUSH) 1140 goto fail_send_response; 1141 else 1142 goto fail_unmap_req; 1143 } 1144 1145 return; 1146 1147 1148 fail_unmap_req: 1149 xbb_unmap_req(req); 1150 /* FALLTHROUGH */ 1151 1152 fail_send_response: 1153 xbb_send_response(xbb, req, BLKIF_RSP_ERROR); 1154 xbb_release_req(xbb, req); 1155 devstat_end_transaction(xbb->xbb_stats, 1156 /*bytes*/0, 1157 req->ds_tag_type, 1158 req->ds_trans_type, 1159 /*now*/NULL, 1160 /*then*/&req->ds_t0); 1161 } 1162 1163 /** 1164 * Process incoming requests from the shared communication ring in response 1165 * to a signal on the ring's event channel. 1166 * 1167 * \param context Callback argument registerd during task initialization - 1168 * the xbb_softc for this instance. 1169 * \param pending The number of taskqueue_enqueue events that have 1170 * occurred since this handler was last run. 1171 */ 1172 static void 1173 xbb_run_queue(void *context, int pending) 1174 { 1175 struct xbb_softc *xbb; 1176 blkif_back_rings_t *rings; 1177 RING_IDX rp; 1178 1179 1180 xbb = (struct xbb_softc *)context; 1181 rings = &xbb->rings; 1182 1183 /* 1184 * Cache req_prod to avoid accessing a cache line shared 1185 * with the frontend. 1186 */ 1187 rp = rings->common.sring->req_prod; 1188 1189 /* Ensure we see queued requests up to 'rp'. */ 1190 rmb(); 1191 1192 /** 1193 * Run so long as there is work to consume and the generation 1194 * of a response will not overflow the ring. 1195 * 1196 * @note There's a 1 to 1 relationship between requests and responses, 1197 * so an overflow should never occur. This test is to protect 1198 * our domain from digesting bogus data. Shouldn't we log this? 1199 */ 1200 while (rings->common.req_cons != rp 1201 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1202 rings->common.req_cons) == 0) { 1203 blkif_request_t ring_req_storage; 1204 blkif_request_t *ring_req; 1205 struct xbb_xen_req *req; 1206 RING_IDX req_ring_idx; 1207 1208 req = xbb_get_req(xbb); 1209 if (req == NULL) { 1210 /* 1211 * Resource shortage has been recorded. 1212 * We'll be scheduled to run once a request 1213 * object frees up due to a completion. 1214 */ 1215 break; 1216 } 1217 1218 switch (xbb->abi) { 1219 case BLKIF_PROTOCOL_NATIVE: 1220 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1221 rings->common.req_cons); 1222 break; 1223 case BLKIF_PROTOCOL_X86_32: 1224 { 1225 struct blkif_x86_32_request *ring_req32; 1226 1227 ring_req32 = RING_GET_REQUEST(&xbb->rings.x86_32, 1228 rings->common.req_cons); 1229 blkif_get_x86_32_req(&ring_req_storage, ring_req32); 1230 ring_req = &ring_req_storage; 1231 break; 1232 } 1233 case BLKIF_PROTOCOL_X86_64: 1234 { 1235 struct blkif_x86_64_request *ring_req64; 1236 1237 ring_req64 = RING_GET_REQUEST(&xbb->rings.x86_64, 1238 rings->common.req_cons); 1239 blkif_get_x86_64_req(&ring_req_storage, ring_req64); 1240 ring_req = &ring_req_storage; 1241 break; 1242 } 1243 default: 1244 panic("Unexpected blkif protocol ABI."); 1245 /* NOTREACHED */ 1246 } 1247 1248 /* 1249 * Signify that we can overwrite this request with a 1250 * response by incrementing our consumer index. The 1251 * response won't be generated until after we've already 1252 * consumed all necessary data out of the version of the 1253 * request in the ring buffer (for native mode). We 1254 * must update the consumer index before issueing back-end 1255 * I/O so there is no possibility that it will complete 1256 * and a response be generated before we make room in 1257 * the queue for that response. 1258 */ 1259 req_ring_idx = xbb->rings.common.req_cons; 1260 xbb->rings.common.req_cons += 1261 BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); 1262 1263 xbb_dispatch_io(xbb, ring_req, req, req_ring_idx); 1264 } 1265 } 1266 1267 /** 1268 * Interrupt handler bound to the shared ring's event channel. 1269 * 1270 * \param arg Callback argument registerd during event channel 1271 * binding - the xbb_softc for this instance. 1272 */ 1273 static void 1274 xbb_intr(void *arg) 1275 { 1276 struct xbb_softc *xbb; 1277 1278 /* Defer to kernel thread. */ 1279 xbb = (struct xbb_softc *)arg; 1280 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1281 } 1282 1283 /*----------------------------- Backend Handlers -----------------------------*/ 1284 /** 1285 * Backend handler for character device access. 1286 * 1287 * \param xbb Per-instance xbb configuration structure. 1288 * \param ring_req Front-end's I/O request as pulled from the shared 1289 * communication ring. 1290 * \param req Allocated internal request structure. 1291 * \param nseg The number of valid segments for this request in 1292 * xbb->xbb_sgs. 1293 * \param operation BIO_* I/O operation code. 1294 * \param bio_flags Additional bio_flag data to pass to any generated 1295 * bios (e.g. BIO_ORDERED).. 1296 * 1297 * \return 0 for success, errno codes for failure. 1298 */ 1299 static int 1300 xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, 1301 struct xbb_xen_req *req, int nseg, int operation, 1302 int bio_flags) 1303 { 1304 struct xbb_dev_data *dev_data; 1305 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQUEST]; 1306 off_t bio_offset; 1307 struct bio *bio; 1308 struct xbb_sg *xbb_sg; 1309 u_int nbio; 1310 u_int bio_idx; 1311 u_int seg_idx; 1312 int error; 1313 1314 dev_data = &xbb->backend.dev; 1315 bio_offset = (off_t)ring_req->sector_number 1316 << xbb->sector_size_shift; 1317 error = 0; 1318 nbio = 0; 1319 bio_idx = 0; 1320 1321 if (operation == BIO_FLUSH) { 1322 bio = g_new_bio(); 1323 if (unlikely(bio == NULL)) { 1324 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 1325 error = ENOMEM; 1326 return (error); 1327 } 1328 1329 bio->bio_cmd = BIO_FLUSH; 1330 bio->bio_flags |= BIO_ORDERED; 1331 bio->bio_dev = dev_data->cdev; 1332 bio->bio_offset = 0; 1333 bio->bio_data = 0; 1334 bio->bio_done = xbb_bio_done; 1335 bio->bio_caller1 = req; 1336 bio->bio_pblkno = 0; 1337 1338 req->pendcnt = 1; 1339 1340 (*dev_data->csw->d_strategy)(bios[bio_idx]); 1341 1342 return (0); 1343 } 1344 1345 for (seg_idx = 0, bio = NULL, xbb_sg = xbb->xbb_sgs; 1346 seg_idx < nseg; 1347 seg_idx++, xbb_sg++) { 1348 1349 /* 1350 * KVA will not be contiguous, so any additional 1351 * I/O will need to be represented in a new bio. 1352 */ 1353 if ((bio != NULL) 1354 && (xbb_sg->first_sect != 0)) { 1355 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 1356 printf("%s: Discontiguous I/O request from " 1357 "domain %d ends on non-sector " 1358 "boundary\n", __func__, 1359 xbb->otherend_id); 1360 error = EINVAL; 1361 goto fail_free_bios; 1362 } 1363 bio = NULL; 1364 } 1365 1366 if (bio == NULL) { 1367 /* 1368 * Make sure that the start of this bio is aligned 1369 * to a device sector. 1370 */ 1371 if ((bio_offset & (xbb->sector_size - 1)) != 0) { 1372 printf("%s: Misaligned I/O request from " 1373 "domain %d\n", __func__, 1374 xbb->otherend_id); 1375 error = EINVAL; 1376 goto fail_free_bios; 1377 } 1378 1379 bio = bios[nbio++] = g_new_bio(); 1380 if (unlikely(bio == NULL)) { 1381 error = ENOMEM; 1382 goto fail_free_bios; 1383 } 1384 bio->bio_cmd = operation; 1385 bio->bio_flags |= bio_flags; 1386 bio->bio_dev = dev_data->cdev; 1387 bio->bio_offset = bio_offset; 1388 bio->bio_data = xbb_req_ioaddr(req, seg_idx, 1389 xbb_sg->first_sect); 1390 bio->bio_done = xbb_bio_done; 1391 bio->bio_caller1 = req; 1392 bio->bio_pblkno = bio_offset 1393 >> xbb->sector_size_shift; 1394 } 1395 1396 bio->bio_length += xbb_sg->nsect << 9; 1397 bio->bio_bcount = bio->bio_length; 1398 bio_offset += xbb_sg->nsect << 9; 1399 1400 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 1401 1402 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 1403 printf("%s: Discontiguous I/O request from " 1404 "domain %d ends on non-sector " 1405 "boundary\n", __func__, 1406 xbb->otherend_id); 1407 error = EINVAL; 1408 goto fail_free_bios; 1409 } 1410 /* 1411 * KVA will not be contiguous, so any additional 1412 * I/O will need to be represented in a new bio. 1413 */ 1414 bio = NULL; 1415 } 1416 } 1417 1418 req->pendcnt = nbio; 1419 1420 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 1421 { 1422 #ifdef XBB_USE_BOUNCE_BUFFERS 1423 vm_offset_t kva_offset; 1424 1425 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 1426 - (vm_offset_t)req->bounce; 1427 if (operation == BIO_WRITE) { 1428 memcpy(bios[bio_idx]->bio_data, 1429 (uint8_t *)req->kva + kva_offset, 1430 bios[bio_idx]->bio_bcount); 1431 } 1432 #endif 1433 (*dev_data->csw->d_strategy)(bios[bio_idx]); 1434 } 1435 1436 return (error); 1437 1438 fail_free_bios: 1439 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 1440 g_destroy_bio(bios[bio_idx]); 1441 1442 return (error); 1443 } 1444 1445 /** 1446 * Backend handler for file access. 1447 * 1448 * \param xbb Per-instance xbb configuration structure. 1449 * \param ring_req Front-end's I/O request as pulled from the shared 1450 * communication ring. 1451 * \param req Allocated internal request structure. 1452 * \param nseg The number of valid segments for this request in 1453 * xbb->xbb_sgs. 1454 * \param operation BIO_* I/O operation code. 1455 * \param bio_flags Additional bio_flag data to pass to any generated bios 1456 * (e.g. BIO_ORDERED).. 1457 * 1458 * \return 0 for success, errno codes for failure. 1459 */ 1460 static int 1461 xbb_dispatch_file(struct xbb_softc *xbb, blkif_request_t *ring_req, 1462 struct xbb_xen_req *req, int nseg, int operation, 1463 int flags) 1464 { 1465 struct xbb_file_data *file_data; 1466 u_int seg_idx; 1467 struct uio xuio; 1468 struct xbb_sg *xbb_sg; 1469 struct iovec *xiovec; 1470 #ifdef XBB_USE_BOUNCE_BUFFERS 1471 void **p_vaddr; 1472 int saved_uio_iovcnt; 1473 #endif /* XBB_USE_BOUNCE_BUFFERS */ 1474 int vfs_is_locked; 1475 int error; 1476 1477 file_data = &xbb->backend.file; 1478 error = 0; 1479 bzero(&xuio, sizeof(xuio)); 1480 1481 req->pendcnt = 0; 1482 1483 switch (operation) { 1484 case BIO_READ: 1485 xuio.uio_rw = UIO_READ; 1486 break; 1487 case BIO_WRITE: 1488 xuio.uio_rw = UIO_WRITE; 1489 break; 1490 case BIO_FLUSH: { 1491 struct mount *mountpoint; 1492 1493 vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); 1494 1495 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 1496 1497 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 1498 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 1499 VOP_UNLOCK(xbb->vn, 0); 1500 1501 vn_finished_write(mountpoint); 1502 1503 VFS_UNLOCK_GIANT(vfs_is_locked); 1504 1505 goto bailout_send_response; 1506 /* NOTREACHED */ 1507 } 1508 default: 1509 panic("invalid operation %d", operation); 1510 /* NOTREACHED */ 1511 } 1512 xuio.uio_offset = (vm_offset_t)ring_req->sector_number 1513 << xbb->sector_size_shift; 1514 1515 xuio.uio_segflg = UIO_SYSSPACE; 1516 xuio.uio_iov = file_data->xiovecs; 1517 xuio.uio_iovcnt = 0; 1518 1519 for (seg_idx = 0, xiovec = NULL, xbb_sg = xbb->xbb_sgs; 1520 seg_idx < nseg; seg_idx++, xbb_sg++) { 1521 1522 /* 1523 * If the first sector is not 0, the KVA will not be 1524 * contiguous and we'll need to go on to another segment. 1525 */ 1526 if (xbb_sg->first_sect != 0) 1527 xiovec = NULL; 1528 1529 if (xiovec == NULL) { 1530 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 1531 xiovec->iov_base = xbb_req_ioaddr(req, seg_idx, 1532 xbb_sg->first_sect); 1533 #ifdef XBB_USE_BOUNCE_BUFFERS 1534 /* 1535 * Store the address of the incoming buffer at this 1536 * particular offset as well, so we can do the copy 1537 * later without having to do more work to 1538 * recalculate this address. 1539 */ 1540 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 1541 *p_vaddr = xbb_req_vaddr(req, seg_idx, 1542 xbb_sg->first_sect); 1543 #endif /* XBB_USE_BOUNCE_BUFFERS */ 1544 xiovec->iov_len = 0; 1545 xuio.uio_iovcnt++; 1546 } 1547 1548 xiovec->iov_len += xbb_sg->nsect << 9; 1549 1550 xuio.uio_resid += xbb_sg->nsect << 9; 1551 1552 /* 1553 * If the last sector is not the full page size count, 1554 * the next segment will not be contiguous in KVA and we 1555 * need a new iovec. 1556 */ 1557 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 1558 xiovec = NULL; 1559 } 1560 1561 xuio.uio_td = curthread; 1562 1563 #ifdef XBB_USE_BOUNCE_BUFFERS 1564 saved_uio_iovcnt = xuio.uio_iovcnt; 1565 1566 if (operation == BIO_WRITE) { 1567 /* Copy the write data to the local buffer. */ 1568 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 1569 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 1570 seg_idx++, xiovec++, p_vaddr++) { 1571 1572 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 1573 } 1574 } else { 1575 /* 1576 * We only need to save off the iovecs in the case of a 1577 * read, because the copy for the read happens after the 1578 * VOP_READ(). (The uio will get modified in that call 1579 * sequence.) 1580 */ 1581 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 1582 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 1583 } 1584 #endif /* XBB_USE_BOUNCE_BUFFERS */ 1585 1586 vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); 1587 switch (operation) { 1588 case BIO_READ: 1589 1590 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 1591 1592 /* 1593 * UFS pays attention to IO_DIRECT for reads. If the 1594 * DIRECTIO option is configured into the kernel, it calls 1595 * ffs_rawread(). But that only works for single-segment 1596 * uios with user space addresses. In our case, with a 1597 * kernel uio, it still reads into the buffer cache, but it 1598 * will just try to release the buffer from the cache later 1599 * on in ffs_read(). 1600 * 1601 * ZFS does not pay attention to IO_DIRECT for reads. 1602 * 1603 * UFS does not pay attention to IO_SYNC for reads. 1604 * 1605 * ZFS pays attention to IO_SYNC (which translates into the 1606 * Solaris define FRSYNC for zfs_read()) for reads. It 1607 * attempts to sync the file before reading. 1608 * 1609 * So, to attempt to provide some barrier semantics in the 1610 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 1611 */ 1612 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 1613 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 1614 1615 VOP_UNLOCK(xbb->vn, 0); 1616 break; 1617 case BIO_WRITE: { 1618 struct mount *mountpoint; 1619 1620 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 1621 1622 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 1623 1624 /* 1625 * UFS pays attention to IO_DIRECT for writes. The write 1626 * is done asynchronously. (Normally the write would just 1627 * get put into cache. 1628 * 1629 * UFS pays attention to IO_SYNC for writes. It will 1630 * attempt to write the buffer out synchronously if that 1631 * flag is set. 1632 * 1633 * ZFS does not pay attention to IO_DIRECT for writes. 1634 * 1635 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 1636 * for writes. It will flush the transaction from the 1637 * cache before returning. 1638 * 1639 * So if we've got the BIO_ORDERED flag set, we want 1640 * IO_SYNC in either the UFS or ZFS case. 1641 */ 1642 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 1643 IO_SYNC : 0, file_data->cred); 1644 VOP_UNLOCK(xbb->vn, 0); 1645 1646 vn_finished_write(mountpoint); 1647 1648 break; 1649 } 1650 default: 1651 panic("invalid operation %d", operation); 1652 /* NOTREACHED */ 1653 } 1654 VFS_UNLOCK_GIANT(vfs_is_locked); 1655 1656 #ifdef XBB_USE_BOUNCE_BUFFERS 1657 /* We only need to copy here for read operations */ 1658 if (operation == BIO_READ) { 1659 1660 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 1661 xiovec = file_data->saved_xiovecs; 1662 seg_idx < saved_uio_iovcnt; seg_idx++, 1663 xiovec++, p_vaddr++) { 1664 1665 /* 1666 * Note that we have to use the copy of the 1667 * io vector we made above. uiomove() modifies 1668 * the uio and its referenced vector as uiomove 1669 * performs the copy, so we can't rely on any 1670 * state from the original uio. 1671 */ 1672 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 1673 } 1674 } 1675 #endif /* XBB_USE_BOUNCE_BUFFERS */ 1676 1677 bailout_send_response: 1678 1679 /* 1680 * All I/O is already done, send the response. A lock is not 1681 * necessary here because we're single threaded, and therefore the 1682 * only context accessing this request right now. If that changes, 1683 * we may need some locking here. 1684 */ 1685 xbb_unmap_req(req); 1686 xbb_send_response(xbb, req, (error == 0) ? BLKIF_RSP_OKAY : 1687 BLKIF_RSP_ERROR); 1688 devstat_end_transaction(xbb->xbb_stats, 1689 /*bytes*/error == 0 ? req->nr_512b_sectors << 9 1690 : 0, 1691 req->ds_tag_type, 1692 req->ds_trans_type, 1693 /*now*/NULL, 1694 /*then*/&req->ds_t0); 1695 xbb_release_req(xbb, req); 1696 1697 return (0); 1698 } 1699 1700 /*--------------------------- Backend Configuration --------------------------*/ 1701 /** 1702 * Close and cleanup any backend device/file specific state for this 1703 * block back instance. 1704 * 1705 * \param xbb Per-instance xbb configuration structure. 1706 */ 1707 static void 1708 xbb_close_backend(struct xbb_softc *xbb) 1709 { 1710 DROP_GIANT(); 1711 DPRINTF("closing dev=%s\n", xbb->dev_name); 1712 if (xbb->vn) { 1713 int flags = FREAD; 1714 int vfs_is_locked = 0; 1715 1716 if ((xbb->flags & XBBF_READ_ONLY) == 0) 1717 flags |= FWRITE; 1718 1719 switch (xbb->device_type) { 1720 case XBB_TYPE_DISK: 1721 if (xbb->backend.dev.csw) { 1722 dev_relthread(xbb->backend.dev.cdev, 1723 xbb->backend.dev.dev_ref); 1724 xbb->backend.dev.csw = NULL; 1725 xbb->backend.dev.cdev = NULL; 1726 } 1727 break; 1728 case XBB_TYPE_FILE: 1729 vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); 1730 break; 1731 case XBB_TYPE_NONE: 1732 default: 1733 panic("Unexpected backend type."); 1734 break; 1735 } 1736 1737 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 1738 xbb->vn = NULL; 1739 1740 switch (xbb->device_type) { 1741 case XBB_TYPE_DISK: 1742 break; 1743 case XBB_TYPE_FILE: 1744 VFS_UNLOCK_GIANT(vfs_is_locked); 1745 if (xbb->backend.file.cred != NULL) { 1746 crfree(xbb->backend.file.cred); 1747 xbb->backend.file.cred = NULL; 1748 } 1749 break; 1750 case XBB_TYPE_NONE: 1751 default: 1752 panic("Unexpected backend type."); 1753 break; 1754 } 1755 } 1756 PICKUP_GIANT(); 1757 } 1758 1759 /** 1760 * Open a character device to be used for backend I/O. 1761 * 1762 * \param xbb Per-instance xbb configuration structure. 1763 * 1764 * \return 0 for success, errno codes for failure. 1765 */ 1766 static int 1767 xbb_open_dev(struct xbb_softc *xbb) 1768 { 1769 struct vattr vattr; 1770 struct cdev *dev; 1771 struct cdevsw *devsw; 1772 int error; 1773 1774 xbb->device_type = XBB_TYPE_DISK; 1775 xbb->dispatch_io = xbb_dispatch_dev; 1776 xbb->backend.dev.cdev = xbb->vn->v_rdev; 1777 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 1778 &xbb->backend.dev.dev_ref); 1779 if (xbb->backend.dev.csw == NULL) 1780 panic("Unable to retrieve device switch"); 1781 1782 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 1783 if (error) { 1784 xenbus_dev_fatal(xbb->dev, error, "error getting " 1785 "vnode attributes for device %s", 1786 xbb->dev_name); 1787 return (error); 1788 } 1789 1790 1791 dev = xbb->vn->v_rdev; 1792 devsw = dev->si_devsw; 1793 if (!devsw->d_ioctl) { 1794 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 1795 "device %s!", xbb->dev_name); 1796 return (ENODEV); 1797 } 1798 1799 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 1800 (caddr_t)&xbb->sector_size, FREAD, 1801 curthread); 1802 if (error) { 1803 xenbus_dev_fatal(xbb->dev, error, 1804 "error calling ioctl DIOCGSECTORSIZE " 1805 "for device %s", xbb->dev_name); 1806 return (error); 1807 } 1808 1809 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 1810 (caddr_t)&xbb->media_size, FREAD, 1811 curthread); 1812 if (error) { 1813 xenbus_dev_fatal(xbb->dev, error, 1814 "error calling ioctl DIOCGMEDIASIZE " 1815 "for device %s", xbb->dev_name); 1816 return (error); 1817 } 1818 1819 return (0); 1820 } 1821 1822 /** 1823 * Open a file to be used for backend I/O. 1824 * 1825 * \param xbb Per-instance xbb configuration structure. 1826 * 1827 * \return 0 for success, errno codes for failure. 1828 */ 1829 static int 1830 xbb_open_file(struct xbb_softc *xbb) 1831 { 1832 struct xbb_file_data *file_data; 1833 struct vattr vattr; 1834 int error; 1835 1836 file_data = &xbb->backend.file; 1837 xbb->device_type = XBB_TYPE_FILE; 1838 xbb->dispatch_io = xbb_dispatch_file; 1839 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 1840 if (error != 0) { 1841 xenbus_dev_fatal(xbb->dev, error, 1842 "error calling VOP_GETATTR()" 1843 "for file %s", xbb->dev_name); 1844 return (error); 1845 } 1846 1847 /* 1848 * Verify that we have the ability to upgrade to exclusive 1849 * access on this file so we can trap errors at open instead 1850 * of reporting them during first access. 1851 */ 1852 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 1853 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 1854 if (xbb->vn->v_iflag & VI_DOOMED) { 1855 error = EBADF; 1856 xenbus_dev_fatal(xbb->dev, error, 1857 "error locking file %s", 1858 xbb->dev_name); 1859 1860 return (error); 1861 } 1862 } 1863 1864 file_data->cred = crhold(curthread->td_ucred); 1865 xbb->media_size = vattr.va_size; 1866 1867 /* 1868 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 1869 * With ZFS, it is 131072 bytes. Block sizes that large don't work 1870 * with disklabel and UFS on FreeBSD at least. Large block sizes 1871 * may not work with other OSes as well. So just export a sector 1872 * size of 512 bytes, which should work with any OS or 1873 * application. Since our backing is a file, any block size will 1874 * work fine for the backing store. 1875 */ 1876 #if 0 1877 xbb->sector_size = vattr.va_blocksize; 1878 #endif 1879 xbb->sector_size = 512; 1880 1881 /* 1882 * Sanity check. The media size has to be at least one 1883 * sector long. 1884 */ 1885 if (xbb->media_size < xbb->sector_size) { 1886 error = EINVAL; 1887 xenbus_dev_fatal(xbb->dev, error, 1888 "file %s size %ju < block size %u", 1889 xbb->dev_name, 1890 (uintmax_t)xbb->media_size, 1891 xbb->sector_size); 1892 } 1893 return (error); 1894 } 1895 1896 /** 1897 * Open the backend provider for this connection. 1898 * 1899 * \param xbb Per-instance xbb configuration structure. 1900 * 1901 * \return 0 for success, errno codes for failure. 1902 */ 1903 static int 1904 xbb_open_backend(struct xbb_softc *xbb) 1905 { 1906 struct nameidata nd; 1907 int flags; 1908 int error; 1909 int vfs_is_locked; 1910 1911 flags = FREAD; 1912 error = 0; 1913 1914 DPRINTF("opening dev=%s\n", xbb->dev_name); 1915 1916 if ((xbb->flags & XBBF_READ_ONLY) == 0) 1917 flags |= FWRITE; 1918 1919 if (!curthread->td_proc->p_fd->fd_cdir) { 1920 curthread->td_proc->p_fd->fd_cdir = rootvnode; 1921 VREF(rootvnode); 1922 } 1923 if (!curthread->td_proc->p_fd->fd_rdir) { 1924 curthread->td_proc->p_fd->fd_rdir = rootvnode; 1925 VREF(rootvnode); 1926 } 1927 if (!curthread->td_proc->p_fd->fd_jdir) { 1928 curthread->td_proc->p_fd->fd_jdir = rootvnode; 1929 VREF(rootvnode); 1930 } 1931 1932 again: 1933 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 1934 error = vn_open(&nd, &flags, 0, NULL); 1935 if (error) { 1936 /* 1937 * This is the only reasonable guess we can make as far as 1938 * path if the user doesn't give us a fully qualified path. 1939 * If they want to specify a file, they need to specify the 1940 * full path. 1941 */ 1942 if (xbb->dev_name[0] != '/') { 1943 char *dev_path = "/dev/"; 1944 char *dev_name; 1945 1946 /* Try adding device path at beginning of name */ 1947 dev_name = malloc(strlen(xbb->dev_name) 1948 + strlen(dev_path) + 1, 1949 M_XENBLOCKBACK, M_NOWAIT); 1950 if (dev_name) { 1951 sprintf(dev_name, "%s%s", dev_path, 1952 xbb->dev_name); 1953 free(xbb->dev_name, M_XENBLOCKBACK); 1954 xbb->dev_name = dev_name; 1955 goto again; 1956 } 1957 } 1958 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 1959 xbb->dev_name); 1960 return (error); 1961 } 1962 1963 vfs_is_locked = NDHASGIANT(&nd); 1964 1965 NDFREE(&nd, NDF_ONLY_PNBUF); 1966 1967 xbb->vn = nd.ni_vp; 1968 1969 /* We only support disks and files. */ 1970 if (vn_isdisk(xbb->vn, &error)) { 1971 error = xbb_open_dev(xbb); 1972 } else if (xbb->vn->v_type == VREG) { 1973 error = xbb_open_file(xbb); 1974 } else { 1975 error = EINVAL; 1976 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 1977 "or file", xbb->dev_name); 1978 } 1979 VOP_UNLOCK(xbb->vn, 0); 1980 VFS_UNLOCK_GIANT(vfs_is_locked); 1981 1982 if (error != 0) { 1983 xbb_close_backend(xbb); 1984 return (error); 1985 } 1986 1987 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 1988 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 1989 1990 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 1991 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 1992 xbb->dev_name, xbb->sector_size, xbb->media_size); 1993 1994 return (0); 1995 } 1996 1997 /*------------------------ Inter-Domain Communication ------------------------*/ 1998 /** 1999 * Cleanup all inter-domain communication mechanisms. 2000 * 2001 * \param xbb Per-instance xbb configuration structure. 2002 */ 2003 static void 2004 xbb_disconnect(struct xbb_softc *xbb) 2005 { 2006 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2007 struct gnttab_unmap_grant_ref *op; 2008 u_int ring_idx; 2009 int error; 2010 2011 DPRINTF("\n"); 2012 2013 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2014 return; 2015 2016 if (xbb->irq != 0) { 2017 unbind_from_irqhandler(xbb->irq); 2018 xbb->irq = 0; 2019 } 2020 2021 for (ring_idx = 0, op = ops; 2022 ring_idx < xbb->ring_config.ring_pages; 2023 ring_idx++, op++) { 2024 2025 op->host_addr = xbb->ring_config.gnt_addr 2026 + (ring_idx * PAGE_SIZE); 2027 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2028 op->handle = xbb->ring_config.handle[ring_idx]; 2029 } 2030 2031 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2032 xbb->ring_config.ring_pages); 2033 if (error != 0) 2034 panic("Grant table op failed (%d)", error); 2035 2036 xbb->flags &= ~XBBF_RING_CONNECTED; 2037 } 2038 2039 /** 2040 * Map shared memory ring into domain local address space, initialize 2041 * ring control structures, and bind an interrupt to the event channel 2042 * used to notify us of ring changes. 2043 * 2044 * \param xbb Per-instance xbb configuration structure. 2045 */ 2046 static int 2047 xbb_connect_ring(struct xbb_softc *xbb) 2048 { 2049 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2050 struct gnttab_map_grant_ref *gnt; 2051 u_int ring_idx; 2052 int error; 2053 2054 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2055 return (0); 2056 2057 /* 2058 * Kva for our ring is at the tail of the region of kva allocated 2059 * by xbb_alloc_communication_mem(). 2060 */ 2061 xbb->ring_config.va = xbb->kva 2062 + (xbb->kva_size 2063 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2064 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2065 + (xbb->kva_size 2066 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2067 2068 for (ring_idx = 0, gnt = gnts; 2069 ring_idx < xbb->ring_config.ring_pages; 2070 ring_idx++, gnt++) { 2071 2072 gnt->host_addr = xbb->ring_config.gnt_addr 2073 + (ring_idx * PAGE_SIZE); 2074 gnt->flags = GNTMAP_host_map; 2075 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2076 gnt->dom = xbb->otherend_id; 2077 } 2078 2079 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2080 xbb->ring_config.ring_pages); 2081 if (error) 2082 panic("blkback: Ring page grant table op failed (%d)", error); 2083 2084 for (ring_idx = 0, gnt = gnts; 2085 ring_idx < xbb->ring_config.ring_pages; 2086 ring_idx++, gnt++) { 2087 if (gnt->status != 0) { 2088 xbb->ring_config.va = 0; 2089 xenbus_dev_fatal(xbb->dev, EACCES, 2090 "Ring shared page mapping failed. " 2091 "Status %d.", gnt->status); 2092 return (EACCES); 2093 } 2094 xbb->ring_config.handle[ring_idx] = gnt->handle; 2095 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2096 } 2097 2098 /* Initialize the ring based on ABI. */ 2099 switch (xbb->abi) { 2100 case BLKIF_PROTOCOL_NATIVE: 2101 { 2102 blkif_sring_t *sring; 2103 sring = (blkif_sring_t *)xbb->ring_config.va; 2104 BACK_RING_INIT(&xbb->rings.native, sring, 2105 xbb->ring_config.ring_pages * PAGE_SIZE); 2106 break; 2107 } 2108 case BLKIF_PROTOCOL_X86_32: 2109 { 2110 blkif_x86_32_sring_t *sring_x86_32; 2111 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2112 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2113 xbb->ring_config.ring_pages * PAGE_SIZE); 2114 break; 2115 } 2116 case BLKIF_PROTOCOL_X86_64: 2117 { 2118 blkif_x86_64_sring_t *sring_x86_64; 2119 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2120 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2121 xbb->ring_config.ring_pages * PAGE_SIZE); 2122 break; 2123 } 2124 default: 2125 panic("Unexpected blkif protocol ABI."); 2126 } 2127 2128 xbb->flags |= XBBF_RING_CONNECTED; 2129 2130 error = 2131 bind_interdomain_evtchn_to_irqhandler(xbb->otherend_id, 2132 xbb->ring_config.evtchn, 2133 device_get_nameunit(xbb->dev), 2134 xbb_intr, /*arg*/xbb, 2135 INTR_TYPE_BIO | INTR_MPSAFE, 2136 &xbb->irq); 2137 if (error) { 2138 xbb_disconnect(xbb); 2139 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2140 return (error); 2141 } 2142 2143 DPRINTF("rings connected!\n"); 2144 2145 return 0; 2146 } 2147 2148 /** 2149 * Size KVA and pseudo-physical address allocations based on negotiated 2150 * values for the size and number of I/O requests, and the size of our 2151 * communication ring. 2152 * 2153 * \param xbb Per-instance xbb configuration structure. 2154 * 2155 * These address spaces are used to dynamically map pages in the 2156 * front-end's domain into our own. 2157 */ 2158 static int 2159 xbb_alloc_communication_mem(struct xbb_softc *xbb) 2160 { 2161 xbb->kva_size = (xbb->ring_config.ring_pages 2162 + (xbb->max_requests * xbb->max_request_segments)) 2163 * PAGE_SIZE; 2164 #ifndef XENHVM 2165 xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size); 2166 if (xbb->kva == 0) 2167 return (ENOMEM); 2168 xbb->gnt_base_addr = xbb->kva; 2169 #else /* XENHVM */ 2170 /* 2171 * Reserve a range of pseudo physical memory that we can map 2172 * into kva. These pages will only be backed by machine 2173 * pages ("real memory") during the lifetime of front-end requests 2174 * via grant table operations. 2175 */ 2176 xbb->pseudo_phys_res_id = 0; 2177 xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY, 2178 &xbb->pseudo_phys_res_id, 2179 0, ~0, xbb->kva_size, 2180 RF_ACTIVE); 2181 if (xbb->pseudo_phys_res == NULL) { 2182 xbb->kva = 0; 2183 return (ENOMEM); 2184 } 2185 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 2186 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 2187 #endif /* XENHVM */ 2188 return (0); 2189 } 2190 2191 /** 2192 * Free dynamically allocated KVA or pseudo-physical address allocations. 2193 * 2194 * \param xbb Per-instance xbb configuration structure. 2195 */ 2196 static void 2197 xbb_free_communication_mem(struct xbb_softc *xbb) 2198 { 2199 if (xbb->kva != 0) { 2200 #ifndef XENHVM 2201 kmem_free(kernel_map, xbb->kva, xbb->kva_size); 2202 #else 2203 if (xbb->pseudo_phys_res != NULL) { 2204 bus_release_resource(xbb->dev, SYS_RES_MEMORY, 2205 xbb->pseudo_phys_res_id, 2206 xbb->pseudo_phys_res); 2207 xbb->pseudo_phys_res = NULL; 2208 } 2209 #endif 2210 } 2211 xbb->kva = 0; 2212 xbb->gnt_base_addr = 0; 2213 } 2214 2215 /** 2216 * Collect front-end information from the XenStore. 2217 * 2218 * \param xbb Per-instance xbb configuration structure. 2219 */ 2220 static int 2221 xbb_collect_frontend_info(struct xbb_softc *xbb) 2222 { 2223 char protocol_abi[64]; 2224 const char *otherend_path; 2225 int error; 2226 u_int ring_idx; 2227 2228 otherend_path = xenbus_get_otherend_path(xbb->dev); 2229 2230 /* 2231 * Mandatory data (used in all versions of the protocol) first. 2232 */ 2233 error = xs_gather(XST_NIL, otherend_path, 2234 "ring-ref", "%" PRIu32, 2235 &xbb->ring_config.ring_ref[0], 2236 "event-channel", "%" PRIu32, 2237 &xbb->ring_config.evtchn, 2238 NULL); 2239 if (error != 0) { 2240 xenbus_dev_fatal(xbb->dev, error, 2241 "Unable to retrieve ring information from " 2242 "frontend %s. Unable to connect.", 2243 xenbus_get_otherend_path(xbb->dev)); 2244 return (error); 2245 } 2246 2247 /* 2248 * These fields are initialized to legacy protocol defaults 2249 * so we only need to fail if reading the updated value succeeds 2250 * and the new value is outside of its allowed range. 2251 * 2252 * \note xs_gather() returns on the first encountered error, so 2253 * we must use independant calls in order to guarantee 2254 * we don't miss information in a sparsly populated front-end 2255 * tree. 2256 */ 2257 (void)xs_scanf(XST_NIL, otherend_path, 2258 "ring-pages", NULL, "%" PRIu32, 2259 &xbb->ring_config.ring_pages); 2260 2261 (void)xs_scanf(XST_NIL, otherend_path, 2262 "max-requests", NULL, "%" PRIu32, 2263 &xbb->max_requests); 2264 2265 (void)xs_scanf(XST_NIL, otherend_path, 2266 "max-request-segments", NULL, "%" PRIu32, 2267 &xbb->max_request_segments); 2268 2269 (void)xs_scanf(XST_NIL, otherend_path, 2270 "max-request-size", NULL, "%" PRIu32, 2271 &xbb->max_request_size); 2272 2273 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 2274 xenbus_dev_fatal(xbb->dev, EINVAL, 2275 "Front-end specificed ring-pages of %u " 2276 "exceeds backend limit of %zu. " 2277 "Unable to connect.", 2278 xbb->ring_config.ring_pages, 2279 XBB_MAX_RING_PAGES); 2280 return (EINVAL); 2281 } else if (xbb->max_requests > XBB_MAX_REQUESTS) { 2282 xenbus_dev_fatal(xbb->dev, EINVAL, 2283 "Front-end specificed max_requests of %u " 2284 "exceeds backend limit of %u. " 2285 "Unable to connect.", 2286 xbb->max_requests, 2287 XBB_MAX_REQUESTS); 2288 return (EINVAL); 2289 } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) { 2290 xenbus_dev_fatal(xbb->dev, EINVAL, 2291 "Front-end specificed max_requests_segments " 2292 "of %u exceeds backend limit of %u. " 2293 "Unable to connect.", 2294 xbb->max_request_segments, 2295 XBB_MAX_SEGMENTS_PER_REQUEST); 2296 return (EINVAL); 2297 } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) { 2298 xenbus_dev_fatal(xbb->dev, EINVAL, 2299 "Front-end specificed max_request_size " 2300 "of %u exceeds backend limit of %u. " 2301 "Unable to connect.", 2302 xbb->max_request_size, 2303 XBB_MAX_REQUEST_SIZE); 2304 return (EINVAL); 2305 } 2306 2307 /* If using a multi-page ring, pull in the remaining references. */ 2308 for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) { 2309 char ring_ref_name[]= "ring_refXX"; 2310 2311 snprintf(ring_ref_name, sizeof(ring_ref_name), 2312 "ring-ref%u", ring_idx); 2313 error = xs_scanf(XST_NIL, otherend_path, 2314 ring_ref_name, NULL, "%" PRIu32, 2315 &xbb->ring_config.ring_ref[ring_idx]); 2316 if (error != 0) { 2317 xenbus_dev_fatal(xbb->dev, error, 2318 "Failed to retriev grant reference " 2319 "for page %u of shared ring. Unable " 2320 "to connect.", ring_idx); 2321 return (error); 2322 } 2323 } 2324 2325 error = xs_gather(XST_NIL, otherend_path, 2326 "protocol", "%63s", protocol_abi, 2327 NULL); 2328 if (error != 0 2329 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 2330 /* 2331 * Assume native if the frontend has not 2332 * published ABI data or it has published and 2333 * matches our own ABI. 2334 */ 2335 xbb->abi = BLKIF_PROTOCOL_NATIVE; 2336 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 2337 2338 xbb->abi = BLKIF_PROTOCOL_X86_32; 2339 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 2340 2341 xbb->abi = BLKIF_PROTOCOL_X86_64; 2342 } else { 2343 2344 xenbus_dev_fatal(xbb->dev, EINVAL, 2345 "Unknown protocol ABI (%s) published by " 2346 "frontend. Unable to connect.", protocol_abi); 2347 return (EINVAL); 2348 } 2349 return (0); 2350 } 2351 2352 /** 2353 * Allocate per-request data structures given request size and number 2354 * information negotiated with the front-end. 2355 * 2356 * \param xbb Per-instance xbb configuration structure. 2357 */ 2358 static int 2359 xbb_alloc_requests(struct xbb_softc *xbb) 2360 { 2361 struct xbb_xen_req *req; 2362 struct xbb_xen_req *last_req; 2363 uint8_t *req_kva; 2364 u_long gnt_base; 2365 2366 /* 2367 * Allocate request book keeping datastructures. 2368 */ 2369 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 2370 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 2371 if (xbb->requests == NULL) { 2372 xenbus_dev_fatal(xbb->dev, ENOMEM, 2373 "Unable to allocate request structures"); 2374 return (ENOMEM); 2375 } 2376 2377 req_kva = (uint8_t *)xbb->kva; 2378 gnt_base = xbb->gnt_base_addr; 2379 req = xbb->requests; 2380 last_req = &xbb->requests[xbb->max_requests - 1]; 2381 while (req <= last_req) { 2382 int seg; 2383 2384 req->xbb = xbb; 2385 req->kva = req_kva; 2386 req->gnt_handles = malloc(xbb->max_request_segments 2387 * sizeof(*req->gnt_handles), 2388 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 2389 if (req->gnt_handles == NULL) { 2390 xenbus_dev_fatal(xbb->dev, ENOMEM, 2391 "Unable to allocate request " 2392 "grant references"); 2393 return (ENOMEM); 2394 } 2395 #ifdef XBB_USE_BOUNCE_BUFFERS 2396 req->bounce = malloc(xbb->max_request_size, 2397 M_XENBLOCKBACK, M_NOWAIT); 2398 if (req->bounce == NULL) { 2399 xenbus_dev_fatal(xbb->dev, ENOMEM, 2400 "Unable to allocate request " 2401 "bounce buffers"); 2402 return (ENOMEM); 2403 } 2404 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2405 req->gnt_base = gnt_base; 2406 req_kva += xbb->max_request_segments * PAGE_SIZE; 2407 gnt_base += xbb->max_request_segments * PAGE_SIZE; 2408 SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links); 2409 2410 for (seg = 0; seg < xbb->max_request_segments; seg++) 2411 req->gnt_handles[seg] = GRANT_REF_INVALID; 2412 2413 req++; 2414 } 2415 return (0); 2416 } 2417 2418 /** 2419 * Supply information about the physical device to the frontend 2420 * via XenBus. 2421 * 2422 * \param xbb Per-instance xbb configuration structure. 2423 */ 2424 static int 2425 xbb_publish_backend_info(struct xbb_softc *xbb) 2426 { 2427 struct xs_transaction xst; 2428 const char *our_path; 2429 const char *leaf; 2430 int error; 2431 2432 our_path = xenbus_get_node(xbb->dev); 2433 while (1) { 2434 error = xs_transaction_start(&xst); 2435 if (error != 0) { 2436 xenbus_dev_fatal(xbb->dev, error, 2437 "Error publishing backend info " 2438 "(start transaction)"); 2439 return (error); 2440 } 2441 2442 leaf = "sectors"; 2443 error = xs_printf(xst, our_path, leaf, 2444 "%"PRIu64, xbb->media_num_sectors); 2445 if (error != 0) 2446 break; 2447 2448 /* XXX Support all VBD attributes here. */ 2449 leaf = "info"; 2450 error = xs_printf(xst, our_path, leaf, "%u", 2451 xbb->flags & XBBF_READ_ONLY 2452 ? VDISK_READONLY : 0); 2453 if (error != 0) 2454 break; 2455 2456 leaf = "sector-size"; 2457 error = xs_printf(xst, our_path, leaf, "%u", 2458 xbb->sector_size); 2459 if (error != 0) 2460 break; 2461 2462 error = xs_transaction_end(xst, 0); 2463 if (error == 0) { 2464 return (0); 2465 } else if (error != EAGAIN) { 2466 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 2467 return (error); 2468 } 2469 } 2470 2471 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 2472 our_path, leaf); 2473 xs_transaction_end(xst, 1); 2474 return (error); 2475 } 2476 2477 /** 2478 * Connect to our blkfront peer now that it has completed publishing 2479 * its configuration into the XenStore. 2480 * 2481 * \param xbb Per-instance xbb configuration structure. 2482 */ 2483 static void 2484 xbb_connect(struct xbb_softc *xbb) 2485 { 2486 int error; 2487 2488 if (xenbus_get_state(xbb->dev) == XenbusStateConnected) 2489 return; 2490 2491 if (xbb_collect_frontend_info(xbb) != 0) 2492 return; 2493 2494 /* Allocate resources whose size depends on front-end configuration. */ 2495 error = xbb_alloc_communication_mem(xbb); 2496 if (error != 0) { 2497 xenbus_dev_fatal(xbb->dev, error, 2498 "Unable to allocate communication memory"); 2499 return; 2500 } 2501 2502 error = xbb_alloc_requests(xbb); 2503 if (error != 0) { 2504 /* Specific errors are reported by xbb_alloc_requests(). */ 2505 return; 2506 } 2507 2508 /* 2509 * Connect communication channel. 2510 */ 2511 error = xbb_connect_ring(xbb); 2512 if (error != 0) { 2513 /* Specific errors are reported by xbb_connect_ring(). */ 2514 return; 2515 } 2516 2517 if (xbb_publish_backend_info(xbb) != 0) { 2518 /* 2519 * If we can't publish our data, we cannot participate 2520 * in this connection, and waiting for a front-end state 2521 * change will not help the situation. 2522 */ 2523 xbb_disconnect(xbb); 2524 return; 2525 } 2526 2527 /* Ready for I/O. */ 2528 xenbus_set_state(xbb->dev, XenbusStateConnected); 2529 } 2530 2531 /*-------------------------- Device Teardown Support -------------------------*/ 2532 /** 2533 * Perform device shutdown functions. 2534 * 2535 * \param xbb Per-instance xbb configuration structure. 2536 * 2537 * Mark this instance as shutting down, wait for any active I/O on the 2538 * backend device/file to drain, disconnect from the front-end, and notify 2539 * any waiters (e.g. a thread invoking our detach method) that detach can 2540 * now proceed. 2541 */ 2542 static int 2543 xbb_shutdown(struct xbb_softc *xbb) 2544 { 2545 static int in_shutdown; 2546 2547 DPRINTF("\n"); 2548 2549 /* 2550 * Due to the need to drop our mutex during some 2551 * xenbus operations, it is possible for two threads 2552 * to attempt to close out shutdown processing at 2553 * the same time. Tell the caller that hits this 2554 * race to try back later. 2555 */ 2556 if (in_shutdown != 0) 2557 return (EAGAIN); 2558 2559 DPRINTF("\n"); 2560 2561 /* Indicate shutdown is in progress. */ 2562 xbb->flags |= XBBF_SHUTDOWN; 2563 2564 /* Wait for requests to complete. */ 2565 if (xbb->active_request_count != 0) 2566 return (EAGAIN); 2567 2568 DPRINTF("\n"); 2569 2570 /* Disconnect from the front-end. */ 2571 xbb_disconnect(xbb); 2572 2573 in_shutdown = 1; 2574 mtx_unlock(&xbb->lock); 2575 xenbus_set_state(xbb->dev, XenbusStateClosed); 2576 mtx_lock(&xbb->lock); 2577 in_shutdown = 0; 2578 2579 /* Indicate to xbb_detach() that is it safe to proceed. */ 2580 wakeup(xbb); 2581 2582 return (0); 2583 } 2584 2585 /** 2586 * Report an attach time error to the console and Xen, and cleanup 2587 * this instance by forcing immediate detach processing. 2588 * 2589 * \param xbb Per-instance xbb configuration structure. 2590 * \param err Errno describing the error. 2591 * \param fmt Printf style format and arguments 2592 */ 2593 static void 2594 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 2595 { 2596 va_list ap; 2597 va_list ap_hotplug; 2598 2599 va_start(ap, fmt); 2600 va_copy(ap_hotplug, ap); 2601 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 2602 "hotplug-error", fmt, ap_hotplug); 2603 va_end(ap_hotplug); 2604 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 2605 "hotplug-status", "error"); 2606 2607 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 2608 va_end(ap); 2609 2610 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 2611 "online", "0"); 2612 xbb_detach(xbb->dev); 2613 } 2614 2615 /*---------------------------- NewBus Entrypoints ----------------------------*/ 2616 /** 2617 * Inspect a XenBus device and claim it if is of the appropriate type. 2618 * 2619 * \param dev NewBus device object representing a candidate XenBus device. 2620 * 2621 * \return 0 for success, errno codes for failure. 2622 */ 2623 static int 2624 xbb_probe(device_t dev) 2625 { 2626 2627 if (!strcmp(xenbus_get_type(dev), "vbd")) { 2628 device_set_desc(dev, "Backend Virtual Block Device"); 2629 device_quiet(dev); 2630 return (0); 2631 } 2632 2633 return (ENXIO); 2634 } 2635 2636 /** 2637 * Attach to a XenBus device that has been claimed by our probe routine. 2638 * 2639 * \param dev NewBus device object representing this Xen Block Back instance. 2640 * 2641 * \return 0 for success, errno codes for failure. 2642 */ 2643 static int 2644 xbb_attach(device_t dev) 2645 { 2646 struct xbb_softc *xbb; 2647 int error; 2648 2649 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 2650 2651 /* 2652 * Basic initialization. 2653 * After this block it is safe to call xbb_detach() 2654 * to clean up any allocated data for this instance. 2655 */ 2656 xbb = device_get_softc(dev); 2657 xbb->dev = dev; 2658 xbb->otherend_id = xenbus_get_otherend_id(dev); 2659 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 2660 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 2661 SLIST_INIT(&xbb->request_free_slist); 2662 2663 /* 2664 * Protocol defaults valid even if all negotiation fails. 2665 */ 2666 xbb->ring_config.ring_pages = 1; 2667 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE); 2668 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; 2669 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 2670 2671 /* 2672 * Publish protocol capabilities for consumption by the 2673 * front-end. 2674 */ 2675 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 2676 "feature-barrier", "1"); 2677 if (error) { 2678 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 2679 xenbus_get_node(xbb->dev)); 2680 return (error); 2681 } 2682 2683 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 2684 "feature-flush-cache", "1"); 2685 if (error) { 2686 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 2687 xenbus_get_node(xbb->dev)); 2688 return (error); 2689 } 2690 2691 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 2692 "max-ring-pages", "%zu", XBB_MAX_RING_PAGES); 2693 if (error) { 2694 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages", 2695 xenbus_get_node(xbb->dev)); 2696 return (error); 2697 } 2698 2699 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 2700 "max-requests", "%u", XBB_MAX_REQUESTS); 2701 if (error) { 2702 xbb_attach_failed(xbb, error, "writing %s/max-requests", 2703 xenbus_get_node(xbb->dev)); 2704 return (error); 2705 } 2706 2707 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 2708 "max-request-segments", "%u", 2709 XBB_MAX_SEGMENTS_PER_REQUEST); 2710 if (error) { 2711 xbb_attach_failed(xbb, error, "writing %s/max-request-segments", 2712 xenbus_get_node(xbb->dev)); 2713 return (error); 2714 } 2715 2716 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 2717 "max-request-size", "%u", 2718 XBB_MAX_REQUEST_SIZE); 2719 if (error) { 2720 xbb_attach_failed(xbb, error, "writing %s/max-request-size", 2721 xenbus_get_node(xbb->dev)); 2722 return (error); 2723 } 2724 2725 /* Collect physical device information. */ 2726 error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 2727 "device-type", NULL, &xbb->dev_type, 2728 NULL); 2729 if (error != 0) 2730 xbb->dev_type = NULL; 2731 2732 error = xs_gather(XST_NIL, xenbus_get_node(dev), 2733 "mode", NULL, &xbb->dev_mode, 2734 "params", NULL, &xbb->dev_name, 2735 NULL); 2736 if (error != 0) { 2737 xbb_attach_failed(xbb, error, "reading backend fields at %s", 2738 xenbus_get_node(dev)); 2739 return (ENXIO); 2740 } 2741 2742 /* Parse fopen style mode flags. */ 2743 if (strchr(xbb->dev_mode, 'w') == NULL) 2744 xbb->flags |= XBBF_READ_ONLY; 2745 2746 /* 2747 * Verify the physical device is present and can support 2748 * the desired I/O mode. 2749 */ 2750 DROP_GIANT(); 2751 error = xbb_open_backend(xbb); 2752 PICKUP_GIANT(); 2753 if (error != 0) { 2754 xbb_attach_failed(xbb, error, "Unable to open %s", 2755 xbb->dev_name); 2756 return (ENXIO); 2757 } 2758 2759 /* Use devstat(9) for recording statistics. */ 2760 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 2761 xbb->sector_size, 2762 DEVSTAT_ALL_SUPPORTED, 2763 DEVSTAT_TYPE_DIRECT 2764 | DEVSTAT_TYPE_IF_OTHER, 2765 DEVSTAT_PRIORITY_OTHER); 2766 /* 2767 * Create a taskqueue for doing work that must occur from a 2768 * thread context. 2769 */ 2770 xbb->io_taskqueue = taskqueue_create(device_get_nameunit(dev), M_NOWAIT, 2771 taskqueue_thread_enqueue, 2772 /*context*/&xbb->io_taskqueue); 2773 if (xbb->io_taskqueue == NULL) { 2774 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 2775 return (ENOMEM); 2776 } 2777 2778 taskqueue_start_threads(&xbb->io_taskqueue, 2779 /*num threads*/1, 2780 /*priority*/PWAIT, 2781 /*thread name*/ 2782 "%s taskq", device_get_nameunit(dev)); 2783 2784 /* Update hot-plug status to satisfy xend. */ 2785 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 2786 "hotplug-status", "connected"); 2787 if (error) { 2788 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 2789 xenbus_get_node(xbb->dev)); 2790 return (error); 2791 } 2792 2793 /* Tell the front end that we are ready to connect. */ 2794 xenbus_set_state(dev, XenbusStateInitWait); 2795 2796 return (0); 2797 } 2798 2799 /** 2800 * Detach from a block back device instanced. 2801 * 2802 * \param dev NewBus device object representing this Xen Block Back instance. 2803 * 2804 * \return 0 for success, errno codes for failure. 2805 * 2806 * \note A block back device may be detached at any time in its life-cycle, 2807 * including part way through the attach process. For this reason, 2808 * initialization order and the intialization state checks in this 2809 * routine must be carefully coupled so that attach time failures 2810 * are gracefully handled. 2811 */ 2812 static int 2813 xbb_detach(device_t dev) 2814 { 2815 struct xbb_softc *xbb; 2816 2817 DPRINTF("\n"); 2818 2819 xbb = device_get_softc(dev); 2820 mtx_lock(&xbb->lock); 2821 while (xbb_shutdown(xbb) == EAGAIN) { 2822 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 2823 "xbb_shutdown", 0); 2824 } 2825 mtx_unlock(&xbb->lock); 2826 mtx_destroy(&xbb->lock); 2827 2828 DPRINTF("\n"); 2829 2830 if (xbb->io_taskqueue != NULL) 2831 taskqueue_free(xbb->io_taskqueue); 2832 2833 if (xbb->xbb_stats != NULL) 2834 devstat_remove_entry(xbb->xbb_stats); 2835 2836 xbb_close_backend(xbb); 2837 xbb_free_communication_mem(xbb); 2838 2839 if (xbb->dev_mode != NULL) { 2840 free(xbb->dev_mode, M_XENBUS); 2841 xbb->dev_mode = NULL; 2842 } 2843 2844 if (xbb->dev_type != NULL) { 2845 free(xbb->dev_type, M_XENBUS); 2846 xbb->dev_type = NULL; 2847 } 2848 2849 if (xbb->dev_name != NULL) { 2850 free(xbb->dev_name, M_XENBUS); 2851 xbb->dev_name = NULL; 2852 } 2853 2854 if (xbb->requests != NULL) { 2855 struct xbb_xen_req *req; 2856 struct xbb_xen_req *last_req; 2857 2858 req = xbb->requests; 2859 last_req = &xbb->requests[xbb->max_requests - 1]; 2860 while (req <= last_req) { 2861 #ifdef XBB_USE_BOUNCE_BUFFERS 2862 if (req->bounce != NULL) { 2863 free(req->bounce, M_XENBLOCKBACK); 2864 req->bounce = NULL; 2865 } 2866 #endif 2867 if (req->gnt_handles != NULL) { 2868 free (req->gnt_handles, M_XENBLOCKBACK); 2869 req->gnt_handles = NULL; 2870 } 2871 req++; 2872 } 2873 free(xbb->requests, M_XENBLOCKBACK); 2874 xbb->requests = NULL; 2875 } 2876 2877 return (0); 2878 } 2879 2880 /** 2881 * Prepare this block back device for suspension of this VM. 2882 * 2883 * \param dev NewBus device object representing this Xen Block Back instance. 2884 * 2885 * \return 0 for success, errno codes for failure. 2886 */ 2887 static int 2888 xbb_suspend(device_t dev) 2889 { 2890 #ifdef NOT_YET 2891 struct xbb_softc *sc = device_get_softc(dev); 2892 2893 /* Prevent new requests being issued until we fix things up. */ 2894 mtx_lock(&sc->xb_io_lock); 2895 sc->connected = BLKIF_STATE_SUSPENDED; 2896 mtx_unlock(&sc->xb_io_lock); 2897 #endif 2898 2899 return (0); 2900 } 2901 2902 /** 2903 * Perform any processing required to recover from a suspended state. 2904 * 2905 * \param dev NewBus device object representing this Xen Block Back instance. 2906 * 2907 * \return 0 for success, errno codes for failure. 2908 */ 2909 static int 2910 xbb_resume(device_t dev) 2911 { 2912 return (0); 2913 } 2914 2915 /** 2916 * Handle state changes expressed via the XenStore by our front-end peer. 2917 * 2918 * \param dev NewBus device object representing this Xen 2919 * Block Back instance. 2920 * \param frontend_state The new state of the front-end. 2921 * 2922 * \return 0 for success, errno codes for failure. 2923 */ 2924 static int 2925 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 2926 { 2927 struct xbb_softc *xbb = device_get_softc(dev); 2928 2929 DPRINTF("state=%s\n", xenbus_strstate(frontend_state)); 2930 2931 switch (frontend_state) { 2932 case XenbusStateInitialising: 2933 case XenbusStateClosing: 2934 break; 2935 case XenbusStateInitialised: 2936 case XenbusStateConnected: 2937 xbb_connect(xbb); 2938 break; 2939 case XenbusStateClosed: 2940 case XenbusStateInitWait: 2941 2942 mtx_lock(&xbb->lock); 2943 xbb_shutdown(xbb); 2944 mtx_unlock(&xbb->lock); 2945 break; 2946 default: 2947 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 2948 frontend_state); 2949 break; 2950 } 2951 return (0); 2952 } 2953 2954 /*---------------------------- NewBus Registration ---------------------------*/ 2955 static device_method_t xbb_methods[] = { 2956 /* Device interface */ 2957 DEVMETHOD(device_probe, xbb_probe), 2958 DEVMETHOD(device_attach, xbb_attach), 2959 DEVMETHOD(device_detach, xbb_detach), 2960 DEVMETHOD(device_shutdown, bus_generic_shutdown), 2961 DEVMETHOD(device_suspend, xbb_suspend), 2962 DEVMETHOD(device_resume, xbb_resume), 2963 2964 /* Xenbus interface */ 2965 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 2966 2967 { 0, 0 } 2968 }; 2969 2970 static driver_t xbb_driver = { 2971 "xbbd", 2972 xbb_methods, 2973 sizeof(struct xbb_softc), 2974 }; 2975 devclass_t xbb_devclass; 2976 2977 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 2978