1 /*- 2 * Copyright (c) 2009-2011 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 * Ken Merry (Spectra Logic Corporation) 32 */ 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 36 /** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 */ 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/kernel.h> 46 #include <sys/malloc.h> 47 48 #include <sys/bio.h> 49 #include <sys/bus.h> 50 #include <sys/conf.h> 51 #include <sys/devicestat.h> 52 #include <sys/disk.h> 53 #include <sys/fcntl.h> 54 #include <sys/filedesc.h> 55 #include <sys/kdb.h> 56 #include <sys/module.h> 57 #include <sys/namei.h> 58 #include <sys/proc.h> 59 #include <sys/rman.h> 60 #include <sys/taskqueue.h> 61 #include <sys/types.h> 62 #include <sys/vnode.h> 63 #include <sys/mount.h> 64 #include <sys/sysctl.h> 65 #include <sys/bitstring.h> 66 67 #include <geom/geom.h> 68 69 #include <machine/_inttypes.h> 70 #include <machine/xen/xen-os.h> 71 72 #include <vm/vm.h> 73 #include <vm/vm_extern.h> 74 #include <vm/vm_kern.h> 75 76 #include <xen/blkif.h> 77 #include <xen/evtchn.h> 78 #include <xen/gnttab.h> 79 #include <xen/xen_intr.h> 80 81 #include <xen/interface/event_channel.h> 82 #include <xen/interface/grant_table.h> 83 84 #include <xen/xenbus/xenbusvar.h> 85 86 /*--------------------------- Compile-time Tunables --------------------------*/ 87 /** 88 * The maximum number of outstanding request blocks (request headers plus 89 * additional segment blocks) we will allow in a negotiated block-front/back 90 * communication channel. 91 */ 92 #define XBB_MAX_REQUESTS 256 93 94 /** 95 * \brief Define to force all I/O to be performed on memory owned by the 96 * backend device, with a copy-in/out to the remote domain's memory. 97 * 98 * \note This option is currently required when this driver's domain is 99 * operating in HVM mode on a system using an IOMMU. 100 * 101 * This driver uses Xen's grant table API to gain access to the memory of 102 * the remote domains it serves. When our domain is operating in PV mode, 103 * the grant table mechanism directly updates our domain's page table entries 104 * to point to the physical pages of the remote domain. This scheme guarantees 105 * that blkback and the backing devices it uses can safely perform DMA 106 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 107 * insure that our domain cannot DMA to pages owned by another domain. As 108 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 109 * table API. For this reason, in HVM mode, we must bounce all requests into 110 * memory that is mapped into our domain at domain startup and thus has 111 * valid IOMMU mappings. 112 */ 113 #define XBB_USE_BOUNCE_BUFFERS 114 115 /** 116 * \brief Define to enable rudimentary request logging to the console. 117 */ 118 #undef XBB_DEBUG 119 120 /*---------------------------------- Macros ----------------------------------*/ 121 /** 122 * Custom malloc type for all driver allocations. 123 */ 124 MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 125 126 #ifdef XBB_DEBUG 127 #define DPRINTF(fmt, args...) \ 128 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 129 #else 130 #define DPRINTF(fmt, args...) do {} while(0) 131 #endif 132 133 /** 134 * The maximum mapped region size per request we will allow in a negotiated 135 * block-front/back communication channel. 136 */ 137 #define XBB_MAX_REQUEST_SIZE \ 138 MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 139 140 /** 141 * The maximum number of segments (within a request header and accompanying 142 * segment blocks) per request we will allow in a negotiated block-front/back 143 * communication channel. 144 */ 145 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 146 (MIN(UIO_MAXIOV, \ 147 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 148 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 149 150 /** 151 * The maximum number of shared memory ring pages we will allow in a 152 * negotiated block-front/back communication channel. Allow enough 153 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 154 */ 155 #define XBB_MAX_RING_PAGES \ 156 BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \ 157 * XBB_MAX_REQUESTS) 158 /** 159 * The maximum number of ring pages that we can allow per request list. 160 * We limit this to the maximum number of segments per request, because 161 * that is already a reasonable number of segments to aggregate. This 162 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 163 * because that would leave situations where we can't dispatch even one 164 * large request. 165 */ 166 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 167 168 /*--------------------------- Forward Declarations ---------------------------*/ 169 struct xbb_softc; 170 struct xbb_xen_req; 171 172 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 173 ...) __attribute__((format(printf, 3, 4))); 174 static int xbb_shutdown(struct xbb_softc *xbb); 175 static int xbb_detach(device_t dev); 176 177 /*------------------------------ Data Structures -----------------------------*/ 178 179 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 180 181 typedef enum { 182 XBB_REQLIST_NONE = 0x00, 183 XBB_REQLIST_MAPPED = 0x01 184 } xbb_reqlist_flags; 185 186 struct xbb_xen_reqlist { 187 /** 188 * Back reference to the parent block back instance for this 189 * request. Used during bio_done handling. 190 */ 191 struct xbb_softc *xbb; 192 193 /** 194 * BLKIF_OP code for this request. 195 */ 196 int operation; 197 198 /** 199 * Set to BLKIF_RSP_* to indicate request status. 200 * 201 * This field allows an error status to be recorded even if the 202 * delivery of this status must be deferred. Deferred reporting 203 * is necessary, for example, when an error is detected during 204 * completion processing of one bio when other bios for this 205 * request are still outstanding. 206 */ 207 int status; 208 209 /** 210 * Number of 512 byte sectors not transferred. 211 */ 212 int residual_512b_sectors; 213 214 /** 215 * Starting sector number of the first request in the list. 216 */ 217 off_t starting_sector_number; 218 219 /** 220 * If we're going to coalesce, the next contiguous sector would be 221 * this one. 222 */ 223 off_t next_contig_sector; 224 225 /** 226 * Number of child requests in the list. 227 */ 228 int num_children; 229 230 /** 231 * Number of I/O requests dispatched to the backend. 232 */ 233 int pendcnt; 234 235 /** 236 * Total number of segments for requests in the list. 237 */ 238 int nr_segments; 239 240 /** 241 * Flags for this particular request list. 242 */ 243 xbb_reqlist_flags flags; 244 245 /** 246 * Kernel virtual address space reserved for this request 247 * list structure and used to map the remote domain's pages for 248 * this I/O, into our domain's address space. 249 */ 250 uint8_t *kva; 251 252 /** 253 * Base, psuedo-physical address, corresponding to the start 254 * of this request's kva region. 255 */ 256 uint64_t gnt_base; 257 258 259 #ifdef XBB_USE_BOUNCE_BUFFERS 260 /** 261 * Pre-allocated domain local memory used to proxy remote 262 * domain memory during I/O operations. 263 */ 264 uint8_t *bounce; 265 #endif 266 267 /** 268 * Array of grant handles (one per page) used to map this request. 269 */ 270 grant_handle_t *gnt_handles; 271 272 /** 273 * Device statistics request ordering type (ordered or simple). 274 */ 275 devstat_tag_type ds_tag_type; 276 277 /** 278 * Device statistics request type (read, write, no_data). 279 */ 280 devstat_trans_flags ds_trans_type; 281 282 /** 283 * The start time for this request. 284 */ 285 struct bintime ds_t0; 286 287 /** 288 * Linked list of contiguous requests with the same operation type. 289 */ 290 struct xbb_xen_req_list contig_req_list; 291 292 /** 293 * Linked list links used to aggregate idle requests in the 294 * request list free pool (xbb->reqlist_free_stailq) and pending 295 * requests waiting for execution (xbb->reqlist_pending_stailq). 296 */ 297 STAILQ_ENTRY(xbb_xen_reqlist) links; 298 }; 299 300 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 301 302 /** 303 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 304 */ 305 struct xbb_xen_req { 306 /** 307 * Linked list links used to aggregate requests into a reqlist 308 * and to store them in the request free pool. 309 */ 310 STAILQ_ENTRY(xbb_xen_req) links; 311 312 /** 313 * The remote domain's identifier for this I/O request. 314 */ 315 uint64_t id; 316 317 /** 318 * The number of pages currently mapped for this request. 319 */ 320 int nr_pages; 321 322 /** 323 * The number of 512 byte sectors comprising this requests. 324 */ 325 int nr_512b_sectors; 326 327 /** 328 * The number of struct bio requests still outstanding for this 329 * request on the backend device. This field is only used for 330 * device (rather than file) backed I/O. 331 */ 332 int pendcnt; 333 334 /** 335 * BLKIF_OP code for this request. 336 */ 337 int operation; 338 339 /** 340 * Storage used for non-native ring requests. 341 */ 342 blkif_request_t ring_req_storage; 343 344 /** 345 * Pointer to the Xen request in the ring. 346 */ 347 blkif_request_t *ring_req; 348 349 /** 350 * Consumer index for this request. 351 */ 352 RING_IDX req_ring_idx; 353 354 /** 355 * The start time for this request. 356 */ 357 struct bintime ds_t0; 358 359 /** 360 * Pointer back to our parent request list. 361 */ 362 struct xbb_xen_reqlist *reqlist; 363 }; 364 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 365 366 /** 367 * \brief Configuration data for the shared memory request ring 368 * used to communicate with the front-end client of this 369 * this driver. 370 */ 371 struct xbb_ring_config { 372 /** KVA address where ring memory is mapped. */ 373 vm_offset_t va; 374 375 /** The pseudo-physical address where ring memory is mapped.*/ 376 uint64_t gnt_addr; 377 378 /** 379 * Grant table handles, one per-ring page, returned by the 380 * hyperpervisor upon mapping of the ring and required to 381 * unmap it when a connection is torn down. 382 */ 383 grant_handle_t handle[XBB_MAX_RING_PAGES]; 384 385 /** 386 * The device bus address returned by the hypervisor when 387 * mapping the ring and required to unmap it when a connection 388 * is torn down. 389 */ 390 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 391 392 /** The number of ring pages mapped for the current connection. */ 393 u_int ring_pages; 394 395 /** 396 * The grant references, one per-ring page, supplied by the 397 * front-end, allowing us to reference the ring pages in the 398 * front-end's domain and to map these pages into our own domain. 399 */ 400 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 401 402 /** The interrupt driven even channel used to signal ring events. */ 403 evtchn_port_t evtchn; 404 }; 405 406 /** 407 * Per-instance connection state flags. 408 */ 409 typedef enum 410 { 411 /** 412 * The front-end requested a read-only mount of the 413 * back-end device/file. 414 */ 415 XBBF_READ_ONLY = 0x01, 416 417 /** Communication with the front-end has been established. */ 418 XBBF_RING_CONNECTED = 0x02, 419 420 /** 421 * Front-end requests exist in the ring and are waiting for 422 * xbb_xen_req objects to free up. 423 */ 424 XBBF_RESOURCE_SHORTAGE = 0x04, 425 426 /** Connection teardown in progress. */ 427 XBBF_SHUTDOWN = 0x08, 428 429 /** A thread is already performing shutdown processing. */ 430 XBBF_IN_SHUTDOWN = 0x10 431 } xbb_flag_t; 432 433 /** Backend device type. */ 434 typedef enum { 435 /** Backend type unknown. */ 436 XBB_TYPE_NONE = 0x00, 437 438 /** 439 * Backend type disk (access via cdev switch 440 * strategy routine). 441 */ 442 XBB_TYPE_DISK = 0x01, 443 444 /** Backend type file (access vnode operations.). */ 445 XBB_TYPE_FILE = 0x02 446 } xbb_type; 447 448 /** 449 * \brief Structure used to memoize information about a per-request 450 * scatter-gather list. 451 * 452 * The chief benefit of using this data structure is it avoids having 453 * to reparse the possibly discontiguous S/G list in the original 454 * request. Due to the way that the mapping of the memory backing an 455 * I/O transaction is handled by Xen, a second pass is unavoidable. 456 * At least this way the second walk is a simple array traversal. 457 * 458 * \note A single Scatter/Gather element in the block interface covers 459 * at most 1 machine page. In this context a sector (blkif 460 * nomenclature, not what I'd choose) is a 512b aligned unit 461 * of mapping within the machine page referenced by an S/G 462 * element. 463 */ 464 struct xbb_sg { 465 /** The number of 512b data chunks mapped in this S/G element. */ 466 int16_t nsect; 467 468 /** 469 * The index (0 based) of the first 512b data chunk mapped 470 * in this S/G element. 471 */ 472 uint8_t first_sect; 473 474 /** 475 * The index (0 based) of the last 512b data chunk mapped 476 * in this S/G element. 477 */ 478 uint8_t last_sect; 479 }; 480 481 /** 482 * Character device backend specific configuration data. 483 */ 484 struct xbb_dev_data { 485 /** Cdev used for device backend access. */ 486 struct cdev *cdev; 487 488 /** Cdev switch used for device backend access. */ 489 struct cdevsw *csw; 490 491 /** Used to hold a reference on opened cdev backend devices. */ 492 int dev_ref; 493 }; 494 495 /** 496 * File backend specific configuration data. 497 */ 498 struct xbb_file_data { 499 /** Credentials to use for vnode backed (file based) I/O. */ 500 struct ucred *cred; 501 502 /** 503 * \brief Array of io vectors used to process file based I/O. 504 * 505 * Only a single file based request is outstanding per-xbb instance, 506 * so we only need one of these. 507 */ 508 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 509 #ifdef XBB_USE_BOUNCE_BUFFERS 510 511 /** 512 * \brief Array of io vectors used to handle bouncing of file reads. 513 * 514 * Vnode operations are free to modify uio data during their 515 * exectuion. In the case of a read with bounce buffering active, 516 * we need some of the data from the original uio in order to 517 * bounce-out the read data. This array serves as the temporary 518 * storage for this saved data. 519 */ 520 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 521 522 /** 523 * \brief Array of memoized bounce buffer kva offsets used 524 * in the file based backend. 525 * 526 * Due to the way that the mapping of the memory backing an 527 * I/O transaction is handled by Xen, a second pass through 528 * the request sg elements is unavoidable. We memoize the computed 529 * bounce address here to reduce the cost of the second walk. 530 */ 531 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 532 #endif /* XBB_USE_BOUNCE_BUFFERS */ 533 }; 534 535 /** 536 * Collection of backend type specific data. 537 */ 538 union xbb_backend_data { 539 struct xbb_dev_data dev; 540 struct xbb_file_data file; 541 }; 542 543 /** 544 * Function signature of backend specific I/O handlers. 545 */ 546 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 547 struct xbb_xen_reqlist *reqlist, int operation, 548 int flags); 549 550 /** 551 * Per-instance configuration data. 552 */ 553 struct xbb_softc { 554 555 /** 556 * Task-queue used to process I/O requests. 557 */ 558 struct taskqueue *io_taskqueue; 559 560 /** 561 * Single "run the request queue" task enqueued 562 * on io_taskqueue. 563 */ 564 struct task io_task; 565 566 /** Device type for this instance. */ 567 xbb_type device_type; 568 569 /** NewBus device corresponding to this instance. */ 570 device_t dev; 571 572 /** Backend specific dispatch routine for this instance. */ 573 xbb_dispatch_t dispatch_io; 574 575 /** The number of requests outstanding on the backend device/file. */ 576 int active_request_count; 577 578 /** Free pool of request tracking structures. */ 579 struct xbb_xen_req_list request_free_stailq; 580 581 /** Array, sized at connection time, of request tracking structures. */ 582 struct xbb_xen_req *requests; 583 584 /** Free pool of request list structures. */ 585 struct xbb_xen_reqlist_list reqlist_free_stailq; 586 587 /** List of pending request lists awaiting execution. */ 588 struct xbb_xen_reqlist_list reqlist_pending_stailq; 589 590 /** Array, sized at connection time, of request list structures. */ 591 struct xbb_xen_reqlist *request_lists; 592 593 /** 594 * Global pool of kva used for mapping remote domain ring 595 * and I/O transaction data. 596 */ 597 vm_offset_t kva; 598 599 /** Psuedo-physical address corresponding to kva. */ 600 uint64_t gnt_base_addr; 601 602 /** The size of the global kva pool. */ 603 int kva_size; 604 605 /** The size of the KVA area used for request lists. */ 606 int reqlist_kva_size; 607 608 /** The number of pages of KVA used for request lists */ 609 int reqlist_kva_pages; 610 611 /** Bitmap of free KVA pages */ 612 bitstr_t *kva_free; 613 614 /** 615 * \brief Cached value of the front-end's domain id. 616 * 617 * This value is used at once for each mapped page in 618 * a transaction. We cache it to avoid incuring the 619 * cost of an ivar access every time this is needed. 620 */ 621 domid_t otherend_id; 622 623 /** 624 * \brief The blkif protocol abi in effect. 625 * 626 * There are situations where the back and front ends can 627 * have a different, native abi (e.g. intel x86_64 and 628 * 32bit x86 domains on the same machine). The back-end 629 * always accomodates the front-end's native abi. That 630 * value is pulled from the XenStore and recorded here. 631 */ 632 int abi; 633 634 /** 635 * \brief The maximum number of requests and request lists allowed 636 * to be in flight at a time. 637 * 638 * This value is negotiated via the XenStore. 639 */ 640 u_int max_requests; 641 642 /** 643 * \brief The maximum number of segments (1 page per segment) 644 * that can be mapped by a request. 645 * 646 * This value is negotiated via the XenStore. 647 */ 648 u_int max_request_segments; 649 650 /** 651 * \brief Maximum number of segments per request list. 652 * 653 * This value is derived from and will generally be larger than 654 * max_request_segments. 655 */ 656 u_int max_reqlist_segments; 657 658 /** 659 * The maximum size of any request to this back-end 660 * device. 661 * 662 * This value is negotiated via the XenStore. 663 */ 664 u_int max_request_size; 665 666 /** 667 * The maximum size of any request list. This is derived directly 668 * from max_reqlist_segments. 669 */ 670 u_int max_reqlist_size; 671 672 /** Various configuration and state bit flags. */ 673 xbb_flag_t flags; 674 675 /** Ring mapping and interrupt configuration data. */ 676 struct xbb_ring_config ring_config; 677 678 /** Runtime, cross-abi safe, structures for ring access. */ 679 blkif_back_rings_t rings; 680 681 /** IRQ mapping for the communication ring event channel. */ 682 int irq; 683 684 /** 685 * \brief Backend access mode flags (e.g. write, or read-only). 686 * 687 * This value is passed to us by the front-end via the XenStore. 688 */ 689 char *dev_mode; 690 691 /** 692 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 693 * 694 * This value is passed to us by the front-end via the XenStore. 695 * Currently unused. 696 */ 697 char *dev_type; 698 699 /** 700 * \brief Backend device/file identifier. 701 * 702 * This value is passed to us by the front-end via the XenStore. 703 * We expect this to be a POSIX path indicating the file or 704 * device to open. 705 */ 706 char *dev_name; 707 708 /** 709 * Vnode corresponding to the backend device node or file 710 * we are acessing. 711 */ 712 struct vnode *vn; 713 714 union xbb_backend_data backend; 715 716 /** The native sector size of the backend. */ 717 u_int sector_size; 718 719 /** log2 of sector_size. */ 720 u_int sector_size_shift; 721 722 /** Size in bytes of the backend device or file. */ 723 off_t media_size; 724 725 /** 726 * \brief media_size expressed in terms of the backend native 727 * sector size. 728 * 729 * (e.g. xbb->media_size >> xbb->sector_size_shift). 730 */ 731 uint64_t media_num_sectors; 732 733 /** 734 * \brief Array of memoized scatter gather data computed during the 735 * conversion of blkif ring requests to internal xbb_xen_req 736 * structures. 737 * 738 * Ring processing is serialized so we only need one of these. 739 */ 740 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 741 742 /** 743 * Temporary grant table map used in xbb_dispatch_io(). When 744 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 745 * stack could cause a stack overflow. 746 */ 747 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 748 749 /** Mutex protecting per-instance data. */ 750 struct mtx lock; 751 752 #ifdef XENHVM 753 /** 754 * Resource representing allocated physical address space 755 * associated with our per-instance kva region. 756 */ 757 struct resource *pseudo_phys_res; 758 759 /** Resource id for allocated physical address space. */ 760 int pseudo_phys_res_id; 761 #endif 762 763 /** 764 * I/O statistics from BlockBack dispatch down. These are 765 * coalesced requests, and we start them right before execution. 766 */ 767 struct devstat *xbb_stats; 768 769 /** 770 * I/O statistics coming into BlockBack. These are the requests as 771 * we get them from BlockFront. They are started as soon as we 772 * receive a request, and completed when the I/O is complete. 773 */ 774 struct devstat *xbb_stats_in; 775 776 /** Disable sending flush to the backend */ 777 int disable_flush; 778 779 /** Send a real flush for every N flush requests */ 780 int flush_interval; 781 782 /** Count of flush requests in the interval */ 783 int flush_count; 784 785 /** Don't coalesce requests if this is set */ 786 int no_coalesce_reqs; 787 788 /** Number of requests we have received */ 789 uint64_t reqs_received; 790 791 /** Number of requests we have completed*/ 792 uint64_t reqs_completed; 793 794 /** How many forced dispatches (i.e. without coalescing) have happend */ 795 uint64_t forced_dispatch; 796 797 /** How many normal dispatches have happend */ 798 uint64_t normal_dispatch; 799 800 /** How many total dispatches have happend */ 801 uint64_t total_dispatch; 802 803 /** How many times we have run out of KVA */ 804 uint64_t kva_shortages; 805 806 /** How many times we have run out of request structures */ 807 uint64_t request_shortages; 808 }; 809 810 /*---------------------------- Request Processing ----------------------------*/ 811 /** 812 * Allocate an internal transaction tracking structure from the free pool. 813 * 814 * \param xbb Per-instance xbb configuration structure. 815 * 816 * \return On success, a pointer to the allocated xbb_xen_req structure. 817 * Otherwise NULL. 818 */ 819 static inline struct xbb_xen_req * 820 xbb_get_req(struct xbb_softc *xbb) 821 { 822 struct xbb_xen_req *req; 823 824 req = NULL; 825 826 mtx_assert(&xbb->lock, MA_OWNED); 827 828 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 829 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 830 xbb->active_request_count++; 831 } 832 833 return (req); 834 } 835 836 /** 837 * Return an allocated transaction tracking structure to the free pool. 838 * 839 * \param xbb Per-instance xbb configuration structure. 840 * \param req The request structure to free. 841 */ 842 static inline void 843 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 844 { 845 mtx_assert(&xbb->lock, MA_OWNED); 846 847 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 848 xbb->active_request_count--; 849 850 KASSERT(xbb->active_request_count >= 0, 851 ("xbb_release_req: negative active count")); 852 } 853 854 /** 855 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 856 * 857 * \param xbb Per-instance xbb configuration structure. 858 * \param req_list The list of requests to free. 859 * \param nreqs The number of items in the list. 860 */ 861 static inline void 862 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 863 int nreqs) 864 { 865 mtx_assert(&xbb->lock, MA_OWNED); 866 867 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 868 xbb->active_request_count -= nreqs; 869 870 KASSERT(xbb->active_request_count >= 0, 871 ("xbb_release_reqs: negative active count")); 872 } 873 874 /** 875 * Given a page index and 512b sector offset within that page, 876 * calculate an offset into a request's kva region. 877 * 878 * \param reqlist The request structure whose kva region will be accessed. 879 * \param pagenr The page index used to compute the kva offset. 880 * \param sector The 512b sector index used to compute the page relative 881 * kva offset. 882 * 883 * \return The computed global KVA offset. 884 */ 885 static inline uint8_t * 886 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 887 { 888 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 889 } 890 891 #ifdef XBB_USE_BOUNCE_BUFFERS 892 /** 893 * Given a page index and 512b sector offset within that page, 894 * calculate an offset into a request's local bounce memory region. 895 * 896 * \param reqlist The request structure whose bounce region will be accessed. 897 * \param pagenr The page index used to compute the bounce offset. 898 * \param sector The 512b sector index used to compute the page relative 899 * bounce offset. 900 * 901 * \return The computed global bounce buffer address. 902 */ 903 static inline uint8_t * 904 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 905 { 906 return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 907 } 908 #endif 909 910 /** 911 * Given a page number and 512b sector offset within that page, 912 * calculate an offset into the request's memory region that the 913 * underlying backend device/file should use for I/O. 914 * 915 * \param reqlist The request structure whose I/O region will be accessed. 916 * \param pagenr The page index used to compute the I/O offset. 917 * \param sector The 512b sector index used to compute the page relative 918 * I/O offset. 919 * 920 * \return The computed global I/O address. 921 * 922 * Depending on configuration, this will either be a local bounce buffer 923 * or a pointer to the memory mapped in from the front-end domain for 924 * this request. 925 */ 926 static inline uint8_t * 927 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 928 { 929 #ifdef XBB_USE_BOUNCE_BUFFERS 930 return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 931 #else 932 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 933 #endif 934 } 935 936 /** 937 * Given a page index and 512b sector offset within that page, calculate 938 * an offset into the local psuedo-physical address space used to map a 939 * front-end's request data into a request. 940 * 941 * \param reqlist The request list structure whose pseudo-physical region 942 * will be accessed. 943 * \param pagenr The page index used to compute the pseudo-physical offset. 944 * \param sector The 512b sector index used to compute the page relative 945 * pseudo-physical offset. 946 * 947 * \return The computed global pseudo-phsyical address. 948 * 949 * Depending on configuration, this will either be a local bounce buffer 950 * or a pointer to the memory mapped in from the front-end domain for 951 * this request. 952 */ 953 static inline uintptr_t 954 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 955 { 956 struct xbb_softc *xbb; 957 958 xbb = reqlist->xbb; 959 960 return ((uintptr_t)(xbb->gnt_base_addr + 961 (uintptr_t)(reqlist->kva - xbb->kva) + 962 (PAGE_SIZE * pagenr) + (sector << 9))); 963 } 964 965 /** 966 * Get Kernel Virtual Address space for mapping requests. 967 * 968 * \param xbb Per-instance xbb configuration structure. 969 * \param nr_pages Number of pages needed. 970 * \param check_only If set, check for free KVA but don't allocate it. 971 * \param have_lock If set, xbb lock is already held. 972 * 973 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 974 * 975 * Note: This should be unnecessary once we have either chaining or 976 * scatter/gather support for struct bio. At that point we'll be able to 977 * put multiple addresses and lengths in one bio/bio chain and won't need 978 * to map everything into one virtual segment. 979 */ 980 static uint8_t * 981 xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 982 { 983 intptr_t first_clear, num_clear; 984 uint8_t *free_kva; 985 int i; 986 987 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 988 989 first_clear = 0; 990 free_kva = NULL; 991 992 mtx_lock(&xbb->lock); 993 994 /* 995 * Look for the first available page. If there are none, we're done. 996 */ 997 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 998 999 if (first_clear == -1) 1000 goto bailout; 1001 1002 /* 1003 * Starting at the first available page, look for consecutive free 1004 * pages that will satisfy the user's request. 1005 */ 1006 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1007 /* 1008 * If this is true, the page is used, so we have to reset 1009 * the number of clear pages and the first clear page 1010 * (since it pointed to a region with an insufficient number 1011 * of clear pages). 1012 */ 1013 if (bit_test(xbb->kva_free, i)) { 1014 num_clear = 0; 1015 first_clear = -1; 1016 continue; 1017 } 1018 1019 if (first_clear == -1) 1020 first_clear = i; 1021 1022 /* 1023 * If this is true, we've found a large enough free region 1024 * to satisfy the request. 1025 */ 1026 if (++num_clear == nr_pages) { 1027 1028 bit_nset(xbb->kva_free, first_clear, 1029 first_clear + nr_pages - 1); 1030 1031 free_kva = xbb->kva + 1032 (uint8_t *)(first_clear * PAGE_SIZE); 1033 1034 KASSERT(free_kva >= (uint8_t *)xbb->kva && 1035 free_kva + (nr_pages * PAGE_SIZE) <= 1036 (uint8_t *)xbb->ring_config.va, 1037 ("Free KVA %p len %d out of range, " 1038 "kva = %#jx, ring VA = %#jx\n", free_kva, 1039 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1040 (uintmax_t)xbb->ring_config.va)); 1041 break; 1042 } 1043 } 1044 1045 bailout: 1046 1047 if (free_kva == NULL) { 1048 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1049 xbb->kva_shortages++; 1050 } 1051 1052 mtx_unlock(&xbb->lock); 1053 1054 return (free_kva); 1055 } 1056 1057 /** 1058 * Free allocated KVA. 1059 * 1060 * \param xbb Per-instance xbb configuration structure. 1061 * \param kva_ptr Pointer to allocated KVA region. 1062 * \param nr_pages Number of pages in the KVA region. 1063 */ 1064 static void 1065 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 1066 { 1067 intptr_t start_page; 1068 1069 mtx_assert(&xbb->lock, MA_OWNED); 1070 1071 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1072 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1073 1074 } 1075 1076 /** 1077 * Unmap the front-end pages associated with this I/O request. 1078 * 1079 * \param req The request structure to unmap. 1080 */ 1081 static void 1082 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1083 { 1084 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1085 u_int i; 1086 u_int invcount; 1087 int error; 1088 1089 invcount = 0; 1090 for (i = 0; i < reqlist->nr_segments; i++) { 1091 1092 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1093 continue; 1094 1095 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1096 unmap[invcount].dev_bus_addr = 0; 1097 unmap[invcount].handle = reqlist->gnt_handles[i]; 1098 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1099 invcount++; 1100 } 1101 1102 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1103 unmap, invcount); 1104 KASSERT(error == 0, ("Grant table operation failed")); 1105 } 1106 1107 /** 1108 * Allocate an internal transaction tracking structure from the free pool. 1109 * 1110 * \param xbb Per-instance xbb configuration structure. 1111 * 1112 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1113 * Otherwise NULL. 1114 */ 1115 static inline struct xbb_xen_reqlist * 1116 xbb_get_reqlist(struct xbb_softc *xbb) 1117 { 1118 struct xbb_xen_reqlist *reqlist; 1119 1120 reqlist = NULL; 1121 1122 mtx_assert(&xbb->lock, MA_OWNED); 1123 1124 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1125 1126 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1127 reqlist->flags = XBB_REQLIST_NONE; 1128 reqlist->kva = NULL; 1129 reqlist->status = BLKIF_RSP_OKAY; 1130 reqlist->residual_512b_sectors = 0; 1131 reqlist->num_children = 0; 1132 reqlist->nr_segments = 0; 1133 STAILQ_INIT(&reqlist->contig_req_list); 1134 } 1135 1136 return (reqlist); 1137 } 1138 1139 /** 1140 * Return an allocated transaction tracking structure to the free pool. 1141 * 1142 * \param xbb Per-instance xbb configuration structure. 1143 * \param req The request list structure to free. 1144 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1145 * during a resource shortage condition. 1146 */ 1147 static inline void 1148 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1149 int wakeup) 1150 { 1151 1152 mtx_lock(&xbb->lock); 1153 1154 if (wakeup) { 1155 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1156 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1157 } 1158 1159 if (reqlist->kva != NULL) 1160 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1161 1162 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1163 1164 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1165 1166 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1167 /* 1168 * Shutdown is in progress. See if we can 1169 * progress further now that one more request 1170 * has completed and been returned to the 1171 * free pool. 1172 */ 1173 xbb_shutdown(xbb); 1174 } 1175 1176 mtx_unlock(&xbb->lock); 1177 1178 if (wakeup != 0) 1179 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1180 } 1181 1182 /** 1183 * Request resources and do basic request setup. 1184 * 1185 * \param xbb Per-instance xbb configuration structure. 1186 * \param reqlist Pointer to reqlist pointer. 1187 * \param ring_req Pointer to a block ring request. 1188 * \param ring_index The ring index of this request. 1189 * 1190 * \return 0 for success, non-zero for failure. 1191 */ 1192 static int 1193 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1194 blkif_request_t *ring_req, RING_IDX ring_idx) 1195 { 1196 struct xbb_xen_reqlist *nreqlist; 1197 struct xbb_xen_req *nreq; 1198 1199 nreqlist = NULL; 1200 nreq = NULL; 1201 1202 mtx_lock(&xbb->lock); 1203 1204 /* 1205 * We don't allow new resources to be allocated if we're in the 1206 * process of shutting down. 1207 */ 1208 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1209 mtx_unlock(&xbb->lock); 1210 return (1); 1211 } 1212 1213 /* 1214 * Allocate a reqlist if the caller doesn't have one already. 1215 */ 1216 if (*reqlist == NULL) { 1217 nreqlist = xbb_get_reqlist(xbb); 1218 if (nreqlist == NULL) 1219 goto bailout_error; 1220 } 1221 1222 /* We always allocate a request. */ 1223 nreq = xbb_get_req(xbb); 1224 if (nreq == NULL) 1225 goto bailout_error; 1226 1227 mtx_unlock(&xbb->lock); 1228 1229 if (*reqlist == NULL) { 1230 *reqlist = nreqlist; 1231 nreqlist->operation = ring_req->operation; 1232 nreqlist->starting_sector_number = ring_req->sector_number; 1233 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1234 links); 1235 } 1236 1237 nreq->reqlist = *reqlist; 1238 nreq->req_ring_idx = ring_idx; 1239 1240 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1241 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1242 nreq->ring_req = &nreq->ring_req_storage; 1243 } else { 1244 nreq->ring_req = ring_req; 1245 } 1246 1247 binuptime(&nreq->ds_t0); 1248 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1249 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1250 (*reqlist)->num_children++; 1251 (*reqlist)->nr_segments += ring_req->nr_segments; 1252 1253 return (0); 1254 1255 bailout_error: 1256 1257 /* 1258 * We're out of resources, so set the shortage flag. The next time 1259 * a request is released, we'll try waking up the work thread to 1260 * see if we can allocate more resources. 1261 */ 1262 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1263 xbb->request_shortages++; 1264 1265 if (nreq != NULL) 1266 xbb_release_req(xbb, nreq); 1267 1268 mtx_unlock(&xbb->lock); 1269 1270 if (nreqlist != NULL) 1271 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1272 1273 return (1); 1274 } 1275 1276 /** 1277 * Create and transmit a response to a blkif request. 1278 * 1279 * \param xbb Per-instance xbb configuration structure. 1280 * \param req The request structure to which to respond. 1281 * \param status The status code to report. See BLKIF_RSP_* 1282 * in sys/xen/interface/io/blkif.h. 1283 */ 1284 static void 1285 xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1286 { 1287 blkif_response_t *resp; 1288 int more_to_do; 1289 int notify; 1290 1291 more_to_do = 0; 1292 1293 /* 1294 * Place on the response ring for the relevant domain. 1295 * For now, only the spacing between entries is different 1296 * in the different ABIs, not the response entry layout. 1297 */ 1298 mtx_lock(&xbb->lock); 1299 switch (xbb->abi) { 1300 case BLKIF_PROTOCOL_NATIVE: 1301 resp = RING_GET_RESPONSE(&xbb->rings.native, 1302 xbb->rings.native.rsp_prod_pvt); 1303 break; 1304 case BLKIF_PROTOCOL_X86_32: 1305 resp = (blkif_response_t *) 1306 RING_GET_RESPONSE(&xbb->rings.x86_32, 1307 xbb->rings.x86_32.rsp_prod_pvt); 1308 break; 1309 case BLKIF_PROTOCOL_X86_64: 1310 resp = (blkif_response_t *) 1311 RING_GET_RESPONSE(&xbb->rings.x86_64, 1312 xbb->rings.x86_64.rsp_prod_pvt); 1313 break; 1314 default: 1315 panic("Unexpected blkif protocol ABI."); 1316 } 1317 1318 resp->id = req->id; 1319 resp->operation = req->operation; 1320 resp->status = status; 1321 1322 xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages); 1323 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify); 1324 1325 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1326 1327 /* 1328 * Tail check for pending requests. Allows frontend to avoid 1329 * notifications if requests are already in flight (lower 1330 * overheads and promotes batching). 1331 */ 1332 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1333 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1334 1335 more_to_do = 1; 1336 } 1337 1338 xbb->reqs_completed++; 1339 1340 mtx_unlock(&xbb->lock); 1341 1342 if (more_to_do) 1343 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1344 1345 if (notify) 1346 notify_remote_via_irq(xbb->irq); 1347 } 1348 1349 /** 1350 * Complete a request list. 1351 * 1352 * \param xbb Per-instance xbb configuration structure. 1353 * \param reqlist Allocated internal request list structure. 1354 */ 1355 static void 1356 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1357 { 1358 struct xbb_xen_req *nreq; 1359 off_t sectors_sent; 1360 1361 sectors_sent = 0; 1362 1363 if (reqlist->flags & XBB_REQLIST_MAPPED) 1364 xbb_unmap_reqlist(reqlist); 1365 1366 /* 1367 * All I/O is done, send the response. A lock should not be 1368 * necessary here because the request list is complete, and 1369 * therefore this is the only context accessing this request 1370 * right now. The functions we call do their own locking if 1371 * necessary. 1372 */ 1373 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1374 off_t cur_sectors_sent; 1375 1376 xbb_send_response(xbb, nreq, reqlist->status); 1377 1378 /* We don't report bytes sent if there is an error. */ 1379 if (reqlist->status == BLKIF_RSP_OKAY) 1380 cur_sectors_sent = nreq->nr_512b_sectors; 1381 else 1382 cur_sectors_sent = 0; 1383 1384 sectors_sent += cur_sectors_sent; 1385 1386 devstat_end_transaction(xbb->xbb_stats_in, 1387 /*bytes*/cur_sectors_sent << 9, 1388 reqlist->ds_tag_type, 1389 reqlist->ds_trans_type, 1390 /*now*/NULL, 1391 /*then*/&nreq->ds_t0); 1392 } 1393 1394 /* 1395 * Take out any sectors not sent. If we wind up negative (which 1396 * might happen if an error is reported as well as a residual), just 1397 * report 0 sectors sent. 1398 */ 1399 sectors_sent -= reqlist->residual_512b_sectors; 1400 if (sectors_sent < 0) 1401 sectors_sent = 0; 1402 1403 devstat_end_transaction(xbb->xbb_stats, 1404 /*bytes*/ sectors_sent << 9, 1405 reqlist->ds_tag_type, 1406 reqlist->ds_trans_type, 1407 /*now*/NULL, 1408 /*then*/&reqlist->ds_t0); 1409 1410 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1411 } 1412 1413 /** 1414 * Completion handler for buffer I/O requests issued by the device 1415 * backend driver. 1416 * 1417 * \param bio The buffer I/O request on which to perform completion 1418 * processing. 1419 */ 1420 static void 1421 xbb_bio_done(struct bio *bio) 1422 { 1423 struct xbb_softc *xbb; 1424 struct xbb_xen_reqlist *reqlist; 1425 1426 reqlist = bio->bio_caller1; 1427 xbb = reqlist->xbb; 1428 1429 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1430 1431 /* 1432 * This is a bit imprecise. With aggregated I/O a single 1433 * request list can contain multiple front-end requests and 1434 * a multiple bios may point to a single request. By carefully 1435 * walking the request list, we could map residuals and errors 1436 * back to the original front-end request, but the interface 1437 * isn't sufficiently rich for us to properly report the error. 1438 * So, we just treat the entire request list as having failed if an 1439 * error occurs on any part. And, if an error occurs, we treat 1440 * the amount of data transferred as 0. 1441 * 1442 * For residuals, we report it on the overall aggregated device, 1443 * but not on the individual requests, since we don't currently 1444 * do the work to determine which front-end request to which the 1445 * residual applies. 1446 */ 1447 if (bio->bio_error) { 1448 DPRINTF("BIO returned error %d for operation on device %s\n", 1449 bio->bio_error, xbb->dev_name); 1450 reqlist->status = BLKIF_RSP_ERROR; 1451 1452 if (bio->bio_error == ENXIO 1453 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1454 1455 /* 1456 * Backend device has disappeared. Signal the 1457 * front-end that we (the device proxy) want to 1458 * go away. 1459 */ 1460 xenbus_set_state(xbb->dev, XenbusStateClosing); 1461 } 1462 } 1463 1464 #ifdef XBB_USE_BOUNCE_BUFFERS 1465 if (bio->bio_cmd == BIO_READ) { 1466 vm_offset_t kva_offset; 1467 1468 kva_offset = (vm_offset_t)bio->bio_data 1469 - (vm_offset_t)reqlist->bounce; 1470 memcpy((uint8_t *)reqlist->kva + kva_offset, 1471 bio->bio_data, bio->bio_bcount); 1472 } 1473 #endif /* XBB_USE_BOUNCE_BUFFERS */ 1474 1475 /* 1476 * Decrement the pending count for the request list. When we're 1477 * done with the requests, send status back for all of them. 1478 */ 1479 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1480 xbb_complete_reqlist(xbb, reqlist); 1481 1482 g_destroy_bio(bio); 1483 } 1484 1485 /** 1486 * Parse a blkif request into an internal request structure and send 1487 * it to the backend for processing. 1488 * 1489 * \param xbb Per-instance xbb configuration structure. 1490 * \param reqlist Allocated internal request list structure. 1491 * 1492 * \return On success, 0. For resource shortages, non-zero. 1493 * 1494 * This routine performs the backend common aspects of request parsing 1495 * including compiling an internal request structure, parsing the S/G 1496 * list and any secondary ring requests in which they may reside, and 1497 * the mapping of front-end I/O pages into our domain. 1498 */ 1499 static int 1500 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1501 { 1502 struct xbb_sg *xbb_sg; 1503 struct gnttab_map_grant_ref *map; 1504 struct blkif_request_segment *sg; 1505 struct blkif_request_segment *last_block_sg; 1506 struct xbb_xen_req *nreq; 1507 u_int nseg; 1508 u_int seg_idx; 1509 u_int block_segs; 1510 int nr_sects; 1511 int total_sects; 1512 int operation; 1513 uint8_t bio_flags; 1514 int error; 1515 1516 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1517 bio_flags = 0; 1518 total_sects = 0; 1519 nr_sects = 0; 1520 1521 /* 1522 * First determine whether we have enough free KVA to satisfy this 1523 * request list. If not, tell xbb_run_queue() so it can go to 1524 * sleep until we have more KVA. 1525 */ 1526 reqlist->kva = NULL; 1527 if (reqlist->nr_segments != 0) { 1528 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1529 if (reqlist->kva == NULL) { 1530 /* 1531 * If we're out of KVA, return ENOMEM. 1532 */ 1533 return (ENOMEM); 1534 } 1535 } 1536 1537 binuptime(&reqlist->ds_t0); 1538 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1539 1540 switch (reqlist->operation) { 1541 case BLKIF_OP_WRITE_BARRIER: 1542 bio_flags |= BIO_ORDERED; 1543 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1544 /* FALLTHROUGH */ 1545 case BLKIF_OP_WRITE: 1546 operation = BIO_WRITE; 1547 reqlist->ds_trans_type = DEVSTAT_WRITE; 1548 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1549 DPRINTF("Attempt to write to read only device %s\n", 1550 xbb->dev_name); 1551 reqlist->status = BLKIF_RSP_ERROR; 1552 goto send_response; 1553 } 1554 break; 1555 case BLKIF_OP_READ: 1556 operation = BIO_READ; 1557 reqlist->ds_trans_type = DEVSTAT_READ; 1558 break; 1559 case BLKIF_OP_FLUSH_DISKCACHE: 1560 /* 1561 * If this is true, the user has requested that we disable 1562 * flush support. So we just complete the requests 1563 * successfully. 1564 */ 1565 if (xbb->disable_flush != 0) { 1566 goto send_response; 1567 } 1568 1569 /* 1570 * The user has requested that we only send a real flush 1571 * for every N flush requests. So keep count, and either 1572 * complete the request immediately or queue it for the 1573 * backend. 1574 */ 1575 if (xbb->flush_interval != 0) { 1576 if (++(xbb->flush_count) < xbb->flush_interval) { 1577 goto send_response; 1578 } else 1579 xbb->flush_count = 0; 1580 } 1581 1582 operation = BIO_FLUSH; 1583 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1584 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1585 goto do_dispatch; 1586 /*NOTREACHED*/ 1587 default: 1588 DPRINTF("error: unknown block io operation [%d]\n", 1589 reqlist->operation); 1590 reqlist->status = BLKIF_RSP_ERROR; 1591 goto send_response; 1592 } 1593 1594 reqlist->xbb = xbb; 1595 xbb_sg = xbb->xbb_sgs; 1596 map = xbb->maps; 1597 seg_idx = 0; 1598 1599 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1600 blkif_request_t *ring_req; 1601 RING_IDX req_ring_idx; 1602 u_int req_seg_idx; 1603 1604 ring_req = nreq->ring_req; 1605 req_ring_idx = nreq->req_ring_idx; 1606 nr_sects = 0; 1607 nseg = ring_req->nr_segments; 1608 nreq->id = ring_req->id; 1609 nreq->nr_pages = nseg; 1610 nreq->nr_512b_sectors = 0; 1611 req_seg_idx = 0; 1612 sg = NULL; 1613 1614 /* Check that number of segments is sane. */ 1615 if (unlikely(nseg == 0) 1616 || unlikely(nseg > xbb->max_request_segments)) { 1617 DPRINTF("Bad number of segments in request (%d)\n", 1618 nseg); 1619 reqlist->status = BLKIF_RSP_ERROR; 1620 goto send_response; 1621 } 1622 1623 block_segs = MIN(nreq->nr_pages, 1624 BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); 1625 sg = ring_req->seg; 1626 last_block_sg = sg + block_segs; 1627 while (1) { 1628 1629 while (sg < last_block_sg) { 1630 KASSERT(seg_idx < 1631 XBB_MAX_SEGMENTS_PER_REQLIST, 1632 ("seg_idx %d is too large, max " 1633 "segs %d\n", seg_idx, 1634 XBB_MAX_SEGMENTS_PER_REQLIST)); 1635 1636 xbb_sg->first_sect = sg->first_sect; 1637 xbb_sg->last_sect = sg->last_sect; 1638 xbb_sg->nsect = 1639 (int8_t)(sg->last_sect - 1640 sg->first_sect + 1); 1641 1642 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1643 || (xbb_sg->nsect <= 0)) { 1644 reqlist->status = BLKIF_RSP_ERROR; 1645 goto send_response; 1646 } 1647 1648 nr_sects += xbb_sg->nsect; 1649 map->host_addr = xbb_get_gntaddr(reqlist, 1650 seg_idx, /*sector*/0); 1651 KASSERT(map->host_addr + PAGE_SIZE <= 1652 xbb->ring_config.gnt_addr, 1653 ("Host address %#jx len %d overlaps " 1654 "ring address %#jx\n", 1655 (uintmax_t)map->host_addr, PAGE_SIZE, 1656 (uintmax_t)xbb->ring_config.gnt_addr)); 1657 1658 map->flags = GNTMAP_host_map; 1659 map->ref = sg->gref; 1660 map->dom = xbb->otherend_id; 1661 if (operation == BIO_WRITE) 1662 map->flags |= GNTMAP_readonly; 1663 sg++; 1664 map++; 1665 xbb_sg++; 1666 seg_idx++; 1667 req_seg_idx++; 1668 } 1669 1670 block_segs = MIN(nseg - req_seg_idx, 1671 BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); 1672 if (block_segs == 0) 1673 break; 1674 1675 /* 1676 * Fetch the next request block full of SG elements. 1677 * For now, only the spacing between entries is 1678 * different in the different ABIs, not the sg entry 1679 * layout. 1680 */ 1681 req_ring_idx++; 1682 switch (xbb->abi) { 1683 case BLKIF_PROTOCOL_NATIVE: 1684 sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native, 1685 req_ring_idx); 1686 break; 1687 case BLKIF_PROTOCOL_X86_32: 1688 { 1689 sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32, 1690 req_ring_idx); 1691 break; 1692 } 1693 case BLKIF_PROTOCOL_X86_64: 1694 { 1695 sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64, 1696 req_ring_idx); 1697 break; 1698 } 1699 default: 1700 panic("Unexpected blkif protocol ABI."); 1701 /* NOTREACHED */ 1702 } 1703 last_block_sg = sg + block_segs; 1704 } 1705 1706 /* Convert to the disk's sector size */ 1707 nreq->nr_512b_sectors = nr_sects; 1708 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1709 total_sects += nr_sects; 1710 1711 if ((nreq->nr_512b_sectors & 1712 ((xbb->sector_size >> 9) - 1)) != 0) { 1713 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1714 "a multiple of the backing store sector " 1715 "size (%d)\n", __func__, 1716 nreq->nr_512b_sectors << 9, 1717 xbb->sector_size); 1718 reqlist->status = BLKIF_RSP_ERROR; 1719 goto send_response; 1720 } 1721 } 1722 1723 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1724 xbb->maps, reqlist->nr_segments); 1725 if (error != 0) 1726 panic("Grant table operation failed (%d)", error); 1727 1728 reqlist->flags |= XBB_REQLIST_MAPPED; 1729 1730 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1731 seg_idx++, map++){ 1732 1733 if (unlikely(map->status != 0)) { 1734 DPRINTF("invalid buffer -- could not remap " 1735 "it (%d)\n", map->status); 1736 DPRINTF("Mapping(%d): Host Addr 0x%lx, flags " 1737 "0x%x ref 0x%x, dom %d\n", seg_idx, 1738 map->host_addr, map->flags, map->ref, 1739 map->dom); 1740 reqlist->status = BLKIF_RSP_ERROR; 1741 goto send_response; 1742 } 1743 1744 reqlist->gnt_handles[seg_idx] = map->handle; 1745 } 1746 if (reqlist->starting_sector_number + total_sects > 1747 xbb->media_num_sectors) { 1748 1749 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1750 "extends past end of device %s\n", 1751 operation == BIO_READ ? "read" : "write", 1752 reqlist->starting_sector_number, 1753 reqlist->starting_sector_number + total_sects, 1754 xbb->dev_name); 1755 reqlist->status = BLKIF_RSP_ERROR; 1756 goto send_response; 1757 } 1758 1759 do_dispatch: 1760 1761 error = xbb->dispatch_io(xbb, 1762 reqlist, 1763 operation, 1764 bio_flags); 1765 1766 if (error != 0) { 1767 reqlist->status = BLKIF_RSP_ERROR; 1768 goto send_response; 1769 } 1770 1771 return (0); 1772 1773 send_response: 1774 1775 xbb_complete_reqlist(xbb, reqlist); 1776 1777 return (0); 1778 } 1779 1780 static __inline int 1781 xbb_count_sects(blkif_request_t *ring_req) 1782 { 1783 int i; 1784 int cur_size = 0; 1785 1786 for (i = 0; i < ring_req->nr_segments; i++) { 1787 int nsect; 1788 1789 nsect = (int8_t)(ring_req->seg[i].last_sect - 1790 ring_req->seg[i].first_sect + 1); 1791 if (nsect <= 0) 1792 break; 1793 1794 cur_size += nsect; 1795 } 1796 1797 return (cur_size); 1798 } 1799 1800 /** 1801 * Process incoming requests from the shared communication ring in response 1802 * to a signal on the ring's event channel. 1803 * 1804 * \param context Callback argument registerd during task initialization - 1805 * the xbb_softc for this instance. 1806 * \param pending The number of taskqueue_enqueue events that have 1807 * occurred since this handler was last run. 1808 */ 1809 static void 1810 xbb_run_queue(void *context, int pending) 1811 { 1812 struct xbb_softc *xbb; 1813 blkif_back_rings_t *rings; 1814 RING_IDX rp; 1815 uint64_t cur_sector; 1816 int cur_operation; 1817 struct xbb_xen_reqlist *reqlist; 1818 1819 1820 xbb = (struct xbb_softc *)context; 1821 rings = &xbb->rings; 1822 1823 /* 1824 * Work gather and dispatch loop. Note that we have a bias here 1825 * towards gathering I/O sent by blockfront. We first gather up 1826 * everything in the ring, as long as we have resources. Then we 1827 * dispatch one request, and then attempt to gather up any 1828 * additional requests that have come in while we were dispatching 1829 * the request. 1830 * 1831 * This allows us to get a clearer picture (via devstat) of how 1832 * many requests blockfront is queueing to us at any given time. 1833 */ 1834 for (;;) { 1835 int retval; 1836 1837 /* 1838 * Initialize reqlist to the last element in the pending 1839 * queue, if there is one. This allows us to add more 1840 * requests to that request list, if we have room. 1841 */ 1842 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1843 xbb_xen_reqlist, links); 1844 if (reqlist != NULL) { 1845 cur_sector = reqlist->next_contig_sector; 1846 cur_operation = reqlist->operation; 1847 } else { 1848 cur_operation = 0; 1849 cur_sector = 0; 1850 } 1851 1852 /* 1853 * Cache req_prod to avoid accessing a cache line shared 1854 * with the frontend. 1855 */ 1856 rp = rings->common.sring->req_prod; 1857 1858 /* Ensure we see queued requests up to 'rp'. */ 1859 rmb(); 1860 1861 /** 1862 * Run so long as there is work to consume and the generation 1863 * of a response will not overflow the ring. 1864 * 1865 * @note There's a 1 to 1 relationship between requests and 1866 * responses, so an overflow should never occur. This 1867 * test is to protect our domain from digesting bogus 1868 * data. Shouldn't we log this? 1869 */ 1870 while (rings->common.req_cons != rp 1871 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1872 rings->common.req_cons) == 0){ 1873 blkif_request_t ring_req_storage; 1874 blkif_request_t *ring_req; 1875 int cur_size; 1876 1877 switch (xbb->abi) { 1878 case BLKIF_PROTOCOL_NATIVE: 1879 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1880 rings->common.req_cons); 1881 break; 1882 case BLKIF_PROTOCOL_X86_32: 1883 { 1884 struct blkif_x86_32_request *ring_req32; 1885 1886 ring_req32 = RING_GET_REQUEST( 1887 &xbb->rings.x86_32, rings->common.req_cons); 1888 blkif_get_x86_32_req(&ring_req_storage, 1889 ring_req32); 1890 ring_req = &ring_req_storage; 1891 break; 1892 } 1893 case BLKIF_PROTOCOL_X86_64: 1894 { 1895 struct blkif_x86_64_request *ring_req64; 1896 1897 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1898 rings->common.req_cons); 1899 blkif_get_x86_64_req(&ring_req_storage, 1900 ring_req64); 1901 ring_req = &ring_req_storage; 1902 break; 1903 } 1904 default: 1905 panic("Unexpected blkif protocol ABI."); 1906 /* NOTREACHED */ 1907 } 1908 1909 /* 1910 * Check for situations that would require closing 1911 * off this I/O for further coalescing: 1912 * - Coalescing is turned off. 1913 * - Current I/O is out of sequence with the previous 1914 * I/O. 1915 * - Coalesced I/O would be too large. 1916 */ 1917 if ((reqlist != NULL) 1918 && ((xbb->no_coalesce_reqs != 0) 1919 || ((xbb->no_coalesce_reqs == 0) 1920 && ((ring_req->sector_number != cur_sector) 1921 || (ring_req->operation != cur_operation) 1922 || ((ring_req->nr_segments + reqlist->nr_segments) > 1923 xbb->max_reqlist_segments))))) { 1924 reqlist = NULL; 1925 } 1926 1927 /* 1928 * Grab and check for all resources in one shot. 1929 * If we can't get all of the resources we need, 1930 * the shortage is noted and the thread will get 1931 * woken up when more resources are available. 1932 */ 1933 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1934 xbb->rings.common.req_cons); 1935 1936 if (retval != 0) { 1937 /* 1938 * Resource shortage has been recorded. 1939 * We'll be scheduled to run once a request 1940 * object frees up due to a completion. 1941 */ 1942 break; 1943 } 1944 1945 /* 1946 * Signify that we can overwrite this request with 1947 * a response by incrementing our consumer index. 1948 * The response won't be generated until after 1949 * we've already consumed all necessary data out 1950 * of the version of the request in the ring buffer 1951 * (for native mode). We must update the consumer 1952 * index before issueing back-end I/O so there is 1953 * no possibility that it will complete and a 1954 * response be generated before we make room in 1955 * the queue for that response. 1956 */ 1957 xbb->rings.common.req_cons += 1958 BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); 1959 xbb->reqs_received++; 1960 1961 cur_size = xbb_count_sects(ring_req); 1962 cur_sector = ring_req->sector_number + cur_size; 1963 reqlist->next_contig_sector = cur_sector; 1964 cur_operation = ring_req->operation; 1965 } 1966 1967 /* Check for I/O to dispatch */ 1968 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1969 if (reqlist == NULL) { 1970 /* 1971 * We're out of work to do, put the task queue to 1972 * sleep. 1973 */ 1974 break; 1975 } 1976 1977 /* 1978 * Grab the first request off the queue and attempt 1979 * to dispatch it. 1980 */ 1981 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1982 1983 retval = xbb_dispatch_io(xbb, reqlist); 1984 if (retval != 0) { 1985 /* 1986 * xbb_dispatch_io() returns non-zero only when 1987 * there is a resource shortage. If that's the 1988 * case, re-queue this request on the head of the 1989 * queue, and go to sleep until we have more 1990 * resources. 1991 */ 1992 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1993 reqlist, links); 1994 break; 1995 } else { 1996 /* 1997 * If we still have anything on the queue after 1998 * removing the head entry, that is because we 1999 * met one of the criteria to create a new 2000 * request list (outlined above), and we'll call 2001 * that a forced dispatch for statistical purposes. 2002 * 2003 * Otherwise, if there is only one element on the 2004 * queue, we coalesced everything available on 2005 * the ring and we'll call that a normal dispatch. 2006 */ 2007 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2008 2009 if (reqlist != NULL) 2010 xbb->forced_dispatch++; 2011 else 2012 xbb->normal_dispatch++; 2013 2014 xbb->total_dispatch++; 2015 } 2016 } 2017 } 2018 2019 /** 2020 * Interrupt handler bound to the shared ring's event channel. 2021 * 2022 * \param arg Callback argument registerd during event channel 2023 * binding - the xbb_softc for this instance. 2024 */ 2025 static void 2026 xbb_intr(void *arg) 2027 { 2028 struct xbb_softc *xbb; 2029 2030 /* Defer to kernel thread. */ 2031 xbb = (struct xbb_softc *)arg; 2032 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2033 } 2034 2035 /*----------------------------- Backend Handlers -----------------------------*/ 2036 /** 2037 * Backend handler for character device access. 2038 * 2039 * \param xbb Per-instance xbb configuration structure. 2040 * \param reqlist Allocated internal request list structure. 2041 * \param operation BIO_* I/O operation code. 2042 * \param bio_flags Additional bio_flag data to pass to any generated 2043 * bios (e.g. BIO_ORDERED).. 2044 * 2045 * \return 0 for success, errno codes for failure. 2046 */ 2047 static int 2048 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2049 int operation, int bio_flags) 2050 { 2051 struct xbb_dev_data *dev_data; 2052 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2053 struct xbb_xen_req *nreq; 2054 off_t bio_offset; 2055 struct bio *bio; 2056 struct xbb_sg *xbb_sg; 2057 u_int nbio; 2058 u_int bio_idx; 2059 u_int nseg; 2060 u_int seg_idx; 2061 int error; 2062 2063 dev_data = &xbb->backend.dev; 2064 bio_offset = (off_t)reqlist->starting_sector_number 2065 << xbb->sector_size_shift; 2066 error = 0; 2067 nbio = 0; 2068 bio_idx = 0; 2069 2070 if (operation == BIO_FLUSH) { 2071 nreq = STAILQ_FIRST(&reqlist->contig_req_list); 2072 bio = g_new_bio(); 2073 if (unlikely(bio == NULL)) { 2074 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2075 error = ENOMEM; 2076 return (error); 2077 } 2078 2079 bio->bio_cmd = BIO_FLUSH; 2080 bio->bio_flags |= BIO_ORDERED; 2081 bio->bio_dev = dev_data->cdev; 2082 bio->bio_offset = 0; 2083 bio->bio_data = 0; 2084 bio->bio_done = xbb_bio_done; 2085 bio->bio_caller1 = nreq; 2086 bio->bio_pblkno = 0; 2087 2088 nreq->pendcnt = 1; 2089 2090 (*dev_data->csw->d_strategy)(bio); 2091 2092 return (0); 2093 } 2094 2095 xbb_sg = xbb->xbb_sgs; 2096 bio = NULL; 2097 nseg = reqlist->nr_segments; 2098 2099 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2100 2101 /* 2102 * KVA will not be contiguous, so any additional 2103 * I/O will need to be represented in a new bio. 2104 */ 2105 if ((bio != NULL) 2106 && (xbb_sg->first_sect != 0)) { 2107 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2108 printf("%s: Discontiguous I/O request " 2109 "from domain %d ends on " 2110 "non-sector boundary\n", 2111 __func__, xbb->otherend_id); 2112 error = EINVAL; 2113 goto fail_free_bios; 2114 } 2115 bio = NULL; 2116 } 2117 2118 if (bio == NULL) { 2119 /* 2120 * Make sure that the start of this bio is 2121 * aligned to a device sector. 2122 */ 2123 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2124 printf("%s: Misaligned I/O request " 2125 "from domain %d\n", __func__, 2126 xbb->otherend_id); 2127 error = EINVAL; 2128 goto fail_free_bios; 2129 } 2130 2131 bio = bios[nbio++] = g_new_bio(); 2132 if (unlikely(bio == NULL)) { 2133 error = ENOMEM; 2134 goto fail_free_bios; 2135 } 2136 bio->bio_cmd = operation; 2137 bio->bio_flags |= bio_flags; 2138 bio->bio_dev = dev_data->cdev; 2139 bio->bio_offset = bio_offset; 2140 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2141 xbb_sg->first_sect); 2142 bio->bio_done = xbb_bio_done; 2143 bio->bio_caller1 = reqlist; 2144 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2145 } 2146 2147 bio->bio_length += xbb_sg->nsect << 9; 2148 bio->bio_bcount = bio->bio_length; 2149 bio_offset += xbb_sg->nsect << 9; 2150 2151 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2152 2153 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2154 printf("%s: Discontiguous I/O request " 2155 "from domain %d ends on " 2156 "non-sector boundary\n", 2157 __func__, xbb->otherend_id); 2158 error = EINVAL; 2159 goto fail_free_bios; 2160 } 2161 /* 2162 * KVA will not be contiguous, so any additional 2163 * I/O will need to be represented in a new bio. 2164 */ 2165 bio = NULL; 2166 } 2167 } 2168 2169 reqlist->pendcnt = nbio; 2170 2171 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2172 { 2173 #ifdef XBB_USE_BOUNCE_BUFFERS 2174 vm_offset_t kva_offset; 2175 2176 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2177 - (vm_offset_t)reqlist->bounce; 2178 if (operation == BIO_WRITE) { 2179 memcpy(bios[bio_idx]->bio_data, 2180 (uint8_t *)reqlist->kva + kva_offset, 2181 bios[bio_idx]->bio_bcount); 2182 } 2183 #endif 2184 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2185 } 2186 2187 return (error); 2188 2189 fail_free_bios: 2190 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2191 g_destroy_bio(bios[bio_idx]); 2192 2193 return (error); 2194 } 2195 2196 /** 2197 * Backend handler for file access. 2198 * 2199 * \param xbb Per-instance xbb configuration structure. 2200 * \param reqlist Allocated internal request list. 2201 * \param operation BIO_* I/O operation code. 2202 * \param flags Additional bio_flag data to pass to any generated bios 2203 * (e.g. BIO_ORDERED).. 2204 * 2205 * \return 0 for success, errno codes for failure. 2206 */ 2207 static int 2208 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2209 int operation, int flags) 2210 { 2211 struct xbb_file_data *file_data; 2212 u_int seg_idx; 2213 u_int nseg; 2214 off_t sectors_sent; 2215 struct uio xuio; 2216 struct xbb_sg *xbb_sg; 2217 struct iovec *xiovec; 2218 #ifdef XBB_USE_BOUNCE_BUFFERS 2219 void **p_vaddr; 2220 int saved_uio_iovcnt; 2221 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2222 int vfs_is_locked; 2223 int error; 2224 2225 file_data = &xbb->backend.file; 2226 sectors_sent = 0; 2227 error = 0; 2228 bzero(&xuio, sizeof(xuio)); 2229 2230 switch (operation) { 2231 case BIO_READ: 2232 xuio.uio_rw = UIO_READ; 2233 break; 2234 case BIO_WRITE: 2235 xuio.uio_rw = UIO_WRITE; 2236 break; 2237 case BIO_FLUSH: { 2238 struct mount *mountpoint; 2239 2240 vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); 2241 2242 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2243 2244 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2245 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2246 VOP_UNLOCK(xbb->vn, 0); 2247 2248 vn_finished_write(mountpoint); 2249 2250 VFS_UNLOCK_GIANT(vfs_is_locked); 2251 2252 goto bailout_send_response; 2253 /* NOTREACHED */ 2254 } 2255 default: 2256 panic("invalid operation %d", operation); 2257 /* NOTREACHED */ 2258 } 2259 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2260 << xbb->sector_size_shift; 2261 xuio.uio_segflg = UIO_SYSSPACE; 2262 xuio.uio_iov = file_data->xiovecs; 2263 xuio.uio_iovcnt = 0; 2264 xbb_sg = xbb->xbb_sgs; 2265 nseg = reqlist->nr_segments; 2266 2267 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2268 2269 /* 2270 * If the first sector is not 0, the KVA will 2271 * not be contiguous and we'll need to go on 2272 * to another segment. 2273 */ 2274 if (xbb_sg->first_sect != 0) 2275 xiovec = NULL; 2276 2277 if (xiovec == NULL) { 2278 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2279 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2280 seg_idx, xbb_sg->first_sect); 2281 #ifdef XBB_USE_BOUNCE_BUFFERS 2282 /* 2283 * Store the address of the incoming 2284 * buffer at this particular offset 2285 * as well, so we can do the copy 2286 * later without having to do more 2287 * work to recalculate this address. 2288 */ 2289 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2290 *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2291 xbb_sg->first_sect); 2292 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2293 xiovec->iov_len = 0; 2294 xuio.uio_iovcnt++; 2295 } 2296 2297 xiovec->iov_len += xbb_sg->nsect << 9; 2298 2299 xuio.uio_resid += xbb_sg->nsect << 9; 2300 2301 /* 2302 * If the last sector is not the full page 2303 * size count, the next segment will not be 2304 * contiguous in KVA and we need a new iovec. 2305 */ 2306 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2307 xiovec = NULL; 2308 } 2309 2310 xuio.uio_td = curthread; 2311 2312 #ifdef XBB_USE_BOUNCE_BUFFERS 2313 saved_uio_iovcnt = xuio.uio_iovcnt; 2314 2315 if (operation == BIO_WRITE) { 2316 /* Copy the write data to the local buffer. */ 2317 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2318 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2319 seg_idx++, xiovec++, p_vaddr++) { 2320 2321 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 2322 } 2323 } else { 2324 /* 2325 * We only need to save off the iovecs in the case of a 2326 * read, because the copy for the read happens after the 2327 * VOP_READ(). (The uio will get modified in that call 2328 * sequence.) 2329 */ 2330 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2331 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2332 } 2333 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2334 2335 vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); 2336 switch (operation) { 2337 case BIO_READ: 2338 2339 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2340 2341 /* 2342 * UFS pays attention to IO_DIRECT for reads. If the 2343 * DIRECTIO option is configured into the kernel, it calls 2344 * ffs_rawread(). But that only works for single-segment 2345 * uios with user space addresses. In our case, with a 2346 * kernel uio, it still reads into the buffer cache, but it 2347 * will just try to release the buffer from the cache later 2348 * on in ffs_read(). 2349 * 2350 * ZFS does not pay attention to IO_DIRECT for reads. 2351 * 2352 * UFS does not pay attention to IO_SYNC for reads. 2353 * 2354 * ZFS pays attention to IO_SYNC (which translates into the 2355 * Solaris define FRSYNC for zfs_read()) for reads. It 2356 * attempts to sync the file before reading. 2357 * 2358 * So, to attempt to provide some barrier semantics in the 2359 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2360 */ 2361 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2362 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2363 2364 VOP_UNLOCK(xbb->vn, 0); 2365 break; 2366 case BIO_WRITE: { 2367 struct mount *mountpoint; 2368 2369 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2370 2371 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2372 2373 /* 2374 * UFS pays attention to IO_DIRECT for writes. The write 2375 * is done asynchronously. (Normally the write would just 2376 * get put into cache. 2377 * 2378 * UFS pays attention to IO_SYNC for writes. It will 2379 * attempt to write the buffer out synchronously if that 2380 * flag is set. 2381 * 2382 * ZFS does not pay attention to IO_DIRECT for writes. 2383 * 2384 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2385 * for writes. It will flush the transaction from the 2386 * cache before returning. 2387 * 2388 * So if we've got the BIO_ORDERED flag set, we want 2389 * IO_SYNC in either the UFS or ZFS case. 2390 */ 2391 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2392 IO_SYNC : 0, file_data->cred); 2393 VOP_UNLOCK(xbb->vn, 0); 2394 2395 vn_finished_write(mountpoint); 2396 2397 break; 2398 } 2399 default: 2400 panic("invalid operation %d", operation); 2401 /* NOTREACHED */ 2402 } 2403 VFS_UNLOCK_GIANT(vfs_is_locked); 2404 2405 #ifdef XBB_USE_BOUNCE_BUFFERS 2406 /* We only need to copy here for read operations */ 2407 if (operation == BIO_READ) { 2408 2409 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2410 xiovec = file_data->saved_xiovecs; 2411 seg_idx < saved_uio_iovcnt; seg_idx++, 2412 xiovec++, p_vaddr++) { 2413 2414 /* 2415 * Note that we have to use the copy of the 2416 * io vector we made above. uiomove() modifies 2417 * the uio and its referenced vector as uiomove 2418 * performs the copy, so we can't rely on any 2419 * state from the original uio. 2420 */ 2421 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 2422 } 2423 } 2424 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2425 2426 bailout_send_response: 2427 2428 if (error != 0) 2429 reqlist->status = BLKIF_RSP_ERROR; 2430 2431 xbb_complete_reqlist(xbb, reqlist); 2432 2433 return (0); 2434 } 2435 2436 /*--------------------------- Backend Configuration --------------------------*/ 2437 /** 2438 * Close and cleanup any backend device/file specific state for this 2439 * block back instance. 2440 * 2441 * \param xbb Per-instance xbb configuration structure. 2442 */ 2443 static void 2444 xbb_close_backend(struct xbb_softc *xbb) 2445 { 2446 DROP_GIANT(); 2447 DPRINTF("closing dev=%s\n", xbb->dev_name); 2448 if (xbb->vn) { 2449 int flags = FREAD; 2450 int vfs_is_locked = 0; 2451 2452 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2453 flags |= FWRITE; 2454 2455 switch (xbb->device_type) { 2456 case XBB_TYPE_DISK: 2457 if (xbb->backend.dev.csw) { 2458 dev_relthread(xbb->backend.dev.cdev, 2459 xbb->backend.dev.dev_ref); 2460 xbb->backend.dev.csw = NULL; 2461 xbb->backend.dev.cdev = NULL; 2462 } 2463 break; 2464 case XBB_TYPE_FILE: 2465 vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); 2466 break; 2467 case XBB_TYPE_NONE: 2468 default: 2469 panic("Unexpected backend type."); 2470 break; 2471 } 2472 2473 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2474 xbb->vn = NULL; 2475 2476 switch (xbb->device_type) { 2477 case XBB_TYPE_DISK: 2478 break; 2479 case XBB_TYPE_FILE: 2480 VFS_UNLOCK_GIANT(vfs_is_locked); 2481 if (xbb->backend.file.cred != NULL) { 2482 crfree(xbb->backend.file.cred); 2483 xbb->backend.file.cred = NULL; 2484 } 2485 break; 2486 case XBB_TYPE_NONE: 2487 default: 2488 panic("Unexpected backend type."); 2489 break; 2490 } 2491 } 2492 PICKUP_GIANT(); 2493 } 2494 2495 /** 2496 * Open a character device to be used for backend I/O. 2497 * 2498 * \param xbb Per-instance xbb configuration structure. 2499 * 2500 * \return 0 for success, errno codes for failure. 2501 */ 2502 static int 2503 xbb_open_dev(struct xbb_softc *xbb) 2504 { 2505 struct vattr vattr; 2506 struct cdev *dev; 2507 struct cdevsw *devsw; 2508 int error; 2509 2510 xbb->device_type = XBB_TYPE_DISK; 2511 xbb->dispatch_io = xbb_dispatch_dev; 2512 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2513 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2514 &xbb->backend.dev.dev_ref); 2515 if (xbb->backend.dev.csw == NULL) 2516 panic("Unable to retrieve device switch"); 2517 2518 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2519 if (error) { 2520 xenbus_dev_fatal(xbb->dev, error, "error getting " 2521 "vnode attributes for device %s", 2522 xbb->dev_name); 2523 return (error); 2524 } 2525 2526 2527 dev = xbb->vn->v_rdev; 2528 devsw = dev->si_devsw; 2529 if (!devsw->d_ioctl) { 2530 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2531 "device %s!", xbb->dev_name); 2532 return (ENODEV); 2533 } 2534 2535 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2536 (caddr_t)&xbb->sector_size, FREAD, 2537 curthread); 2538 if (error) { 2539 xenbus_dev_fatal(xbb->dev, error, 2540 "error calling ioctl DIOCGSECTORSIZE " 2541 "for device %s", xbb->dev_name); 2542 return (error); 2543 } 2544 2545 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2546 (caddr_t)&xbb->media_size, FREAD, 2547 curthread); 2548 if (error) { 2549 xenbus_dev_fatal(xbb->dev, error, 2550 "error calling ioctl DIOCGMEDIASIZE " 2551 "for device %s", xbb->dev_name); 2552 return (error); 2553 } 2554 2555 return (0); 2556 } 2557 2558 /** 2559 * Open a file to be used for backend I/O. 2560 * 2561 * \param xbb Per-instance xbb configuration structure. 2562 * 2563 * \return 0 for success, errno codes for failure. 2564 */ 2565 static int 2566 xbb_open_file(struct xbb_softc *xbb) 2567 { 2568 struct xbb_file_data *file_data; 2569 struct vattr vattr; 2570 int error; 2571 2572 file_data = &xbb->backend.file; 2573 xbb->device_type = XBB_TYPE_FILE; 2574 xbb->dispatch_io = xbb_dispatch_file; 2575 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2576 if (error != 0) { 2577 xenbus_dev_fatal(xbb->dev, error, 2578 "error calling VOP_GETATTR()" 2579 "for file %s", xbb->dev_name); 2580 return (error); 2581 } 2582 2583 /* 2584 * Verify that we have the ability to upgrade to exclusive 2585 * access on this file so we can trap errors at open instead 2586 * of reporting them during first access. 2587 */ 2588 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2589 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2590 if (xbb->vn->v_iflag & VI_DOOMED) { 2591 error = EBADF; 2592 xenbus_dev_fatal(xbb->dev, error, 2593 "error locking file %s", 2594 xbb->dev_name); 2595 2596 return (error); 2597 } 2598 } 2599 2600 file_data->cred = crhold(curthread->td_ucred); 2601 xbb->media_size = vattr.va_size; 2602 2603 /* 2604 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2605 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2606 * with disklabel and UFS on FreeBSD at least. Large block sizes 2607 * may not work with other OSes as well. So just export a sector 2608 * size of 512 bytes, which should work with any OS or 2609 * application. Since our backing is a file, any block size will 2610 * work fine for the backing store. 2611 */ 2612 #if 0 2613 xbb->sector_size = vattr.va_blocksize; 2614 #endif 2615 xbb->sector_size = 512; 2616 2617 /* 2618 * Sanity check. The media size has to be at least one 2619 * sector long. 2620 */ 2621 if (xbb->media_size < xbb->sector_size) { 2622 error = EINVAL; 2623 xenbus_dev_fatal(xbb->dev, error, 2624 "file %s size %ju < block size %u", 2625 xbb->dev_name, 2626 (uintmax_t)xbb->media_size, 2627 xbb->sector_size); 2628 } 2629 return (error); 2630 } 2631 2632 /** 2633 * Open the backend provider for this connection. 2634 * 2635 * \param xbb Per-instance xbb configuration structure. 2636 * 2637 * \return 0 for success, errno codes for failure. 2638 */ 2639 static int 2640 xbb_open_backend(struct xbb_softc *xbb) 2641 { 2642 struct nameidata nd; 2643 int flags; 2644 int error; 2645 int vfs_is_locked; 2646 2647 flags = FREAD; 2648 error = 0; 2649 2650 DPRINTF("opening dev=%s\n", xbb->dev_name); 2651 2652 if (rootvnode == NULL) { 2653 xenbus_dev_fatal(xbb->dev, ENOENT, 2654 "Root file system not mounted"); 2655 return (ENOENT); 2656 } 2657 2658 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2659 flags |= FWRITE; 2660 2661 if (!curthread->td_proc->p_fd->fd_cdir) { 2662 curthread->td_proc->p_fd->fd_cdir = rootvnode; 2663 VREF(rootvnode); 2664 } 2665 if (!curthread->td_proc->p_fd->fd_rdir) { 2666 curthread->td_proc->p_fd->fd_rdir = rootvnode; 2667 VREF(rootvnode); 2668 } 2669 if (!curthread->td_proc->p_fd->fd_jdir) { 2670 curthread->td_proc->p_fd->fd_jdir = rootvnode; 2671 VREF(rootvnode); 2672 } 2673 2674 again: 2675 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2676 error = vn_open(&nd, &flags, 0, NULL); 2677 if (error) { 2678 /* 2679 * This is the only reasonable guess we can make as far as 2680 * path if the user doesn't give us a fully qualified path. 2681 * If they want to specify a file, they need to specify the 2682 * full path. 2683 */ 2684 if (xbb->dev_name[0] != '/') { 2685 char *dev_path = "/dev/"; 2686 char *dev_name; 2687 2688 /* Try adding device path at beginning of name */ 2689 dev_name = malloc(strlen(xbb->dev_name) 2690 + strlen(dev_path) + 1, 2691 M_XENBLOCKBACK, M_NOWAIT); 2692 if (dev_name) { 2693 sprintf(dev_name, "%s%s", dev_path, 2694 xbb->dev_name); 2695 free(xbb->dev_name, M_XENBLOCKBACK); 2696 xbb->dev_name = dev_name; 2697 goto again; 2698 } 2699 } 2700 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2701 xbb->dev_name); 2702 return (error); 2703 } 2704 2705 vfs_is_locked = NDHASGIANT(&nd); 2706 2707 NDFREE(&nd, NDF_ONLY_PNBUF); 2708 2709 xbb->vn = nd.ni_vp; 2710 2711 /* We only support disks and files. */ 2712 if (vn_isdisk(xbb->vn, &error)) { 2713 error = xbb_open_dev(xbb); 2714 } else if (xbb->vn->v_type == VREG) { 2715 error = xbb_open_file(xbb); 2716 } else { 2717 error = EINVAL; 2718 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2719 "or file", xbb->dev_name); 2720 } 2721 VOP_UNLOCK(xbb->vn, 0); 2722 VFS_UNLOCK_GIANT(vfs_is_locked); 2723 2724 if (error != 0) { 2725 xbb_close_backend(xbb); 2726 return (error); 2727 } 2728 2729 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2730 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2731 2732 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2733 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2734 xbb->dev_name, xbb->sector_size, xbb->media_size); 2735 2736 return (0); 2737 } 2738 2739 /*------------------------ Inter-Domain Communication ------------------------*/ 2740 /** 2741 * Free dynamically allocated KVA or pseudo-physical address allocations. 2742 * 2743 * \param xbb Per-instance xbb configuration structure. 2744 */ 2745 static void 2746 xbb_free_communication_mem(struct xbb_softc *xbb) 2747 { 2748 if (xbb->kva != 0) { 2749 #ifndef XENHVM 2750 kmem_free(kernel_map, xbb->kva, xbb->kva_size); 2751 #else 2752 if (xbb->pseudo_phys_res != NULL) { 2753 bus_release_resource(xbb->dev, SYS_RES_MEMORY, 2754 xbb->pseudo_phys_res_id, 2755 xbb->pseudo_phys_res); 2756 xbb->pseudo_phys_res = NULL; 2757 } 2758 #endif 2759 } 2760 xbb->kva = 0; 2761 xbb->gnt_base_addr = 0; 2762 if (xbb->kva_free != NULL) { 2763 free(xbb->kva_free, M_XENBLOCKBACK); 2764 xbb->kva_free = NULL; 2765 } 2766 } 2767 2768 /** 2769 * Cleanup all inter-domain communication mechanisms. 2770 * 2771 * \param xbb Per-instance xbb configuration structure. 2772 */ 2773 static int 2774 xbb_disconnect(struct xbb_softc *xbb) 2775 { 2776 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2777 struct gnttab_unmap_grant_ref *op; 2778 u_int ring_idx; 2779 int error; 2780 2781 DPRINTF("\n"); 2782 2783 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2784 return (0); 2785 2786 if (xbb->irq != 0) { 2787 unbind_from_irqhandler(xbb->irq); 2788 xbb->irq = 0; 2789 } 2790 2791 mtx_unlock(&xbb->lock); 2792 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2793 mtx_lock(&xbb->lock); 2794 2795 /* 2796 * No new interrupts can generate work, but we must wait 2797 * for all currently active requests to drain. 2798 */ 2799 if (xbb->active_request_count != 0) 2800 return (EAGAIN); 2801 2802 for (ring_idx = 0, op = ops; 2803 ring_idx < xbb->ring_config.ring_pages; 2804 ring_idx++, op++) { 2805 2806 op->host_addr = xbb->ring_config.gnt_addr 2807 + (ring_idx * PAGE_SIZE); 2808 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2809 op->handle = xbb->ring_config.handle[ring_idx]; 2810 } 2811 2812 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2813 xbb->ring_config.ring_pages); 2814 if (error != 0) 2815 panic("Grant table op failed (%d)", error); 2816 2817 xbb_free_communication_mem(xbb); 2818 2819 if (xbb->requests != NULL) { 2820 free(xbb->requests, M_XENBLOCKBACK); 2821 xbb->requests = NULL; 2822 } 2823 2824 if (xbb->request_lists != NULL) { 2825 struct xbb_xen_reqlist *reqlist; 2826 int i; 2827 2828 /* There is one request list for ever allocated request. */ 2829 for (i = 0, reqlist = xbb->request_lists; 2830 i < xbb->max_requests; i++, reqlist++){ 2831 #ifdef XBB_USE_BOUNCE_BUFFERS 2832 if (reqlist->bounce != NULL) { 2833 free(reqlist->bounce, M_XENBLOCKBACK); 2834 reqlist->bounce = NULL; 2835 } 2836 #endif 2837 if (reqlist->gnt_handles != NULL) { 2838 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2839 reqlist->gnt_handles = NULL; 2840 } 2841 } 2842 free(xbb->request_lists, M_XENBLOCKBACK); 2843 xbb->request_lists = NULL; 2844 } 2845 2846 xbb->flags &= ~XBBF_RING_CONNECTED; 2847 return (0); 2848 } 2849 2850 /** 2851 * Map shared memory ring into domain local address space, initialize 2852 * ring control structures, and bind an interrupt to the event channel 2853 * used to notify us of ring changes. 2854 * 2855 * \param xbb Per-instance xbb configuration structure. 2856 */ 2857 static int 2858 xbb_connect_ring(struct xbb_softc *xbb) 2859 { 2860 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2861 struct gnttab_map_grant_ref *gnt; 2862 u_int ring_idx; 2863 int error; 2864 2865 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2866 return (0); 2867 2868 /* 2869 * Kva for our ring is at the tail of the region of kva allocated 2870 * by xbb_alloc_communication_mem(). 2871 */ 2872 xbb->ring_config.va = xbb->kva 2873 + (xbb->kva_size 2874 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2875 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2876 + (xbb->kva_size 2877 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2878 2879 for (ring_idx = 0, gnt = gnts; 2880 ring_idx < xbb->ring_config.ring_pages; 2881 ring_idx++, gnt++) { 2882 2883 gnt->host_addr = xbb->ring_config.gnt_addr 2884 + (ring_idx * PAGE_SIZE); 2885 gnt->flags = GNTMAP_host_map; 2886 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2887 gnt->dom = xbb->otherend_id; 2888 } 2889 2890 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2891 xbb->ring_config.ring_pages); 2892 if (error) 2893 panic("blkback: Ring page grant table op failed (%d)", error); 2894 2895 for (ring_idx = 0, gnt = gnts; 2896 ring_idx < xbb->ring_config.ring_pages; 2897 ring_idx++, gnt++) { 2898 if (gnt->status != 0) { 2899 xbb->ring_config.va = 0; 2900 xenbus_dev_fatal(xbb->dev, EACCES, 2901 "Ring shared page mapping failed. " 2902 "Status %d.", gnt->status); 2903 return (EACCES); 2904 } 2905 xbb->ring_config.handle[ring_idx] = gnt->handle; 2906 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2907 } 2908 2909 /* Initialize the ring based on ABI. */ 2910 switch (xbb->abi) { 2911 case BLKIF_PROTOCOL_NATIVE: 2912 { 2913 blkif_sring_t *sring; 2914 sring = (blkif_sring_t *)xbb->ring_config.va; 2915 BACK_RING_INIT(&xbb->rings.native, sring, 2916 xbb->ring_config.ring_pages * PAGE_SIZE); 2917 break; 2918 } 2919 case BLKIF_PROTOCOL_X86_32: 2920 { 2921 blkif_x86_32_sring_t *sring_x86_32; 2922 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2923 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2924 xbb->ring_config.ring_pages * PAGE_SIZE); 2925 break; 2926 } 2927 case BLKIF_PROTOCOL_X86_64: 2928 { 2929 blkif_x86_64_sring_t *sring_x86_64; 2930 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2931 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2932 xbb->ring_config.ring_pages * PAGE_SIZE); 2933 break; 2934 } 2935 default: 2936 panic("Unexpected blkif protocol ABI."); 2937 } 2938 2939 xbb->flags |= XBBF_RING_CONNECTED; 2940 2941 error = 2942 bind_interdomain_evtchn_to_irqhandler(xbb->otherend_id, 2943 xbb->ring_config.evtchn, 2944 device_get_nameunit(xbb->dev), 2945 xbb_intr, /*arg*/xbb, 2946 INTR_TYPE_BIO | INTR_MPSAFE, 2947 &xbb->irq); 2948 if (error) { 2949 (void)xbb_disconnect(xbb); 2950 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2951 return (error); 2952 } 2953 2954 DPRINTF("rings connected!\n"); 2955 2956 return 0; 2957 } 2958 2959 /* Needed to make bit_alloc() macro work */ 2960 #define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK, \ 2961 M_NOWAIT|M_ZERO); 2962 2963 /** 2964 * Size KVA and pseudo-physical address allocations based on negotiated 2965 * values for the size and number of I/O requests, and the size of our 2966 * communication ring. 2967 * 2968 * \param xbb Per-instance xbb configuration structure. 2969 * 2970 * These address spaces are used to dynamically map pages in the 2971 * front-end's domain into our own. 2972 */ 2973 static int 2974 xbb_alloc_communication_mem(struct xbb_softc *xbb) 2975 { 2976 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2977 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2978 xbb->kva_size = xbb->reqlist_kva_size + 2979 (xbb->ring_config.ring_pages * PAGE_SIZE); 2980 2981 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages); 2982 if (xbb->kva_free == NULL) 2983 return (ENOMEM); 2984 2985 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 2986 device_get_nameunit(xbb->dev), xbb->kva_size, 2987 xbb->reqlist_kva_size); 2988 #ifndef XENHVM 2989 xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size); 2990 if (xbb->kva == 0) 2991 return (ENOMEM); 2992 xbb->gnt_base_addr = xbb->kva; 2993 #else /* XENHVM */ 2994 /* 2995 * Reserve a range of pseudo physical memory that we can map 2996 * into kva. These pages will only be backed by machine 2997 * pages ("real memory") during the lifetime of front-end requests 2998 * via grant table operations. 2999 */ 3000 xbb->pseudo_phys_res_id = 0; 3001 xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY, 3002 &xbb->pseudo_phys_res_id, 3003 0, ~0, xbb->kva_size, 3004 RF_ACTIVE); 3005 if (xbb->pseudo_phys_res == NULL) { 3006 xbb->kva = 0; 3007 return (ENOMEM); 3008 } 3009 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3010 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3011 #endif /* XENHVM */ 3012 3013 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3014 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3015 (uintmax_t)xbb->gnt_base_addr); 3016 return (0); 3017 } 3018 3019 /** 3020 * Collect front-end information from the XenStore. 3021 * 3022 * \param xbb Per-instance xbb configuration structure. 3023 */ 3024 static int 3025 xbb_collect_frontend_info(struct xbb_softc *xbb) 3026 { 3027 char protocol_abi[64]; 3028 const char *otherend_path; 3029 int error; 3030 u_int ring_idx; 3031 3032 otherend_path = xenbus_get_otherend_path(xbb->dev); 3033 3034 /* 3035 * Protocol defaults valid even if all negotiation fails. 3036 */ 3037 xbb->ring_config.ring_pages = 1; 3038 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE); 3039 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; 3040 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3041 3042 /* 3043 * Mandatory data (used in all versions of the protocol) first. 3044 */ 3045 error = xs_gather(XST_NIL, otherend_path, 3046 "ring-ref", "%" PRIu32, 3047 &xbb->ring_config.ring_ref[0], 3048 "event-channel", "%" PRIu32, 3049 &xbb->ring_config.evtchn, 3050 NULL); 3051 if (error != 0) { 3052 xenbus_dev_fatal(xbb->dev, error, 3053 "Unable to retrieve ring information from " 3054 "frontend %s. Unable to connect.", 3055 xenbus_get_otherend_path(xbb->dev)); 3056 return (error); 3057 } 3058 3059 /* 3060 * These fields are initialized to legacy protocol defaults 3061 * so we only need to fail if reading the updated value succeeds 3062 * and the new value is outside of its allowed range. 3063 * 3064 * \note xs_gather() returns on the first encountered error, so 3065 * we must use independant calls in order to guarantee 3066 * we don't miss information in a sparsly populated front-end 3067 * tree. 3068 */ 3069 (void)xs_scanf(XST_NIL, otherend_path, 3070 "ring-pages", NULL, "%u", 3071 &xbb->ring_config.ring_pages); 3072 3073 (void)xs_scanf(XST_NIL, otherend_path, 3074 "max-requests", NULL, "%u", 3075 &xbb->max_requests); 3076 3077 (void)xs_scanf(XST_NIL, otherend_path, 3078 "max-request-segments", NULL, "%u", 3079 &xbb->max_request_segments); 3080 3081 (void)xs_scanf(XST_NIL, otherend_path, 3082 "max-request-size", NULL, "%u", 3083 &xbb->max_request_size); 3084 3085 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3086 xenbus_dev_fatal(xbb->dev, EINVAL, 3087 "Front-end specificed ring-pages of %u " 3088 "exceeds backend limit of %zu. " 3089 "Unable to connect.", 3090 xbb->ring_config.ring_pages, 3091 XBB_MAX_RING_PAGES); 3092 return (EINVAL); 3093 } else if (xbb->max_requests > XBB_MAX_REQUESTS) { 3094 xenbus_dev_fatal(xbb->dev, EINVAL, 3095 "Front-end specificed max_requests of %u " 3096 "exceeds backend limit of %u. " 3097 "Unable to connect.", 3098 xbb->max_requests, 3099 XBB_MAX_REQUESTS); 3100 return (EINVAL); 3101 } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) { 3102 xenbus_dev_fatal(xbb->dev, EINVAL, 3103 "Front-end specificed max_requests_segments " 3104 "of %u exceeds backend limit of %u. " 3105 "Unable to connect.", 3106 xbb->max_request_segments, 3107 XBB_MAX_SEGMENTS_PER_REQUEST); 3108 return (EINVAL); 3109 } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) { 3110 xenbus_dev_fatal(xbb->dev, EINVAL, 3111 "Front-end specificed max_request_size " 3112 "of %u exceeds backend limit of %u. " 3113 "Unable to connect.", 3114 xbb->max_request_size, 3115 XBB_MAX_REQUEST_SIZE); 3116 return (EINVAL); 3117 } 3118 3119 /* If using a multi-page ring, pull in the remaining references. */ 3120 for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) { 3121 char ring_ref_name[]= "ring_refXX"; 3122 3123 snprintf(ring_ref_name, sizeof(ring_ref_name), 3124 "ring-ref%u", ring_idx); 3125 error = xs_scanf(XST_NIL, otherend_path, 3126 ring_ref_name, NULL, "%" PRIu32, 3127 &xbb->ring_config.ring_ref[ring_idx]); 3128 if (error != 0) { 3129 xenbus_dev_fatal(xbb->dev, error, 3130 "Failed to retriev grant reference " 3131 "for page %u of shared ring. Unable " 3132 "to connect.", ring_idx); 3133 return (error); 3134 } 3135 } 3136 3137 error = xs_gather(XST_NIL, otherend_path, 3138 "protocol", "%63s", protocol_abi, 3139 NULL); 3140 if (error != 0 3141 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3142 /* 3143 * Assume native if the frontend has not 3144 * published ABI data or it has published and 3145 * matches our own ABI. 3146 */ 3147 xbb->abi = BLKIF_PROTOCOL_NATIVE; 3148 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3149 3150 xbb->abi = BLKIF_PROTOCOL_X86_32; 3151 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3152 3153 xbb->abi = BLKIF_PROTOCOL_X86_64; 3154 } else { 3155 3156 xenbus_dev_fatal(xbb->dev, EINVAL, 3157 "Unknown protocol ABI (%s) published by " 3158 "frontend. Unable to connect.", protocol_abi); 3159 return (EINVAL); 3160 } 3161 return (0); 3162 } 3163 3164 /** 3165 * Allocate per-request data structures given request size and number 3166 * information negotiated with the front-end. 3167 * 3168 * \param xbb Per-instance xbb configuration structure. 3169 */ 3170 static int 3171 xbb_alloc_requests(struct xbb_softc *xbb) 3172 { 3173 struct xbb_xen_req *req; 3174 struct xbb_xen_req *last_req; 3175 3176 /* 3177 * Allocate request book keeping datastructures. 3178 */ 3179 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3180 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3181 if (xbb->requests == NULL) { 3182 xenbus_dev_fatal(xbb->dev, ENOMEM, 3183 "Unable to allocate request structures"); 3184 return (ENOMEM); 3185 } 3186 3187 req = xbb->requests; 3188 last_req = &xbb->requests[xbb->max_requests - 1]; 3189 STAILQ_INIT(&xbb->request_free_stailq); 3190 while (req <= last_req) { 3191 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3192 req++; 3193 } 3194 return (0); 3195 } 3196 3197 static int 3198 xbb_alloc_request_lists(struct xbb_softc *xbb) 3199 { 3200 int i; 3201 struct xbb_xen_reqlist *reqlist; 3202 3203 /* 3204 * If no requests can be merged, we need 1 request list per 3205 * in flight request. 3206 */ 3207 xbb->request_lists = malloc(xbb->max_requests * 3208 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3209 if (xbb->request_lists == NULL) { 3210 xenbus_dev_fatal(xbb->dev, ENOMEM, 3211 "Unable to allocate request list structures"); 3212 return (ENOMEM); 3213 } 3214 3215 STAILQ_INIT(&xbb->reqlist_free_stailq); 3216 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3217 for (i = 0; i < xbb->max_requests; i++) { 3218 int seg; 3219 3220 reqlist = &xbb->request_lists[i]; 3221 3222 reqlist->xbb = xbb; 3223 3224 #ifdef XBB_USE_BOUNCE_BUFFERS 3225 reqlist->bounce = malloc(xbb->max_reqlist_size, 3226 M_XENBLOCKBACK, M_NOWAIT); 3227 if (reqlist->bounce == NULL) { 3228 xenbus_dev_fatal(xbb->dev, ENOMEM, 3229 "Unable to allocate request " 3230 "bounce buffers"); 3231 return (ENOMEM); 3232 } 3233 #endif /* XBB_USE_BOUNCE_BUFFERS */ 3234 3235 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3236 sizeof(*reqlist->gnt_handles), 3237 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3238 if (reqlist->gnt_handles == NULL) { 3239 xenbus_dev_fatal(xbb->dev, ENOMEM, 3240 "Unable to allocate request " 3241 "grant references"); 3242 return (ENOMEM); 3243 } 3244 3245 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3246 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3247 3248 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3249 } 3250 return (0); 3251 } 3252 3253 /** 3254 * Supply information about the physical device to the frontend 3255 * via XenBus. 3256 * 3257 * \param xbb Per-instance xbb configuration structure. 3258 */ 3259 static int 3260 xbb_publish_backend_info(struct xbb_softc *xbb) 3261 { 3262 struct xs_transaction xst; 3263 const char *our_path; 3264 const char *leaf; 3265 int error; 3266 3267 our_path = xenbus_get_node(xbb->dev); 3268 while (1) { 3269 error = xs_transaction_start(&xst); 3270 if (error != 0) { 3271 xenbus_dev_fatal(xbb->dev, error, 3272 "Error publishing backend info " 3273 "(start transaction)"); 3274 return (error); 3275 } 3276 3277 leaf = "sectors"; 3278 error = xs_printf(xst, our_path, leaf, 3279 "%"PRIu64, xbb->media_num_sectors); 3280 if (error != 0) 3281 break; 3282 3283 /* XXX Support all VBD attributes here. */ 3284 leaf = "info"; 3285 error = xs_printf(xst, our_path, leaf, "%u", 3286 xbb->flags & XBBF_READ_ONLY 3287 ? VDISK_READONLY : 0); 3288 if (error != 0) 3289 break; 3290 3291 leaf = "sector-size"; 3292 error = xs_printf(xst, our_path, leaf, "%u", 3293 xbb->sector_size); 3294 if (error != 0) 3295 break; 3296 3297 error = xs_transaction_end(xst, 0); 3298 if (error == 0) { 3299 return (0); 3300 } else if (error != EAGAIN) { 3301 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3302 return (error); 3303 } 3304 } 3305 3306 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3307 our_path, leaf); 3308 xs_transaction_end(xst, 1); 3309 return (error); 3310 } 3311 3312 /** 3313 * Connect to our blkfront peer now that it has completed publishing 3314 * its configuration into the XenStore. 3315 * 3316 * \param xbb Per-instance xbb configuration structure. 3317 */ 3318 static void 3319 xbb_connect(struct xbb_softc *xbb) 3320 { 3321 int error; 3322 3323 if (xenbus_get_state(xbb->dev) == XenbusStateConnected) 3324 return; 3325 3326 if (xbb_collect_frontend_info(xbb) != 0) 3327 return; 3328 3329 xbb->flags &= ~XBBF_SHUTDOWN; 3330 3331 /* 3332 * We limit the maximum number of reqlist segments to the maximum 3333 * number of segments in the ring, or our absolute maximum, 3334 * whichever is smaller. 3335 */ 3336 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3337 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3338 3339 /* 3340 * The maximum size is simply a function of the number of segments 3341 * we can handle. 3342 */ 3343 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3344 3345 /* Allocate resources whose size depends on front-end configuration. */ 3346 error = xbb_alloc_communication_mem(xbb); 3347 if (error != 0) { 3348 xenbus_dev_fatal(xbb->dev, error, 3349 "Unable to allocate communication memory"); 3350 return; 3351 } 3352 3353 error = xbb_alloc_requests(xbb); 3354 if (error != 0) { 3355 /* Specific errors are reported by xbb_alloc_requests(). */ 3356 return; 3357 } 3358 3359 error = xbb_alloc_request_lists(xbb); 3360 if (error != 0) { 3361 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3362 return; 3363 } 3364 3365 /* 3366 * Connect communication channel. 3367 */ 3368 error = xbb_connect_ring(xbb); 3369 if (error != 0) { 3370 /* Specific errors are reported by xbb_connect_ring(). */ 3371 return; 3372 } 3373 3374 if (xbb_publish_backend_info(xbb) != 0) { 3375 /* 3376 * If we can't publish our data, we cannot participate 3377 * in this connection, and waiting for a front-end state 3378 * change will not help the situation. 3379 */ 3380 (void)xbb_disconnect(xbb); 3381 return; 3382 } 3383 3384 /* Ready for I/O. */ 3385 xenbus_set_state(xbb->dev, XenbusStateConnected); 3386 } 3387 3388 /*-------------------------- Device Teardown Support -------------------------*/ 3389 /** 3390 * Perform device shutdown functions. 3391 * 3392 * \param xbb Per-instance xbb configuration structure. 3393 * 3394 * Mark this instance as shutting down, wait for any active I/O on the 3395 * backend device/file to drain, disconnect from the front-end, and notify 3396 * any waiters (e.g. a thread invoking our detach method) that detach can 3397 * now proceed. 3398 */ 3399 static int 3400 xbb_shutdown(struct xbb_softc *xbb) 3401 { 3402 int error; 3403 3404 DPRINTF("\n"); 3405 3406 /* 3407 * Due to the need to drop our mutex during some 3408 * xenbus operations, it is possible for two threads 3409 * to attempt to close out shutdown processing at 3410 * the same time. Tell the caller that hits this 3411 * race to try back later. 3412 */ 3413 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3414 return (EAGAIN); 3415 3416 DPRINTF("\n"); 3417 3418 /* Indicate shutdown is in progress. */ 3419 xbb->flags |= XBBF_SHUTDOWN; 3420 3421 /* Disconnect from the front-end. */ 3422 error = xbb_disconnect(xbb); 3423 if (error != 0) { 3424 /* 3425 * Requests still outstanding. We'll be called again 3426 * once they complete. 3427 */ 3428 KASSERT(error == EAGAIN, 3429 ("%s: Unexpected xbb_disconnect() failure %d", 3430 __func__, error)); 3431 3432 return (error); 3433 } 3434 3435 DPRINTF("\n"); 3436 3437 xbb->flags |= XBBF_IN_SHUTDOWN; 3438 mtx_unlock(&xbb->lock); 3439 3440 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3441 xenbus_set_state(xbb->dev, XenbusStateClosing); 3442 3443 mtx_lock(&xbb->lock); 3444 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3445 3446 /* Indicate to xbb_detach() that is it safe to proceed. */ 3447 wakeup(xbb); 3448 3449 return (0); 3450 } 3451 3452 /** 3453 * Report an attach time error to the console and Xen, and cleanup 3454 * this instance by forcing immediate detach processing. 3455 * 3456 * \param xbb Per-instance xbb configuration structure. 3457 * \param err Errno describing the error. 3458 * \param fmt Printf style format and arguments 3459 */ 3460 static void 3461 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3462 { 3463 va_list ap; 3464 va_list ap_hotplug; 3465 3466 va_start(ap, fmt); 3467 va_copy(ap_hotplug, ap); 3468 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3469 "hotplug-error", fmt, ap_hotplug); 3470 va_end(ap_hotplug); 3471 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3472 "hotplug-status", "error"); 3473 3474 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3475 va_end(ap); 3476 3477 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3478 "online", "0"); 3479 xbb_detach(xbb->dev); 3480 } 3481 3482 /*---------------------------- NewBus Entrypoints ----------------------------*/ 3483 /** 3484 * Inspect a XenBus device and claim it if is of the appropriate type. 3485 * 3486 * \param dev NewBus device object representing a candidate XenBus device. 3487 * 3488 * \return 0 for success, errno codes for failure. 3489 */ 3490 static int 3491 xbb_probe(device_t dev) 3492 { 3493 3494 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3495 device_set_desc(dev, "Backend Virtual Block Device"); 3496 device_quiet(dev); 3497 return (0); 3498 } 3499 3500 return (ENXIO); 3501 } 3502 3503 /** 3504 * Setup sysctl variables to control various Block Back parameters. 3505 * 3506 * \param xbb Xen Block Back softc. 3507 * 3508 */ 3509 static void 3510 xbb_setup_sysctl(struct xbb_softc *xbb) 3511 { 3512 struct sysctl_ctx_list *sysctl_ctx = NULL; 3513 struct sysctl_oid *sysctl_tree = NULL; 3514 3515 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3516 if (sysctl_ctx == NULL) 3517 return; 3518 3519 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3520 if (sysctl_tree == NULL) 3521 return; 3522 3523 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3524 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3525 "fake the flush command"); 3526 3527 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3528 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3529 "send a real flush for N flush requests"); 3530 3531 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3532 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3533 "Don't coalesce contiguous requests"); 3534 3535 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3536 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3537 "how many I/O requests we have received"); 3538 3539 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3540 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3541 "how many I/O requests have been completed"); 3542 3543 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3544 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3545 "how many I/O dispatches were forced"); 3546 3547 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3548 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3549 "how many I/O dispatches were normal"); 3550 3551 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3552 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3553 "total number of I/O dispatches"); 3554 3555 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3556 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3557 "how many times we have run out of KVA"); 3558 3559 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3560 "request_shortages", CTLFLAG_RW, 3561 &xbb->request_shortages, 3562 "how many times we have run out of requests"); 3563 3564 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3565 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3566 "maximum outstanding requests (negotiated)"); 3567 3568 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3569 "max_request_segments", CTLFLAG_RD, 3570 &xbb->max_request_segments, 0, 3571 "maximum number of pages per requests (negotiated)"); 3572 } 3573 3574 /** 3575 * Attach to a XenBus device that has been claimed by our probe routine. 3576 * 3577 * \param dev NewBus device object representing this Xen Block Back instance. 3578 * 3579 * \return 0 for success, errno codes for failure. 3580 */ 3581 static int 3582 xbb_attach(device_t dev) 3583 { 3584 struct xbb_softc *xbb; 3585 int error; 3586 3587 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3588 3589 /* 3590 * Basic initialization. 3591 * After this block it is safe to call xbb_detach() 3592 * to clean up any allocated data for this instance. 3593 */ 3594 xbb = device_get_softc(dev); 3595 xbb->dev = dev; 3596 xbb->otherend_id = xenbus_get_otherend_id(dev); 3597 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3598 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3599 3600 /* 3601 * Publish protocol capabilities for consumption by the 3602 * front-end. 3603 */ 3604 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3605 "feature-barrier", "1"); 3606 if (error) { 3607 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3608 xenbus_get_node(xbb->dev)); 3609 return (error); 3610 } 3611 3612 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3613 "feature-flush-cache", "1"); 3614 if (error) { 3615 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3616 xenbus_get_node(xbb->dev)); 3617 return (error); 3618 } 3619 3620 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3621 "max-ring-pages", "%zu", XBB_MAX_RING_PAGES); 3622 if (error) { 3623 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages", 3624 xenbus_get_node(xbb->dev)); 3625 return (error); 3626 } 3627 3628 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3629 "max-requests", "%u", XBB_MAX_REQUESTS); 3630 if (error) { 3631 xbb_attach_failed(xbb, error, "writing %s/max-requests", 3632 xenbus_get_node(xbb->dev)); 3633 return (error); 3634 } 3635 3636 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3637 "max-request-segments", "%u", 3638 XBB_MAX_SEGMENTS_PER_REQUEST); 3639 if (error) { 3640 xbb_attach_failed(xbb, error, "writing %s/max-request-segments", 3641 xenbus_get_node(xbb->dev)); 3642 return (error); 3643 } 3644 3645 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3646 "max-request-size", "%u", 3647 XBB_MAX_REQUEST_SIZE); 3648 if (error) { 3649 xbb_attach_failed(xbb, error, "writing %s/max-request-size", 3650 xenbus_get_node(xbb->dev)); 3651 return (error); 3652 } 3653 3654 /* Collect physical device information. */ 3655 error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3656 "device-type", NULL, &xbb->dev_type, 3657 NULL); 3658 if (error != 0) 3659 xbb->dev_type = NULL; 3660 3661 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3662 "mode", NULL, &xbb->dev_mode, 3663 "params", NULL, &xbb->dev_name, 3664 NULL); 3665 if (error != 0) { 3666 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3667 xenbus_get_node(dev)); 3668 return (ENXIO); 3669 } 3670 3671 /* Parse fopen style mode flags. */ 3672 if (strchr(xbb->dev_mode, 'w') == NULL) 3673 xbb->flags |= XBBF_READ_ONLY; 3674 3675 /* 3676 * Verify the physical device is present and can support 3677 * the desired I/O mode. 3678 */ 3679 DROP_GIANT(); 3680 error = xbb_open_backend(xbb); 3681 PICKUP_GIANT(); 3682 if (error != 0) { 3683 xbb_attach_failed(xbb, error, "Unable to open %s", 3684 xbb->dev_name); 3685 return (ENXIO); 3686 } 3687 3688 /* Use devstat(9) for recording statistics. */ 3689 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3690 xbb->sector_size, 3691 DEVSTAT_ALL_SUPPORTED, 3692 DEVSTAT_TYPE_DIRECT 3693 | DEVSTAT_TYPE_IF_OTHER, 3694 DEVSTAT_PRIORITY_OTHER); 3695 3696 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3697 xbb->sector_size, 3698 DEVSTAT_ALL_SUPPORTED, 3699 DEVSTAT_TYPE_DIRECT 3700 | DEVSTAT_TYPE_IF_OTHER, 3701 DEVSTAT_PRIORITY_OTHER); 3702 /* 3703 * Setup sysctl variables. 3704 */ 3705 xbb_setup_sysctl(xbb); 3706 3707 /* 3708 * Create a taskqueue for doing work that must occur from a 3709 * thread context. 3710 */ 3711 xbb->io_taskqueue = taskqueue_create(device_get_nameunit(dev), M_NOWAIT, 3712 taskqueue_thread_enqueue, 3713 /*context*/&xbb->io_taskqueue); 3714 if (xbb->io_taskqueue == NULL) { 3715 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3716 return (ENOMEM); 3717 } 3718 3719 taskqueue_start_threads(&xbb->io_taskqueue, 3720 /*num threads*/1, 3721 /*priority*/PWAIT, 3722 /*thread name*/ 3723 "%s taskq", device_get_nameunit(dev)); 3724 3725 /* Update hot-plug status to satisfy xend. */ 3726 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3727 "hotplug-status", "connected"); 3728 if (error) { 3729 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3730 xenbus_get_node(xbb->dev)); 3731 return (error); 3732 } 3733 3734 /* Tell the front end that we are ready to connect. */ 3735 xenbus_set_state(dev, XenbusStateInitWait); 3736 3737 return (0); 3738 } 3739 3740 /** 3741 * Detach from a block back device instance. 3742 * 3743 * \param dev NewBus device object representing this Xen Block Back instance. 3744 * 3745 * \return 0 for success, errno codes for failure. 3746 * 3747 * \note A block back device may be detached at any time in its life-cycle, 3748 * including part way through the attach process. For this reason, 3749 * initialization order and the intialization state checks in this 3750 * routine must be carefully coupled so that attach time failures 3751 * are gracefully handled. 3752 */ 3753 static int 3754 xbb_detach(device_t dev) 3755 { 3756 struct xbb_softc *xbb; 3757 3758 DPRINTF("\n"); 3759 3760 xbb = device_get_softc(dev); 3761 mtx_lock(&xbb->lock); 3762 while (xbb_shutdown(xbb) == EAGAIN) { 3763 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3764 "xbb_shutdown", 0); 3765 } 3766 mtx_unlock(&xbb->lock); 3767 3768 DPRINTF("\n"); 3769 3770 if (xbb->io_taskqueue != NULL) 3771 taskqueue_free(xbb->io_taskqueue); 3772 3773 if (xbb->xbb_stats != NULL) 3774 devstat_remove_entry(xbb->xbb_stats); 3775 3776 if (xbb->xbb_stats_in != NULL) 3777 devstat_remove_entry(xbb->xbb_stats_in); 3778 3779 xbb_close_backend(xbb); 3780 3781 if (xbb->dev_mode != NULL) { 3782 free(xbb->dev_mode, M_XENBUS); 3783 xbb->dev_mode = NULL; 3784 } 3785 3786 if (xbb->dev_type != NULL) { 3787 free(xbb->dev_type, M_XENBUS); 3788 xbb->dev_type = NULL; 3789 } 3790 3791 if (xbb->dev_name != NULL) { 3792 free(xbb->dev_name, M_XENBUS); 3793 xbb->dev_name = NULL; 3794 } 3795 3796 mtx_destroy(&xbb->lock); 3797 return (0); 3798 } 3799 3800 /** 3801 * Prepare this block back device for suspension of this VM. 3802 * 3803 * \param dev NewBus device object representing this Xen Block Back instance. 3804 * 3805 * \return 0 for success, errno codes for failure. 3806 */ 3807 static int 3808 xbb_suspend(device_t dev) 3809 { 3810 #ifdef NOT_YET 3811 struct xbb_softc *sc = device_get_softc(dev); 3812 3813 /* Prevent new requests being issued until we fix things up. */ 3814 mtx_lock(&sc->xb_io_lock); 3815 sc->connected = BLKIF_STATE_SUSPENDED; 3816 mtx_unlock(&sc->xb_io_lock); 3817 #endif 3818 3819 return (0); 3820 } 3821 3822 /** 3823 * Perform any processing required to recover from a suspended state. 3824 * 3825 * \param dev NewBus device object representing this Xen Block Back instance. 3826 * 3827 * \return 0 for success, errno codes for failure. 3828 */ 3829 static int 3830 xbb_resume(device_t dev) 3831 { 3832 return (0); 3833 } 3834 3835 /** 3836 * Handle state changes expressed via the XenStore by our front-end peer. 3837 * 3838 * \param dev NewBus device object representing this Xen 3839 * Block Back instance. 3840 * \param frontend_state The new state of the front-end. 3841 * 3842 * \return 0 for success, errno codes for failure. 3843 */ 3844 static void 3845 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3846 { 3847 struct xbb_softc *xbb = device_get_softc(dev); 3848 3849 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3850 xenbus_strstate(frontend_state), 3851 xenbus_strstate(xenbus_get_state(xbb->dev))); 3852 3853 switch (frontend_state) { 3854 case XenbusStateInitialising: 3855 break; 3856 case XenbusStateInitialised: 3857 case XenbusStateConnected: 3858 xbb_connect(xbb); 3859 break; 3860 case XenbusStateClosing: 3861 case XenbusStateClosed: 3862 mtx_lock(&xbb->lock); 3863 xbb_shutdown(xbb); 3864 mtx_unlock(&xbb->lock); 3865 if (frontend_state == XenbusStateClosed) 3866 xenbus_set_state(xbb->dev, XenbusStateClosed); 3867 break; 3868 default: 3869 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3870 frontend_state); 3871 break; 3872 } 3873 } 3874 3875 /*---------------------------- NewBus Registration ---------------------------*/ 3876 static device_method_t xbb_methods[] = { 3877 /* Device interface */ 3878 DEVMETHOD(device_probe, xbb_probe), 3879 DEVMETHOD(device_attach, xbb_attach), 3880 DEVMETHOD(device_detach, xbb_detach), 3881 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3882 DEVMETHOD(device_suspend, xbb_suspend), 3883 DEVMETHOD(device_resume, xbb_resume), 3884 3885 /* Xenbus interface */ 3886 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3887 3888 { 0, 0 } 3889 }; 3890 3891 static driver_t xbb_driver = { 3892 "xbbd", 3893 xbb_methods, 3894 sizeof(struct xbb_softc), 3895 }; 3896 devclass_t xbb_devclass; 3897 3898 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 3899