1 /*- 2 * Copyright (c) 2009-2011 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 * Ken Merry (Spectra Logic Corporation) 32 */ 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 36 /** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 */ 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/kernel.h> 46 #include <sys/malloc.h> 47 48 #include <sys/bio.h> 49 #include <sys/bus.h> 50 #include <sys/conf.h> 51 #include <sys/devicestat.h> 52 #include <sys/disk.h> 53 #include <sys/fcntl.h> 54 #include <sys/filedesc.h> 55 #include <sys/kdb.h> 56 #include <sys/module.h> 57 #include <sys/namei.h> 58 #include <sys/proc.h> 59 #include <sys/rman.h> 60 #include <sys/taskqueue.h> 61 #include <sys/types.h> 62 #include <sys/vnode.h> 63 #include <sys/mount.h> 64 #include <sys/sysctl.h> 65 #include <sys/bitstring.h> 66 #include <sys/sdt.h> 67 68 #include <geom/geom.h> 69 70 #include <machine/_inttypes.h> 71 72 #include <vm/vm.h> 73 #include <vm/vm_extern.h> 74 #include <vm/vm_kern.h> 75 76 #include <xen/xen-os.h> 77 #include <xen/blkif.h> 78 #include <xen/gnttab.h> 79 #include <xen/xen_intr.h> 80 81 #include <xen/interface/event_channel.h> 82 #include <xen/interface/grant_table.h> 83 84 #include <xen/xenbus/xenbusvar.h> 85 86 /*--------------------------- Compile-time Tunables --------------------------*/ 87 /** 88 * The maximum number of outstanding request blocks (request headers plus 89 * additional segment blocks) we will allow in a negotiated block-front/back 90 * communication channel. 91 */ 92 #define XBB_MAX_REQUESTS 256 93 94 /** 95 * \brief Define to force all I/O to be performed on memory owned by the 96 * backend device, with a copy-in/out to the remote domain's memory. 97 * 98 * \note This option is currently required when this driver's domain is 99 * operating in HVM mode on a system using an IOMMU. 100 * 101 * This driver uses Xen's grant table API to gain access to the memory of 102 * the remote domains it serves. When our domain is operating in PV mode, 103 * the grant table mechanism directly updates our domain's page table entries 104 * to point to the physical pages of the remote domain. This scheme guarantees 105 * that blkback and the backing devices it uses can safely perform DMA 106 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 107 * insure that our domain cannot DMA to pages owned by another domain. As 108 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 109 * table API. For this reason, in HVM mode, we must bounce all requests into 110 * memory that is mapped into our domain at domain startup and thus has 111 * valid IOMMU mappings. 112 */ 113 #define XBB_USE_BOUNCE_BUFFERS 114 115 /** 116 * \brief Define to enable rudimentary request logging to the console. 117 */ 118 #undef XBB_DEBUG 119 120 /*---------------------------------- Macros ----------------------------------*/ 121 /** 122 * Custom malloc type for all driver allocations. 123 */ 124 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 125 126 #ifdef XBB_DEBUG 127 #define DPRINTF(fmt, args...) \ 128 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 129 #else 130 #define DPRINTF(fmt, args...) do {} while(0) 131 #endif 132 133 /** 134 * The maximum mapped region size per request we will allow in a negotiated 135 * block-front/back communication channel. 136 */ 137 #define XBB_MAX_REQUEST_SIZE \ 138 MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 139 140 /** 141 * The maximum number of segments (within a request header and accompanying 142 * segment blocks) per request we will allow in a negotiated block-front/back 143 * communication channel. 144 */ 145 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 146 (MIN(UIO_MAXIOV, \ 147 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 148 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 149 150 /** 151 * The maximum number of shared memory ring pages we will allow in a 152 * negotiated block-front/back communication channel. Allow enough 153 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 154 */ 155 #define XBB_MAX_RING_PAGES \ 156 BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \ 157 * XBB_MAX_REQUESTS) 158 /** 159 * The maximum number of ring pages that we can allow per request list. 160 * We limit this to the maximum number of segments per request, because 161 * that is already a reasonable number of segments to aggregate. This 162 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 163 * because that would leave situations where we can't dispatch even one 164 * large request. 165 */ 166 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 167 168 /*--------------------------- Forward Declarations ---------------------------*/ 169 struct xbb_softc; 170 struct xbb_xen_req; 171 172 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 173 ...) __attribute__((format(printf, 3, 4))); 174 static int xbb_shutdown(struct xbb_softc *xbb); 175 static int xbb_detach(device_t dev); 176 177 /*------------------------------ Data Structures -----------------------------*/ 178 179 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 180 181 typedef enum { 182 XBB_REQLIST_NONE = 0x00, 183 XBB_REQLIST_MAPPED = 0x01 184 } xbb_reqlist_flags; 185 186 struct xbb_xen_reqlist { 187 /** 188 * Back reference to the parent block back instance for this 189 * request. Used during bio_done handling. 190 */ 191 struct xbb_softc *xbb; 192 193 /** 194 * BLKIF_OP code for this request. 195 */ 196 int operation; 197 198 /** 199 * Set to BLKIF_RSP_* to indicate request status. 200 * 201 * This field allows an error status to be recorded even if the 202 * delivery of this status must be deferred. Deferred reporting 203 * is necessary, for example, when an error is detected during 204 * completion processing of one bio when other bios for this 205 * request are still outstanding. 206 */ 207 int status; 208 209 /** 210 * Number of 512 byte sectors not transferred. 211 */ 212 int residual_512b_sectors; 213 214 /** 215 * Starting sector number of the first request in the list. 216 */ 217 off_t starting_sector_number; 218 219 /** 220 * If we're going to coalesce, the next contiguous sector would be 221 * this one. 222 */ 223 off_t next_contig_sector; 224 225 /** 226 * Number of child requests in the list. 227 */ 228 int num_children; 229 230 /** 231 * Number of I/O requests still pending on the backend. 232 */ 233 int pendcnt; 234 235 /** 236 * Total number of segments for requests in the list. 237 */ 238 int nr_segments; 239 240 /** 241 * Flags for this particular request list. 242 */ 243 xbb_reqlist_flags flags; 244 245 /** 246 * Kernel virtual address space reserved for this request 247 * list structure and used to map the remote domain's pages for 248 * this I/O, into our domain's address space. 249 */ 250 uint8_t *kva; 251 252 /** 253 * Base, psuedo-physical address, corresponding to the start 254 * of this request's kva region. 255 */ 256 uint64_t gnt_base; 257 258 259 #ifdef XBB_USE_BOUNCE_BUFFERS 260 /** 261 * Pre-allocated domain local memory used to proxy remote 262 * domain memory during I/O operations. 263 */ 264 uint8_t *bounce; 265 #endif 266 267 /** 268 * Array of grant handles (one per page) used to map this request. 269 */ 270 grant_handle_t *gnt_handles; 271 272 /** 273 * Device statistics request ordering type (ordered or simple). 274 */ 275 devstat_tag_type ds_tag_type; 276 277 /** 278 * Device statistics request type (read, write, no_data). 279 */ 280 devstat_trans_flags ds_trans_type; 281 282 /** 283 * The start time for this request. 284 */ 285 struct bintime ds_t0; 286 287 /** 288 * Linked list of contiguous requests with the same operation type. 289 */ 290 struct xbb_xen_req_list contig_req_list; 291 292 /** 293 * Linked list links used to aggregate idle requests in the 294 * request list free pool (xbb->reqlist_free_stailq) and pending 295 * requests waiting for execution (xbb->reqlist_pending_stailq). 296 */ 297 STAILQ_ENTRY(xbb_xen_reqlist) links; 298 }; 299 300 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 301 302 /** 303 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 304 */ 305 struct xbb_xen_req { 306 /** 307 * Linked list links used to aggregate requests into a reqlist 308 * and to store them in the request free pool. 309 */ 310 STAILQ_ENTRY(xbb_xen_req) links; 311 312 /** 313 * The remote domain's identifier for this I/O request. 314 */ 315 uint64_t id; 316 317 /** 318 * The number of pages currently mapped for this request. 319 */ 320 int nr_pages; 321 322 /** 323 * The number of 512 byte sectors comprising this requests. 324 */ 325 int nr_512b_sectors; 326 327 /** 328 * BLKIF_OP code for this request. 329 */ 330 int operation; 331 332 /** 333 * Storage used for non-native ring requests. 334 */ 335 blkif_request_t ring_req_storage; 336 337 /** 338 * Pointer to the Xen request in the ring. 339 */ 340 blkif_request_t *ring_req; 341 342 /** 343 * Consumer index for this request. 344 */ 345 RING_IDX req_ring_idx; 346 347 /** 348 * The start time for this request. 349 */ 350 struct bintime ds_t0; 351 352 /** 353 * Pointer back to our parent request list. 354 */ 355 struct xbb_xen_reqlist *reqlist; 356 }; 357 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 358 359 /** 360 * \brief Configuration data for the shared memory request ring 361 * used to communicate with the front-end client of this 362 * this driver. 363 */ 364 struct xbb_ring_config { 365 /** KVA address where ring memory is mapped. */ 366 vm_offset_t va; 367 368 /** The pseudo-physical address where ring memory is mapped.*/ 369 uint64_t gnt_addr; 370 371 /** 372 * Grant table handles, one per-ring page, returned by the 373 * hyperpervisor upon mapping of the ring and required to 374 * unmap it when a connection is torn down. 375 */ 376 grant_handle_t handle[XBB_MAX_RING_PAGES]; 377 378 /** 379 * The device bus address returned by the hypervisor when 380 * mapping the ring and required to unmap it when a connection 381 * is torn down. 382 */ 383 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 384 385 /** The number of ring pages mapped for the current connection. */ 386 u_int ring_pages; 387 388 /** 389 * The grant references, one per-ring page, supplied by the 390 * front-end, allowing us to reference the ring pages in the 391 * front-end's domain and to map these pages into our own domain. 392 */ 393 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 394 395 /** The interrupt driven even channel used to signal ring events. */ 396 evtchn_port_t evtchn; 397 }; 398 399 /** 400 * Per-instance connection state flags. 401 */ 402 typedef enum 403 { 404 /** 405 * The front-end requested a read-only mount of the 406 * back-end device/file. 407 */ 408 XBBF_READ_ONLY = 0x01, 409 410 /** Communication with the front-end has been established. */ 411 XBBF_RING_CONNECTED = 0x02, 412 413 /** 414 * Front-end requests exist in the ring and are waiting for 415 * xbb_xen_req objects to free up. 416 */ 417 XBBF_RESOURCE_SHORTAGE = 0x04, 418 419 /** Connection teardown in progress. */ 420 XBBF_SHUTDOWN = 0x08, 421 422 /** A thread is already performing shutdown processing. */ 423 XBBF_IN_SHUTDOWN = 0x10 424 } xbb_flag_t; 425 426 /** Backend device type. */ 427 typedef enum { 428 /** Backend type unknown. */ 429 XBB_TYPE_NONE = 0x00, 430 431 /** 432 * Backend type disk (access via cdev switch 433 * strategy routine). 434 */ 435 XBB_TYPE_DISK = 0x01, 436 437 /** Backend type file (access vnode operations.). */ 438 XBB_TYPE_FILE = 0x02 439 } xbb_type; 440 441 /** 442 * \brief Structure used to memoize information about a per-request 443 * scatter-gather list. 444 * 445 * The chief benefit of using this data structure is it avoids having 446 * to reparse the possibly discontiguous S/G list in the original 447 * request. Due to the way that the mapping of the memory backing an 448 * I/O transaction is handled by Xen, a second pass is unavoidable. 449 * At least this way the second walk is a simple array traversal. 450 * 451 * \note A single Scatter/Gather element in the block interface covers 452 * at most 1 machine page. In this context a sector (blkif 453 * nomenclature, not what I'd choose) is a 512b aligned unit 454 * of mapping within the machine page referenced by an S/G 455 * element. 456 */ 457 struct xbb_sg { 458 /** The number of 512b data chunks mapped in this S/G element. */ 459 int16_t nsect; 460 461 /** 462 * The index (0 based) of the first 512b data chunk mapped 463 * in this S/G element. 464 */ 465 uint8_t first_sect; 466 467 /** 468 * The index (0 based) of the last 512b data chunk mapped 469 * in this S/G element. 470 */ 471 uint8_t last_sect; 472 }; 473 474 /** 475 * Character device backend specific configuration data. 476 */ 477 struct xbb_dev_data { 478 /** Cdev used for device backend access. */ 479 struct cdev *cdev; 480 481 /** Cdev switch used for device backend access. */ 482 struct cdevsw *csw; 483 484 /** Used to hold a reference on opened cdev backend devices. */ 485 int dev_ref; 486 }; 487 488 /** 489 * File backend specific configuration data. 490 */ 491 struct xbb_file_data { 492 /** Credentials to use for vnode backed (file based) I/O. */ 493 struct ucred *cred; 494 495 /** 496 * \brief Array of io vectors used to process file based I/O. 497 * 498 * Only a single file based request is outstanding per-xbb instance, 499 * so we only need one of these. 500 */ 501 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 502 #ifdef XBB_USE_BOUNCE_BUFFERS 503 504 /** 505 * \brief Array of io vectors used to handle bouncing of file reads. 506 * 507 * Vnode operations are free to modify uio data during their 508 * exectuion. In the case of a read with bounce buffering active, 509 * we need some of the data from the original uio in order to 510 * bounce-out the read data. This array serves as the temporary 511 * storage for this saved data. 512 */ 513 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 514 515 /** 516 * \brief Array of memoized bounce buffer kva offsets used 517 * in the file based backend. 518 * 519 * Due to the way that the mapping of the memory backing an 520 * I/O transaction is handled by Xen, a second pass through 521 * the request sg elements is unavoidable. We memoize the computed 522 * bounce address here to reduce the cost of the second walk. 523 */ 524 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 525 #endif /* XBB_USE_BOUNCE_BUFFERS */ 526 }; 527 528 /** 529 * Collection of backend type specific data. 530 */ 531 union xbb_backend_data { 532 struct xbb_dev_data dev; 533 struct xbb_file_data file; 534 }; 535 536 /** 537 * Function signature of backend specific I/O handlers. 538 */ 539 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 540 struct xbb_xen_reqlist *reqlist, int operation, 541 int flags); 542 543 /** 544 * Per-instance configuration data. 545 */ 546 struct xbb_softc { 547 548 /** 549 * Task-queue used to process I/O requests. 550 */ 551 struct taskqueue *io_taskqueue; 552 553 /** 554 * Single "run the request queue" task enqueued 555 * on io_taskqueue. 556 */ 557 struct task io_task; 558 559 /** Device type for this instance. */ 560 xbb_type device_type; 561 562 /** NewBus device corresponding to this instance. */ 563 device_t dev; 564 565 /** Backend specific dispatch routine for this instance. */ 566 xbb_dispatch_t dispatch_io; 567 568 /** The number of requests outstanding on the backend device/file. */ 569 int active_request_count; 570 571 /** Free pool of request tracking structures. */ 572 struct xbb_xen_req_list request_free_stailq; 573 574 /** Array, sized at connection time, of request tracking structures. */ 575 struct xbb_xen_req *requests; 576 577 /** Free pool of request list structures. */ 578 struct xbb_xen_reqlist_list reqlist_free_stailq; 579 580 /** List of pending request lists awaiting execution. */ 581 struct xbb_xen_reqlist_list reqlist_pending_stailq; 582 583 /** Array, sized at connection time, of request list structures. */ 584 struct xbb_xen_reqlist *request_lists; 585 586 /** 587 * Global pool of kva used for mapping remote domain ring 588 * and I/O transaction data. 589 */ 590 vm_offset_t kva; 591 592 /** Psuedo-physical address corresponding to kva. */ 593 uint64_t gnt_base_addr; 594 595 /** The size of the global kva pool. */ 596 int kva_size; 597 598 /** The size of the KVA area used for request lists. */ 599 int reqlist_kva_size; 600 601 /** The number of pages of KVA used for request lists */ 602 int reqlist_kva_pages; 603 604 /** Bitmap of free KVA pages */ 605 bitstr_t *kva_free; 606 607 /** 608 * \brief Cached value of the front-end's domain id. 609 * 610 * This value is used at once for each mapped page in 611 * a transaction. We cache it to avoid incuring the 612 * cost of an ivar access every time this is needed. 613 */ 614 domid_t otherend_id; 615 616 /** 617 * \brief The blkif protocol abi in effect. 618 * 619 * There are situations where the back and front ends can 620 * have a different, native abi (e.g. intel x86_64 and 621 * 32bit x86 domains on the same machine). The back-end 622 * always accomodates the front-end's native abi. That 623 * value is pulled from the XenStore and recorded here. 624 */ 625 int abi; 626 627 /** 628 * \brief The maximum number of requests and request lists allowed 629 * to be in flight at a time. 630 * 631 * This value is negotiated via the XenStore. 632 */ 633 u_int max_requests; 634 635 /** 636 * \brief The maximum number of segments (1 page per segment) 637 * that can be mapped by a request. 638 * 639 * This value is negotiated via the XenStore. 640 */ 641 u_int max_request_segments; 642 643 /** 644 * \brief Maximum number of segments per request list. 645 * 646 * This value is derived from and will generally be larger than 647 * max_request_segments. 648 */ 649 u_int max_reqlist_segments; 650 651 /** 652 * The maximum size of any request to this back-end 653 * device. 654 * 655 * This value is negotiated via the XenStore. 656 */ 657 u_int max_request_size; 658 659 /** 660 * The maximum size of any request list. This is derived directly 661 * from max_reqlist_segments. 662 */ 663 u_int max_reqlist_size; 664 665 /** Various configuration and state bit flags. */ 666 xbb_flag_t flags; 667 668 /** Ring mapping and interrupt configuration data. */ 669 struct xbb_ring_config ring_config; 670 671 /** Runtime, cross-abi safe, structures for ring access. */ 672 blkif_back_rings_t rings; 673 674 /** IRQ mapping for the communication ring event channel. */ 675 xen_intr_handle_t xen_intr_handle; 676 677 /** 678 * \brief Backend access mode flags (e.g. write, or read-only). 679 * 680 * This value is passed to us by the front-end via the XenStore. 681 */ 682 char *dev_mode; 683 684 /** 685 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 686 * 687 * This value is passed to us by the front-end via the XenStore. 688 * Currently unused. 689 */ 690 char *dev_type; 691 692 /** 693 * \brief Backend device/file identifier. 694 * 695 * This value is passed to us by the front-end via the XenStore. 696 * We expect this to be a POSIX path indicating the file or 697 * device to open. 698 */ 699 char *dev_name; 700 701 /** 702 * Vnode corresponding to the backend device node or file 703 * we are acessing. 704 */ 705 struct vnode *vn; 706 707 union xbb_backend_data backend; 708 709 /** The native sector size of the backend. */ 710 u_int sector_size; 711 712 /** log2 of sector_size. */ 713 u_int sector_size_shift; 714 715 /** Size in bytes of the backend device or file. */ 716 off_t media_size; 717 718 /** 719 * \brief media_size expressed in terms of the backend native 720 * sector size. 721 * 722 * (e.g. xbb->media_size >> xbb->sector_size_shift). 723 */ 724 uint64_t media_num_sectors; 725 726 /** 727 * \brief Array of memoized scatter gather data computed during the 728 * conversion of blkif ring requests to internal xbb_xen_req 729 * structures. 730 * 731 * Ring processing is serialized so we only need one of these. 732 */ 733 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 734 735 /** 736 * Temporary grant table map used in xbb_dispatch_io(). When 737 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 738 * stack could cause a stack overflow. 739 */ 740 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 741 742 /** Mutex protecting per-instance data. */ 743 struct mtx lock; 744 745 #ifdef XENHVM 746 /** 747 * Resource representing allocated physical address space 748 * associated with our per-instance kva region. 749 */ 750 struct resource *pseudo_phys_res; 751 752 /** Resource id for allocated physical address space. */ 753 int pseudo_phys_res_id; 754 #endif 755 756 /** 757 * I/O statistics from BlockBack dispatch down. These are 758 * coalesced requests, and we start them right before execution. 759 */ 760 struct devstat *xbb_stats; 761 762 /** 763 * I/O statistics coming into BlockBack. These are the requests as 764 * we get them from BlockFront. They are started as soon as we 765 * receive a request, and completed when the I/O is complete. 766 */ 767 struct devstat *xbb_stats_in; 768 769 /** Disable sending flush to the backend */ 770 int disable_flush; 771 772 /** Send a real flush for every N flush requests */ 773 int flush_interval; 774 775 /** Count of flush requests in the interval */ 776 int flush_count; 777 778 /** Don't coalesce requests if this is set */ 779 int no_coalesce_reqs; 780 781 /** Number of requests we have received */ 782 uint64_t reqs_received; 783 784 /** Number of requests we have completed*/ 785 uint64_t reqs_completed; 786 787 /** How many forced dispatches (i.e. without coalescing) have happend */ 788 uint64_t forced_dispatch; 789 790 /** How many normal dispatches have happend */ 791 uint64_t normal_dispatch; 792 793 /** How many total dispatches have happend */ 794 uint64_t total_dispatch; 795 796 /** How many times we have run out of KVA */ 797 uint64_t kva_shortages; 798 799 /** How many times we have run out of request structures */ 800 uint64_t request_shortages; 801 }; 802 803 /*---------------------------- Request Processing ----------------------------*/ 804 /** 805 * Allocate an internal transaction tracking structure from the free pool. 806 * 807 * \param xbb Per-instance xbb configuration structure. 808 * 809 * \return On success, a pointer to the allocated xbb_xen_req structure. 810 * Otherwise NULL. 811 */ 812 static inline struct xbb_xen_req * 813 xbb_get_req(struct xbb_softc *xbb) 814 { 815 struct xbb_xen_req *req; 816 817 req = NULL; 818 819 mtx_assert(&xbb->lock, MA_OWNED); 820 821 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 822 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 823 xbb->active_request_count++; 824 } 825 826 return (req); 827 } 828 829 /** 830 * Return an allocated transaction tracking structure to the free pool. 831 * 832 * \param xbb Per-instance xbb configuration structure. 833 * \param req The request structure to free. 834 */ 835 static inline void 836 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 837 { 838 mtx_assert(&xbb->lock, MA_OWNED); 839 840 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 841 xbb->active_request_count--; 842 843 KASSERT(xbb->active_request_count >= 0, 844 ("xbb_release_req: negative active count")); 845 } 846 847 /** 848 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 849 * 850 * \param xbb Per-instance xbb configuration structure. 851 * \param req_list The list of requests to free. 852 * \param nreqs The number of items in the list. 853 */ 854 static inline void 855 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 856 int nreqs) 857 { 858 mtx_assert(&xbb->lock, MA_OWNED); 859 860 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 861 xbb->active_request_count -= nreqs; 862 863 KASSERT(xbb->active_request_count >= 0, 864 ("xbb_release_reqs: negative active count")); 865 } 866 867 /** 868 * Given a page index and 512b sector offset within that page, 869 * calculate an offset into a request's kva region. 870 * 871 * \param reqlist The request structure whose kva region will be accessed. 872 * \param pagenr The page index used to compute the kva offset. 873 * \param sector The 512b sector index used to compute the page relative 874 * kva offset. 875 * 876 * \return The computed global KVA offset. 877 */ 878 static inline uint8_t * 879 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 880 { 881 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 882 } 883 884 #ifdef XBB_USE_BOUNCE_BUFFERS 885 /** 886 * Given a page index and 512b sector offset within that page, 887 * calculate an offset into a request's local bounce memory region. 888 * 889 * \param reqlist The request structure whose bounce region will be accessed. 890 * \param pagenr The page index used to compute the bounce offset. 891 * \param sector The 512b sector index used to compute the page relative 892 * bounce offset. 893 * 894 * \return The computed global bounce buffer address. 895 */ 896 static inline uint8_t * 897 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 898 { 899 return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 900 } 901 #endif 902 903 /** 904 * Given a page number and 512b sector offset within that page, 905 * calculate an offset into the request's memory region that the 906 * underlying backend device/file should use for I/O. 907 * 908 * \param reqlist The request structure whose I/O region will be accessed. 909 * \param pagenr The page index used to compute the I/O offset. 910 * \param sector The 512b sector index used to compute the page relative 911 * I/O offset. 912 * 913 * \return The computed global I/O address. 914 * 915 * Depending on configuration, this will either be a local bounce buffer 916 * or a pointer to the memory mapped in from the front-end domain for 917 * this request. 918 */ 919 static inline uint8_t * 920 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 921 { 922 #ifdef XBB_USE_BOUNCE_BUFFERS 923 return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 924 #else 925 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 926 #endif 927 } 928 929 /** 930 * Given a page index and 512b sector offset within that page, calculate 931 * an offset into the local psuedo-physical address space used to map a 932 * front-end's request data into a request. 933 * 934 * \param reqlist The request list structure whose pseudo-physical region 935 * will be accessed. 936 * \param pagenr The page index used to compute the pseudo-physical offset. 937 * \param sector The 512b sector index used to compute the page relative 938 * pseudo-physical offset. 939 * 940 * \return The computed global pseudo-phsyical address. 941 * 942 * Depending on configuration, this will either be a local bounce buffer 943 * or a pointer to the memory mapped in from the front-end domain for 944 * this request. 945 */ 946 static inline uintptr_t 947 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 948 { 949 struct xbb_softc *xbb; 950 951 xbb = reqlist->xbb; 952 953 return ((uintptr_t)(xbb->gnt_base_addr + 954 (uintptr_t)(reqlist->kva - xbb->kva) + 955 (PAGE_SIZE * pagenr) + (sector << 9))); 956 } 957 958 /** 959 * Get Kernel Virtual Address space for mapping requests. 960 * 961 * \param xbb Per-instance xbb configuration structure. 962 * \param nr_pages Number of pages needed. 963 * \param check_only If set, check for free KVA but don't allocate it. 964 * \param have_lock If set, xbb lock is already held. 965 * 966 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 967 * 968 * Note: This should be unnecessary once we have either chaining or 969 * scatter/gather support for struct bio. At that point we'll be able to 970 * put multiple addresses and lengths in one bio/bio chain and won't need 971 * to map everything into one virtual segment. 972 */ 973 static uint8_t * 974 xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 975 { 976 intptr_t first_clear; 977 intptr_t num_clear; 978 uint8_t *free_kva; 979 int i; 980 981 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 982 983 first_clear = 0; 984 free_kva = NULL; 985 986 mtx_lock(&xbb->lock); 987 988 /* 989 * Look for the first available page. If there are none, we're done. 990 */ 991 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 992 993 if (first_clear == -1) 994 goto bailout; 995 996 /* 997 * Starting at the first available page, look for consecutive free 998 * pages that will satisfy the user's request. 999 */ 1000 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1001 /* 1002 * If this is true, the page is used, so we have to reset 1003 * the number of clear pages and the first clear page 1004 * (since it pointed to a region with an insufficient number 1005 * of clear pages). 1006 */ 1007 if (bit_test(xbb->kva_free, i)) { 1008 num_clear = 0; 1009 first_clear = -1; 1010 continue; 1011 } 1012 1013 if (first_clear == -1) 1014 first_clear = i; 1015 1016 /* 1017 * If this is true, we've found a large enough free region 1018 * to satisfy the request. 1019 */ 1020 if (++num_clear == nr_pages) { 1021 1022 bit_nset(xbb->kva_free, first_clear, 1023 first_clear + nr_pages - 1); 1024 1025 free_kva = xbb->kva + 1026 (uint8_t *)(first_clear * PAGE_SIZE); 1027 1028 KASSERT(free_kva >= (uint8_t *)xbb->kva && 1029 free_kva + (nr_pages * PAGE_SIZE) <= 1030 (uint8_t *)xbb->ring_config.va, 1031 ("Free KVA %p len %d out of range, " 1032 "kva = %#jx, ring VA = %#jx\n", free_kva, 1033 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1034 (uintmax_t)xbb->ring_config.va)); 1035 break; 1036 } 1037 } 1038 1039 bailout: 1040 1041 if (free_kva == NULL) { 1042 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1043 xbb->kva_shortages++; 1044 } 1045 1046 mtx_unlock(&xbb->lock); 1047 1048 return (free_kva); 1049 } 1050 1051 /** 1052 * Free allocated KVA. 1053 * 1054 * \param xbb Per-instance xbb configuration structure. 1055 * \param kva_ptr Pointer to allocated KVA region. 1056 * \param nr_pages Number of pages in the KVA region. 1057 */ 1058 static void 1059 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 1060 { 1061 intptr_t start_page; 1062 1063 mtx_assert(&xbb->lock, MA_OWNED); 1064 1065 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1066 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1067 1068 } 1069 1070 /** 1071 * Unmap the front-end pages associated with this I/O request. 1072 * 1073 * \param req The request structure to unmap. 1074 */ 1075 static void 1076 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1077 { 1078 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1079 u_int i; 1080 u_int invcount; 1081 int error; 1082 1083 invcount = 0; 1084 for (i = 0; i < reqlist->nr_segments; i++) { 1085 1086 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1087 continue; 1088 1089 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1090 unmap[invcount].dev_bus_addr = 0; 1091 unmap[invcount].handle = reqlist->gnt_handles[i]; 1092 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1093 invcount++; 1094 } 1095 1096 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1097 unmap, invcount); 1098 KASSERT(error == 0, ("Grant table operation failed")); 1099 } 1100 1101 /** 1102 * Allocate an internal transaction tracking structure from the free pool. 1103 * 1104 * \param xbb Per-instance xbb configuration structure. 1105 * 1106 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1107 * Otherwise NULL. 1108 */ 1109 static inline struct xbb_xen_reqlist * 1110 xbb_get_reqlist(struct xbb_softc *xbb) 1111 { 1112 struct xbb_xen_reqlist *reqlist; 1113 1114 reqlist = NULL; 1115 1116 mtx_assert(&xbb->lock, MA_OWNED); 1117 1118 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1119 1120 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1121 reqlist->flags = XBB_REQLIST_NONE; 1122 reqlist->kva = NULL; 1123 reqlist->status = BLKIF_RSP_OKAY; 1124 reqlist->residual_512b_sectors = 0; 1125 reqlist->num_children = 0; 1126 reqlist->nr_segments = 0; 1127 STAILQ_INIT(&reqlist->contig_req_list); 1128 } 1129 1130 return (reqlist); 1131 } 1132 1133 /** 1134 * Return an allocated transaction tracking structure to the free pool. 1135 * 1136 * \param xbb Per-instance xbb configuration structure. 1137 * \param req The request list structure to free. 1138 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1139 * during a resource shortage condition. 1140 */ 1141 static inline void 1142 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1143 int wakeup) 1144 { 1145 1146 mtx_lock(&xbb->lock); 1147 1148 if (wakeup) { 1149 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1150 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1151 } 1152 1153 if (reqlist->kva != NULL) 1154 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1155 1156 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1157 1158 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1159 1160 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1161 /* 1162 * Shutdown is in progress. See if we can 1163 * progress further now that one more request 1164 * has completed and been returned to the 1165 * free pool. 1166 */ 1167 xbb_shutdown(xbb); 1168 } 1169 1170 mtx_unlock(&xbb->lock); 1171 1172 if (wakeup != 0) 1173 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1174 } 1175 1176 /** 1177 * Request resources and do basic request setup. 1178 * 1179 * \param xbb Per-instance xbb configuration structure. 1180 * \param reqlist Pointer to reqlist pointer. 1181 * \param ring_req Pointer to a block ring request. 1182 * \param ring_index The ring index of this request. 1183 * 1184 * \return 0 for success, non-zero for failure. 1185 */ 1186 static int 1187 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1188 blkif_request_t *ring_req, RING_IDX ring_idx) 1189 { 1190 struct xbb_xen_reqlist *nreqlist; 1191 struct xbb_xen_req *nreq; 1192 1193 nreqlist = NULL; 1194 nreq = NULL; 1195 1196 mtx_lock(&xbb->lock); 1197 1198 /* 1199 * We don't allow new resources to be allocated if we're in the 1200 * process of shutting down. 1201 */ 1202 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1203 mtx_unlock(&xbb->lock); 1204 return (1); 1205 } 1206 1207 /* 1208 * Allocate a reqlist if the caller doesn't have one already. 1209 */ 1210 if (*reqlist == NULL) { 1211 nreqlist = xbb_get_reqlist(xbb); 1212 if (nreqlist == NULL) 1213 goto bailout_error; 1214 } 1215 1216 /* We always allocate a request. */ 1217 nreq = xbb_get_req(xbb); 1218 if (nreq == NULL) 1219 goto bailout_error; 1220 1221 mtx_unlock(&xbb->lock); 1222 1223 if (*reqlist == NULL) { 1224 *reqlist = nreqlist; 1225 nreqlist->operation = ring_req->operation; 1226 nreqlist->starting_sector_number = ring_req->sector_number; 1227 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1228 links); 1229 } 1230 1231 nreq->reqlist = *reqlist; 1232 nreq->req_ring_idx = ring_idx; 1233 nreq->id = ring_req->id; 1234 nreq->operation = ring_req->operation; 1235 1236 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1237 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1238 nreq->ring_req = &nreq->ring_req_storage; 1239 } else { 1240 nreq->ring_req = ring_req; 1241 } 1242 1243 binuptime(&nreq->ds_t0); 1244 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1245 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1246 (*reqlist)->num_children++; 1247 (*reqlist)->nr_segments += ring_req->nr_segments; 1248 1249 return (0); 1250 1251 bailout_error: 1252 1253 /* 1254 * We're out of resources, so set the shortage flag. The next time 1255 * a request is released, we'll try waking up the work thread to 1256 * see if we can allocate more resources. 1257 */ 1258 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1259 xbb->request_shortages++; 1260 1261 if (nreq != NULL) 1262 xbb_release_req(xbb, nreq); 1263 1264 mtx_unlock(&xbb->lock); 1265 1266 if (nreqlist != NULL) 1267 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1268 1269 return (1); 1270 } 1271 1272 /** 1273 * Create and transmit a response to a blkif request. 1274 * 1275 * \param xbb Per-instance xbb configuration structure. 1276 * \param req The request structure to which to respond. 1277 * \param status The status code to report. See BLKIF_RSP_* 1278 * in sys/xen/interface/io/blkif.h. 1279 */ 1280 static void 1281 xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1282 { 1283 blkif_response_t *resp; 1284 int more_to_do; 1285 int notify; 1286 1287 more_to_do = 0; 1288 1289 /* 1290 * Place on the response ring for the relevant domain. 1291 * For now, only the spacing between entries is different 1292 * in the different ABIs, not the response entry layout. 1293 */ 1294 mtx_lock(&xbb->lock); 1295 switch (xbb->abi) { 1296 case BLKIF_PROTOCOL_NATIVE: 1297 resp = RING_GET_RESPONSE(&xbb->rings.native, 1298 xbb->rings.native.rsp_prod_pvt); 1299 break; 1300 case BLKIF_PROTOCOL_X86_32: 1301 resp = (blkif_response_t *) 1302 RING_GET_RESPONSE(&xbb->rings.x86_32, 1303 xbb->rings.x86_32.rsp_prod_pvt); 1304 break; 1305 case BLKIF_PROTOCOL_X86_64: 1306 resp = (blkif_response_t *) 1307 RING_GET_RESPONSE(&xbb->rings.x86_64, 1308 xbb->rings.x86_64.rsp_prod_pvt); 1309 break; 1310 default: 1311 panic("Unexpected blkif protocol ABI."); 1312 } 1313 1314 resp->id = req->id; 1315 resp->operation = req->operation; 1316 resp->status = status; 1317 1318 xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages); 1319 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify); 1320 1321 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1322 1323 /* 1324 * Tail check for pending requests. Allows frontend to avoid 1325 * notifications if requests are already in flight (lower 1326 * overheads and promotes batching). 1327 */ 1328 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1329 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1330 1331 more_to_do = 1; 1332 } 1333 1334 xbb->reqs_completed++; 1335 1336 mtx_unlock(&xbb->lock); 1337 1338 if (more_to_do) 1339 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1340 1341 if (notify) 1342 xen_intr_signal(xbb->xen_intr_handle); 1343 } 1344 1345 /** 1346 * Complete a request list. 1347 * 1348 * \param xbb Per-instance xbb configuration structure. 1349 * \param reqlist Allocated internal request list structure. 1350 */ 1351 static void 1352 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1353 { 1354 struct xbb_xen_req *nreq; 1355 off_t sectors_sent; 1356 1357 sectors_sent = 0; 1358 1359 if (reqlist->flags & XBB_REQLIST_MAPPED) 1360 xbb_unmap_reqlist(reqlist); 1361 1362 /* 1363 * All I/O is done, send the response. A lock should not be 1364 * necessary here because the request list is complete, and 1365 * therefore this is the only context accessing this request 1366 * right now. The functions we call do their own locking if 1367 * necessary. 1368 */ 1369 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1370 off_t cur_sectors_sent; 1371 1372 xbb_send_response(xbb, nreq, reqlist->status); 1373 1374 /* We don't report bytes sent if there is an error. */ 1375 if (reqlist->status == BLKIF_RSP_OKAY) 1376 cur_sectors_sent = nreq->nr_512b_sectors; 1377 else 1378 cur_sectors_sent = 0; 1379 1380 sectors_sent += cur_sectors_sent; 1381 1382 devstat_end_transaction(xbb->xbb_stats_in, 1383 /*bytes*/cur_sectors_sent << 9, 1384 reqlist->ds_tag_type, 1385 reqlist->ds_trans_type, 1386 /*now*/NULL, 1387 /*then*/&nreq->ds_t0); 1388 } 1389 1390 /* 1391 * Take out any sectors not sent. If we wind up negative (which 1392 * might happen if an error is reported as well as a residual), just 1393 * report 0 sectors sent. 1394 */ 1395 sectors_sent -= reqlist->residual_512b_sectors; 1396 if (sectors_sent < 0) 1397 sectors_sent = 0; 1398 1399 devstat_end_transaction(xbb->xbb_stats, 1400 /*bytes*/ sectors_sent << 9, 1401 reqlist->ds_tag_type, 1402 reqlist->ds_trans_type, 1403 /*now*/NULL, 1404 /*then*/&reqlist->ds_t0); 1405 1406 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1407 } 1408 1409 /** 1410 * Completion handler for buffer I/O requests issued by the device 1411 * backend driver. 1412 * 1413 * \param bio The buffer I/O request on which to perform completion 1414 * processing. 1415 */ 1416 static void 1417 xbb_bio_done(struct bio *bio) 1418 { 1419 struct xbb_softc *xbb; 1420 struct xbb_xen_reqlist *reqlist; 1421 1422 reqlist = bio->bio_caller1; 1423 xbb = reqlist->xbb; 1424 1425 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1426 1427 /* 1428 * This is a bit imprecise. With aggregated I/O a single 1429 * request list can contain multiple front-end requests and 1430 * a multiple bios may point to a single request. By carefully 1431 * walking the request list, we could map residuals and errors 1432 * back to the original front-end request, but the interface 1433 * isn't sufficiently rich for us to properly report the error. 1434 * So, we just treat the entire request list as having failed if an 1435 * error occurs on any part. And, if an error occurs, we treat 1436 * the amount of data transferred as 0. 1437 * 1438 * For residuals, we report it on the overall aggregated device, 1439 * but not on the individual requests, since we don't currently 1440 * do the work to determine which front-end request to which the 1441 * residual applies. 1442 */ 1443 if (bio->bio_error) { 1444 DPRINTF("BIO returned error %d for operation on device %s\n", 1445 bio->bio_error, xbb->dev_name); 1446 reqlist->status = BLKIF_RSP_ERROR; 1447 1448 if (bio->bio_error == ENXIO 1449 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1450 1451 /* 1452 * Backend device has disappeared. Signal the 1453 * front-end that we (the device proxy) want to 1454 * go away. 1455 */ 1456 xenbus_set_state(xbb->dev, XenbusStateClosing); 1457 } 1458 } 1459 1460 #ifdef XBB_USE_BOUNCE_BUFFERS 1461 if (bio->bio_cmd == BIO_READ) { 1462 vm_offset_t kva_offset; 1463 1464 kva_offset = (vm_offset_t)bio->bio_data 1465 - (vm_offset_t)reqlist->bounce; 1466 memcpy((uint8_t *)reqlist->kva + kva_offset, 1467 bio->bio_data, bio->bio_bcount); 1468 } 1469 #endif /* XBB_USE_BOUNCE_BUFFERS */ 1470 1471 /* 1472 * Decrement the pending count for the request list. When we're 1473 * done with the requests, send status back for all of them. 1474 */ 1475 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1476 xbb_complete_reqlist(xbb, reqlist); 1477 1478 g_destroy_bio(bio); 1479 } 1480 1481 /** 1482 * Parse a blkif request into an internal request structure and send 1483 * it to the backend for processing. 1484 * 1485 * \param xbb Per-instance xbb configuration structure. 1486 * \param reqlist Allocated internal request list structure. 1487 * 1488 * \return On success, 0. For resource shortages, non-zero. 1489 * 1490 * This routine performs the backend common aspects of request parsing 1491 * including compiling an internal request structure, parsing the S/G 1492 * list and any secondary ring requests in which they may reside, and 1493 * the mapping of front-end I/O pages into our domain. 1494 */ 1495 static int 1496 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1497 { 1498 struct xbb_sg *xbb_sg; 1499 struct gnttab_map_grant_ref *map; 1500 struct blkif_request_segment *sg; 1501 struct blkif_request_segment *last_block_sg; 1502 struct xbb_xen_req *nreq; 1503 u_int nseg; 1504 u_int seg_idx; 1505 u_int block_segs; 1506 int nr_sects; 1507 int total_sects; 1508 int operation; 1509 uint8_t bio_flags; 1510 int error; 1511 1512 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1513 bio_flags = 0; 1514 total_sects = 0; 1515 nr_sects = 0; 1516 1517 /* 1518 * First determine whether we have enough free KVA to satisfy this 1519 * request list. If not, tell xbb_run_queue() so it can go to 1520 * sleep until we have more KVA. 1521 */ 1522 reqlist->kva = NULL; 1523 if (reqlist->nr_segments != 0) { 1524 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1525 if (reqlist->kva == NULL) { 1526 /* 1527 * If we're out of KVA, return ENOMEM. 1528 */ 1529 return (ENOMEM); 1530 } 1531 } 1532 1533 binuptime(&reqlist->ds_t0); 1534 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1535 1536 switch (reqlist->operation) { 1537 case BLKIF_OP_WRITE_BARRIER: 1538 bio_flags |= BIO_ORDERED; 1539 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1540 /* FALLTHROUGH */ 1541 case BLKIF_OP_WRITE: 1542 operation = BIO_WRITE; 1543 reqlist->ds_trans_type = DEVSTAT_WRITE; 1544 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1545 DPRINTF("Attempt to write to read only device %s\n", 1546 xbb->dev_name); 1547 reqlist->status = BLKIF_RSP_ERROR; 1548 goto send_response; 1549 } 1550 break; 1551 case BLKIF_OP_READ: 1552 operation = BIO_READ; 1553 reqlist->ds_trans_type = DEVSTAT_READ; 1554 break; 1555 case BLKIF_OP_FLUSH_DISKCACHE: 1556 /* 1557 * If this is true, the user has requested that we disable 1558 * flush support. So we just complete the requests 1559 * successfully. 1560 */ 1561 if (xbb->disable_flush != 0) { 1562 goto send_response; 1563 } 1564 1565 /* 1566 * The user has requested that we only send a real flush 1567 * for every N flush requests. So keep count, and either 1568 * complete the request immediately or queue it for the 1569 * backend. 1570 */ 1571 if (xbb->flush_interval != 0) { 1572 if (++(xbb->flush_count) < xbb->flush_interval) { 1573 goto send_response; 1574 } else 1575 xbb->flush_count = 0; 1576 } 1577 1578 operation = BIO_FLUSH; 1579 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1580 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1581 goto do_dispatch; 1582 /*NOTREACHED*/ 1583 default: 1584 DPRINTF("error: unknown block io operation [%d]\n", 1585 reqlist->operation); 1586 reqlist->status = BLKIF_RSP_ERROR; 1587 goto send_response; 1588 } 1589 1590 reqlist->xbb = xbb; 1591 xbb_sg = xbb->xbb_sgs; 1592 map = xbb->maps; 1593 seg_idx = 0; 1594 1595 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1596 blkif_request_t *ring_req; 1597 RING_IDX req_ring_idx; 1598 u_int req_seg_idx; 1599 1600 ring_req = nreq->ring_req; 1601 req_ring_idx = nreq->req_ring_idx; 1602 nr_sects = 0; 1603 nseg = ring_req->nr_segments; 1604 nreq->nr_pages = nseg; 1605 nreq->nr_512b_sectors = 0; 1606 req_seg_idx = 0; 1607 sg = NULL; 1608 1609 /* Check that number of segments is sane. */ 1610 if (__predict_false(nseg == 0) 1611 || __predict_false(nseg > xbb->max_request_segments)) { 1612 DPRINTF("Bad number of segments in request (%d)\n", 1613 nseg); 1614 reqlist->status = BLKIF_RSP_ERROR; 1615 goto send_response; 1616 } 1617 1618 block_segs = MIN(nreq->nr_pages, 1619 BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); 1620 sg = ring_req->seg; 1621 last_block_sg = sg + block_segs; 1622 while (1) { 1623 1624 while (sg < last_block_sg) { 1625 KASSERT(seg_idx < 1626 XBB_MAX_SEGMENTS_PER_REQLIST, 1627 ("seg_idx %d is too large, max " 1628 "segs %d\n", seg_idx, 1629 XBB_MAX_SEGMENTS_PER_REQLIST)); 1630 1631 xbb_sg->first_sect = sg->first_sect; 1632 xbb_sg->last_sect = sg->last_sect; 1633 xbb_sg->nsect = 1634 (int8_t)(sg->last_sect - 1635 sg->first_sect + 1); 1636 1637 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1638 || (xbb_sg->nsect <= 0)) { 1639 reqlist->status = BLKIF_RSP_ERROR; 1640 goto send_response; 1641 } 1642 1643 nr_sects += xbb_sg->nsect; 1644 map->host_addr = xbb_get_gntaddr(reqlist, 1645 seg_idx, /*sector*/0); 1646 KASSERT(map->host_addr + PAGE_SIZE <= 1647 xbb->ring_config.gnt_addr, 1648 ("Host address %#jx len %d overlaps " 1649 "ring address %#jx\n", 1650 (uintmax_t)map->host_addr, PAGE_SIZE, 1651 (uintmax_t)xbb->ring_config.gnt_addr)); 1652 1653 map->flags = GNTMAP_host_map; 1654 map->ref = sg->gref; 1655 map->dom = xbb->otherend_id; 1656 if (operation == BIO_WRITE) 1657 map->flags |= GNTMAP_readonly; 1658 sg++; 1659 map++; 1660 xbb_sg++; 1661 seg_idx++; 1662 req_seg_idx++; 1663 } 1664 1665 block_segs = MIN(nseg - req_seg_idx, 1666 BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); 1667 if (block_segs == 0) 1668 break; 1669 1670 /* 1671 * Fetch the next request block full of SG elements. 1672 * For now, only the spacing between entries is 1673 * different in the different ABIs, not the sg entry 1674 * layout. 1675 */ 1676 req_ring_idx++; 1677 switch (xbb->abi) { 1678 case BLKIF_PROTOCOL_NATIVE: 1679 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native, 1680 req_ring_idx); 1681 break; 1682 case BLKIF_PROTOCOL_X86_32: 1683 { 1684 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32, 1685 req_ring_idx); 1686 break; 1687 } 1688 case BLKIF_PROTOCOL_X86_64: 1689 { 1690 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64, 1691 req_ring_idx); 1692 break; 1693 } 1694 default: 1695 panic("Unexpected blkif protocol ABI."); 1696 /* NOTREACHED */ 1697 } 1698 last_block_sg = sg + block_segs; 1699 } 1700 1701 /* Convert to the disk's sector size */ 1702 nreq->nr_512b_sectors = nr_sects; 1703 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1704 total_sects += nr_sects; 1705 1706 if ((nreq->nr_512b_sectors & 1707 ((xbb->sector_size >> 9) - 1)) != 0) { 1708 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1709 "a multiple of the backing store sector " 1710 "size (%d)\n", __func__, 1711 nreq->nr_512b_sectors << 9, 1712 xbb->sector_size); 1713 reqlist->status = BLKIF_RSP_ERROR; 1714 goto send_response; 1715 } 1716 } 1717 1718 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1719 xbb->maps, reqlist->nr_segments); 1720 if (error != 0) 1721 panic("Grant table operation failed (%d)", error); 1722 1723 reqlist->flags |= XBB_REQLIST_MAPPED; 1724 1725 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1726 seg_idx++, map++){ 1727 1728 if (__predict_false(map->status != 0)) { 1729 DPRINTF("invalid buffer -- could not remap " 1730 "it (%d)\n", map->status); 1731 DPRINTF("Mapping(%d): Host Addr 0x%lx, flags " 1732 "0x%x ref 0x%x, dom %d\n", seg_idx, 1733 map->host_addr, map->flags, map->ref, 1734 map->dom); 1735 reqlist->status = BLKIF_RSP_ERROR; 1736 goto send_response; 1737 } 1738 1739 reqlist->gnt_handles[seg_idx] = map->handle; 1740 } 1741 if (reqlist->starting_sector_number + total_sects > 1742 xbb->media_num_sectors) { 1743 1744 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1745 "extends past end of device %s\n", 1746 operation == BIO_READ ? "read" : "write", 1747 reqlist->starting_sector_number, 1748 reqlist->starting_sector_number + total_sects, 1749 xbb->dev_name); 1750 reqlist->status = BLKIF_RSP_ERROR; 1751 goto send_response; 1752 } 1753 1754 do_dispatch: 1755 1756 error = xbb->dispatch_io(xbb, 1757 reqlist, 1758 operation, 1759 bio_flags); 1760 1761 if (error != 0) { 1762 reqlist->status = BLKIF_RSP_ERROR; 1763 goto send_response; 1764 } 1765 1766 return (0); 1767 1768 send_response: 1769 1770 xbb_complete_reqlist(xbb, reqlist); 1771 1772 return (0); 1773 } 1774 1775 static __inline int 1776 xbb_count_sects(blkif_request_t *ring_req) 1777 { 1778 int i; 1779 int cur_size = 0; 1780 1781 for (i = 0; i < ring_req->nr_segments; i++) { 1782 int nsect; 1783 1784 nsect = (int8_t)(ring_req->seg[i].last_sect - 1785 ring_req->seg[i].first_sect + 1); 1786 if (nsect <= 0) 1787 break; 1788 1789 cur_size += nsect; 1790 } 1791 1792 return (cur_size); 1793 } 1794 1795 /** 1796 * Process incoming requests from the shared communication ring in response 1797 * to a signal on the ring's event channel. 1798 * 1799 * \param context Callback argument registerd during task initialization - 1800 * the xbb_softc for this instance. 1801 * \param pending The number of taskqueue_enqueue events that have 1802 * occurred since this handler was last run. 1803 */ 1804 static void 1805 xbb_run_queue(void *context, int pending) 1806 { 1807 struct xbb_softc *xbb; 1808 blkif_back_rings_t *rings; 1809 RING_IDX rp; 1810 uint64_t cur_sector; 1811 int cur_operation; 1812 struct xbb_xen_reqlist *reqlist; 1813 1814 1815 xbb = (struct xbb_softc *)context; 1816 rings = &xbb->rings; 1817 1818 /* 1819 * Work gather and dispatch loop. Note that we have a bias here 1820 * towards gathering I/O sent by blockfront. We first gather up 1821 * everything in the ring, as long as we have resources. Then we 1822 * dispatch one request, and then attempt to gather up any 1823 * additional requests that have come in while we were dispatching 1824 * the request. 1825 * 1826 * This allows us to get a clearer picture (via devstat) of how 1827 * many requests blockfront is queueing to us at any given time. 1828 */ 1829 for (;;) { 1830 int retval; 1831 1832 /* 1833 * Initialize reqlist to the last element in the pending 1834 * queue, if there is one. This allows us to add more 1835 * requests to that request list, if we have room. 1836 */ 1837 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1838 xbb_xen_reqlist, links); 1839 if (reqlist != NULL) { 1840 cur_sector = reqlist->next_contig_sector; 1841 cur_operation = reqlist->operation; 1842 } else { 1843 cur_operation = 0; 1844 cur_sector = 0; 1845 } 1846 1847 /* 1848 * Cache req_prod to avoid accessing a cache line shared 1849 * with the frontend. 1850 */ 1851 rp = rings->common.sring->req_prod; 1852 1853 /* Ensure we see queued requests up to 'rp'. */ 1854 rmb(); 1855 1856 /** 1857 * Run so long as there is work to consume and the generation 1858 * of a response will not overflow the ring. 1859 * 1860 * @note There's a 1 to 1 relationship between requests and 1861 * responses, so an overflow should never occur. This 1862 * test is to protect our domain from digesting bogus 1863 * data. Shouldn't we log this? 1864 */ 1865 while (rings->common.req_cons != rp 1866 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1867 rings->common.req_cons) == 0){ 1868 blkif_request_t ring_req_storage; 1869 blkif_request_t *ring_req; 1870 int cur_size; 1871 1872 switch (xbb->abi) { 1873 case BLKIF_PROTOCOL_NATIVE: 1874 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1875 rings->common.req_cons); 1876 break; 1877 case BLKIF_PROTOCOL_X86_32: 1878 { 1879 struct blkif_x86_32_request *ring_req32; 1880 1881 ring_req32 = RING_GET_REQUEST( 1882 &xbb->rings.x86_32, rings->common.req_cons); 1883 blkif_get_x86_32_req(&ring_req_storage, 1884 ring_req32); 1885 ring_req = &ring_req_storage; 1886 break; 1887 } 1888 case BLKIF_PROTOCOL_X86_64: 1889 { 1890 struct blkif_x86_64_request *ring_req64; 1891 1892 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1893 rings->common.req_cons); 1894 blkif_get_x86_64_req(&ring_req_storage, 1895 ring_req64); 1896 ring_req = &ring_req_storage; 1897 break; 1898 } 1899 default: 1900 panic("Unexpected blkif protocol ABI."); 1901 /* NOTREACHED */ 1902 } 1903 1904 /* 1905 * Check for situations that would require closing 1906 * off this I/O for further coalescing: 1907 * - Coalescing is turned off. 1908 * - Current I/O is out of sequence with the previous 1909 * I/O. 1910 * - Coalesced I/O would be too large. 1911 */ 1912 if ((reqlist != NULL) 1913 && ((xbb->no_coalesce_reqs != 0) 1914 || ((xbb->no_coalesce_reqs == 0) 1915 && ((ring_req->sector_number != cur_sector) 1916 || (ring_req->operation != cur_operation) 1917 || ((ring_req->nr_segments + reqlist->nr_segments) > 1918 xbb->max_reqlist_segments))))) { 1919 reqlist = NULL; 1920 } 1921 1922 /* 1923 * Grab and check for all resources in one shot. 1924 * If we can't get all of the resources we need, 1925 * the shortage is noted and the thread will get 1926 * woken up when more resources are available. 1927 */ 1928 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1929 xbb->rings.common.req_cons); 1930 1931 if (retval != 0) { 1932 /* 1933 * Resource shortage has been recorded. 1934 * We'll be scheduled to run once a request 1935 * object frees up due to a completion. 1936 */ 1937 break; 1938 } 1939 1940 /* 1941 * Signify that we can overwrite this request with 1942 * a response by incrementing our consumer index. 1943 * The response won't be generated until after 1944 * we've already consumed all necessary data out 1945 * of the version of the request in the ring buffer 1946 * (for native mode). We must update the consumer 1947 * index before issueing back-end I/O so there is 1948 * no possibility that it will complete and a 1949 * response be generated before we make room in 1950 * the queue for that response. 1951 */ 1952 xbb->rings.common.req_cons += 1953 BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); 1954 xbb->reqs_received++; 1955 1956 cur_size = xbb_count_sects(ring_req); 1957 cur_sector = ring_req->sector_number + cur_size; 1958 reqlist->next_contig_sector = cur_sector; 1959 cur_operation = ring_req->operation; 1960 } 1961 1962 /* Check for I/O to dispatch */ 1963 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1964 if (reqlist == NULL) { 1965 /* 1966 * We're out of work to do, put the task queue to 1967 * sleep. 1968 */ 1969 break; 1970 } 1971 1972 /* 1973 * Grab the first request off the queue and attempt 1974 * to dispatch it. 1975 */ 1976 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1977 1978 retval = xbb_dispatch_io(xbb, reqlist); 1979 if (retval != 0) { 1980 /* 1981 * xbb_dispatch_io() returns non-zero only when 1982 * there is a resource shortage. If that's the 1983 * case, re-queue this request on the head of the 1984 * queue, and go to sleep until we have more 1985 * resources. 1986 */ 1987 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1988 reqlist, links); 1989 break; 1990 } else { 1991 /* 1992 * If we still have anything on the queue after 1993 * removing the head entry, that is because we 1994 * met one of the criteria to create a new 1995 * request list (outlined above), and we'll call 1996 * that a forced dispatch for statistical purposes. 1997 * 1998 * Otherwise, if there is only one element on the 1999 * queue, we coalesced everything available on 2000 * the ring and we'll call that a normal dispatch. 2001 */ 2002 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2003 2004 if (reqlist != NULL) 2005 xbb->forced_dispatch++; 2006 else 2007 xbb->normal_dispatch++; 2008 2009 xbb->total_dispatch++; 2010 } 2011 } 2012 } 2013 2014 /** 2015 * Interrupt handler bound to the shared ring's event channel. 2016 * 2017 * \param arg Callback argument registerd during event channel 2018 * binding - the xbb_softc for this instance. 2019 */ 2020 static int 2021 xbb_filter(void *arg) 2022 { 2023 struct xbb_softc *xbb; 2024 2025 /* Defer to taskqueue thread. */ 2026 xbb = (struct xbb_softc *)arg; 2027 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2028 2029 return (FILTER_HANDLED); 2030 } 2031 2032 SDT_PROVIDER_DEFINE(xbb); 2033 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 2034 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 2035 "uint64_t"); 2036 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 2037 "uint64_t", "uint64_t"); 2038 2039 /*----------------------------- Backend Handlers -----------------------------*/ 2040 /** 2041 * Backend handler for character device access. 2042 * 2043 * \param xbb Per-instance xbb configuration structure. 2044 * \param reqlist Allocated internal request list structure. 2045 * \param operation BIO_* I/O operation code. 2046 * \param bio_flags Additional bio_flag data to pass to any generated 2047 * bios (e.g. BIO_ORDERED).. 2048 * 2049 * \return 0 for success, errno codes for failure. 2050 */ 2051 static int 2052 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2053 int operation, int bio_flags) 2054 { 2055 struct xbb_dev_data *dev_data; 2056 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2057 off_t bio_offset; 2058 struct bio *bio; 2059 struct xbb_sg *xbb_sg; 2060 u_int nbio; 2061 u_int bio_idx; 2062 u_int nseg; 2063 u_int seg_idx; 2064 int error; 2065 2066 dev_data = &xbb->backend.dev; 2067 bio_offset = (off_t)reqlist->starting_sector_number 2068 << xbb->sector_size_shift; 2069 error = 0; 2070 nbio = 0; 2071 bio_idx = 0; 2072 2073 if (operation == BIO_FLUSH) { 2074 bio = g_new_bio(); 2075 if (__predict_false(bio == NULL)) { 2076 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2077 error = ENOMEM; 2078 return (error); 2079 } 2080 2081 bio->bio_cmd = BIO_FLUSH; 2082 bio->bio_flags |= BIO_ORDERED; 2083 bio->bio_dev = dev_data->cdev; 2084 bio->bio_offset = 0; 2085 bio->bio_data = 0; 2086 bio->bio_done = xbb_bio_done; 2087 bio->bio_caller1 = reqlist; 2088 bio->bio_pblkno = 0; 2089 2090 reqlist->pendcnt = 1; 2091 2092 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2093 device_get_unit(xbb->dev)); 2094 2095 (*dev_data->csw->d_strategy)(bio); 2096 2097 return (0); 2098 } 2099 2100 xbb_sg = xbb->xbb_sgs; 2101 bio = NULL; 2102 nseg = reqlist->nr_segments; 2103 2104 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2105 2106 /* 2107 * KVA will not be contiguous, so any additional 2108 * I/O will need to be represented in a new bio. 2109 */ 2110 if ((bio != NULL) 2111 && (xbb_sg->first_sect != 0)) { 2112 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2113 printf("%s: Discontiguous I/O request " 2114 "from domain %d ends on " 2115 "non-sector boundary\n", 2116 __func__, xbb->otherend_id); 2117 error = EINVAL; 2118 goto fail_free_bios; 2119 } 2120 bio = NULL; 2121 } 2122 2123 if (bio == NULL) { 2124 /* 2125 * Make sure that the start of this bio is 2126 * aligned to a device sector. 2127 */ 2128 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2129 printf("%s: Misaligned I/O request " 2130 "from domain %d\n", __func__, 2131 xbb->otherend_id); 2132 error = EINVAL; 2133 goto fail_free_bios; 2134 } 2135 2136 bio = bios[nbio++] = g_new_bio(); 2137 if (__predict_false(bio == NULL)) { 2138 error = ENOMEM; 2139 goto fail_free_bios; 2140 } 2141 bio->bio_cmd = operation; 2142 bio->bio_flags |= bio_flags; 2143 bio->bio_dev = dev_data->cdev; 2144 bio->bio_offset = bio_offset; 2145 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2146 xbb_sg->first_sect); 2147 bio->bio_done = xbb_bio_done; 2148 bio->bio_caller1 = reqlist; 2149 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2150 } 2151 2152 bio->bio_length += xbb_sg->nsect << 9; 2153 bio->bio_bcount = bio->bio_length; 2154 bio_offset += xbb_sg->nsect << 9; 2155 2156 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2157 2158 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2159 printf("%s: Discontiguous I/O request " 2160 "from domain %d ends on " 2161 "non-sector boundary\n", 2162 __func__, xbb->otherend_id); 2163 error = EINVAL; 2164 goto fail_free_bios; 2165 } 2166 /* 2167 * KVA will not be contiguous, so any additional 2168 * I/O will need to be represented in a new bio. 2169 */ 2170 bio = NULL; 2171 } 2172 } 2173 2174 reqlist->pendcnt = nbio; 2175 2176 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2177 { 2178 #ifdef XBB_USE_BOUNCE_BUFFERS 2179 vm_offset_t kva_offset; 2180 2181 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2182 - (vm_offset_t)reqlist->bounce; 2183 if (operation == BIO_WRITE) { 2184 memcpy(bios[bio_idx]->bio_data, 2185 (uint8_t *)reqlist->kva + kva_offset, 2186 bios[bio_idx]->bio_bcount); 2187 } 2188 #endif 2189 if (operation == BIO_READ) { 2190 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2191 device_get_unit(xbb->dev), 2192 bios[bio_idx]->bio_offset, 2193 bios[bio_idx]->bio_length); 2194 } else if (operation == BIO_WRITE) { 2195 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2196 device_get_unit(xbb->dev), 2197 bios[bio_idx]->bio_offset, 2198 bios[bio_idx]->bio_length); 2199 } 2200 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2201 } 2202 2203 return (error); 2204 2205 fail_free_bios: 2206 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2207 g_destroy_bio(bios[bio_idx]); 2208 2209 return (error); 2210 } 2211 2212 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2213 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2214 "uint64_t"); 2215 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2216 "uint64_t", "uint64_t"); 2217 2218 /** 2219 * Backend handler for file access. 2220 * 2221 * \param xbb Per-instance xbb configuration structure. 2222 * \param reqlist Allocated internal request list. 2223 * \param operation BIO_* I/O operation code. 2224 * \param flags Additional bio_flag data to pass to any generated bios 2225 * (e.g. BIO_ORDERED).. 2226 * 2227 * \return 0 for success, errno codes for failure. 2228 */ 2229 static int 2230 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2231 int operation, int flags) 2232 { 2233 struct xbb_file_data *file_data; 2234 u_int seg_idx; 2235 u_int nseg; 2236 off_t sectors_sent; 2237 struct uio xuio; 2238 struct xbb_sg *xbb_sg; 2239 struct iovec *xiovec; 2240 #ifdef XBB_USE_BOUNCE_BUFFERS 2241 void **p_vaddr; 2242 int saved_uio_iovcnt; 2243 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2244 int error; 2245 2246 file_data = &xbb->backend.file; 2247 sectors_sent = 0; 2248 error = 0; 2249 bzero(&xuio, sizeof(xuio)); 2250 2251 switch (operation) { 2252 case BIO_READ: 2253 xuio.uio_rw = UIO_READ; 2254 break; 2255 case BIO_WRITE: 2256 xuio.uio_rw = UIO_WRITE; 2257 break; 2258 case BIO_FLUSH: { 2259 struct mount *mountpoint; 2260 2261 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2262 device_get_unit(xbb->dev)); 2263 2264 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2265 2266 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2267 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2268 VOP_UNLOCK(xbb->vn, 0); 2269 2270 vn_finished_write(mountpoint); 2271 2272 goto bailout_send_response; 2273 /* NOTREACHED */ 2274 } 2275 default: 2276 panic("invalid operation %d", operation); 2277 /* NOTREACHED */ 2278 } 2279 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2280 << xbb->sector_size_shift; 2281 xuio.uio_segflg = UIO_SYSSPACE; 2282 xuio.uio_iov = file_data->xiovecs; 2283 xuio.uio_iovcnt = 0; 2284 xbb_sg = xbb->xbb_sgs; 2285 nseg = reqlist->nr_segments; 2286 2287 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2288 2289 /* 2290 * If the first sector is not 0, the KVA will 2291 * not be contiguous and we'll need to go on 2292 * to another segment. 2293 */ 2294 if (xbb_sg->first_sect != 0) 2295 xiovec = NULL; 2296 2297 if (xiovec == NULL) { 2298 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2299 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2300 seg_idx, xbb_sg->first_sect); 2301 #ifdef XBB_USE_BOUNCE_BUFFERS 2302 /* 2303 * Store the address of the incoming 2304 * buffer at this particular offset 2305 * as well, so we can do the copy 2306 * later without having to do more 2307 * work to recalculate this address. 2308 */ 2309 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2310 *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2311 xbb_sg->first_sect); 2312 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2313 xiovec->iov_len = 0; 2314 xuio.uio_iovcnt++; 2315 } 2316 2317 xiovec->iov_len += xbb_sg->nsect << 9; 2318 2319 xuio.uio_resid += xbb_sg->nsect << 9; 2320 2321 /* 2322 * If the last sector is not the full page 2323 * size count, the next segment will not be 2324 * contiguous in KVA and we need a new iovec. 2325 */ 2326 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2327 xiovec = NULL; 2328 } 2329 2330 xuio.uio_td = curthread; 2331 2332 #ifdef XBB_USE_BOUNCE_BUFFERS 2333 saved_uio_iovcnt = xuio.uio_iovcnt; 2334 2335 if (operation == BIO_WRITE) { 2336 /* Copy the write data to the local buffer. */ 2337 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2338 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2339 seg_idx++, xiovec++, p_vaddr++) { 2340 2341 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 2342 } 2343 } else { 2344 /* 2345 * We only need to save off the iovecs in the case of a 2346 * read, because the copy for the read happens after the 2347 * VOP_READ(). (The uio will get modified in that call 2348 * sequence.) 2349 */ 2350 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2351 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2352 } 2353 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2354 2355 switch (operation) { 2356 case BIO_READ: 2357 2358 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2359 device_get_unit(xbb->dev), xuio.uio_offset, 2360 xuio.uio_resid); 2361 2362 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2363 2364 /* 2365 * UFS pays attention to IO_DIRECT for reads. If the 2366 * DIRECTIO option is configured into the kernel, it calls 2367 * ffs_rawread(). But that only works for single-segment 2368 * uios with user space addresses. In our case, with a 2369 * kernel uio, it still reads into the buffer cache, but it 2370 * will just try to release the buffer from the cache later 2371 * on in ffs_read(). 2372 * 2373 * ZFS does not pay attention to IO_DIRECT for reads. 2374 * 2375 * UFS does not pay attention to IO_SYNC for reads. 2376 * 2377 * ZFS pays attention to IO_SYNC (which translates into the 2378 * Solaris define FRSYNC for zfs_read()) for reads. It 2379 * attempts to sync the file before reading. 2380 * 2381 * So, to attempt to provide some barrier semantics in the 2382 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2383 */ 2384 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2385 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2386 2387 VOP_UNLOCK(xbb->vn, 0); 2388 break; 2389 case BIO_WRITE: { 2390 struct mount *mountpoint; 2391 2392 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2393 device_get_unit(xbb->dev), xuio.uio_offset, 2394 xuio.uio_resid); 2395 2396 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2397 2398 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2399 2400 /* 2401 * UFS pays attention to IO_DIRECT for writes. The write 2402 * is done asynchronously. (Normally the write would just 2403 * get put into cache. 2404 * 2405 * UFS pays attention to IO_SYNC for writes. It will 2406 * attempt to write the buffer out synchronously if that 2407 * flag is set. 2408 * 2409 * ZFS does not pay attention to IO_DIRECT for writes. 2410 * 2411 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2412 * for writes. It will flush the transaction from the 2413 * cache before returning. 2414 * 2415 * So if we've got the BIO_ORDERED flag set, we want 2416 * IO_SYNC in either the UFS or ZFS case. 2417 */ 2418 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2419 IO_SYNC : 0, file_data->cred); 2420 VOP_UNLOCK(xbb->vn, 0); 2421 2422 vn_finished_write(mountpoint); 2423 2424 break; 2425 } 2426 default: 2427 panic("invalid operation %d", operation); 2428 /* NOTREACHED */ 2429 } 2430 2431 #ifdef XBB_USE_BOUNCE_BUFFERS 2432 /* We only need to copy here for read operations */ 2433 if (operation == BIO_READ) { 2434 2435 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2436 xiovec = file_data->saved_xiovecs; 2437 seg_idx < saved_uio_iovcnt; seg_idx++, 2438 xiovec++, p_vaddr++) { 2439 2440 /* 2441 * Note that we have to use the copy of the 2442 * io vector we made above. uiomove() modifies 2443 * the uio and its referenced vector as uiomove 2444 * performs the copy, so we can't rely on any 2445 * state from the original uio. 2446 */ 2447 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 2448 } 2449 } 2450 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2451 2452 bailout_send_response: 2453 2454 if (error != 0) 2455 reqlist->status = BLKIF_RSP_ERROR; 2456 2457 xbb_complete_reqlist(xbb, reqlist); 2458 2459 return (0); 2460 } 2461 2462 /*--------------------------- Backend Configuration --------------------------*/ 2463 /** 2464 * Close and cleanup any backend device/file specific state for this 2465 * block back instance. 2466 * 2467 * \param xbb Per-instance xbb configuration structure. 2468 */ 2469 static void 2470 xbb_close_backend(struct xbb_softc *xbb) 2471 { 2472 DROP_GIANT(); 2473 DPRINTF("closing dev=%s\n", xbb->dev_name); 2474 if (xbb->vn) { 2475 int flags = FREAD; 2476 2477 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2478 flags |= FWRITE; 2479 2480 switch (xbb->device_type) { 2481 case XBB_TYPE_DISK: 2482 if (xbb->backend.dev.csw) { 2483 dev_relthread(xbb->backend.dev.cdev, 2484 xbb->backend.dev.dev_ref); 2485 xbb->backend.dev.csw = NULL; 2486 xbb->backend.dev.cdev = NULL; 2487 } 2488 break; 2489 case XBB_TYPE_FILE: 2490 break; 2491 case XBB_TYPE_NONE: 2492 default: 2493 panic("Unexpected backend type."); 2494 break; 2495 } 2496 2497 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2498 xbb->vn = NULL; 2499 2500 switch (xbb->device_type) { 2501 case XBB_TYPE_DISK: 2502 break; 2503 case XBB_TYPE_FILE: 2504 if (xbb->backend.file.cred != NULL) { 2505 crfree(xbb->backend.file.cred); 2506 xbb->backend.file.cred = NULL; 2507 } 2508 break; 2509 case XBB_TYPE_NONE: 2510 default: 2511 panic("Unexpected backend type."); 2512 break; 2513 } 2514 } 2515 PICKUP_GIANT(); 2516 } 2517 2518 /** 2519 * Open a character device to be used for backend I/O. 2520 * 2521 * \param xbb Per-instance xbb configuration structure. 2522 * 2523 * \return 0 for success, errno codes for failure. 2524 */ 2525 static int 2526 xbb_open_dev(struct xbb_softc *xbb) 2527 { 2528 struct vattr vattr; 2529 struct cdev *dev; 2530 struct cdevsw *devsw; 2531 int error; 2532 2533 xbb->device_type = XBB_TYPE_DISK; 2534 xbb->dispatch_io = xbb_dispatch_dev; 2535 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2536 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2537 &xbb->backend.dev.dev_ref); 2538 if (xbb->backend.dev.csw == NULL) 2539 panic("Unable to retrieve device switch"); 2540 2541 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2542 if (error) { 2543 xenbus_dev_fatal(xbb->dev, error, "error getting " 2544 "vnode attributes for device %s", 2545 xbb->dev_name); 2546 return (error); 2547 } 2548 2549 2550 dev = xbb->vn->v_rdev; 2551 devsw = dev->si_devsw; 2552 if (!devsw->d_ioctl) { 2553 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2554 "device %s!", xbb->dev_name); 2555 return (ENODEV); 2556 } 2557 2558 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2559 (caddr_t)&xbb->sector_size, FREAD, 2560 curthread); 2561 if (error) { 2562 xenbus_dev_fatal(xbb->dev, error, 2563 "error calling ioctl DIOCGSECTORSIZE " 2564 "for device %s", xbb->dev_name); 2565 return (error); 2566 } 2567 2568 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2569 (caddr_t)&xbb->media_size, FREAD, 2570 curthread); 2571 if (error) { 2572 xenbus_dev_fatal(xbb->dev, error, 2573 "error calling ioctl DIOCGMEDIASIZE " 2574 "for device %s", xbb->dev_name); 2575 return (error); 2576 } 2577 2578 return (0); 2579 } 2580 2581 /** 2582 * Open a file to be used for backend I/O. 2583 * 2584 * \param xbb Per-instance xbb configuration structure. 2585 * 2586 * \return 0 for success, errno codes for failure. 2587 */ 2588 static int 2589 xbb_open_file(struct xbb_softc *xbb) 2590 { 2591 struct xbb_file_data *file_data; 2592 struct vattr vattr; 2593 int error; 2594 2595 file_data = &xbb->backend.file; 2596 xbb->device_type = XBB_TYPE_FILE; 2597 xbb->dispatch_io = xbb_dispatch_file; 2598 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2599 if (error != 0) { 2600 xenbus_dev_fatal(xbb->dev, error, 2601 "error calling VOP_GETATTR()" 2602 "for file %s", xbb->dev_name); 2603 return (error); 2604 } 2605 2606 /* 2607 * Verify that we have the ability to upgrade to exclusive 2608 * access on this file so we can trap errors at open instead 2609 * of reporting them during first access. 2610 */ 2611 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2612 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2613 if (xbb->vn->v_iflag & VI_DOOMED) { 2614 error = EBADF; 2615 xenbus_dev_fatal(xbb->dev, error, 2616 "error locking file %s", 2617 xbb->dev_name); 2618 2619 return (error); 2620 } 2621 } 2622 2623 file_data->cred = crhold(curthread->td_ucred); 2624 xbb->media_size = vattr.va_size; 2625 2626 /* 2627 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2628 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2629 * with disklabel and UFS on FreeBSD at least. Large block sizes 2630 * may not work with other OSes as well. So just export a sector 2631 * size of 512 bytes, which should work with any OS or 2632 * application. Since our backing is a file, any block size will 2633 * work fine for the backing store. 2634 */ 2635 #if 0 2636 xbb->sector_size = vattr.va_blocksize; 2637 #endif 2638 xbb->sector_size = 512; 2639 2640 /* 2641 * Sanity check. The media size has to be at least one 2642 * sector long. 2643 */ 2644 if (xbb->media_size < xbb->sector_size) { 2645 error = EINVAL; 2646 xenbus_dev_fatal(xbb->dev, error, 2647 "file %s size %ju < block size %u", 2648 xbb->dev_name, 2649 (uintmax_t)xbb->media_size, 2650 xbb->sector_size); 2651 } 2652 return (error); 2653 } 2654 2655 /** 2656 * Open the backend provider for this connection. 2657 * 2658 * \param xbb Per-instance xbb configuration structure. 2659 * 2660 * \return 0 for success, errno codes for failure. 2661 */ 2662 static int 2663 xbb_open_backend(struct xbb_softc *xbb) 2664 { 2665 struct nameidata nd; 2666 int flags; 2667 int error; 2668 2669 flags = FREAD; 2670 error = 0; 2671 2672 DPRINTF("opening dev=%s\n", xbb->dev_name); 2673 2674 if (rootvnode == NULL) { 2675 xenbus_dev_fatal(xbb->dev, ENOENT, 2676 "Root file system not mounted"); 2677 return (ENOENT); 2678 } 2679 2680 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2681 flags |= FWRITE; 2682 2683 if (!curthread->td_proc->p_fd->fd_cdir) { 2684 curthread->td_proc->p_fd->fd_cdir = rootvnode; 2685 VREF(rootvnode); 2686 } 2687 if (!curthread->td_proc->p_fd->fd_rdir) { 2688 curthread->td_proc->p_fd->fd_rdir = rootvnode; 2689 VREF(rootvnode); 2690 } 2691 if (!curthread->td_proc->p_fd->fd_jdir) { 2692 curthread->td_proc->p_fd->fd_jdir = rootvnode; 2693 VREF(rootvnode); 2694 } 2695 2696 again: 2697 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2698 error = vn_open(&nd, &flags, 0, NULL); 2699 if (error) { 2700 /* 2701 * This is the only reasonable guess we can make as far as 2702 * path if the user doesn't give us a fully qualified path. 2703 * If they want to specify a file, they need to specify the 2704 * full path. 2705 */ 2706 if (xbb->dev_name[0] != '/') { 2707 char *dev_path = "/dev/"; 2708 char *dev_name; 2709 2710 /* Try adding device path at beginning of name */ 2711 dev_name = malloc(strlen(xbb->dev_name) 2712 + strlen(dev_path) + 1, 2713 M_XENBLOCKBACK, M_NOWAIT); 2714 if (dev_name) { 2715 sprintf(dev_name, "%s%s", dev_path, 2716 xbb->dev_name); 2717 free(xbb->dev_name, M_XENBLOCKBACK); 2718 xbb->dev_name = dev_name; 2719 goto again; 2720 } 2721 } 2722 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2723 xbb->dev_name); 2724 return (error); 2725 } 2726 2727 NDFREE(&nd, NDF_ONLY_PNBUF); 2728 2729 xbb->vn = nd.ni_vp; 2730 2731 /* We only support disks and files. */ 2732 if (vn_isdisk(xbb->vn, &error)) { 2733 error = xbb_open_dev(xbb); 2734 } else if (xbb->vn->v_type == VREG) { 2735 error = xbb_open_file(xbb); 2736 } else { 2737 error = EINVAL; 2738 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2739 "or file", xbb->dev_name); 2740 } 2741 VOP_UNLOCK(xbb->vn, 0); 2742 2743 if (error != 0) { 2744 xbb_close_backend(xbb); 2745 return (error); 2746 } 2747 2748 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2749 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2750 2751 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2752 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2753 xbb->dev_name, xbb->sector_size, xbb->media_size); 2754 2755 return (0); 2756 } 2757 2758 /*------------------------ Inter-Domain Communication ------------------------*/ 2759 /** 2760 * Free dynamically allocated KVA or pseudo-physical address allocations. 2761 * 2762 * \param xbb Per-instance xbb configuration structure. 2763 */ 2764 static void 2765 xbb_free_communication_mem(struct xbb_softc *xbb) 2766 { 2767 if (xbb->kva != 0) { 2768 #ifndef XENHVM 2769 kva_free(xbb->kva, xbb->kva_size); 2770 #else 2771 if (xbb->pseudo_phys_res != NULL) { 2772 bus_release_resource(xbb->dev, SYS_RES_MEMORY, 2773 xbb->pseudo_phys_res_id, 2774 xbb->pseudo_phys_res); 2775 xbb->pseudo_phys_res = NULL; 2776 } 2777 #endif 2778 } 2779 xbb->kva = 0; 2780 xbb->gnt_base_addr = 0; 2781 if (xbb->kva_free != NULL) { 2782 free(xbb->kva_free, M_XENBLOCKBACK); 2783 xbb->kva_free = NULL; 2784 } 2785 } 2786 2787 /** 2788 * Cleanup all inter-domain communication mechanisms. 2789 * 2790 * \param xbb Per-instance xbb configuration structure. 2791 */ 2792 static int 2793 xbb_disconnect(struct xbb_softc *xbb) 2794 { 2795 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2796 struct gnttab_unmap_grant_ref *op; 2797 u_int ring_idx; 2798 int error; 2799 2800 DPRINTF("\n"); 2801 2802 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2803 return (0); 2804 2805 xen_intr_unbind(&xbb->xen_intr_handle); 2806 2807 mtx_unlock(&xbb->lock); 2808 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2809 mtx_lock(&xbb->lock); 2810 2811 /* 2812 * No new interrupts can generate work, but we must wait 2813 * for all currently active requests to drain. 2814 */ 2815 if (xbb->active_request_count != 0) 2816 return (EAGAIN); 2817 2818 for (ring_idx = 0, op = ops; 2819 ring_idx < xbb->ring_config.ring_pages; 2820 ring_idx++, op++) { 2821 2822 op->host_addr = xbb->ring_config.gnt_addr 2823 + (ring_idx * PAGE_SIZE); 2824 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2825 op->handle = xbb->ring_config.handle[ring_idx]; 2826 } 2827 2828 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2829 xbb->ring_config.ring_pages); 2830 if (error != 0) 2831 panic("Grant table op failed (%d)", error); 2832 2833 xbb_free_communication_mem(xbb); 2834 2835 if (xbb->requests != NULL) { 2836 free(xbb->requests, M_XENBLOCKBACK); 2837 xbb->requests = NULL; 2838 } 2839 2840 if (xbb->request_lists != NULL) { 2841 struct xbb_xen_reqlist *reqlist; 2842 int i; 2843 2844 /* There is one request list for ever allocated request. */ 2845 for (i = 0, reqlist = xbb->request_lists; 2846 i < xbb->max_requests; i++, reqlist++){ 2847 #ifdef XBB_USE_BOUNCE_BUFFERS 2848 if (reqlist->bounce != NULL) { 2849 free(reqlist->bounce, M_XENBLOCKBACK); 2850 reqlist->bounce = NULL; 2851 } 2852 #endif 2853 if (reqlist->gnt_handles != NULL) { 2854 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2855 reqlist->gnt_handles = NULL; 2856 } 2857 } 2858 free(xbb->request_lists, M_XENBLOCKBACK); 2859 xbb->request_lists = NULL; 2860 } 2861 2862 xbb->flags &= ~XBBF_RING_CONNECTED; 2863 return (0); 2864 } 2865 2866 /** 2867 * Map shared memory ring into domain local address space, initialize 2868 * ring control structures, and bind an interrupt to the event channel 2869 * used to notify us of ring changes. 2870 * 2871 * \param xbb Per-instance xbb configuration structure. 2872 */ 2873 static int 2874 xbb_connect_ring(struct xbb_softc *xbb) 2875 { 2876 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2877 struct gnttab_map_grant_ref *gnt; 2878 u_int ring_idx; 2879 int error; 2880 2881 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2882 return (0); 2883 2884 /* 2885 * Kva for our ring is at the tail of the region of kva allocated 2886 * by xbb_alloc_communication_mem(). 2887 */ 2888 xbb->ring_config.va = xbb->kva 2889 + (xbb->kva_size 2890 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2891 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2892 + (xbb->kva_size 2893 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2894 2895 for (ring_idx = 0, gnt = gnts; 2896 ring_idx < xbb->ring_config.ring_pages; 2897 ring_idx++, gnt++) { 2898 2899 gnt->host_addr = xbb->ring_config.gnt_addr 2900 + (ring_idx * PAGE_SIZE); 2901 gnt->flags = GNTMAP_host_map; 2902 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2903 gnt->dom = xbb->otherend_id; 2904 } 2905 2906 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2907 xbb->ring_config.ring_pages); 2908 if (error) 2909 panic("blkback: Ring page grant table op failed (%d)", error); 2910 2911 for (ring_idx = 0, gnt = gnts; 2912 ring_idx < xbb->ring_config.ring_pages; 2913 ring_idx++, gnt++) { 2914 if (gnt->status != 0) { 2915 xbb->ring_config.va = 0; 2916 xenbus_dev_fatal(xbb->dev, EACCES, 2917 "Ring shared page mapping failed. " 2918 "Status %d.", gnt->status); 2919 return (EACCES); 2920 } 2921 xbb->ring_config.handle[ring_idx] = gnt->handle; 2922 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2923 } 2924 2925 /* Initialize the ring based on ABI. */ 2926 switch (xbb->abi) { 2927 case BLKIF_PROTOCOL_NATIVE: 2928 { 2929 blkif_sring_t *sring; 2930 sring = (blkif_sring_t *)xbb->ring_config.va; 2931 BACK_RING_INIT(&xbb->rings.native, sring, 2932 xbb->ring_config.ring_pages * PAGE_SIZE); 2933 break; 2934 } 2935 case BLKIF_PROTOCOL_X86_32: 2936 { 2937 blkif_x86_32_sring_t *sring_x86_32; 2938 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2939 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2940 xbb->ring_config.ring_pages * PAGE_SIZE); 2941 break; 2942 } 2943 case BLKIF_PROTOCOL_X86_64: 2944 { 2945 blkif_x86_64_sring_t *sring_x86_64; 2946 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2947 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2948 xbb->ring_config.ring_pages * PAGE_SIZE); 2949 break; 2950 } 2951 default: 2952 panic("Unexpected blkif protocol ABI."); 2953 } 2954 2955 xbb->flags |= XBBF_RING_CONNECTED; 2956 2957 error = xen_intr_bind_remote_port(xbb->dev, 2958 xbb->otherend_id, 2959 xbb->ring_config.evtchn, 2960 xbb_filter, 2961 /*ithread_handler*/NULL, 2962 /*arg*/xbb, 2963 INTR_TYPE_BIO | INTR_MPSAFE, 2964 &xbb->xen_intr_handle); 2965 if (error) { 2966 (void)xbb_disconnect(xbb); 2967 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2968 return (error); 2969 } 2970 2971 DPRINTF("rings connected!\n"); 2972 2973 return 0; 2974 } 2975 2976 /* Needed to make bit_alloc() macro work */ 2977 #define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK, \ 2978 M_NOWAIT|M_ZERO); 2979 2980 /** 2981 * Size KVA and pseudo-physical address allocations based on negotiated 2982 * values for the size and number of I/O requests, and the size of our 2983 * communication ring. 2984 * 2985 * \param xbb Per-instance xbb configuration structure. 2986 * 2987 * These address spaces are used to dynamically map pages in the 2988 * front-end's domain into our own. 2989 */ 2990 static int 2991 xbb_alloc_communication_mem(struct xbb_softc *xbb) 2992 { 2993 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2994 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2995 xbb->kva_size = xbb->reqlist_kva_size + 2996 (xbb->ring_config.ring_pages * PAGE_SIZE); 2997 2998 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages); 2999 if (xbb->kva_free == NULL) 3000 return (ENOMEM); 3001 3002 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 3003 device_get_nameunit(xbb->dev), xbb->kva_size, 3004 xbb->reqlist_kva_size); 3005 #ifndef XENHVM 3006 xbb->kva = kva_alloc(xbb->kva_size); 3007 if (xbb->kva == 0) 3008 return (ENOMEM); 3009 xbb->gnt_base_addr = xbb->kva; 3010 #else /* XENHVM */ 3011 /* 3012 * Reserve a range of pseudo physical memory that we can map 3013 * into kva. These pages will only be backed by machine 3014 * pages ("real memory") during the lifetime of front-end requests 3015 * via grant table operations. 3016 */ 3017 xbb->pseudo_phys_res_id = 0; 3018 xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY, 3019 &xbb->pseudo_phys_res_id, 3020 0, ~0, xbb->kva_size, 3021 RF_ACTIVE); 3022 if (xbb->pseudo_phys_res == NULL) { 3023 xbb->kva = 0; 3024 return (ENOMEM); 3025 } 3026 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3027 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3028 #endif /* XENHVM */ 3029 3030 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3031 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3032 (uintmax_t)xbb->gnt_base_addr); 3033 return (0); 3034 } 3035 3036 /** 3037 * Collect front-end information from the XenStore. 3038 * 3039 * \param xbb Per-instance xbb configuration structure. 3040 */ 3041 static int 3042 xbb_collect_frontend_info(struct xbb_softc *xbb) 3043 { 3044 char protocol_abi[64]; 3045 const char *otherend_path; 3046 int error; 3047 u_int ring_idx; 3048 u_int ring_page_order; 3049 size_t ring_size; 3050 3051 otherend_path = xenbus_get_otherend_path(xbb->dev); 3052 3053 /* 3054 * Protocol defaults valid even if all negotiation fails. 3055 */ 3056 xbb->ring_config.ring_pages = 1; 3057 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; 3058 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3059 3060 /* 3061 * Mandatory data (used in all versions of the protocol) first. 3062 */ 3063 error = xs_scanf(XST_NIL, otherend_path, 3064 "event-channel", NULL, "%" PRIu32, 3065 &xbb->ring_config.evtchn); 3066 if (error != 0) { 3067 xenbus_dev_fatal(xbb->dev, error, 3068 "Unable to retrieve event-channel information " 3069 "from frontend %s. Unable to connect.", 3070 xenbus_get_otherend_path(xbb->dev)); 3071 return (error); 3072 } 3073 3074 /* 3075 * These fields are initialized to legacy protocol defaults 3076 * so we only need to fail if reading the updated value succeeds 3077 * and the new value is outside of its allowed range. 3078 * 3079 * \note xs_gather() returns on the first encountered error, so 3080 * we must use independant calls in order to guarantee 3081 * we don't miss information in a sparsly populated front-end 3082 * tree. 3083 * 3084 * \note xs_scanf() does not update variables for unmatched 3085 * fields. 3086 */ 3087 ring_page_order = 0; 3088 (void)xs_scanf(XST_NIL, otherend_path, 3089 "ring-page-order", NULL, "%u", 3090 &ring_page_order); 3091 xbb->ring_config.ring_pages = 1 << ring_page_order; 3092 (void)xs_scanf(XST_NIL, otherend_path, 3093 "num-ring-pages", NULL, "%u", 3094 &xbb->ring_config.ring_pages); 3095 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3096 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3097 3098 (void)xs_scanf(XST_NIL, otherend_path, 3099 "max-requests", NULL, "%u", 3100 &xbb->max_requests); 3101 3102 (void)xs_scanf(XST_NIL, otherend_path, 3103 "max-request-segments", NULL, "%u", 3104 &xbb->max_request_segments); 3105 3106 (void)xs_scanf(XST_NIL, otherend_path, 3107 "max-request-size", NULL, "%u", 3108 &xbb->max_request_size); 3109 3110 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3111 xenbus_dev_fatal(xbb->dev, EINVAL, 3112 "Front-end specified ring-pages of %u " 3113 "exceeds backend limit of %zu. " 3114 "Unable to connect.", 3115 xbb->ring_config.ring_pages, 3116 XBB_MAX_RING_PAGES); 3117 return (EINVAL); 3118 } else if (xbb->max_requests > XBB_MAX_REQUESTS) { 3119 xenbus_dev_fatal(xbb->dev, EINVAL, 3120 "Front-end specified max_requests of %u " 3121 "exceeds backend limit of %u. " 3122 "Unable to connect.", 3123 xbb->max_requests, 3124 XBB_MAX_REQUESTS); 3125 return (EINVAL); 3126 } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) { 3127 xenbus_dev_fatal(xbb->dev, EINVAL, 3128 "Front-end specified max_requests_segments " 3129 "of %u exceeds backend limit of %u. " 3130 "Unable to connect.", 3131 xbb->max_request_segments, 3132 XBB_MAX_SEGMENTS_PER_REQUEST); 3133 return (EINVAL); 3134 } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) { 3135 xenbus_dev_fatal(xbb->dev, EINVAL, 3136 "Front-end specified max_request_size " 3137 "of %u exceeds backend limit of %u. " 3138 "Unable to connect.", 3139 xbb->max_request_size, 3140 XBB_MAX_REQUEST_SIZE); 3141 return (EINVAL); 3142 } 3143 3144 if (xbb->ring_config.ring_pages == 1) { 3145 error = xs_gather(XST_NIL, otherend_path, 3146 "ring-ref", "%" PRIu32, 3147 &xbb->ring_config.ring_ref[0], 3148 NULL); 3149 if (error != 0) { 3150 xenbus_dev_fatal(xbb->dev, error, 3151 "Unable to retrieve ring information " 3152 "from frontend %s. Unable to " 3153 "connect.", 3154 xenbus_get_otherend_path(xbb->dev)); 3155 return (error); 3156 } 3157 } else { 3158 /* Multi-page ring format. */ 3159 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3160 ring_idx++) { 3161 char ring_ref_name[]= "ring_refXX"; 3162 3163 snprintf(ring_ref_name, sizeof(ring_ref_name), 3164 "ring-ref%u", ring_idx); 3165 error = xs_scanf(XST_NIL, otherend_path, 3166 ring_ref_name, NULL, "%" PRIu32, 3167 &xbb->ring_config.ring_ref[ring_idx]); 3168 if (error != 0) { 3169 xenbus_dev_fatal(xbb->dev, error, 3170 "Failed to retriev grant " 3171 "reference for page %u of " 3172 "shared ring. Unable " 3173 "to connect.", ring_idx); 3174 return (error); 3175 } 3176 } 3177 } 3178 3179 error = xs_gather(XST_NIL, otherend_path, 3180 "protocol", "%63s", protocol_abi, 3181 NULL); 3182 if (error != 0 3183 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3184 /* 3185 * Assume native if the frontend has not 3186 * published ABI data or it has published and 3187 * matches our own ABI. 3188 */ 3189 xbb->abi = BLKIF_PROTOCOL_NATIVE; 3190 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3191 3192 xbb->abi = BLKIF_PROTOCOL_X86_32; 3193 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3194 3195 xbb->abi = BLKIF_PROTOCOL_X86_64; 3196 } else { 3197 3198 xenbus_dev_fatal(xbb->dev, EINVAL, 3199 "Unknown protocol ABI (%s) published by " 3200 "frontend. Unable to connect.", protocol_abi); 3201 return (EINVAL); 3202 } 3203 return (0); 3204 } 3205 3206 /** 3207 * Allocate per-request data structures given request size and number 3208 * information negotiated with the front-end. 3209 * 3210 * \param xbb Per-instance xbb configuration structure. 3211 */ 3212 static int 3213 xbb_alloc_requests(struct xbb_softc *xbb) 3214 { 3215 struct xbb_xen_req *req; 3216 struct xbb_xen_req *last_req; 3217 3218 /* 3219 * Allocate request book keeping datastructures. 3220 */ 3221 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3222 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3223 if (xbb->requests == NULL) { 3224 xenbus_dev_fatal(xbb->dev, ENOMEM, 3225 "Unable to allocate request structures"); 3226 return (ENOMEM); 3227 } 3228 3229 req = xbb->requests; 3230 last_req = &xbb->requests[xbb->max_requests - 1]; 3231 STAILQ_INIT(&xbb->request_free_stailq); 3232 while (req <= last_req) { 3233 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3234 req++; 3235 } 3236 return (0); 3237 } 3238 3239 static int 3240 xbb_alloc_request_lists(struct xbb_softc *xbb) 3241 { 3242 struct xbb_xen_reqlist *reqlist; 3243 int i; 3244 3245 /* 3246 * If no requests can be merged, we need 1 request list per 3247 * in flight request. 3248 */ 3249 xbb->request_lists = malloc(xbb->max_requests * 3250 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3251 if (xbb->request_lists == NULL) { 3252 xenbus_dev_fatal(xbb->dev, ENOMEM, 3253 "Unable to allocate request list structures"); 3254 return (ENOMEM); 3255 } 3256 3257 STAILQ_INIT(&xbb->reqlist_free_stailq); 3258 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3259 for (i = 0; i < xbb->max_requests; i++) { 3260 int seg; 3261 3262 reqlist = &xbb->request_lists[i]; 3263 3264 reqlist->xbb = xbb; 3265 3266 #ifdef XBB_USE_BOUNCE_BUFFERS 3267 reqlist->bounce = malloc(xbb->max_reqlist_size, 3268 M_XENBLOCKBACK, M_NOWAIT); 3269 if (reqlist->bounce == NULL) { 3270 xenbus_dev_fatal(xbb->dev, ENOMEM, 3271 "Unable to allocate request " 3272 "bounce buffers"); 3273 return (ENOMEM); 3274 } 3275 #endif /* XBB_USE_BOUNCE_BUFFERS */ 3276 3277 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3278 sizeof(*reqlist->gnt_handles), 3279 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3280 if (reqlist->gnt_handles == NULL) { 3281 xenbus_dev_fatal(xbb->dev, ENOMEM, 3282 "Unable to allocate request " 3283 "grant references"); 3284 return (ENOMEM); 3285 } 3286 3287 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3288 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3289 3290 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3291 } 3292 return (0); 3293 } 3294 3295 /** 3296 * Supply information about the physical device to the frontend 3297 * via XenBus. 3298 * 3299 * \param xbb Per-instance xbb configuration structure. 3300 */ 3301 static int 3302 xbb_publish_backend_info(struct xbb_softc *xbb) 3303 { 3304 struct xs_transaction xst; 3305 const char *our_path; 3306 const char *leaf; 3307 int error; 3308 3309 our_path = xenbus_get_node(xbb->dev); 3310 while (1) { 3311 error = xs_transaction_start(&xst); 3312 if (error != 0) { 3313 xenbus_dev_fatal(xbb->dev, error, 3314 "Error publishing backend info " 3315 "(start transaction)"); 3316 return (error); 3317 } 3318 3319 leaf = "sectors"; 3320 error = xs_printf(xst, our_path, leaf, 3321 "%"PRIu64, xbb->media_num_sectors); 3322 if (error != 0) 3323 break; 3324 3325 /* XXX Support all VBD attributes here. */ 3326 leaf = "info"; 3327 error = xs_printf(xst, our_path, leaf, "%u", 3328 xbb->flags & XBBF_READ_ONLY 3329 ? VDISK_READONLY : 0); 3330 if (error != 0) 3331 break; 3332 3333 leaf = "sector-size"; 3334 error = xs_printf(xst, our_path, leaf, "%u", 3335 xbb->sector_size); 3336 if (error != 0) 3337 break; 3338 3339 error = xs_transaction_end(xst, 0); 3340 if (error == 0) { 3341 return (0); 3342 } else if (error != EAGAIN) { 3343 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3344 return (error); 3345 } 3346 } 3347 3348 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3349 our_path, leaf); 3350 xs_transaction_end(xst, 1); 3351 return (error); 3352 } 3353 3354 /** 3355 * Connect to our blkfront peer now that it has completed publishing 3356 * its configuration into the XenStore. 3357 * 3358 * \param xbb Per-instance xbb configuration structure. 3359 */ 3360 static void 3361 xbb_connect(struct xbb_softc *xbb) 3362 { 3363 int error; 3364 3365 if (xenbus_get_state(xbb->dev) == XenbusStateConnected) 3366 return; 3367 3368 if (xbb_collect_frontend_info(xbb) != 0) 3369 return; 3370 3371 xbb->flags &= ~XBBF_SHUTDOWN; 3372 3373 /* 3374 * We limit the maximum number of reqlist segments to the maximum 3375 * number of segments in the ring, or our absolute maximum, 3376 * whichever is smaller. 3377 */ 3378 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3379 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3380 3381 /* 3382 * The maximum size is simply a function of the number of segments 3383 * we can handle. 3384 */ 3385 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3386 3387 /* Allocate resources whose size depends on front-end configuration. */ 3388 error = xbb_alloc_communication_mem(xbb); 3389 if (error != 0) { 3390 xenbus_dev_fatal(xbb->dev, error, 3391 "Unable to allocate communication memory"); 3392 return; 3393 } 3394 3395 error = xbb_alloc_requests(xbb); 3396 if (error != 0) { 3397 /* Specific errors are reported by xbb_alloc_requests(). */ 3398 return; 3399 } 3400 3401 error = xbb_alloc_request_lists(xbb); 3402 if (error != 0) { 3403 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3404 return; 3405 } 3406 3407 /* 3408 * Connect communication channel. 3409 */ 3410 error = xbb_connect_ring(xbb); 3411 if (error != 0) { 3412 /* Specific errors are reported by xbb_connect_ring(). */ 3413 return; 3414 } 3415 3416 if (xbb_publish_backend_info(xbb) != 0) { 3417 /* 3418 * If we can't publish our data, we cannot participate 3419 * in this connection, and waiting for a front-end state 3420 * change will not help the situation. 3421 */ 3422 (void)xbb_disconnect(xbb); 3423 return; 3424 } 3425 3426 /* Ready for I/O. */ 3427 xenbus_set_state(xbb->dev, XenbusStateConnected); 3428 } 3429 3430 /*-------------------------- Device Teardown Support -------------------------*/ 3431 /** 3432 * Perform device shutdown functions. 3433 * 3434 * \param xbb Per-instance xbb configuration structure. 3435 * 3436 * Mark this instance as shutting down, wait for any active I/O on the 3437 * backend device/file to drain, disconnect from the front-end, and notify 3438 * any waiters (e.g. a thread invoking our detach method) that detach can 3439 * now proceed. 3440 */ 3441 static int 3442 xbb_shutdown(struct xbb_softc *xbb) 3443 { 3444 XenbusState frontState; 3445 int error; 3446 3447 DPRINTF("\n"); 3448 3449 /* 3450 * Due to the need to drop our mutex during some 3451 * xenbus operations, it is possible for two threads 3452 * to attempt to close out shutdown processing at 3453 * the same time. Tell the caller that hits this 3454 * race to try back later. 3455 */ 3456 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3457 return (EAGAIN); 3458 3459 xbb->flags |= XBBF_IN_SHUTDOWN; 3460 mtx_unlock(&xbb->lock); 3461 3462 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3463 xenbus_set_state(xbb->dev, XenbusStateClosing); 3464 3465 frontState = xenbus_get_otherend_state(xbb->dev); 3466 mtx_lock(&xbb->lock); 3467 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3468 3469 /* The front can submit I/O until entering the closed state. */ 3470 if (frontState < XenbusStateClosed) 3471 return (EAGAIN); 3472 3473 DPRINTF("\n"); 3474 3475 /* Indicate shutdown is in progress. */ 3476 xbb->flags |= XBBF_SHUTDOWN; 3477 3478 /* Disconnect from the front-end. */ 3479 error = xbb_disconnect(xbb); 3480 if (error != 0) { 3481 /* 3482 * Requests still outstanding. We'll be called again 3483 * once they complete. 3484 */ 3485 KASSERT(error == EAGAIN, 3486 ("%s: Unexpected xbb_disconnect() failure %d", 3487 __func__, error)); 3488 3489 return (error); 3490 } 3491 3492 DPRINTF("\n"); 3493 3494 /* Indicate to xbb_detach() that is it safe to proceed. */ 3495 wakeup(xbb); 3496 3497 return (0); 3498 } 3499 3500 /** 3501 * Report an attach time error to the console and Xen, and cleanup 3502 * this instance by forcing immediate detach processing. 3503 * 3504 * \param xbb Per-instance xbb configuration structure. 3505 * \param err Errno describing the error. 3506 * \param fmt Printf style format and arguments 3507 */ 3508 static void 3509 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3510 { 3511 va_list ap; 3512 va_list ap_hotplug; 3513 3514 va_start(ap, fmt); 3515 va_copy(ap_hotplug, ap); 3516 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3517 "hotplug-error", fmt, ap_hotplug); 3518 va_end(ap_hotplug); 3519 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3520 "hotplug-status", "error"); 3521 3522 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3523 va_end(ap); 3524 3525 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3526 "online", "0"); 3527 xbb_detach(xbb->dev); 3528 } 3529 3530 /*---------------------------- NewBus Entrypoints ----------------------------*/ 3531 /** 3532 * Inspect a XenBus device and claim it if is of the appropriate type. 3533 * 3534 * \param dev NewBus device object representing a candidate XenBus device. 3535 * 3536 * \return 0 for success, errno codes for failure. 3537 */ 3538 static int 3539 xbb_probe(device_t dev) 3540 { 3541 3542 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3543 device_set_desc(dev, "Backend Virtual Block Device"); 3544 device_quiet(dev); 3545 return (0); 3546 } 3547 3548 return (ENXIO); 3549 } 3550 3551 /** 3552 * Setup sysctl variables to control various Block Back parameters. 3553 * 3554 * \param xbb Xen Block Back softc. 3555 * 3556 */ 3557 static void 3558 xbb_setup_sysctl(struct xbb_softc *xbb) 3559 { 3560 struct sysctl_ctx_list *sysctl_ctx = NULL; 3561 struct sysctl_oid *sysctl_tree = NULL; 3562 3563 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3564 if (sysctl_ctx == NULL) 3565 return; 3566 3567 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3568 if (sysctl_tree == NULL) 3569 return; 3570 3571 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3572 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3573 "fake the flush command"); 3574 3575 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3576 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3577 "send a real flush for N flush requests"); 3578 3579 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3580 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3581 "Don't coalesce contiguous requests"); 3582 3583 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3584 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3585 "how many I/O requests we have received"); 3586 3587 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3588 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3589 "how many I/O requests have been completed"); 3590 3591 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3592 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3593 "how many I/O dispatches were forced"); 3594 3595 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3596 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3597 "how many I/O dispatches were normal"); 3598 3599 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3600 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3601 "total number of I/O dispatches"); 3602 3603 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3604 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3605 "how many times we have run out of KVA"); 3606 3607 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3608 "request_shortages", CTLFLAG_RW, 3609 &xbb->request_shortages, 3610 "how many times we have run out of requests"); 3611 3612 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3613 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3614 "maximum outstanding requests (negotiated)"); 3615 3616 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3617 "max_request_segments", CTLFLAG_RD, 3618 &xbb->max_request_segments, 0, 3619 "maximum number of pages per requests (negotiated)"); 3620 3621 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3622 "max_request_size", CTLFLAG_RD, 3623 &xbb->max_request_size, 0, 3624 "maximum size in bytes of a request (negotiated)"); 3625 3626 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3627 "ring_pages", CTLFLAG_RD, 3628 &xbb->ring_config.ring_pages, 0, 3629 "communication channel pages (negotiated)"); 3630 } 3631 3632 /** 3633 * Attach to a XenBus device that has been claimed by our probe routine. 3634 * 3635 * \param dev NewBus device object representing this Xen Block Back instance. 3636 * 3637 * \return 0 for success, errno codes for failure. 3638 */ 3639 static int 3640 xbb_attach(device_t dev) 3641 { 3642 struct xbb_softc *xbb; 3643 int error; 3644 u_int max_ring_page_order; 3645 3646 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3647 3648 /* 3649 * Basic initialization. 3650 * After this block it is safe to call xbb_detach() 3651 * to clean up any allocated data for this instance. 3652 */ 3653 xbb = device_get_softc(dev); 3654 xbb->dev = dev; 3655 xbb->otherend_id = xenbus_get_otherend_id(dev); 3656 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3657 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3658 3659 /* 3660 * Publish protocol capabilities for consumption by the 3661 * front-end. 3662 */ 3663 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3664 "feature-barrier", "1"); 3665 if (error) { 3666 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3667 xenbus_get_node(xbb->dev)); 3668 return (error); 3669 } 3670 3671 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3672 "feature-flush-cache", "1"); 3673 if (error) { 3674 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3675 xenbus_get_node(xbb->dev)); 3676 return (error); 3677 } 3678 3679 /* 3680 * Amazon EC2 client compatility. They refer to max-ring-pages 3681 * instead of to max-ring-page-order. 3682 */ 3683 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3684 "max-ring-pages", "%zu", XBB_MAX_RING_PAGES); 3685 if (error) { 3686 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages", 3687 xenbus_get_node(xbb->dev)); 3688 return (error); 3689 } 3690 3691 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3692 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3693 "max-ring-page-order", "%u", max_ring_page_order); 3694 if (error) { 3695 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3696 xenbus_get_node(xbb->dev)); 3697 return (error); 3698 } 3699 3700 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3701 "max-requests", "%u", XBB_MAX_REQUESTS); 3702 if (error) { 3703 xbb_attach_failed(xbb, error, "writing %s/max-requests", 3704 xenbus_get_node(xbb->dev)); 3705 return (error); 3706 } 3707 3708 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3709 "max-request-segments", "%u", 3710 XBB_MAX_SEGMENTS_PER_REQUEST); 3711 if (error) { 3712 xbb_attach_failed(xbb, error, "writing %s/max-request-segments", 3713 xenbus_get_node(xbb->dev)); 3714 return (error); 3715 } 3716 3717 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3718 "max-request-size", "%u", 3719 XBB_MAX_REQUEST_SIZE); 3720 if (error) { 3721 xbb_attach_failed(xbb, error, "writing %s/max-request-size", 3722 xenbus_get_node(xbb->dev)); 3723 return (error); 3724 } 3725 3726 /* Collect physical device information. */ 3727 error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3728 "device-type", NULL, &xbb->dev_type, 3729 NULL); 3730 if (error != 0) 3731 xbb->dev_type = NULL; 3732 3733 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3734 "mode", NULL, &xbb->dev_mode, 3735 "params", NULL, &xbb->dev_name, 3736 NULL); 3737 if (error != 0) { 3738 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3739 xenbus_get_node(dev)); 3740 return (ENXIO); 3741 } 3742 3743 /* Parse fopen style mode flags. */ 3744 if (strchr(xbb->dev_mode, 'w') == NULL) 3745 xbb->flags |= XBBF_READ_ONLY; 3746 3747 /* 3748 * Verify the physical device is present and can support 3749 * the desired I/O mode. 3750 */ 3751 DROP_GIANT(); 3752 error = xbb_open_backend(xbb); 3753 PICKUP_GIANT(); 3754 if (error != 0) { 3755 xbb_attach_failed(xbb, error, "Unable to open %s", 3756 xbb->dev_name); 3757 return (ENXIO); 3758 } 3759 3760 /* Use devstat(9) for recording statistics. */ 3761 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3762 xbb->sector_size, 3763 DEVSTAT_ALL_SUPPORTED, 3764 DEVSTAT_TYPE_DIRECT 3765 | DEVSTAT_TYPE_IF_OTHER, 3766 DEVSTAT_PRIORITY_OTHER); 3767 3768 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3769 xbb->sector_size, 3770 DEVSTAT_ALL_SUPPORTED, 3771 DEVSTAT_TYPE_DIRECT 3772 | DEVSTAT_TYPE_IF_OTHER, 3773 DEVSTAT_PRIORITY_OTHER); 3774 /* 3775 * Setup sysctl variables. 3776 */ 3777 xbb_setup_sysctl(xbb); 3778 3779 /* 3780 * Create a taskqueue for doing work that must occur from a 3781 * thread context. 3782 */ 3783 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3784 M_NOWAIT, 3785 taskqueue_thread_enqueue, 3786 /*contxt*/&xbb->io_taskqueue); 3787 if (xbb->io_taskqueue == NULL) { 3788 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3789 return (ENOMEM); 3790 } 3791 3792 taskqueue_start_threads(&xbb->io_taskqueue, 3793 /*num threads*/1, 3794 /*priority*/PWAIT, 3795 /*thread name*/ 3796 "%s taskq", device_get_nameunit(dev)); 3797 3798 /* Update hot-plug status to satisfy xend. */ 3799 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3800 "hotplug-status", "connected"); 3801 if (error) { 3802 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3803 xenbus_get_node(xbb->dev)); 3804 return (error); 3805 } 3806 3807 /* Tell the front end that we are ready to connect. */ 3808 xenbus_set_state(dev, XenbusStateInitWait); 3809 3810 return (0); 3811 } 3812 3813 /** 3814 * Detach from a block back device instance. 3815 * 3816 * \param dev NewBus device object representing this Xen Block Back instance. 3817 * 3818 * \return 0 for success, errno codes for failure. 3819 * 3820 * \note A block back device may be detached at any time in its life-cycle, 3821 * including part way through the attach process. For this reason, 3822 * initialization order and the intialization state checks in this 3823 * routine must be carefully coupled so that attach time failures 3824 * are gracefully handled. 3825 */ 3826 static int 3827 xbb_detach(device_t dev) 3828 { 3829 struct xbb_softc *xbb; 3830 3831 DPRINTF("\n"); 3832 3833 xbb = device_get_softc(dev); 3834 mtx_lock(&xbb->lock); 3835 while (xbb_shutdown(xbb) == EAGAIN) { 3836 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3837 "xbb_shutdown", 0); 3838 } 3839 mtx_unlock(&xbb->lock); 3840 3841 DPRINTF("\n"); 3842 3843 if (xbb->io_taskqueue != NULL) 3844 taskqueue_free(xbb->io_taskqueue); 3845 3846 if (xbb->xbb_stats != NULL) 3847 devstat_remove_entry(xbb->xbb_stats); 3848 3849 if (xbb->xbb_stats_in != NULL) 3850 devstat_remove_entry(xbb->xbb_stats_in); 3851 3852 xbb_close_backend(xbb); 3853 3854 if (xbb->dev_mode != NULL) { 3855 free(xbb->dev_mode, M_XENBUS); 3856 xbb->dev_mode = NULL; 3857 } 3858 3859 if (xbb->dev_type != NULL) { 3860 free(xbb->dev_type, M_XENBUS); 3861 xbb->dev_type = NULL; 3862 } 3863 3864 if (xbb->dev_name != NULL) { 3865 free(xbb->dev_name, M_XENBUS); 3866 xbb->dev_name = NULL; 3867 } 3868 3869 mtx_destroy(&xbb->lock); 3870 return (0); 3871 } 3872 3873 /** 3874 * Prepare this block back device for suspension of this VM. 3875 * 3876 * \param dev NewBus device object representing this Xen Block Back instance. 3877 * 3878 * \return 0 for success, errno codes for failure. 3879 */ 3880 static int 3881 xbb_suspend(device_t dev) 3882 { 3883 #ifdef NOT_YET 3884 struct xbb_softc *sc = device_get_softc(dev); 3885 3886 /* Prevent new requests being issued until we fix things up. */ 3887 mtx_lock(&sc->xb_io_lock); 3888 sc->connected = BLKIF_STATE_SUSPENDED; 3889 mtx_unlock(&sc->xb_io_lock); 3890 #endif 3891 3892 return (0); 3893 } 3894 3895 /** 3896 * Perform any processing required to recover from a suspended state. 3897 * 3898 * \param dev NewBus device object representing this Xen Block Back instance. 3899 * 3900 * \return 0 for success, errno codes for failure. 3901 */ 3902 static int 3903 xbb_resume(device_t dev) 3904 { 3905 return (0); 3906 } 3907 3908 /** 3909 * Handle state changes expressed via the XenStore by our front-end peer. 3910 * 3911 * \param dev NewBus device object representing this Xen 3912 * Block Back instance. 3913 * \param frontend_state The new state of the front-end. 3914 * 3915 * \return 0 for success, errno codes for failure. 3916 */ 3917 static void 3918 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3919 { 3920 struct xbb_softc *xbb = device_get_softc(dev); 3921 3922 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3923 xenbus_strstate(frontend_state), 3924 xenbus_strstate(xenbus_get_state(xbb->dev))); 3925 3926 switch (frontend_state) { 3927 case XenbusStateInitialising: 3928 break; 3929 case XenbusStateInitialised: 3930 case XenbusStateConnected: 3931 xbb_connect(xbb); 3932 break; 3933 case XenbusStateClosing: 3934 case XenbusStateClosed: 3935 mtx_lock(&xbb->lock); 3936 xbb_shutdown(xbb); 3937 mtx_unlock(&xbb->lock); 3938 if (frontend_state == XenbusStateClosed) 3939 xenbus_set_state(xbb->dev, XenbusStateClosed); 3940 break; 3941 default: 3942 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3943 frontend_state); 3944 break; 3945 } 3946 } 3947 3948 /*---------------------------- NewBus Registration ---------------------------*/ 3949 static device_method_t xbb_methods[] = { 3950 /* Device interface */ 3951 DEVMETHOD(device_probe, xbb_probe), 3952 DEVMETHOD(device_attach, xbb_attach), 3953 DEVMETHOD(device_detach, xbb_detach), 3954 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3955 DEVMETHOD(device_suspend, xbb_suspend), 3956 DEVMETHOD(device_resume, xbb_resume), 3957 3958 /* Xenbus interface */ 3959 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3960 3961 { 0, 0 } 3962 }; 3963 3964 static driver_t xbb_driver = { 3965 "xbbd", 3966 xbb_methods, 3967 sizeof(struct xbb_softc), 3968 }; 3969 devclass_t xbb_devclass; 3970 3971 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 3972