1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009-2012 Spectra Logic Corporation 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions, and the following disclaimer, 12 * without modification. 13 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 14 * substantially similar to the "NO WARRANTY" disclaimer below 15 * ("Disclaimer") and any redistribution must be conditioned upon 16 * including a substantially similar Disclaimer requirement for further 17 * binary redistribution. 18 * 19 * NO WARRANTY 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 29 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGES. 31 * 32 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 33 * Ken Merry (Spectra Logic Corporation) 34 */ 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 /** 39 * \file blkback.c 40 * 41 * \brief Device driver supporting the vending of block storage from 42 * a FreeBSD domain to other domains. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kernel.h> 48 #include <sys/malloc.h> 49 50 #include <sys/bio.h> 51 #include <sys/bus.h> 52 #include <sys/conf.h> 53 #include <sys/devicestat.h> 54 #include <sys/disk.h> 55 #include <sys/fcntl.h> 56 #include <sys/filedesc.h> 57 #include <sys/kdb.h> 58 #include <sys/module.h> 59 #include <sys/namei.h> 60 #include <sys/proc.h> 61 #include <sys/rman.h> 62 #include <sys/taskqueue.h> 63 #include <sys/types.h> 64 #include <sys/vnode.h> 65 #include <sys/mount.h> 66 #include <sys/sysctl.h> 67 #include <sys/bitstring.h> 68 #include <sys/sdt.h> 69 70 #include <geom/geom.h> 71 72 #include <machine/_inttypes.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 78 #include <xen/xen-os.h> 79 #include <xen/blkif.h> 80 #include <xen/gnttab.h> 81 #include <xen/xen_intr.h> 82 83 #include <xen/interface/event_channel.h> 84 #include <xen/interface/grant_table.h> 85 86 #include <xen/xenbus/xenbusvar.h> 87 88 /*--------------------------- Compile-time Tunables --------------------------*/ 89 /** 90 * The maximum number of shared memory ring pages we will allow in a 91 * negotiated block-front/back communication channel. Allow enough 92 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 93 */ 94 #define XBB_MAX_RING_PAGES 32 95 96 /** 97 * The maximum number of outstanding request blocks (request headers plus 98 * additional segment blocks) we will allow in a negotiated block-front/back 99 * communication channel. 100 */ 101 #define XBB_MAX_REQUESTS \ 102 __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) 103 104 /** 105 * \brief Define to force all I/O to be performed on memory owned by the 106 * backend device, with a copy-in/out to the remote domain's memory. 107 * 108 * \note This option is currently required when this driver's domain is 109 * operating in HVM mode on a system using an IOMMU. 110 * 111 * This driver uses Xen's grant table API to gain access to the memory of 112 * the remote domains it serves. When our domain is operating in PV mode, 113 * the grant table mechanism directly updates our domain's page table entries 114 * to point to the physical pages of the remote domain. This scheme guarantees 115 * that blkback and the backing devices it uses can safely perform DMA 116 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 117 * insure that our domain cannot DMA to pages owned by another domain. As 118 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 119 * table API. For this reason, in HVM mode, we must bounce all requests into 120 * memory that is mapped into our domain at domain startup and thus has 121 * valid IOMMU mappings. 122 */ 123 #define XBB_USE_BOUNCE_BUFFERS 124 125 /** 126 * \brief Define to enable rudimentary request logging to the console. 127 */ 128 #undef XBB_DEBUG 129 130 /*---------------------------------- Macros ----------------------------------*/ 131 /** 132 * Custom malloc type for all driver allocations. 133 */ 134 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 135 136 #ifdef XBB_DEBUG 137 #define DPRINTF(fmt, args...) \ 138 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 139 #else 140 #define DPRINTF(fmt, args...) do {} while(0) 141 #endif 142 143 /** 144 * The maximum mapped region size per request we will allow in a negotiated 145 * block-front/back communication channel. 146 */ 147 #define XBB_MAX_REQUEST_SIZE \ 148 MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 149 150 /** 151 * The maximum number of segments (within a request header and accompanying 152 * segment blocks) per request we will allow in a negotiated block-front/back 153 * communication channel. 154 */ 155 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 156 (MIN(UIO_MAXIOV, \ 157 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 158 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 159 160 /** 161 * The maximum number of ring pages that we can allow per request list. 162 * We limit this to the maximum number of segments per request, because 163 * that is already a reasonable number of segments to aggregate. This 164 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 165 * because that would leave situations where we can't dispatch even one 166 * large request. 167 */ 168 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 169 170 /*--------------------------- Forward Declarations ---------------------------*/ 171 struct xbb_softc; 172 struct xbb_xen_req; 173 174 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 175 ...) __attribute__((format(printf, 3, 4))); 176 static int xbb_shutdown(struct xbb_softc *xbb); 177 178 /*------------------------------ Data Structures -----------------------------*/ 179 180 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 181 182 typedef enum { 183 XBB_REQLIST_NONE = 0x00, 184 XBB_REQLIST_MAPPED = 0x01 185 } xbb_reqlist_flags; 186 187 struct xbb_xen_reqlist { 188 /** 189 * Back reference to the parent block back instance for this 190 * request. Used during bio_done handling. 191 */ 192 struct xbb_softc *xbb; 193 194 /** 195 * BLKIF_OP code for this request. 196 */ 197 int operation; 198 199 /** 200 * Set to BLKIF_RSP_* to indicate request status. 201 * 202 * This field allows an error status to be recorded even if the 203 * delivery of this status must be deferred. Deferred reporting 204 * is necessary, for example, when an error is detected during 205 * completion processing of one bio when other bios for this 206 * request are still outstanding. 207 */ 208 int status; 209 210 /** 211 * Number of 512 byte sectors not transferred. 212 */ 213 int residual_512b_sectors; 214 215 /** 216 * Starting sector number of the first request in the list. 217 */ 218 off_t starting_sector_number; 219 220 /** 221 * If we're going to coalesce, the next contiguous sector would be 222 * this one. 223 */ 224 off_t next_contig_sector; 225 226 /** 227 * Number of child requests in the list. 228 */ 229 int num_children; 230 231 /** 232 * Number of I/O requests still pending on the backend. 233 */ 234 int pendcnt; 235 236 /** 237 * Total number of segments for requests in the list. 238 */ 239 int nr_segments; 240 241 /** 242 * Flags for this particular request list. 243 */ 244 xbb_reqlist_flags flags; 245 246 /** 247 * Kernel virtual address space reserved for this request 248 * list structure and used to map the remote domain's pages for 249 * this I/O, into our domain's address space. 250 */ 251 uint8_t *kva; 252 253 /** 254 * Base, pseudo-physical address, corresponding to the start 255 * of this request's kva region. 256 */ 257 uint64_t gnt_base; 258 259 260 #ifdef XBB_USE_BOUNCE_BUFFERS 261 /** 262 * Pre-allocated domain local memory used to proxy remote 263 * domain memory during I/O operations. 264 */ 265 uint8_t *bounce; 266 #endif 267 268 /** 269 * Array of grant handles (one per page) used to map this request. 270 */ 271 grant_handle_t *gnt_handles; 272 273 /** 274 * Device statistics request ordering type (ordered or simple). 275 */ 276 devstat_tag_type ds_tag_type; 277 278 /** 279 * Device statistics request type (read, write, no_data). 280 */ 281 devstat_trans_flags ds_trans_type; 282 283 /** 284 * The start time for this request. 285 */ 286 struct bintime ds_t0; 287 288 /** 289 * Linked list of contiguous requests with the same operation type. 290 */ 291 struct xbb_xen_req_list contig_req_list; 292 293 /** 294 * Linked list links used to aggregate idle requests in the 295 * request list free pool (xbb->reqlist_free_stailq) and pending 296 * requests waiting for execution (xbb->reqlist_pending_stailq). 297 */ 298 STAILQ_ENTRY(xbb_xen_reqlist) links; 299 }; 300 301 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 302 303 /** 304 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 305 */ 306 struct xbb_xen_req { 307 /** 308 * Linked list links used to aggregate requests into a reqlist 309 * and to store them in the request free pool. 310 */ 311 STAILQ_ENTRY(xbb_xen_req) links; 312 313 /** 314 * The remote domain's identifier for this I/O request. 315 */ 316 uint64_t id; 317 318 /** 319 * The number of pages currently mapped for this request. 320 */ 321 int nr_pages; 322 323 /** 324 * The number of 512 byte sectors comprising this requests. 325 */ 326 int nr_512b_sectors; 327 328 /** 329 * BLKIF_OP code for this request. 330 */ 331 int operation; 332 333 /** 334 * Storage used for non-native ring requests. 335 */ 336 blkif_request_t ring_req_storage; 337 338 /** 339 * Pointer to the Xen request in the ring. 340 */ 341 blkif_request_t *ring_req; 342 343 /** 344 * Consumer index for this request. 345 */ 346 RING_IDX req_ring_idx; 347 348 /** 349 * The start time for this request. 350 */ 351 struct bintime ds_t0; 352 353 /** 354 * Pointer back to our parent request list. 355 */ 356 struct xbb_xen_reqlist *reqlist; 357 }; 358 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 359 360 /** 361 * \brief Configuration data for the shared memory request ring 362 * used to communicate with the front-end client of this 363 * this driver. 364 */ 365 struct xbb_ring_config { 366 /** KVA address where ring memory is mapped. */ 367 vm_offset_t va; 368 369 /** The pseudo-physical address where ring memory is mapped.*/ 370 uint64_t gnt_addr; 371 372 /** 373 * Grant table handles, one per-ring page, returned by the 374 * hyperpervisor upon mapping of the ring and required to 375 * unmap it when a connection is torn down. 376 */ 377 grant_handle_t handle[XBB_MAX_RING_PAGES]; 378 379 /** 380 * The device bus address returned by the hypervisor when 381 * mapping the ring and required to unmap it when a connection 382 * is torn down. 383 */ 384 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 385 386 /** The number of ring pages mapped for the current connection. */ 387 u_int ring_pages; 388 389 /** 390 * The grant references, one per-ring page, supplied by the 391 * front-end, allowing us to reference the ring pages in the 392 * front-end's domain and to map these pages into our own domain. 393 */ 394 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 395 396 /** The interrupt driven even channel used to signal ring events. */ 397 evtchn_port_t evtchn; 398 }; 399 400 /** 401 * Per-instance connection state flags. 402 */ 403 typedef enum 404 { 405 /** 406 * The front-end requested a read-only mount of the 407 * back-end device/file. 408 */ 409 XBBF_READ_ONLY = 0x01, 410 411 /** Communication with the front-end has been established. */ 412 XBBF_RING_CONNECTED = 0x02, 413 414 /** 415 * Front-end requests exist in the ring and are waiting for 416 * xbb_xen_req objects to free up. 417 */ 418 XBBF_RESOURCE_SHORTAGE = 0x04, 419 420 /** Connection teardown in progress. */ 421 XBBF_SHUTDOWN = 0x08, 422 423 /** A thread is already performing shutdown processing. */ 424 XBBF_IN_SHUTDOWN = 0x10 425 } xbb_flag_t; 426 427 /** Backend device type. */ 428 typedef enum { 429 /** Backend type unknown. */ 430 XBB_TYPE_NONE = 0x00, 431 432 /** 433 * Backend type disk (access via cdev switch 434 * strategy routine). 435 */ 436 XBB_TYPE_DISK = 0x01, 437 438 /** Backend type file (access vnode operations.). */ 439 XBB_TYPE_FILE = 0x02 440 } xbb_type; 441 442 /** 443 * \brief Structure used to memoize information about a per-request 444 * scatter-gather list. 445 * 446 * The chief benefit of using this data structure is it avoids having 447 * to reparse the possibly discontiguous S/G list in the original 448 * request. Due to the way that the mapping of the memory backing an 449 * I/O transaction is handled by Xen, a second pass is unavoidable. 450 * At least this way the second walk is a simple array traversal. 451 * 452 * \note A single Scatter/Gather element in the block interface covers 453 * at most 1 machine page. In this context a sector (blkif 454 * nomenclature, not what I'd choose) is a 512b aligned unit 455 * of mapping within the machine page referenced by an S/G 456 * element. 457 */ 458 struct xbb_sg { 459 /** The number of 512b data chunks mapped in this S/G element. */ 460 int16_t nsect; 461 462 /** 463 * The index (0 based) of the first 512b data chunk mapped 464 * in this S/G element. 465 */ 466 uint8_t first_sect; 467 468 /** 469 * The index (0 based) of the last 512b data chunk mapped 470 * in this S/G element. 471 */ 472 uint8_t last_sect; 473 }; 474 475 /** 476 * Character device backend specific configuration data. 477 */ 478 struct xbb_dev_data { 479 /** Cdev used for device backend access. */ 480 struct cdev *cdev; 481 482 /** Cdev switch used for device backend access. */ 483 struct cdevsw *csw; 484 485 /** Used to hold a reference on opened cdev backend devices. */ 486 int dev_ref; 487 }; 488 489 /** 490 * File backend specific configuration data. 491 */ 492 struct xbb_file_data { 493 /** Credentials to use for vnode backed (file based) I/O. */ 494 struct ucred *cred; 495 496 /** 497 * \brief Array of io vectors used to process file based I/O. 498 * 499 * Only a single file based request is outstanding per-xbb instance, 500 * so we only need one of these. 501 */ 502 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 503 #ifdef XBB_USE_BOUNCE_BUFFERS 504 505 /** 506 * \brief Array of io vectors used to handle bouncing of file reads. 507 * 508 * Vnode operations are free to modify uio data during their 509 * exectuion. In the case of a read with bounce buffering active, 510 * we need some of the data from the original uio in order to 511 * bounce-out the read data. This array serves as the temporary 512 * storage for this saved data. 513 */ 514 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 515 516 /** 517 * \brief Array of memoized bounce buffer kva offsets used 518 * in the file based backend. 519 * 520 * Due to the way that the mapping of the memory backing an 521 * I/O transaction is handled by Xen, a second pass through 522 * the request sg elements is unavoidable. We memoize the computed 523 * bounce address here to reduce the cost of the second walk. 524 */ 525 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 526 #endif /* XBB_USE_BOUNCE_BUFFERS */ 527 }; 528 529 /** 530 * Collection of backend type specific data. 531 */ 532 union xbb_backend_data { 533 struct xbb_dev_data dev; 534 struct xbb_file_data file; 535 }; 536 537 /** 538 * Function signature of backend specific I/O handlers. 539 */ 540 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 541 struct xbb_xen_reqlist *reqlist, int operation, 542 int flags); 543 544 /** 545 * Per-instance configuration data. 546 */ 547 struct xbb_softc { 548 549 /** 550 * Task-queue used to process I/O requests. 551 */ 552 struct taskqueue *io_taskqueue; 553 554 /** 555 * Single "run the request queue" task enqueued 556 * on io_taskqueue. 557 */ 558 struct task io_task; 559 560 /** Device type for this instance. */ 561 xbb_type device_type; 562 563 /** NewBus device corresponding to this instance. */ 564 device_t dev; 565 566 /** Backend specific dispatch routine for this instance. */ 567 xbb_dispatch_t dispatch_io; 568 569 /** The number of requests outstanding on the backend device/file. */ 570 int active_request_count; 571 572 /** Free pool of request tracking structures. */ 573 struct xbb_xen_req_list request_free_stailq; 574 575 /** Array, sized at connection time, of request tracking structures. */ 576 struct xbb_xen_req *requests; 577 578 /** Free pool of request list structures. */ 579 struct xbb_xen_reqlist_list reqlist_free_stailq; 580 581 /** List of pending request lists awaiting execution. */ 582 struct xbb_xen_reqlist_list reqlist_pending_stailq; 583 584 /** Array, sized at connection time, of request list structures. */ 585 struct xbb_xen_reqlist *request_lists; 586 587 /** 588 * Global pool of kva used for mapping remote domain ring 589 * and I/O transaction data. 590 */ 591 vm_offset_t kva; 592 593 /** Pseudo-physical address corresponding to kva. */ 594 uint64_t gnt_base_addr; 595 596 /** The size of the global kva pool. */ 597 int kva_size; 598 599 /** The size of the KVA area used for request lists. */ 600 int reqlist_kva_size; 601 602 /** The number of pages of KVA used for request lists */ 603 int reqlist_kva_pages; 604 605 /** Bitmap of free KVA pages */ 606 bitstr_t *kva_free; 607 608 /** 609 * \brief Cached value of the front-end's domain id. 610 * 611 * This value is used at once for each mapped page in 612 * a transaction. We cache it to avoid incuring the 613 * cost of an ivar access every time this is needed. 614 */ 615 domid_t otherend_id; 616 617 /** 618 * \brief The blkif protocol abi in effect. 619 * 620 * There are situations where the back and front ends can 621 * have a different, native abi (e.g. intel x86_64 and 622 * 32bit x86 domains on the same machine). The back-end 623 * always accommodates the front-end's native abi. That 624 * value is pulled from the XenStore and recorded here. 625 */ 626 int abi; 627 628 /** 629 * \brief The maximum number of requests and request lists allowed 630 * to be in flight at a time. 631 * 632 * This value is negotiated via the XenStore. 633 */ 634 u_int max_requests; 635 636 /** 637 * \brief The maximum number of segments (1 page per segment) 638 * that can be mapped by a request. 639 * 640 * This value is negotiated via the XenStore. 641 */ 642 u_int max_request_segments; 643 644 /** 645 * \brief Maximum number of segments per request list. 646 * 647 * This value is derived from and will generally be larger than 648 * max_request_segments. 649 */ 650 u_int max_reqlist_segments; 651 652 /** 653 * The maximum size of any request to this back-end 654 * device. 655 * 656 * This value is negotiated via the XenStore. 657 */ 658 u_int max_request_size; 659 660 /** 661 * The maximum size of any request list. This is derived directly 662 * from max_reqlist_segments. 663 */ 664 u_int max_reqlist_size; 665 666 /** Various configuration and state bit flags. */ 667 xbb_flag_t flags; 668 669 /** Ring mapping and interrupt configuration data. */ 670 struct xbb_ring_config ring_config; 671 672 /** Runtime, cross-abi safe, structures for ring access. */ 673 blkif_back_rings_t rings; 674 675 /** IRQ mapping for the communication ring event channel. */ 676 xen_intr_handle_t xen_intr_handle; 677 678 /** 679 * \brief Backend access mode flags (e.g. write, or read-only). 680 * 681 * This value is passed to us by the front-end via the XenStore. 682 */ 683 char *dev_mode; 684 685 /** 686 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 687 * 688 * This value is passed to us by the front-end via the XenStore. 689 * Currently unused. 690 */ 691 char *dev_type; 692 693 /** 694 * \brief Backend device/file identifier. 695 * 696 * This value is passed to us by the front-end via the XenStore. 697 * We expect this to be a POSIX path indicating the file or 698 * device to open. 699 */ 700 char *dev_name; 701 702 /** 703 * Vnode corresponding to the backend device node or file 704 * we are acessing. 705 */ 706 struct vnode *vn; 707 708 union xbb_backend_data backend; 709 710 /** The native sector size of the backend. */ 711 u_int sector_size; 712 713 /** log2 of sector_size. */ 714 u_int sector_size_shift; 715 716 /** Size in bytes of the backend device or file. */ 717 off_t media_size; 718 719 /** 720 * \brief media_size expressed in terms of the backend native 721 * sector size. 722 * 723 * (e.g. xbb->media_size >> xbb->sector_size_shift). 724 */ 725 uint64_t media_num_sectors; 726 727 /** 728 * \brief Array of memoized scatter gather data computed during the 729 * conversion of blkif ring requests to internal xbb_xen_req 730 * structures. 731 * 732 * Ring processing is serialized so we only need one of these. 733 */ 734 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 735 736 /** 737 * Temporary grant table map used in xbb_dispatch_io(). When 738 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 739 * stack could cause a stack overflow. 740 */ 741 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 742 743 /** Mutex protecting per-instance data. */ 744 struct mtx lock; 745 746 /** 747 * Resource representing allocated physical address space 748 * associated with our per-instance kva region. 749 */ 750 struct resource *pseudo_phys_res; 751 752 /** Resource id for allocated physical address space. */ 753 int pseudo_phys_res_id; 754 755 /** 756 * I/O statistics from BlockBack dispatch down. These are 757 * coalesced requests, and we start them right before execution. 758 */ 759 struct devstat *xbb_stats; 760 761 /** 762 * I/O statistics coming into BlockBack. These are the requests as 763 * we get them from BlockFront. They are started as soon as we 764 * receive a request, and completed when the I/O is complete. 765 */ 766 struct devstat *xbb_stats_in; 767 768 /** Disable sending flush to the backend */ 769 int disable_flush; 770 771 /** Send a real flush for every N flush requests */ 772 int flush_interval; 773 774 /** Count of flush requests in the interval */ 775 int flush_count; 776 777 /** Don't coalesce requests if this is set */ 778 int no_coalesce_reqs; 779 780 /** Number of requests we have received */ 781 uint64_t reqs_received; 782 783 /** Number of requests we have completed*/ 784 uint64_t reqs_completed; 785 786 /** Number of requests we queued but not pushed*/ 787 uint64_t reqs_queued_for_completion; 788 789 /** Number of requests we completed with an error status*/ 790 uint64_t reqs_completed_with_error; 791 792 /** How many forced dispatches (i.e. without coalescing) have happened */ 793 uint64_t forced_dispatch; 794 795 /** How many normal dispatches have happened */ 796 uint64_t normal_dispatch; 797 798 /** How many total dispatches have happened */ 799 uint64_t total_dispatch; 800 801 /** How many times we have run out of KVA */ 802 uint64_t kva_shortages; 803 804 /** How many times we have run out of request structures */ 805 uint64_t request_shortages; 806 807 /** Watch to wait for hotplug script execution */ 808 struct xs_watch hotplug_watch; 809 }; 810 811 /*---------------------------- Request Processing ----------------------------*/ 812 /** 813 * Allocate an internal transaction tracking structure from the free pool. 814 * 815 * \param xbb Per-instance xbb configuration structure. 816 * 817 * \return On success, a pointer to the allocated xbb_xen_req structure. 818 * Otherwise NULL. 819 */ 820 static inline struct xbb_xen_req * 821 xbb_get_req(struct xbb_softc *xbb) 822 { 823 struct xbb_xen_req *req; 824 825 req = NULL; 826 827 mtx_assert(&xbb->lock, MA_OWNED); 828 829 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 830 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 831 xbb->active_request_count++; 832 } 833 834 return (req); 835 } 836 837 /** 838 * Return an allocated transaction tracking structure to the free pool. 839 * 840 * \param xbb Per-instance xbb configuration structure. 841 * \param req The request structure to free. 842 */ 843 static inline void 844 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 845 { 846 mtx_assert(&xbb->lock, MA_OWNED); 847 848 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 849 xbb->active_request_count--; 850 851 KASSERT(xbb->active_request_count >= 0, 852 ("xbb_release_req: negative active count")); 853 } 854 855 /** 856 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 857 * 858 * \param xbb Per-instance xbb configuration structure. 859 * \param req_list The list of requests to free. 860 * \param nreqs The number of items in the list. 861 */ 862 static inline void 863 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 864 int nreqs) 865 { 866 mtx_assert(&xbb->lock, MA_OWNED); 867 868 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 869 xbb->active_request_count -= nreqs; 870 871 KASSERT(xbb->active_request_count >= 0, 872 ("xbb_release_reqs: negative active count")); 873 } 874 875 /** 876 * Given a page index and 512b sector offset within that page, 877 * calculate an offset into a request's kva region. 878 * 879 * \param reqlist The request structure whose kva region will be accessed. 880 * \param pagenr The page index used to compute the kva offset. 881 * \param sector The 512b sector index used to compute the page relative 882 * kva offset. 883 * 884 * \return The computed global KVA offset. 885 */ 886 static inline uint8_t * 887 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 888 { 889 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 890 } 891 892 #ifdef XBB_USE_BOUNCE_BUFFERS 893 /** 894 * Given a page index and 512b sector offset within that page, 895 * calculate an offset into a request's local bounce memory region. 896 * 897 * \param reqlist The request structure whose bounce region will be accessed. 898 * \param pagenr The page index used to compute the bounce offset. 899 * \param sector The 512b sector index used to compute the page relative 900 * bounce offset. 901 * 902 * \return The computed global bounce buffer address. 903 */ 904 static inline uint8_t * 905 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 906 { 907 return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 908 } 909 #endif 910 911 /** 912 * Given a page number and 512b sector offset within that page, 913 * calculate an offset into the request's memory region that the 914 * underlying backend device/file should use for I/O. 915 * 916 * \param reqlist The request structure whose I/O region will be accessed. 917 * \param pagenr The page index used to compute the I/O offset. 918 * \param sector The 512b sector index used to compute the page relative 919 * I/O offset. 920 * 921 * \return The computed global I/O address. 922 * 923 * Depending on configuration, this will either be a local bounce buffer 924 * or a pointer to the memory mapped in from the front-end domain for 925 * this request. 926 */ 927 static inline uint8_t * 928 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 929 { 930 #ifdef XBB_USE_BOUNCE_BUFFERS 931 return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 932 #else 933 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 934 #endif 935 } 936 937 /** 938 * Given a page index and 512b sector offset within that page, calculate 939 * an offset into the local pseudo-physical address space used to map a 940 * front-end's request data into a request. 941 * 942 * \param reqlist The request list structure whose pseudo-physical region 943 * will be accessed. 944 * \param pagenr The page index used to compute the pseudo-physical offset. 945 * \param sector The 512b sector index used to compute the page relative 946 * pseudo-physical offset. 947 * 948 * \return The computed global pseudo-phsyical address. 949 * 950 * Depending on configuration, this will either be a local bounce buffer 951 * or a pointer to the memory mapped in from the front-end domain for 952 * this request. 953 */ 954 static inline uintptr_t 955 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 956 { 957 struct xbb_softc *xbb; 958 959 xbb = reqlist->xbb; 960 961 return ((uintptr_t)(xbb->gnt_base_addr + 962 (uintptr_t)(reqlist->kva - xbb->kva) + 963 (PAGE_SIZE * pagenr) + (sector << 9))); 964 } 965 966 /** 967 * Get Kernel Virtual Address space for mapping requests. 968 * 969 * \param xbb Per-instance xbb configuration structure. 970 * \param nr_pages Number of pages needed. 971 * \param check_only If set, check for free KVA but don't allocate it. 972 * \param have_lock If set, xbb lock is already held. 973 * 974 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 975 * 976 * Note: This should be unnecessary once we have either chaining or 977 * scatter/gather support for struct bio. At that point we'll be able to 978 * put multiple addresses and lengths in one bio/bio chain and won't need 979 * to map everything into one virtual segment. 980 */ 981 static uint8_t * 982 xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 983 { 984 int first_clear; 985 int num_clear; 986 uint8_t *free_kva; 987 int i; 988 989 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 990 991 first_clear = 0; 992 free_kva = NULL; 993 994 mtx_lock(&xbb->lock); 995 996 /* 997 * Look for the first available page. If there are none, we're done. 998 */ 999 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 1000 1001 if (first_clear == -1) 1002 goto bailout; 1003 1004 /* 1005 * Starting at the first available page, look for consecutive free 1006 * pages that will satisfy the user's request. 1007 */ 1008 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1009 /* 1010 * If this is true, the page is used, so we have to reset 1011 * the number of clear pages and the first clear page 1012 * (since it pointed to a region with an insufficient number 1013 * of clear pages). 1014 */ 1015 if (bit_test(xbb->kva_free, i)) { 1016 num_clear = 0; 1017 first_clear = -1; 1018 continue; 1019 } 1020 1021 if (first_clear == -1) 1022 first_clear = i; 1023 1024 /* 1025 * If this is true, we've found a large enough free region 1026 * to satisfy the request. 1027 */ 1028 if (++num_clear == nr_pages) { 1029 1030 bit_nset(xbb->kva_free, first_clear, 1031 first_clear + nr_pages - 1); 1032 1033 free_kva = xbb->kva + 1034 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); 1035 1036 KASSERT(free_kva >= (uint8_t *)xbb->kva && 1037 free_kva + (nr_pages * PAGE_SIZE) <= 1038 (uint8_t *)xbb->ring_config.va, 1039 ("Free KVA %p len %d out of range, " 1040 "kva = %#jx, ring VA = %#jx\n", free_kva, 1041 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1042 (uintmax_t)xbb->ring_config.va)); 1043 break; 1044 } 1045 } 1046 1047 bailout: 1048 1049 if (free_kva == NULL) { 1050 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1051 xbb->kva_shortages++; 1052 } 1053 1054 mtx_unlock(&xbb->lock); 1055 1056 return (free_kva); 1057 } 1058 1059 /** 1060 * Free allocated KVA. 1061 * 1062 * \param xbb Per-instance xbb configuration structure. 1063 * \param kva_ptr Pointer to allocated KVA region. 1064 * \param nr_pages Number of pages in the KVA region. 1065 */ 1066 static void 1067 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 1068 { 1069 intptr_t start_page; 1070 1071 mtx_assert(&xbb->lock, MA_OWNED); 1072 1073 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1074 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1075 1076 } 1077 1078 /** 1079 * Unmap the front-end pages associated with this I/O request. 1080 * 1081 * \param req The request structure to unmap. 1082 */ 1083 static void 1084 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1085 { 1086 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1087 u_int i; 1088 u_int invcount; 1089 int error; 1090 1091 invcount = 0; 1092 for (i = 0; i < reqlist->nr_segments; i++) { 1093 1094 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1095 continue; 1096 1097 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1098 unmap[invcount].dev_bus_addr = 0; 1099 unmap[invcount].handle = reqlist->gnt_handles[i]; 1100 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1101 invcount++; 1102 } 1103 1104 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1105 unmap, invcount); 1106 KASSERT(error == 0, ("Grant table operation failed")); 1107 } 1108 1109 /** 1110 * Allocate an internal transaction tracking structure from the free pool. 1111 * 1112 * \param xbb Per-instance xbb configuration structure. 1113 * 1114 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1115 * Otherwise NULL. 1116 */ 1117 static inline struct xbb_xen_reqlist * 1118 xbb_get_reqlist(struct xbb_softc *xbb) 1119 { 1120 struct xbb_xen_reqlist *reqlist; 1121 1122 reqlist = NULL; 1123 1124 mtx_assert(&xbb->lock, MA_OWNED); 1125 1126 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1127 1128 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1129 reqlist->flags = XBB_REQLIST_NONE; 1130 reqlist->kva = NULL; 1131 reqlist->status = BLKIF_RSP_OKAY; 1132 reqlist->residual_512b_sectors = 0; 1133 reqlist->num_children = 0; 1134 reqlist->nr_segments = 0; 1135 STAILQ_INIT(&reqlist->contig_req_list); 1136 } 1137 1138 return (reqlist); 1139 } 1140 1141 /** 1142 * Return an allocated transaction tracking structure to the free pool. 1143 * 1144 * \param xbb Per-instance xbb configuration structure. 1145 * \param req The request list structure to free. 1146 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1147 * during a resource shortage condition. 1148 */ 1149 static inline void 1150 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1151 int wakeup) 1152 { 1153 1154 mtx_assert(&xbb->lock, MA_OWNED); 1155 1156 if (wakeup) { 1157 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1158 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1159 } 1160 1161 if (reqlist->kva != NULL) 1162 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1163 1164 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1165 1166 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1167 1168 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1169 /* 1170 * Shutdown is in progress. See if we can 1171 * progress further now that one more request 1172 * has completed and been returned to the 1173 * free pool. 1174 */ 1175 xbb_shutdown(xbb); 1176 } 1177 1178 if (wakeup != 0) 1179 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1180 } 1181 1182 /** 1183 * Request resources and do basic request setup. 1184 * 1185 * \param xbb Per-instance xbb configuration structure. 1186 * \param reqlist Pointer to reqlist pointer. 1187 * \param ring_req Pointer to a block ring request. 1188 * \param ring_index The ring index of this request. 1189 * 1190 * \return 0 for success, non-zero for failure. 1191 */ 1192 static int 1193 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1194 blkif_request_t *ring_req, RING_IDX ring_idx) 1195 { 1196 struct xbb_xen_reqlist *nreqlist; 1197 struct xbb_xen_req *nreq; 1198 1199 nreqlist = NULL; 1200 nreq = NULL; 1201 1202 mtx_lock(&xbb->lock); 1203 1204 /* 1205 * We don't allow new resources to be allocated if we're in the 1206 * process of shutting down. 1207 */ 1208 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1209 mtx_unlock(&xbb->lock); 1210 return (1); 1211 } 1212 1213 /* 1214 * Allocate a reqlist if the caller doesn't have one already. 1215 */ 1216 if (*reqlist == NULL) { 1217 nreqlist = xbb_get_reqlist(xbb); 1218 if (nreqlist == NULL) 1219 goto bailout_error; 1220 } 1221 1222 /* We always allocate a request. */ 1223 nreq = xbb_get_req(xbb); 1224 if (nreq == NULL) 1225 goto bailout_error; 1226 1227 mtx_unlock(&xbb->lock); 1228 1229 if (*reqlist == NULL) { 1230 *reqlist = nreqlist; 1231 nreqlist->operation = ring_req->operation; 1232 nreqlist->starting_sector_number = ring_req->sector_number; 1233 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1234 links); 1235 } 1236 1237 nreq->reqlist = *reqlist; 1238 nreq->req_ring_idx = ring_idx; 1239 nreq->id = ring_req->id; 1240 nreq->operation = ring_req->operation; 1241 1242 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1243 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1244 nreq->ring_req = &nreq->ring_req_storage; 1245 } else { 1246 nreq->ring_req = ring_req; 1247 } 1248 1249 binuptime(&nreq->ds_t0); 1250 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1251 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1252 (*reqlist)->num_children++; 1253 (*reqlist)->nr_segments += ring_req->nr_segments; 1254 1255 return (0); 1256 1257 bailout_error: 1258 1259 /* 1260 * We're out of resources, so set the shortage flag. The next time 1261 * a request is released, we'll try waking up the work thread to 1262 * see if we can allocate more resources. 1263 */ 1264 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1265 xbb->request_shortages++; 1266 1267 if (nreq != NULL) 1268 xbb_release_req(xbb, nreq); 1269 1270 if (nreqlist != NULL) 1271 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1272 1273 mtx_unlock(&xbb->lock); 1274 1275 return (1); 1276 } 1277 1278 /** 1279 * Create and queue a response to a blkif request. 1280 * 1281 * \param xbb Per-instance xbb configuration structure. 1282 * \param req The request structure to which to respond. 1283 * \param status The status code to report. See BLKIF_RSP_* 1284 * in sys/xen/interface/io/blkif.h. 1285 */ 1286 static void 1287 xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1288 { 1289 blkif_response_t *resp; 1290 1291 /* 1292 * The mutex is required here, and should be held across this call 1293 * until after the subsequent call to xbb_push_responses(). This 1294 * is to guarantee that another context won't queue responses and 1295 * push them while we're active. 1296 * 1297 * That could lead to the other end being notified of responses 1298 * before the resources have been freed on this end. The other end 1299 * would then be able to queue additional I/O, and we may run out 1300 * of resources because we haven't freed them all yet. 1301 */ 1302 mtx_assert(&xbb->lock, MA_OWNED); 1303 1304 /* 1305 * Place on the response ring for the relevant domain. 1306 * For now, only the spacing between entries is different 1307 * in the different ABIs, not the response entry layout. 1308 */ 1309 switch (xbb->abi) { 1310 case BLKIF_PROTOCOL_NATIVE: 1311 resp = RING_GET_RESPONSE(&xbb->rings.native, 1312 xbb->rings.native.rsp_prod_pvt); 1313 break; 1314 case BLKIF_PROTOCOL_X86_32: 1315 resp = (blkif_response_t *) 1316 RING_GET_RESPONSE(&xbb->rings.x86_32, 1317 xbb->rings.x86_32.rsp_prod_pvt); 1318 break; 1319 case BLKIF_PROTOCOL_X86_64: 1320 resp = (blkif_response_t *) 1321 RING_GET_RESPONSE(&xbb->rings.x86_64, 1322 xbb->rings.x86_64.rsp_prod_pvt); 1323 break; 1324 default: 1325 panic("Unexpected blkif protocol ABI."); 1326 } 1327 1328 resp->id = req->id; 1329 resp->operation = req->operation; 1330 resp->status = status; 1331 1332 if (status != BLKIF_RSP_OKAY) 1333 xbb->reqs_completed_with_error++; 1334 1335 xbb->rings.common.rsp_prod_pvt++; 1336 1337 xbb->reqs_queued_for_completion++; 1338 1339 } 1340 1341 /** 1342 * Send queued responses to blkif requests. 1343 * 1344 * \param xbb Per-instance xbb configuration structure. 1345 * \param run_taskqueue Flag that is set to 1 if the taskqueue 1346 * should be run, 0 if it does not need to be run. 1347 * \param notify Flag that is set to 1 if the other end should be 1348 * notified via irq, 0 if the other end should not be 1349 * notified. 1350 */ 1351 static void 1352 xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) 1353 { 1354 int more_to_do; 1355 1356 /* 1357 * The mutex is required here. 1358 */ 1359 mtx_assert(&xbb->lock, MA_OWNED); 1360 1361 more_to_do = 0; 1362 1363 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); 1364 1365 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1366 1367 /* 1368 * Tail check for pending requests. Allows frontend to avoid 1369 * notifications if requests are already in flight (lower 1370 * overheads and promotes batching). 1371 */ 1372 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1373 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1374 1375 more_to_do = 1; 1376 } 1377 1378 xbb->reqs_completed += xbb->reqs_queued_for_completion; 1379 xbb->reqs_queued_for_completion = 0; 1380 1381 *run_taskqueue = more_to_do; 1382 } 1383 1384 /** 1385 * Complete a request list. 1386 * 1387 * \param xbb Per-instance xbb configuration structure. 1388 * \param reqlist Allocated internal request list structure. 1389 */ 1390 static void 1391 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1392 { 1393 struct xbb_xen_req *nreq; 1394 off_t sectors_sent; 1395 int notify, run_taskqueue; 1396 1397 sectors_sent = 0; 1398 1399 if (reqlist->flags & XBB_REQLIST_MAPPED) 1400 xbb_unmap_reqlist(reqlist); 1401 1402 mtx_lock(&xbb->lock); 1403 1404 /* 1405 * All I/O is done, send the response. A lock is not necessary 1406 * to protect the request list, because all requests have 1407 * completed. Therefore this is the only context accessing this 1408 * reqlist right now. However, in order to make sure that no one 1409 * else queues responses onto the queue or pushes them to the other 1410 * side while we're active, we need to hold the lock across the 1411 * calls to xbb_queue_response() and xbb_push_responses(). 1412 */ 1413 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1414 off_t cur_sectors_sent; 1415 1416 /* Put this response on the ring, but don't push yet */ 1417 xbb_queue_response(xbb, nreq, reqlist->status); 1418 1419 /* We don't report bytes sent if there is an error. */ 1420 if (reqlist->status == BLKIF_RSP_OKAY) 1421 cur_sectors_sent = nreq->nr_512b_sectors; 1422 else 1423 cur_sectors_sent = 0; 1424 1425 sectors_sent += cur_sectors_sent; 1426 1427 devstat_end_transaction(xbb->xbb_stats_in, 1428 /*bytes*/cur_sectors_sent << 9, 1429 reqlist->ds_tag_type, 1430 reqlist->ds_trans_type, 1431 /*now*/NULL, 1432 /*then*/&nreq->ds_t0); 1433 } 1434 1435 /* 1436 * Take out any sectors not sent. If we wind up negative (which 1437 * might happen if an error is reported as well as a residual), just 1438 * report 0 sectors sent. 1439 */ 1440 sectors_sent -= reqlist->residual_512b_sectors; 1441 if (sectors_sent < 0) 1442 sectors_sent = 0; 1443 1444 devstat_end_transaction(xbb->xbb_stats, 1445 /*bytes*/ sectors_sent << 9, 1446 reqlist->ds_tag_type, 1447 reqlist->ds_trans_type, 1448 /*now*/NULL, 1449 /*then*/&reqlist->ds_t0); 1450 1451 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1452 1453 xbb_push_responses(xbb, &run_taskqueue, ¬ify); 1454 1455 mtx_unlock(&xbb->lock); 1456 1457 if (run_taskqueue) 1458 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1459 1460 if (notify) 1461 xen_intr_signal(xbb->xen_intr_handle); 1462 } 1463 1464 /** 1465 * Completion handler for buffer I/O requests issued by the device 1466 * backend driver. 1467 * 1468 * \param bio The buffer I/O request on which to perform completion 1469 * processing. 1470 */ 1471 static void 1472 xbb_bio_done(struct bio *bio) 1473 { 1474 struct xbb_softc *xbb; 1475 struct xbb_xen_reqlist *reqlist; 1476 1477 reqlist = bio->bio_caller1; 1478 xbb = reqlist->xbb; 1479 1480 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1481 1482 /* 1483 * This is a bit imprecise. With aggregated I/O a single 1484 * request list can contain multiple front-end requests and 1485 * a multiple bios may point to a single request. By carefully 1486 * walking the request list, we could map residuals and errors 1487 * back to the original front-end request, but the interface 1488 * isn't sufficiently rich for us to properly report the error. 1489 * So, we just treat the entire request list as having failed if an 1490 * error occurs on any part. And, if an error occurs, we treat 1491 * the amount of data transferred as 0. 1492 * 1493 * For residuals, we report it on the overall aggregated device, 1494 * but not on the individual requests, since we don't currently 1495 * do the work to determine which front-end request to which the 1496 * residual applies. 1497 */ 1498 if (bio->bio_error) { 1499 DPRINTF("BIO returned error %d for operation on device %s\n", 1500 bio->bio_error, xbb->dev_name); 1501 reqlist->status = BLKIF_RSP_ERROR; 1502 1503 if (bio->bio_error == ENXIO 1504 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1505 1506 /* 1507 * Backend device has disappeared. Signal the 1508 * front-end that we (the device proxy) want to 1509 * go away. 1510 */ 1511 xenbus_set_state(xbb->dev, XenbusStateClosing); 1512 } 1513 } 1514 1515 #ifdef XBB_USE_BOUNCE_BUFFERS 1516 if (bio->bio_cmd == BIO_READ) { 1517 vm_offset_t kva_offset; 1518 1519 kva_offset = (vm_offset_t)bio->bio_data 1520 - (vm_offset_t)reqlist->bounce; 1521 memcpy((uint8_t *)reqlist->kva + kva_offset, 1522 bio->bio_data, bio->bio_bcount); 1523 } 1524 #endif /* XBB_USE_BOUNCE_BUFFERS */ 1525 1526 /* 1527 * Decrement the pending count for the request list. When we're 1528 * done with the requests, send status back for all of them. 1529 */ 1530 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1531 xbb_complete_reqlist(xbb, reqlist); 1532 1533 g_destroy_bio(bio); 1534 } 1535 1536 /** 1537 * Parse a blkif request into an internal request structure and send 1538 * it to the backend for processing. 1539 * 1540 * \param xbb Per-instance xbb configuration structure. 1541 * \param reqlist Allocated internal request list structure. 1542 * 1543 * \return On success, 0. For resource shortages, non-zero. 1544 * 1545 * This routine performs the backend common aspects of request parsing 1546 * including compiling an internal request structure, parsing the S/G 1547 * list and any secondary ring requests in which they may reside, and 1548 * the mapping of front-end I/O pages into our domain. 1549 */ 1550 static int 1551 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1552 { 1553 struct xbb_sg *xbb_sg; 1554 struct gnttab_map_grant_ref *map; 1555 struct blkif_request_segment *sg; 1556 struct blkif_request_segment *last_block_sg; 1557 struct xbb_xen_req *nreq; 1558 u_int nseg; 1559 u_int seg_idx; 1560 u_int block_segs; 1561 int nr_sects; 1562 int total_sects; 1563 int operation; 1564 uint8_t bio_flags; 1565 int error; 1566 1567 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1568 bio_flags = 0; 1569 total_sects = 0; 1570 nr_sects = 0; 1571 1572 /* 1573 * First determine whether we have enough free KVA to satisfy this 1574 * request list. If not, tell xbb_run_queue() so it can go to 1575 * sleep until we have more KVA. 1576 */ 1577 reqlist->kva = NULL; 1578 if (reqlist->nr_segments != 0) { 1579 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1580 if (reqlist->kva == NULL) { 1581 /* 1582 * If we're out of KVA, return ENOMEM. 1583 */ 1584 return (ENOMEM); 1585 } 1586 } 1587 1588 binuptime(&reqlist->ds_t0); 1589 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1590 1591 switch (reqlist->operation) { 1592 case BLKIF_OP_WRITE_BARRIER: 1593 bio_flags |= BIO_ORDERED; 1594 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1595 /* FALLTHROUGH */ 1596 case BLKIF_OP_WRITE: 1597 operation = BIO_WRITE; 1598 reqlist->ds_trans_type = DEVSTAT_WRITE; 1599 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1600 DPRINTF("Attempt to write to read only device %s\n", 1601 xbb->dev_name); 1602 reqlist->status = BLKIF_RSP_ERROR; 1603 goto send_response; 1604 } 1605 break; 1606 case BLKIF_OP_READ: 1607 operation = BIO_READ; 1608 reqlist->ds_trans_type = DEVSTAT_READ; 1609 break; 1610 case BLKIF_OP_FLUSH_DISKCACHE: 1611 /* 1612 * If this is true, the user has requested that we disable 1613 * flush support. So we just complete the requests 1614 * successfully. 1615 */ 1616 if (xbb->disable_flush != 0) { 1617 goto send_response; 1618 } 1619 1620 /* 1621 * The user has requested that we only send a real flush 1622 * for every N flush requests. So keep count, and either 1623 * complete the request immediately or queue it for the 1624 * backend. 1625 */ 1626 if (xbb->flush_interval != 0) { 1627 if (++(xbb->flush_count) < xbb->flush_interval) { 1628 goto send_response; 1629 } else 1630 xbb->flush_count = 0; 1631 } 1632 1633 operation = BIO_FLUSH; 1634 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1635 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1636 goto do_dispatch; 1637 /*NOTREACHED*/ 1638 default: 1639 DPRINTF("error: unknown block io operation [%d]\n", 1640 reqlist->operation); 1641 reqlist->status = BLKIF_RSP_ERROR; 1642 goto send_response; 1643 } 1644 1645 reqlist->xbb = xbb; 1646 xbb_sg = xbb->xbb_sgs; 1647 map = xbb->maps; 1648 seg_idx = 0; 1649 1650 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1651 blkif_request_t *ring_req; 1652 RING_IDX req_ring_idx; 1653 u_int req_seg_idx; 1654 1655 ring_req = nreq->ring_req; 1656 req_ring_idx = nreq->req_ring_idx; 1657 nr_sects = 0; 1658 nseg = ring_req->nr_segments; 1659 nreq->nr_pages = nseg; 1660 nreq->nr_512b_sectors = 0; 1661 req_seg_idx = 0; 1662 sg = NULL; 1663 1664 /* Check that number of segments is sane. */ 1665 if (__predict_false(nseg == 0) 1666 || __predict_false(nseg > xbb->max_request_segments)) { 1667 DPRINTF("Bad number of segments in request (%d)\n", 1668 nseg); 1669 reqlist->status = BLKIF_RSP_ERROR; 1670 goto send_response; 1671 } 1672 1673 block_segs = nseg; 1674 sg = ring_req->seg; 1675 last_block_sg = sg + block_segs; 1676 1677 while (sg < last_block_sg) { 1678 KASSERT(seg_idx < 1679 XBB_MAX_SEGMENTS_PER_REQLIST, 1680 ("seg_idx %d is too large, max " 1681 "segs %d\n", seg_idx, 1682 XBB_MAX_SEGMENTS_PER_REQLIST)); 1683 1684 xbb_sg->first_sect = sg->first_sect; 1685 xbb_sg->last_sect = sg->last_sect; 1686 xbb_sg->nsect = 1687 (int8_t)(sg->last_sect - 1688 sg->first_sect + 1); 1689 1690 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1691 || (xbb_sg->nsect <= 0)) { 1692 reqlist->status = BLKIF_RSP_ERROR; 1693 goto send_response; 1694 } 1695 1696 nr_sects += xbb_sg->nsect; 1697 map->host_addr = xbb_get_gntaddr(reqlist, 1698 seg_idx, /*sector*/0); 1699 KASSERT(map->host_addr + PAGE_SIZE <= 1700 xbb->ring_config.gnt_addr, 1701 ("Host address %#jx len %d overlaps " 1702 "ring address %#jx\n", 1703 (uintmax_t)map->host_addr, PAGE_SIZE, 1704 (uintmax_t)xbb->ring_config.gnt_addr)); 1705 1706 map->flags = GNTMAP_host_map; 1707 map->ref = sg->gref; 1708 map->dom = xbb->otherend_id; 1709 if (operation == BIO_WRITE) 1710 map->flags |= GNTMAP_readonly; 1711 sg++; 1712 map++; 1713 xbb_sg++; 1714 seg_idx++; 1715 req_seg_idx++; 1716 } 1717 1718 /* Convert to the disk's sector size */ 1719 nreq->nr_512b_sectors = nr_sects; 1720 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1721 total_sects += nr_sects; 1722 1723 if ((nreq->nr_512b_sectors & 1724 ((xbb->sector_size >> 9) - 1)) != 0) { 1725 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1726 "a multiple of the backing store sector " 1727 "size (%d)\n", __func__, 1728 nreq->nr_512b_sectors << 9, 1729 xbb->sector_size); 1730 reqlist->status = BLKIF_RSP_ERROR; 1731 goto send_response; 1732 } 1733 } 1734 1735 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1736 xbb->maps, reqlist->nr_segments); 1737 if (error != 0) 1738 panic("Grant table operation failed (%d)", error); 1739 1740 reqlist->flags |= XBB_REQLIST_MAPPED; 1741 1742 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1743 seg_idx++, map++){ 1744 1745 if (__predict_false(map->status != 0)) { 1746 DPRINTF("invalid buffer -- could not remap " 1747 "it (%d)\n", map->status); 1748 DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " 1749 "0x%x ref 0x%x, dom %d\n", seg_idx, 1750 map->host_addr, map->flags, map->ref, 1751 map->dom); 1752 reqlist->status = BLKIF_RSP_ERROR; 1753 goto send_response; 1754 } 1755 1756 reqlist->gnt_handles[seg_idx] = map->handle; 1757 } 1758 if (reqlist->starting_sector_number + total_sects > 1759 xbb->media_num_sectors) { 1760 1761 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1762 "extends past end of device %s\n", 1763 operation == BIO_READ ? "read" : "write", 1764 reqlist->starting_sector_number, 1765 reqlist->starting_sector_number + total_sects, 1766 xbb->dev_name); 1767 reqlist->status = BLKIF_RSP_ERROR; 1768 goto send_response; 1769 } 1770 1771 do_dispatch: 1772 1773 error = xbb->dispatch_io(xbb, 1774 reqlist, 1775 operation, 1776 bio_flags); 1777 1778 if (error != 0) { 1779 reqlist->status = BLKIF_RSP_ERROR; 1780 goto send_response; 1781 } 1782 1783 return (0); 1784 1785 send_response: 1786 1787 xbb_complete_reqlist(xbb, reqlist); 1788 1789 return (0); 1790 } 1791 1792 static __inline int 1793 xbb_count_sects(blkif_request_t *ring_req) 1794 { 1795 int i; 1796 int cur_size = 0; 1797 1798 for (i = 0; i < ring_req->nr_segments; i++) { 1799 int nsect; 1800 1801 nsect = (int8_t)(ring_req->seg[i].last_sect - 1802 ring_req->seg[i].first_sect + 1); 1803 if (nsect <= 0) 1804 break; 1805 1806 cur_size += nsect; 1807 } 1808 1809 return (cur_size); 1810 } 1811 1812 /** 1813 * Process incoming requests from the shared communication ring in response 1814 * to a signal on the ring's event channel. 1815 * 1816 * \param context Callback argument registerd during task initialization - 1817 * the xbb_softc for this instance. 1818 * \param pending The number of taskqueue_enqueue events that have 1819 * occurred since this handler was last run. 1820 */ 1821 static void 1822 xbb_run_queue(void *context, int pending) 1823 { 1824 struct xbb_softc *xbb; 1825 blkif_back_rings_t *rings; 1826 RING_IDX rp; 1827 uint64_t cur_sector; 1828 int cur_operation; 1829 struct xbb_xen_reqlist *reqlist; 1830 1831 1832 xbb = (struct xbb_softc *)context; 1833 rings = &xbb->rings; 1834 1835 /* 1836 * Work gather and dispatch loop. Note that we have a bias here 1837 * towards gathering I/O sent by blockfront. We first gather up 1838 * everything in the ring, as long as we have resources. Then we 1839 * dispatch one request, and then attempt to gather up any 1840 * additional requests that have come in while we were dispatching 1841 * the request. 1842 * 1843 * This allows us to get a clearer picture (via devstat) of how 1844 * many requests blockfront is queueing to us at any given time. 1845 */ 1846 for (;;) { 1847 int retval; 1848 1849 /* 1850 * Initialize reqlist to the last element in the pending 1851 * queue, if there is one. This allows us to add more 1852 * requests to that request list, if we have room. 1853 */ 1854 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1855 xbb_xen_reqlist, links); 1856 if (reqlist != NULL) { 1857 cur_sector = reqlist->next_contig_sector; 1858 cur_operation = reqlist->operation; 1859 } else { 1860 cur_operation = 0; 1861 cur_sector = 0; 1862 } 1863 1864 /* 1865 * Cache req_prod to avoid accessing a cache line shared 1866 * with the frontend. 1867 */ 1868 rp = rings->common.sring->req_prod; 1869 1870 /* Ensure we see queued requests up to 'rp'. */ 1871 rmb(); 1872 1873 /** 1874 * Run so long as there is work to consume and the generation 1875 * of a response will not overflow the ring. 1876 * 1877 * @note There's a 1 to 1 relationship between requests and 1878 * responses, so an overflow should never occur. This 1879 * test is to protect our domain from digesting bogus 1880 * data. Shouldn't we log this? 1881 */ 1882 while (rings->common.req_cons != rp 1883 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1884 rings->common.req_cons) == 0){ 1885 blkif_request_t ring_req_storage; 1886 blkif_request_t *ring_req; 1887 int cur_size; 1888 1889 switch (xbb->abi) { 1890 case BLKIF_PROTOCOL_NATIVE: 1891 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1892 rings->common.req_cons); 1893 break; 1894 case BLKIF_PROTOCOL_X86_32: 1895 { 1896 struct blkif_x86_32_request *ring_req32; 1897 1898 ring_req32 = RING_GET_REQUEST( 1899 &xbb->rings.x86_32, rings->common.req_cons); 1900 blkif_get_x86_32_req(&ring_req_storage, 1901 ring_req32); 1902 ring_req = &ring_req_storage; 1903 break; 1904 } 1905 case BLKIF_PROTOCOL_X86_64: 1906 { 1907 struct blkif_x86_64_request *ring_req64; 1908 1909 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1910 rings->common.req_cons); 1911 blkif_get_x86_64_req(&ring_req_storage, 1912 ring_req64); 1913 ring_req = &ring_req_storage; 1914 break; 1915 } 1916 default: 1917 panic("Unexpected blkif protocol ABI."); 1918 /* NOTREACHED */ 1919 } 1920 1921 /* 1922 * Check for situations that would require closing 1923 * off this I/O for further coalescing: 1924 * - Coalescing is turned off. 1925 * - Current I/O is out of sequence with the previous 1926 * I/O. 1927 * - Coalesced I/O would be too large. 1928 */ 1929 if ((reqlist != NULL) 1930 && ((xbb->no_coalesce_reqs != 0) 1931 || ((xbb->no_coalesce_reqs == 0) 1932 && ((ring_req->sector_number != cur_sector) 1933 || (ring_req->operation != cur_operation) 1934 || ((ring_req->nr_segments + reqlist->nr_segments) > 1935 xbb->max_reqlist_segments))))) { 1936 reqlist = NULL; 1937 } 1938 1939 /* 1940 * Grab and check for all resources in one shot. 1941 * If we can't get all of the resources we need, 1942 * the shortage is noted and the thread will get 1943 * woken up when more resources are available. 1944 */ 1945 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1946 xbb->rings.common.req_cons); 1947 1948 if (retval != 0) { 1949 /* 1950 * Resource shortage has been recorded. 1951 * We'll be scheduled to run once a request 1952 * object frees up due to a completion. 1953 */ 1954 break; 1955 } 1956 1957 /* 1958 * Signify that we can overwrite this request with 1959 * a response by incrementing our consumer index. 1960 * The response won't be generated until after 1961 * we've already consumed all necessary data out 1962 * of the version of the request in the ring buffer 1963 * (for native mode). We must update the consumer 1964 * index before issuing back-end I/O so there is 1965 * no possibility that it will complete and a 1966 * response be generated before we make room in 1967 * the queue for that response. 1968 */ 1969 xbb->rings.common.req_cons++; 1970 xbb->reqs_received++; 1971 1972 cur_size = xbb_count_sects(ring_req); 1973 cur_sector = ring_req->sector_number + cur_size; 1974 reqlist->next_contig_sector = cur_sector; 1975 cur_operation = ring_req->operation; 1976 } 1977 1978 /* Check for I/O to dispatch */ 1979 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1980 if (reqlist == NULL) { 1981 /* 1982 * We're out of work to do, put the task queue to 1983 * sleep. 1984 */ 1985 break; 1986 } 1987 1988 /* 1989 * Grab the first request off the queue and attempt 1990 * to dispatch it. 1991 */ 1992 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1993 1994 retval = xbb_dispatch_io(xbb, reqlist); 1995 if (retval != 0) { 1996 /* 1997 * xbb_dispatch_io() returns non-zero only when 1998 * there is a resource shortage. If that's the 1999 * case, re-queue this request on the head of the 2000 * queue, and go to sleep until we have more 2001 * resources. 2002 */ 2003 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 2004 reqlist, links); 2005 break; 2006 } else { 2007 /* 2008 * If we still have anything on the queue after 2009 * removing the head entry, that is because we 2010 * met one of the criteria to create a new 2011 * request list (outlined above), and we'll call 2012 * that a forced dispatch for statistical purposes. 2013 * 2014 * Otherwise, if there is only one element on the 2015 * queue, we coalesced everything available on 2016 * the ring and we'll call that a normal dispatch. 2017 */ 2018 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2019 2020 if (reqlist != NULL) 2021 xbb->forced_dispatch++; 2022 else 2023 xbb->normal_dispatch++; 2024 2025 xbb->total_dispatch++; 2026 } 2027 } 2028 } 2029 2030 /** 2031 * Interrupt handler bound to the shared ring's event channel. 2032 * 2033 * \param arg Callback argument registerd during event channel 2034 * binding - the xbb_softc for this instance. 2035 */ 2036 static int 2037 xbb_filter(void *arg) 2038 { 2039 struct xbb_softc *xbb; 2040 2041 /* Defer to taskqueue thread. */ 2042 xbb = (struct xbb_softc *)arg; 2043 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2044 2045 return (FILTER_HANDLED); 2046 } 2047 2048 SDT_PROVIDER_DEFINE(xbb); 2049 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 2050 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 2051 "uint64_t"); 2052 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 2053 "uint64_t", "uint64_t"); 2054 2055 /*----------------------------- Backend Handlers -----------------------------*/ 2056 /** 2057 * Backend handler for character device access. 2058 * 2059 * \param xbb Per-instance xbb configuration structure. 2060 * \param reqlist Allocated internal request list structure. 2061 * \param operation BIO_* I/O operation code. 2062 * \param bio_flags Additional bio_flag data to pass to any generated 2063 * bios (e.g. BIO_ORDERED).. 2064 * 2065 * \return 0 for success, errno codes for failure. 2066 */ 2067 static int 2068 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2069 int operation, int bio_flags) 2070 { 2071 struct xbb_dev_data *dev_data; 2072 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2073 off_t bio_offset; 2074 struct bio *bio; 2075 struct xbb_sg *xbb_sg; 2076 u_int nbio; 2077 u_int bio_idx; 2078 u_int nseg; 2079 u_int seg_idx; 2080 int error; 2081 2082 dev_data = &xbb->backend.dev; 2083 bio_offset = (off_t)reqlist->starting_sector_number 2084 << xbb->sector_size_shift; 2085 error = 0; 2086 nbio = 0; 2087 bio_idx = 0; 2088 2089 if (operation == BIO_FLUSH) { 2090 bio = g_new_bio(); 2091 if (__predict_false(bio == NULL)) { 2092 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2093 error = ENOMEM; 2094 return (error); 2095 } 2096 2097 bio->bio_cmd = BIO_FLUSH; 2098 bio->bio_flags |= BIO_ORDERED; 2099 bio->bio_dev = dev_data->cdev; 2100 bio->bio_offset = 0; 2101 bio->bio_data = 0; 2102 bio->bio_done = xbb_bio_done; 2103 bio->bio_caller1 = reqlist; 2104 bio->bio_pblkno = 0; 2105 2106 reqlist->pendcnt = 1; 2107 2108 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2109 device_get_unit(xbb->dev)); 2110 2111 (*dev_data->csw->d_strategy)(bio); 2112 2113 return (0); 2114 } 2115 2116 xbb_sg = xbb->xbb_sgs; 2117 bio = NULL; 2118 nseg = reqlist->nr_segments; 2119 2120 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2121 2122 /* 2123 * KVA will not be contiguous, so any additional 2124 * I/O will need to be represented in a new bio. 2125 */ 2126 if ((bio != NULL) 2127 && (xbb_sg->first_sect != 0)) { 2128 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2129 printf("%s: Discontiguous I/O request " 2130 "from domain %d ends on " 2131 "non-sector boundary\n", 2132 __func__, xbb->otherend_id); 2133 error = EINVAL; 2134 goto fail_free_bios; 2135 } 2136 bio = NULL; 2137 } 2138 2139 if (bio == NULL) { 2140 /* 2141 * Make sure that the start of this bio is 2142 * aligned to a device sector. 2143 */ 2144 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2145 printf("%s: Misaligned I/O request " 2146 "from domain %d\n", __func__, 2147 xbb->otherend_id); 2148 error = EINVAL; 2149 goto fail_free_bios; 2150 } 2151 2152 bio = bios[nbio++] = g_new_bio(); 2153 if (__predict_false(bio == NULL)) { 2154 error = ENOMEM; 2155 goto fail_free_bios; 2156 } 2157 bio->bio_cmd = operation; 2158 bio->bio_flags |= bio_flags; 2159 bio->bio_dev = dev_data->cdev; 2160 bio->bio_offset = bio_offset; 2161 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2162 xbb_sg->first_sect); 2163 bio->bio_done = xbb_bio_done; 2164 bio->bio_caller1 = reqlist; 2165 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2166 } 2167 2168 bio->bio_length += xbb_sg->nsect << 9; 2169 bio->bio_bcount = bio->bio_length; 2170 bio_offset += xbb_sg->nsect << 9; 2171 2172 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2173 2174 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2175 printf("%s: Discontiguous I/O request " 2176 "from domain %d ends on " 2177 "non-sector boundary\n", 2178 __func__, xbb->otherend_id); 2179 error = EINVAL; 2180 goto fail_free_bios; 2181 } 2182 /* 2183 * KVA will not be contiguous, so any additional 2184 * I/O will need to be represented in a new bio. 2185 */ 2186 bio = NULL; 2187 } 2188 } 2189 2190 reqlist->pendcnt = nbio; 2191 2192 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2193 { 2194 #ifdef XBB_USE_BOUNCE_BUFFERS 2195 vm_offset_t kva_offset; 2196 2197 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2198 - (vm_offset_t)reqlist->bounce; 2199 if (operation == BIO_WRITE) { 2200 memcpy(bios[bio_idx]->bio_data, 2201 (uint8_t *)reqlist->kva + kva_offset, 2202 bios[bio_idx]->bio_bcount); 2203 } 2204 #endif 2205 if (operation == BIO_READ) { 2206 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2207 device_get_unit(xbb->dev), 2208 bios[bio_idx]->bio_offset, 2209 bios[bio_idx]->bio_length); 2210 } else if (operation == BIO_WRITE) { 2211 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2212 device_get_unit(xbb->dev), 2213 bios[bio_idx]->bio_offset, 2214 bios[bio_idx]->bio_length); 2215 } 2216 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2217 } 2218 2219 return (error); 2220 2221 fail_free_bios: 2222 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2223 g_destroy_bio(bios[bio_idx]); 2224 2225 return (error); 2226 } 2227 2228 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2229 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2230 "uint64_t"); 2231 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2232 "uint64_t", "uint64_t"); 2233 2234 /** 2235 * Backend handler for file access. 2236 * 2237 * \param xbb Per-instance xbb configuration structure. 2238 * \param reqlist Allocated internal request list. 2239 * \param operation BIO_* I/O operation code. 2240 * \param flags Additional bio_flag data to pass to any generated bios 2241 * (e.g. BIO_ORDERED).. 2242 * 2243 * \return 0 for success, errno codes for failure. 2244 */ 2245 static int 2246 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2247 int operation, int flags) 2248 { 2249 struct xbb_file_data *file_data; 2250 u_int seg_idx; 2251 u_int nseg; 2252 struct uio xuio; 2253 struct xbb_sg *xbb_sg; 2254 struct iovec *xiovec; 2255 #ifdef XBB_USE_BOUNCE_BUFFERS 2256 void **p_vaddr; 2257 int saved_uio_iovcnt; 2258 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2259 int error; 2260 2261 file_data = &xbb->backend.file; 2262 error = 0; 2263 bzero(&xuio, sizeof(xuio)); 2264 2265 switch (operation) { 2266 case BIO_READ: 2267 xuio.uio_rw = UIO_READ; 2268 break; 2269 case BIO_WRITE: 2270 xuio.uio_rw = UIO_WRITE; 2271 break; 2272 case BIO_FLUSH: { 2273 struct mount *mountpoint; 2274 2275 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2276 device_get_unit(xbb->dev)); 2277 2278 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2279 2280 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2281 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2282 VOP_UNLOCK(xbb->vn, 0); 2283 2284 vn_finished_write(mountpoint); 2285 2286 goto bailout_send_response; 2287 /* NOTREACHED */ 2288 } 2289 default: 2290 panic("invalid operation %d", operation); 2291 /* NOTREACHED */ 2292 } 2293 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2294 << xbb->sector_size_shift; 2295 xuio.uio_segflg = UIO_SYSSPACE; 2296 xuio.uio_iov = file_data->xiovecs; 2297 xuio.uio_iovcnt = 0; 2298 xbb_sg = xbb->xbb_sgs; 2299 nseg = reqlist->nr_segments; 2300 2301 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2302 2303 /* 2304 * If the first sector is not 0, the KVA will 2305 * not be contiguous and we'll need to go on 2306 * to another segment. 2307 */ 2308 if (xbb_sg->first_sect != 0) 2309 xiovec = NULL; 2310 2311 if (xiovec == NULL) { 2312 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2313 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2314 seg_idx, xbb_sg->first_sect); 2315 #ifdef XBB_USE_BOUNCE_BUFFERS 2316 /* 2317 * Store the address of the incoming 2318 * buffer at this particular offset 2319 * as well, so we can do the copy 2320 * later without having to do more 2321 * work to recalculate this address. 2322 */ 2323 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2324 *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2325 xbb_sg->first_sect); 2326 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2327 xiovec->iov_len = 0; 2328 xuio.uio_iovcnt++; 2329 } 2330 2331 xiovec->iov_len += xbb_sg->nsect << 9; 2332 2333 xuio.uio_resid += xbb_sg->nsect << 9; 2334 2335 /* 2336 * If the last sector is not the full page 2337 * size count, the next segment will not be 2338 * contiguous in KVA and we need a new iovec. 2339 */ 2340 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2341 xiovec = NULL; 2342 } 2343 2344 xuio.uio_td = curthread; 2345 2346 #ifdef XBB_USE_BOUNCE_BUFFERS 2347 saved_uio_iovcnt = xuio.uio_iovcnt; 2348 2349 if (operation == BIO_WRITE) { 2350 /* Copy the write data to the local buffer. */ 2351 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2352 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2353 seg_idx++, xiovec++, p_vaddr++) { 2354 2355 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 2356 } 2357 } else { 2358 /* 2359 * We only need to save off the iovecs in the case of a 2360 * read, because the copy for the read happens after the 2361 * VOP_READ(). (The uio will get modified in that call 2362 * sequence.) 2363 */ 2364 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2365 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2366 } 2367 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2368 2369 switch (operation) { 2370 case BIO_READ: 2371 2372 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2373 device_get_unit(xbb->dev), xuio.uio_offset, 2374 xuio.uio_resid); 2375 2376 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2377 2378 /* 2379 * UFS pays attention to IO_DIRECT for reads. If the 2380 * DIRECTIO option is configured into the kernel, it calls 2381 * ffs_rawread(). But that only works for single-segment 2382 * uios with user space addresses. In our case, with a 2383 * kernel uio, it still reads into the buffer cache, but it 2384 * will just try to release the buffer from the cache later 2385 * on in ffs_read(). 2386 * 2387 * ZFS does not pay attention to IO_DIRECT for reads. 2388 * 2389 * UFS does not pay attention to IO_SYNC for reads. 2390 * 2391 * ZFS pays attention to IO_SYNC (which translates into the 2392 * Solaris define FRSYNC for zfs_read()) for reads. It 2393 * attempts to sync the file before reading. 2394 * 2395 * So, to attempt to provide some barrier semantics in the 2396 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2397 */ 2398 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2399 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2400 2401 VOP_UNLOCK(xbb->vn, 0); 2402 break; 2403 case BIO_WRITE: { 2404 struct mount *mountpoint; 2405 2406 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2407 device_get_unit(xbb->dev), xuio.uio_offset, 2408 xuio.uio_resid); 2409 2410 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2411 2412 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2413 2414 /* 2415 * UFS pays attention to IO_DIRECT for writes. The write 2416 * is done asynchronously. (Normally the write would just 2417 * get put into cache. 2418 * 2419 * UFS pays attention to IO_SYNC for writes. It will 2420 * attempt to write the buffer out synchronously if that 2421 * flag is set. 2422 * 2423 * ZFS does not pay attention to IO_DIRECT for writes. 2424 * 2425 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2426 * for writes. It will flush the transaction from the 2427 * cache before returning. 2428 * 2429 * So if we've got the BIO_ORDERED flag set, we want 2430 * IO_SYNC in either the UFS or ZFS case. 2431 */ 2432 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2433 IO_SYNC : 0, file_data->cred); 2434 VOP_UNLOCK(xbb->vn, 0); 2435 2436 vn_finished_write(mountpoint); 2437 2438 break; 2439 } 2440 default: 2441 panic("invalid operation %d", operation); 2442 /* NOTREACHED */ 2443 } 2444 2445 #ifdef XBB_USE_BOUNCE_BUFFERS 2446 /* We only need to copy here for read operations */ 2447 if (operation == BIO_READ) { 2448 2449 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2450 xiovec = file_data->saved_xiovecs; 2451 seg_idx < saved_uio_iovcnt; seg_idx++, 2452 xiovec++, p_vaddr++) { 2453 2454 /* 2455 * Note that we have to use the copy of the 2456 * io vector we made above. uiomove() modifies 2457 * the uio and its referenced vector as uiomove 2458 * performs the copy, so we can't rely on any 2459 * state from the original uio. 2460 */ 2461 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 2462 } 2463 } 2464 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2465 2466 bailout_send_response: 2467 2468 if (error != 0) 2469 reqlist->status = BLKIF_RSP_ERROR; 2470 2471 xbb_complete_reqlist(xbb, reqlist); 2472 2473 return (0); 2474 } 2475 2476 /*--------------------------- Backend Configuration --------------------------*/ 2477 /** 2478 * Close and cleanup any backend device/file specific state for this 2479 * block back instance. 2480 * 2481 * \param xbb Per-instance xbb configuration structure. 2482 */ 2483 static void 2484 xbb_close_backend(struct xbb_softc *xbb) 2485 { 2486 DROP_GIANT(); 2487 DPRINTF("closing dev=%s\n", xbb->dev_name); 2488 if (xbb->vn) { 2489 int flags = FREAD; 2490 2491 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2492 flags |= FWRITE; 2493 2494 switch (xbb->device_type) { 2495 case XBB_TYPE_DISK: 2496 if (xbb->backend.dev.csw) { 2497 dev_relthread(xbb->backend.dev.cdev, 2498 xbb->backend.dev.dev_ref); 2499 xbb->backend.dev.csw = NULL; 2500 xbb->backend.dev.cdev = NULL; 2501 } 2502 break; 2503 case XBB_TYPE_FILE: 2504 break; 2505 case XBB_TYPE_NONE: 2506 default: 2507 panic("Unexpected backend type."); 2508 break; 2509 } 2510 2511 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2512 xbb->vn = NULL; 2513 2514 switch (xbb->device_type) { 2515 case XBB_TYPE_DISK: 2516 break; 2517 case XBB_TYPE_FILE: 2518 if (xbb->backend.file.cred != NULL) { 2519 crfree(xbb->backend.file.cred); 2520 xbb->backend.file.cred = NULL; 2521 } 2522 break; 2523 case XBB_TYPE_NONE: 2524 default: 2525 panic("Unexpected backend type."); 2526 break; 2527 } 2528 } 2529 PICKUP_GIANT(); 2530 } 2531 2532 /** 2533 * Open a character device to be used for backend I/O. 2534 * 2535 * \param xbb Per-instance xbb configuration structure. 2536 * 2537 * \return 0 for success, errno codes for failure. 2538 */ 2539 static int 2540 xbb_open_dev(struct xbb_softc *xbb) 2541 { 2542 struct vattr vattr; 2543 struct cdev *dev; 2544 struct cdevsw *devsw; 2545 int error; 2546 2547 xbb->device_type = XBB_TYPE_DISK; 2548 xbb->dispatch_io = xbb_dispatch_dev; 2549 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2550 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2551 &xbb->backend.dev.dev_ref); 2552 if (xbb->backend.dev.csw == NULL) 2553 panic("Unable to retrieve device switch"); 2554 2555 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2556 if (error) { 2557 xenbus_dev_fatal(xbb->dev, error, "error getting " 2558 "vnode attributes for device %s", 2559 xbb->dev_name); 2560 return (error); 2561 } 2562 2563 2564 dev = xbb->vn->v_rdev; 2565 devsw = dev->si_devsw; 2566 if (!devsw->d_ioctl) { 2567 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2568 "device %s!", xbb->dev_name); 2569 return (ENODEV); 2570 } 2571 2572 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2573 (caddr_t)&xbb->sector_size, FREAD, 2574 curthread); 2575 if (error) { 2576 xenbus_dev_fatal(xbb->dev, error, 2577 "error calling ioctl DIOCGSECTORSIZE " 2578 "for device %s", xbb->dev_name); 2579 return (error); 2580 } 2581 2582 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2583 (caddr_t)&xbb->media_size, FREAD, 2584 curthread); 2585 if (error) { 2586 xenbus_dev_fatal(xbb->dev, error, 2587 "error calling ioctl DIOCGMEDIASIZE " 2588 "for device %s", xbb->dev_name); 2589 return (error); 2590 } 2591 2592 return (0); 2593 } 2594 2595 /** 2596 * Open a file to be used for backend I/O. 2597 * 2598 * \param xbb Per-instance xbb configuration structure. 2599 * 2600 * \return 0 for success, errno codes for failure. 2601 */ 2602 static int 2603 xbb_open_file(struct xbb_softc *xbb) 2604 { 2605 struct xbb_file_data *file_data; 2606 struct vattr vattr; 2607 int error; 2608 2609 file_data = &xbb->backend.file; 2610 xbb->device_type = XBB_TYPE_FILE; 2611 xbb->dispatch_io = xbb_dispatch_file; 2612 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2613 if (error != 0) { 2614 xenbus_dev_fatal(xbb->dev, error, 2615 "error calling VOP_GETATTR()" 2616 "for file %s", xbb->dev_name); 2617 return (error); 2618 } 2619 2620 /* 2621 * Verify that we have the ability to upgrade to exclusive 2622 * access on this file so we can trap errors at open instead 2623 * of reporting them during first access. 2624 */ 2625 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2626 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2627 if (xbb->vn->v_iflag & VI_DOOMED) { 2628 error = EBADF; 2629 xenbus_dev_fatal(xbb->dev, error, 2630 "error locking file %s", 2631 xbb->dev_name); 2632 2633 return (error); 2634 } 2635 } 2636 2637 file_data->cred = crhold(curthread->td_ucred); 2638 xbb->media_size = vattr.va_size; 2639 2640 /* 2641 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2642 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2643 * with disklabel and UFS on FreeBSD at least. Large block sizes 2644 * may not work with other OSes as well. So just export a sector 2645 * size of 512 bytes, which should work with any OS or 2646 * application. Since our backing is a file, any block size will 2647 * work fine for the backing store. 2648 */ 2649 #if 0 2650 xbb->sector_size = vattr.va_blocksize; 2651 #endif 2652 xbb->sector_size = 512; 2653 2654 /* 2655 * Sanity check. The media size has to be at least one 2656 * sector long. 2657 */ 2658 if (xbb->media_size < xbb->sector_size) { 2659 error = EINVAL; 2660 xenbus_dev_fatal(xbb->dev, error, 2661 "file %s size %ju < block size %u", 2662 xbb->dev_name, 2663 (uintmax_t)xbb->media_size, 2664 xbb->sector_size); 2665 } 2666 return (error); 2667 } 2668 2669 /** 2670 * Open the backend provider for this connection. 2671 * 2672 * \param xbb Per-instance xbb configuration structure. 2673 * 2674 * \return 0 for success, errno codes for failure. 2675 */ 2676 static int 2677 xbb_open_backend(struct xbb_softc *xbb) 2678 { 2679 struct nameidata nd; 2680 int flags; 2681 int error; 2682 2683 flags = FREAD; 2684 error = 0; 2685 2686 DPRINTF("opening dev=%s\n", xbb->dev_name); 2687 2688 if (rootvnode == NULL) { 2689 xenbus_dev_fatal(xbb->dev, ENOENT, 2690 "Root file system not mounted"); 2691 return (ENOENT); 2692 } 2693 2694 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2695 flags |= FWRITE; 2696 2697 pwd_ensure_dirs(); 2698 2699 again: 2700 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2701 error = vn_open(&nd, &flags, 0, NULL); 2702 if (error) { 2703 /* 2704 * This is the only reasonable guess we can make as far as 2705 * path if the user doesn't give us a fully qualified path. 2706 * If they want to specify a file, they need to specify the 2707 * full path. 2708 */ 2709 if (xbb->dev_name[0] != '/') { 2710 char *dev_path = "/dev/"; 2711 char *dev_name; 2712 2713 /* Try adding device path at beginning of name */ 2714 dev_name = malloc(strlen(xbb->dev_name) 2715 + strlen(dev_path) + 1, 2716 M_XENBLOCKBACK, M_NOWAIT); 2717 if (dev_name) { 2718 sprintf(dev_name, "%s%s", dev_path, 2719 xbb->dev_name); 2720 free(xbb->dev_name, M_XENBLOCKBACK); 2721 xbb->dev_name = dev_name; 2722 goto again; 2723 } 2724 } 2725 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2726 xbb->dev_name); 2727 return (error); 2728 } 2729 2730 NDFREE(&nd, NDF_ONLY_PNBUF); 2731 2732 xbb->vn = nd.ni_vp; 2733 2734 /* We only support disks and files. */ 2735 if (vn_isdisk(xbb->vn, &error)) { 2736 error = xbb_open_dev(xbb); 2737 } else if (xbb->vn->v_type == VREG) { 2738 error = xbb_open_file(xbb); 2739 } else { 2740 error = EINVAL; 2741 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2742 "or file", xbb->dev_name); 2743 } 2744 VOP_UNLOCK(xbb->vn, 0); 2745 2746 if (error != 0) { 2747 xbb_close_backend(xbb); 2748 return (error); 2749 } 2750 2751 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2752 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2753 2754 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2755 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2756 xbb->dev_name, xbb->sector_size, xbb->media_size); 2757 2758 return (0); 2759 } 2760 2761 /*------------------------ Inter-Domain Communication ------------------------*/ 2762 /** 2763 * Free dynamically allocated KVA or pseudo-physical address allocations. 2764 * 2765 * \param xbb Per-instance xbb configuration structure. 2766 */ 2767 static void 2768 xbb_free_communication_mem(struct xbb_softc *xbb) 2769 { 2770 if (xbb->kva != 0) { 2771 if (xbb->pseudo_phys_res != NULL) { 2772 xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, 2773 xbb->pseudo_phys_res); 2774 xbb->pseudo_phys_res = NULL; 2775 } 2776 } 2777 xbb->kva = 0; 2778 xbb->gnt_base_addr = 0; 2779 if (xbb->kva_free != NULL) { 2780 free(xbb->kva_free, M_XENBLOCKBACK); 2781 xbb->kva_free = NULL; 2782 } 2783 } 2784 2785 /** 2786 * Cleanup all inter-domain communication mechanisms. 2787 * 2788 * \param xbb Per-instance xbb configuration structure. 2789 */ 2790 static int 2791 xbb_disconnect(struct xbb_softc *xbb) 2792 { 2793 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2794 struct gnttab_unmap_grant_ref *op; 2795 u_int ring_idx; 2796 int error; 2797 2798 DPRINTF("\n"); 2799 2800 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2801 return (0); 2802 2803 xen_intr_unbind(&xbb->xen_intr_handle); 2804 2805 mtx_unlock(&xbb->lock); 2806 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2807 mtx_lock(&xbb->lock); 2808 2809 /* 2810 * No new interrupts can generate work, but we must wait 2811 * for all currently active requests to drain. 2812 */ 2813 if (xbb->active_request_count != 0) 2814 return (EAGAIN); 2815 2816 for (ring_idx = 0, op = ops; 2817 ring_idx < xbb->ring_config.ring_pages; 2818 ring_idx++, op++) { 2819 2820 op->host_addr = xbb->ring_config.gnt_addr 2821 + (ring_idx * PAGE_SIZE); 2822 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2823 op->handle = xbb->ring_config.handle[ring_idx]; 2824 } 2825 2826 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2827 xbb->ring_config.ring_pages); 2828 if (error != 0) 2829 panic("Grant table op failed (%d)", error); 2830 2831 xbb_free_communication_mem(xbb); 2832 2833 if (xbb->requests != NULL) { 2834 free(xbb->requests, M_XENBLOCKBACK); 2835 xbb->requests = NULL; 2836 } 2837 2838 if (xbb->request_lists != NULL) { 2839 struct xbb_xen_reqlist *reqlist; 2840 int i; 2841 2842 /* There is one request list for ever allocated request. */ 2843 for (i = 0, reqlist = xbb->request_lists; 2844 i < xbb->max_requests; i++, reqlist++){ 2845 #ifdef XBB_USE_BOUNCE_BUFFERS 2846 if (reqlist->bounce != NULL) { 2847 free(reqlist->bounce, M_XENBLOCKBACK); 2848 reqlist->bounce = NULL; 2849 } 2850 #endif 2851 if (reqlist->gnt_handles != NULL) { 2852 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2853 reqlist->gnt_handles = NULL; 2854 } 2855 } 2856 free(xbb->request_lists, M_XENBLOCKBACK); 2857 xbb->request_lists = NULL; 2858 } 2859 2860 xbb->flags &= ~XBBF_RING_CONNECTED; 2861 return (0); 2862 } 2863 2864 /** 2865 * Map shared memory ring into domain local address space, initialize 2866 * ring control structures, and bind an interrupt to the event channel 2867 * used to notify us of ring changes. 2868 * 2869 * \param xbb Per-instance xbb configuration structure. 2870 */ 2871 static int 2872 xbb_connect_ring(struct xbb_softc *xbb) 2873 { 2874 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2875 struct gnttab_map_grant_ref *gnt; 2876 u_int ring_idx; 2877 int error; 2878 2879 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2880 return (0); 2881 2882 /* 2883 * Kva for our ring is at the tail of the region of kva allocated 2884 * by xbb_alloc_communication_mem(). 2885 */ 2886 xbb->ring_config.va = xbb->kva 2887 + (xbb->kva_size 2888 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2889 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2890 + (xbb->kva_size 2891 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2892 2893 for (ring_idx = 0, gnt = gnts; 2894 ring_idx < xbb->ring_config.ring_pages; 2895 ring_idx++, gnt++) { 2896 2897 gnt->host_addr = xbb->ring_config.gnt_addr 2898 + (ring_idx * PAGE_SIZE); 2899 gnt->flags = GNTMAP_host_map; 2900 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2901 gnt->dom = xbb->otherend_id; 2902 } 2903 2904 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2905 xbb->ring_config.ring_pages); 2906 if (error) 2907 panic("blkback: Ring page grant table op failed (%d)", error); 2908 2909 for (ring_idx = 0, gnt = gnts; 2910 ring_idx < xbb->ring_config.ring_pages; 2911 ring_idx++, gnt++) { 2912 if (gnt->status != 0) { 2913 xbb->ring_config.va = 0; 2914 xenbus_dev_fatal(xbb->dev, EACCES, 2915 "Ring shared page mapping failed. " 2916 "Status %d.", gnt->status); 2917 return (EACCES); 2918 } 2919 xbb->ring_config.handle[ring_idx] = gnt->handle; 2920 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2921 } 2922 2923 /* Initialize the ring based on ABI. */ 2924 switch (xbb->abi) { 2925 case BLKIF_PROTOCOL_NATIVE: 2926 { 2927 blkif_sring_t *sring; 2928 sring = (blkif_sring_t *)xbb->ring_config.va; 2929 BACK_RING_INIT(&xbb->rings.native, sring, 2930 xbb->ring_config.ring_pages * PAGE_SIZE); 2931 break; 2932 } 2933 case BLKIF_PROTOCOL_X86_32: 2934 { 2935 blkif_x86_32_sring_t *sring_x86_32; 2936 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2937 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2938 xbb->ring_config.ring_pages * PAGE_SIZE); 2939 break; 2940 } 2941 case BLKIF_PROTOCOL_X86_64: 2942 { 2943 blkif_x86_64_sring_t *sring_x86_64; 2944 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2945 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2946 xbb->ring_config.ring_pages * PAGE_SIZE); 2947 break; 2948 } 2949 default: 2950 panic("Unexpected blkif protocol ABI."); 2951 } 2952 2953 xbb->flags |= XBBF_RING_CONNECTED; 2954 2955 error = xen_intr_bind_remote_port(xbb->dev, 2956 xbb->otherend_id, 2957 xbb->ring_config.evtchn, 2958 xbb_filter, 2959 /*ithread_handler*/NULL, 2960 /*arg*/xbb, 2961 INTR_TYPE_BIO | INTR_MPSAFE, 2962 &xbb->xen_intr_handle); 2963 if (error) { 2964 (void)xbb_disconnect(xbb); 2965 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2966 return (error); 2967 } 2968 2969 DPRINTF("rings connected!\n"); 2970 2971 return 0; 2972 } 2973 2974 /** 2975 * Size KVA and pseudo-physical address allocations based on negotiated 2976 * values for the size and number of I/O requests, and the size of our 2977 * communication ring. 2978 * 2979 * \param xbb Per-instance xbb configuration structure. 2980 * 2981 * These address spaces are used to dynamically map pages in the 2982 * front-end's domain into our own. 2983 */ 2984 static int 2985 xbb_alloc_communication_mem(struct xbb_softc *xbb) 2986 { 2987 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2988 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2989 xbb->kva_size = xbb->reqlist_kva_size + 2990 (xbb->ring_config.ring_pages * PAGE_SIZE); 2991 2992 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); 2993 if (xbb->kva_free == NULL) 2994 return (ENOMEM); 2995 2996 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 2997 device_get_nameunit(xbb->dev), xbb->kva_size, 2998 xbb->reqlist_kva_size); 2999 /* 3000 * Reserve a range of pseudo physical memory that we can map 3001 * into kva. These pages will only be backed by machine 3002 * pages ("real memory") during the lifetime of front-end requests 3003 * via grant table operations. 3004 */ 3005 xbb->pseudo_phys_res_id = 0; 3006 xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, 3007 xbb->kva_size); 3008 if (xbb->pseudo_phys_res == NULL) { 3009 xbb->kva = 0; 3010 return (ENOMEM); 3011 } 3012 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3013 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3014 3015 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3016 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3017 (uintmax_t)xbb->gnt_base_addr); 3018 return (0); 3019 } 3020 3021 /** 3022 * Collect front-end information from the XenStore. 3023 * 3024 * \param xbb Per-instance xbb configuration structure. 3025 */ 3026 static int 3027 xbb_collect_frontend_info(struct xbb_softc *xbb) 3028 { 3029 char protocol_abi[64]; 3030 const char *otherend_path; 3031 int error; 3032 u_int ring_idx; 3033 u_int ring_page_order; 3034 size_t ring_size; 3035 3036 otherend_path = xenbus_get_otherend_path(xbb->dev); 3037 3038 /* 3039 * Protocol defaults valid even if all negotiation fails. 3040 */ 3041 xbb->ring_config.ring_pages = 1; 3042 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; 3043 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3044 3045 /* 3046 * Mandatory data (used in all versions of the protocol) first. 3047 */ 3048 error = xs_scanf(XST_NIL, otherend_path, 3049 "event-channel", NULL, "%" PRIu32, 3050 &xbb->ring_config.evtchn); 3051 if (error != 0) { 3052 xenbus_dev_fatal(xbb->dev, error, 3053 "Unable to retrieve event-channel information " 3054 "from frontend %s. Unable to connect.", 3055 xenbus_get_otherend_path(xbb->dev)); 3056 return (error); 3057 } 3058 3059 /* 3060 * These fields are initialized to legacy protocol defaults 3061 * so we only need to fail if reading the updated value succeeds 3062 * and the new value is outside of its allowed range. 3063 * 3064 * \note xs_gather() returns on the first encountered error, so 3065 * we must use independent calls in order to guarantee 3066 * we don't miss information in a sparsly populated front-end 3067 * tree. 3068 * 3069 * \note xs_scanf() does not update variables for unmatched 3070 * fields. 3071 */ 3072 ring_page_order = 0; 3073 xbb->max_requests = 32; 3074 3075 (void)xs_scanf(XST_NIL, otherend_path, 3076 "ring-page-order", NULL, "%u", 3077 &ring_page_order); 3078 xbb->ring_config.ring_pages = 1 << ring_page_order; 3079 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3080 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3081 3082 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3083 xenbus_dev_fatal(xbb->dev, EINVAL, 3084 "Front-end specified ring-pages of %u " 3085 "exceeds backend limit of %u. " 3086 "Unable to connect.", 3087 xbb->ring_config.ring_pages, 3088 XBB_MAX_RING_PAGES); 3089 return (EINVAL); 3090 } 3091 3092 if (xbb->ring_config.ring_pages == 1) { 3093 error = xs_gather(XST_NIL, otherend_path, 3094 "ring-ref", "%" PRIu32, 3095 &xbb->ring_config.ring_ref[0], 3096 NULL); 3097 if (error != 0) { 3098 xenbus_dev_fatal(xbb->dev, error, 3099 "Unable to retrieve ring information " 3100 "from frontend %s. Unable to " 3101 "connect.", 3102 xenbus_get_otherend_path(xbb->dev)); 3103 return (error); 3104 } 3105 } else { 3106 /* Multi-page ring format. */ 3107 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3108 ring_idx++) { 3109 char ring_ref_name[]= "ring_refXX"; 3110 3111 snprintf(ring_ref_name, sizeof(ring_ref_name), 3112 "ring-ref%u", ring_idx); 3113 error = xs_scanf(XST_NIL, otherend_path, 3114 ring_ref_name, NULL, "%" PRIu32, 3115 &xbb->ring_config.ring_ref[ring_idx]); 3116 if (error != 0) { 3117 xenbus_dev_fatal(xbb->dev, error, 3118 "Failed to retriev grant " 3119 "reference for page %u of " 3120 "shared ring. Unable " 3121 "to connect.", ring_idx); 3122 return (error); 3123 } 3124 } 3125 } 3126 3127 error = xs_gather(XST_NIL, otherend_path, 3128 "protocol", "%63s", protocol_abi, 3129 NULL); 3130 if (error != 0 3131 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3132 /* 3133 * Assume native if the frontend has not 3134 * published ABI data or it has published and 3135 * matches our own ABI. 3136 */ 3137 xbb->abi = BLKIF_PROTOCOL_NATIVE; 3138 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3139 3140 xbb->abi = BLKIF_PROTOCOL_X86_32; 3141 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3142 3143 xbb->abi = BLKIF_PROTOCOL_X86_64; 3144 } else { 3145 3146 xenbus_dev_fatal(xbb->dev, EINVAL, 3147 "Unknown protocol ABI (%s) published by " 3148 "frontend. Unable to connect.", protocol_abi); 3149 return (EINVAL); 3150 } 3151 return (0); 3152 } 3153 3154 /** 3155 * Allocate per-request data structures given request size and number 3156 * information negotiated with the front-end. 3157 * 3158 * \param xbb Per-instance xbb configuration structure. 3159 */ 3160 static int 3161 xbb_alloc_requests(struct xbb_softc *xbb) 3162 { 3163 struct xbb_xen_req *req; 3164 struct xbb_xen_req *last_req; 3165 3166 /* 3167 * Allocate request book keeping datastructures. 3168 */ 3169 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3170 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3171 if (xbb->requests == NULL) { 3172 xenbus_dev_fatal(xbb->dev, ENOMEM, 3173 "Unable to allocate request structures"); 3174 return (ENOMEM); 3175 } 3176 3177 req = xbb->requests; 3178 last_req = &xbb->requests[xbb->max_requests - 1]; 3179 STAILQ_INIT(&xbb->request_free_stailq); 3180 while (req <= last_req) { 3181 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3182 req++; 3183 } 3184 return (0); 3185 } 3186 3187 static int 3188 xbb_alloc_request_lists(struct xbb_softc *xbb) 3189 { 3190 struct xbb_xen_reqlist *reqlist; 3191 int i; 3192 3193 /* 3194 * If no requests can be merged, we need 1 request list per 3195 * in flight request. 3196 */ 3197 xbb->request_lists = malloc(xbb->max_requests * 3198 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3199 if (xbb->request_lists == NULL) { 3200 xenbus_dev_fatal(xbb->dev, ENOMEM, 3201 "Unable to allocate request list structures"); 3202 return (ENOMEM); 3203 } 3204 3205 STAILQ_INIT(&xbb->reqlist_free_stailq); 3206 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3207 for (i = 0; i < xbb->max_requests; i++) { 3208 int seg; 3209 3210 reqlist = &xbb->request_lists[i]; 3211 3212 reqlist->xbb = xbb; 3213 3214 #ifdef XBB_USE_BOUNCE_BUFFERS 3215 reqlist->bounce = malloc(xbb->max_reqlist_size, 3216 M_XENBLOCKBACK, M_NOWAIT); 3217 if (reqlist->bounce == NULL) { 3218 xenbus_dev_fatal(xbb->dev, ENOMEM, 3219 "Unable to allocate request " 3220 "bounce buffers"); 3221 return (ENOMEM); 3222 } 3223 #endif /* XBB_USE_BOUNCE_BUFFERS */ 3224 3225 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3226 sizeof(*reqlist->gnt_handles), 3227 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3228 if (reqlist->gnt_handles == NULL) { 3229 xenbus_dev_fatal(xbb->dev, ENOMEM, 3230 "Unable to allocate request " 3231 "grant references"); 3232 return (ENOMEM); 3233 } 3234 3235 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3236 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3237 3238 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3239 } 3240 return (0); 3241 } 3242 3243 /** 3244 * Supply information about the physical device to the frontend 3245 * via XenBus. 3246 * 3247 * \param xbb Per-instance xbb configuration structure. 3248 */ 3249 static int 3250 xbb_publish_backend_info(struct xbb_softc *xbb) 3251 { 3252 struct xs_transaction xst; 3253 const char *our_path; 3254 const char *leaf; 3255 int error; 3256 3257 our_path = xenbus_get_node(xbb->dev); 3258 while (1) { 3259 error = xs_transaction_start(&xst); 3260 if (error != 0) { 3261 xenbus_dev_fatal(xbb->dev, error, 3262 "Error publishing backend info " 3263 "(start transaction)"); 3264 return (error); 3265 } 3266 3267 leaf = "sectors"; 3268 error = xs_printf(xst, our_path, leaf, 3269 "%"PRIu64, xbb->media_num_sectors); 3270 if (error != 0) 3271 break; 3272 3273 /* XXX Support all VBD attributes here. */ 3274 leaf = "info"; 3275 error = xs_printf(xst, our_path, leaf, "%u", 3276 xbb->flags & XBBF_READ_ONLY 3277 ? VDISK_READONLY : 0); 3278 if (error != 0) 3279 break; 3280 3281 leaf = "sector-size"; 3282 error = xs_printf(xst, our_path, leaf, "%u", 3283 xbb->sector_size); 3284 if (error != 0) 3285 break; 3286 3287 error = xs_transaction_end(xst, 0); 3288 if (error == 0) { 3289 return (0); 3290 } else if (error != EAGAIN) { 3291 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3292 return (error); 3293 } 3294 } 3295 3296 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3297 our_path, leaf); 3298 xs_transaction_end(xst, 1); 3299 return (error); 3300 } 3301 3302 /** 3303 * Connect to our blkfront peer now that it has completed publishing 3304 * its configuration into the XenStore. 3305 * 3306 * \param xbb Per-instance xbb configuration structure. 3307 */ 3308 static void 3309 xbb_connect(struct xbb_softc *xbb) 3310 { 3311 int error; 3312 3313 if (xenbus_get_state(xbb->dev) != XenbusStateInitialised) 3314 return; 3315 3316 if (xbb_collect_frontend_info(xbb) != 0) 3317 return; 3318 3319 xbb->flags &= ~XBBF_SHUTDOWN; 3320 3321 /* 3322 * We limit the maximum number of reqlist segments to the maximum 3323 * number of segments in the ring, or our absolute maximum, 3324 * whichever is smaller. 3325 */ 3326 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3327 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3328 3329 /* 3330 * The maximum size is simply a function of the number of segments 3331 * we can handle. 3332 */ 3333 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3334 3335 /* Allocate resources whose size depends on front-end configuration. */ 3336 error = xbb_alloc_communication_mem(xbb); 3337 if (error != 0) { 3338 xenbus_dev_fatal(xbb->dev, error, 3339 "Unable to allocate communication memory"); 3340 return; 3341 } 3342 3343 error = xbb_alloc_requests(xbb); 3344 if (error != 0) { 3345 /* Specific errors are reported by xbb_alloc_requests(). */ 3346 return; 3347 } 3348 3349 error = xbb_alloc_request_lists(xbb); 3350 if (error != 0) { 3351 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3352 return; 3353 } 3354 3355 /* 3356 * Connect communication channel. 3357 */ 3358 error = xbb_connect_ring(xbb); 3359 if (error != 0) { 3360 /* Specific errors are reported by xbb_connect_ring(). */ 3361 return; 3362 } 3363 3364 if (xbb_publish_backend_info(xbb) != 0) { 3365 /* 3366 * If we can't publish our data, we cannot participate 3367 * in this connection, and waiting for a front-end state 3368 * change will not help the situation. 3369 */ 3370 (void)xbb_disconnect(xbb); 3371 return; 3372 } 3373 3374 /* Ready for I/O. */ 3375 xenbus_set_state(xbb->dev, XenbusStateConnected); 3376 } 3377 3378 /*-------------------------- Device Teardown Support -------------------------*/ 3379 /** 3380 * Perform device shutdown functions. 3381 * 3382 * \param xbb Per-instance xbb configuration structure. 3383 * 3384 * Mark this instance as shutting down, wait for any active I/O on the 3385 * backend device/file to drain, disconnect from the front-end, and notify 3386 * any waiters (e.g. a thread invoking our detach method) that detach can 3387 * now proceed. 3388 */ 3389 static int 3390 xbb_shutdown(struct xbb_softc *xbb) 3391 { 3392 XenbusState frontState; 3393 int error; 3394 3395 DPRINTF("\n"); 3396 3397 /* 3398 * Due to the need to drop our mutex during some 3399 * xenbus operations, it is possible for two threads 3400 * to attempt to close out shutdown processing at 3401 * the same time. Tell the caller that hits this 3402 * race to try back later. 3403 */ 3404 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3405 return (EAGAIN); 3406 3407 xbb->flags |= XBBF_IN_SHUTDOWN; 3408 mtx_unlock(&xbb->lock); 3409 3410 if (xbb->hotplug_watch.node != NULL) { 3411 xs_unregister_watch(&xbb->hotplug_watch); 3412 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3413 xbb->hotplug_watch.node = NULL; 3414 } 3415 3416 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3417 xenbus_set_state(xbb->dev, XenbusStateClosing); 3418 3419 frontState = xenbus_get_otherend_state(xbb->dev); 3420 mtx_lock(&xbb->lock); 3421 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3422 3423 /* Wait for the frontend to disconnect (if it's connected). */ 3424 if (frontState == XenbusStateConnected) 3425 return (EAGAIN); 3426 3427 DPRINTF("\n"); 3428 3429 /* Indicate shutdown is in progress. */ 3430 xbb->flags |= XBBF_SHUTDOWN; 3431 3432 /* Disconnect from the front-end. */ 3433 error = xbb_disconnect(xbb); 3434 if (error != 0) { 3435 /* 3436 * Requests still outstanding. We'll be called again 3437 * once they complete. 3438 */ 3439 KASSERT(error == EAGAIN, 3440 ("%s: Unexpected xbb_disconnect() failure %d", 3441 __func__, error)); 3442 3443 return (error); 3444 } 3445 3446 DPRINTF("\n"); 3447 3448 /* Indicate to xbb_detach() that is it safe to proceed. */ 3449 wakeup(xbb); 3450 3451 return (0); 3452 } 3453 3454 /** 3455 * Report an attach time error to the console and Xen, and cleanup 3456 * this instance by forcing immediate detach processing. 3457 * 3458 * \param xbb Per-instance xbb configuration structure. 3459 * \param err Errno describing the error. 3460 * \param fmt Printf style format and arguments 3461 */ 3462 static void 3463 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3464 { 3465 va_list ap; 3466 va_list ap_hotplug; 3467 3468 va_start(ap, fmt); 3469 va_copy(ap_hotplug, ap); 3470 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3471 "hotplug-error", fmt, ap_hotplug); 3472 va_end(ap_hotplug); 3473 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3474 "hotplug-status", "error"); 3475 3476 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3477 va_end(ap); 3478 3479 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3480 "online", "0"); 3481 mtx_lock(&xbb->lock); 3482 xbb_shutdown(xbb); 3483 mtx_unlock(&xbb->lock); 3484 } 3485 3486 /*---------------------------- NewBus Entrypoints ----------------------------*/ 3487 /** 3488 * Inspect a XenBus device and claim it if is of the appropriate type. 3489 * 3490 * \param dev NewBus device object representing a candidate XenBus device. 3491 * 3492 * \return 0 for success, errno codes for failure. 3493 */ 3494 static int 3495 xbb_probe(device_t dev) 3496 { 3497 3498 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3499 device_set_desc(dev, "Backend Virtual Block Device"); 3500 device_quiet(dev); 3501 return (0); 3502 } 3503 3504 return (ENXIO); 3505 } 3506 3507 /** 3508 * Setup sysctl variables to control various Block Back parameters. 3509 * 3510 * \param xbb Xen Block Back softc. 3511 * 3512 */ 3513 static void 3514 xbb_setup_sysctl(struct xbb_softc *xbb) 3515 { 3516 struct sysctl_ctx_list *sysctl_ctx = NULL; 3517 struct sysctl_oid *sysctl_tree = NULL; 3518 3519 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3520 if (sysctl_ctx == NULL) 3521 return; 3522 3523 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3524 if (sysctl_tree == NULL) 3525 return; 3526 3527 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3528 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3529 "fake the flush command"); 3530 3531 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3532 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3533 "send a real flush for N flush requests"); 3534 3535 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3536 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3537 "Don't coalesce contiguous requests"); 3538 3539 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3540 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3541 "how many I/O requests we have received"); 3542 3543 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3544 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3545 "how many I/O requests have been completed"); 3546 3547 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3548 "reqs_queued_for_completion", CTLFLAG_RW, 3549 &xbb->reqs_queued_for_completion, 3550 "how many I/O requests queued but not yet pushed"); 3551 3552 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3553 "reqs_completed_with_error", CTLFLAG_RW, 3554 &xbb->reqs_completed_with_error, 3555 "how many I/O requests completed with error status"); 3556 3557 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3558 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3559 "how many I/O dispatches were forced"); 3560 3561 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3562 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3563 "how many I/O dispatches were normal"); 3564 3565 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3566 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3567 "total number of I/O dispatches"); 3568 3569 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3570 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3571 "how many times we have run out of KVA"); 3572 3573 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3574 "request_shortages", CTLFLAG_RW, 3575 &xbb->request_shortages, 3576 "how many times we have run out of requests"); 3577 3578 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3579 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3580 "maximum outstanding requests (negotiated)"); 3581 3582 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3583 "max_request_segments", CTLFLAG_RD, 3584 &xbb->max_request_segments, 0, 3585 "maximum number of pages per requests (negotiated)"); 3586 3587 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3588 "max_request_size", CTLFLAG_RD, 3589 &xbb->max_request_size, 0, 3590 "maximum size in bytes of a request (negotiated)"); 3591 3592 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3593 "ring_pages", CTLFLAG_RD, 3594 &xbb->ring_config.ring_pages, 0, 3595 "communication channel pages (negotiated)"); 3596 } 3597 3598 static void 3599 xbb_attach_disk(struct xs_watch *watch, const char **vec, unsigned int len) 3600 { 3601 device_t dev; 3602 struct xbb_softc *xbb; 3603 int error; 3604 3605 dev = (device_t) watch->callback_data; 3606 xbb = device_get_softc(dev); 3607 3608 error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", 3609 NULL, &xbb->dev_name, NULL); 3610 if (error != 0) 3611 return; 3612 3613 xs_unregister_watch(watch); 3614 free(watch->node, M_XENBLOCKBACK); 3615 watch->node = NULL; 3616 3617 /* Collect physical device information. */ 3618 error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3619 "device-type", NULL, &xbb->dev_type, 3620 NULL); 3621 if (error != 0) 3622 xbb->dev_type = NULL; 3623 3624 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3625 "mode", NULL, &xbb->dev_mode, 3626 NULL); 3627 if (error != 0) { 3628 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3629 xenbus_get_node(dev)); 3630 return; 3631 } 3632 3633 /* Parse fopen style mode flags. */ 3634 if (strchr(xbb->dev_mode, 'w') == NULL) 3635 xbb->flags |= XBBF_READ_ONLY; 3636 3637 /* 3638 * Verify the physical device is present and can support 3639 * the desired I/O mode. 3640 */ 3641 error = xbb_open_backend(xbb); 3642 if (error != 0) { 3643 xbb_attach_failed(xbb, error, "Unable to open %s", 3644 xbb->dev_name); 3645 return; 3646 } 3647 3648 /* Use devstat(9) for recording statistics. */ 3649 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3650 xbb->sector_size, 3651 DEVSTAT_ALL_SUPPORTED, 3652 DEVSTAT_TYPE_DIRECT 3653 | DEVSTAT_TYPE_IF_OTHER, 3654 DEVSTAT_PRIORITY_OTHER); 3655 3656 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3657 xbb->sector_size, 3658 DEVSTAT_ALL_SUPPORTED, 3659 DEVSTAT_TYPE_DIRECT 3660 | DEVSTAT_TYPE_IF_OTHER, 3661 DEVSTAT_PRIORITY_OTHER); 3662 /* 3663 * Setup sysctl variables. 3664 */ 3665 xbb_setup_sysctl(xbb); 3666 3667 /* 3668 * Create a taskqueue for doing work that must occur from a 3669 * thread context. 3670 */ 3671 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3672 M_NOWAIT, 3673 taskqueue_thread_enqueue, 3674 /*contxt*/&xbb->io_taskqueue); 3675 if (xbb->io_taskqueue == NULL) { 3676 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3677 return; 3678 } 3679 3680 taskqueue_start_threads(&xbb->io_taskqueue, 3681 /*num threads*/1, 3682 /*priority*/PWAIT, 3683 /*thread name*/ 3684 "%s taskq", device_get_nameunit(dev)); 3685 3686 /* Update hot-plug status to satisfy xend. */ 3687 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3688 "hotplug-status", "connected"); 3689 if (error) { 3690 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3691 xenbus_get_node(xbb->dev)); 3692 return; 3693 } 3694 3695 /* Tell the front end that we are ready to connect. */ 3696 xenbus_set_state(dev, XenbusStateInitialised); 3697 } 3698 3699 /** 3700 * Attach to a XenBus device that has been claimed by our probe routine. 3701 * 3702 * \param dev NewBus device object representing this Xen Block Back instance. 3703 * 3704 * \return 0 for success, errno codes for failure. 3705 */ 3706 static int 3707 xbb_attach(device_t dev) 3708 { 3709 struct xbb_softc *xbb; 3710 int error; 3711 u_int max_ring_page_order; 3712 struct sbuf *watch_path; 3713 3714 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3715 3716 /* 3717 * Basic initialization. 3718 * After this block it is safe to call xbb_detach() 3719 * to clean up any allocated data for this instance. 3720 */ 3721 xbb = device_get_softc(dev); 3722 xbb->dev = dev; 3723 xbb->otherend_id = xenbus_get_otherend_id(dev); 3724 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3725 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3726 3727 /* 3728 * Publish protocol capabilities for consumption by the 3729 * front-end. 3730 */ 3731 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3732 "feature-barrier", "1"); 3733 if (error) { 3734 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3735 xenbus_get_node(xbb->dev)); 3736 return (error); 3737 } 3738 3739 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3740 "feature-flush-cache", "1"); 3741 if (error) { 3742 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3743 xenbus_get_node(xbb->dev)); 3744 return (error); 3745 } 3746 3747 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3748 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3749 "max-ring-page-order", "%u", max_ring_page_order); 3750 if (error) { 3751 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3752 xenbus_get_node(xbb->dev)); 3753 return (error); 3754 } 3755 3756 /* 3757 * We need to wait for hotplug script execution before 3758 * moving forward. 3759 */ 3760 watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); 3761 xbb->hotplug_watch.callback_data = (uintptr_t)dev; 3762 xbb->hotplug_watch.callback = xbb_attach_disk; 3763 KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); 3764 xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); 3765 sbuf_delete(watch_path); 3766 error = xs_register_watch(&xbb->hotplug_watch); 3767 if (error != 0) { 3768 xbb_attach_failed(xbb, error, "failed to create watch on %s", 3769 xbb->hotplug_watch.node); 3770 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3771 return (error); 3772 } 3773 3774 /* Tell the toolstack blkback has attached. */ 3775 xenbus_set_state(dev, XenbusStateInitWait); 3776 3777 return (0); 3778 } 3779 3780 /** 3781 * Detach from a block back device instance. 3782 * 3783 * \param dev NewBus device object representing this Xen Block Back instance. 3784 * 3785 * \return 0 for success, errno codes for failure. 3786 * 3787 * \note A block back device may be detached at any time in its life-cycle, 3788 * including part way through the attach process. For this reason, 3789 * initialization order and the initialization state checks in this 3790 * routine must be carefully coupled so that attach time failures 3791 * are gracefully handled. 3792 */ 3793 static int 3794 xbb_detach(device_t dev) 3795 { 3796 struct xbb_softc *xbb; 3797 3798 DPRINTF("\n"); 3799 3800 xbb = device_get_softc(dev); 3801 mtx_lock(&xbb->lock); 3802 while (xbb_shutdown(xbb) == EAGAIN) { 3803 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3804 "xbb_shutdown", 0); 3805 } 3806 mtx_unlock(&xbb->lock); 3807 3808 DPRINTF("\n"); 3809 3810 if (xbb->io_taskqueue != NULL) 3811 taskqueue_free(xbb->io_taskqueue); 3812 3813 if (xbb->xbb_stats != NULL) 3814 devstat_remove_entry(xbb->xbb_stats); 3815 3816 if (xbb->xbb_stats_in != NULL) 3817 devstat_remove_entry(xbb->xbb_stats_in); 3818 3819 xbb_close_backend(xbb); 3820 3821 if (xbb->dev_mode != NULL) { 3822 free(xbb->dev_mode, M_XENSTORE); 3823 xbb->dev_mode = NULL; 3824 } 3825 3826 if (xbb->dev_type != NULL) { 3827 free(xbb->dev_type, M_XENSTORE); 3828 xbb->dev_type = NULL; 3829 } 3830 3831 if (xbb->dev_name != NULL) { 3832 free(xbb->dev_name, M_XENSTORE); 3833 xbb->dev_name = NULL; 3834 } 3835 3836 mtx_destroy(&xbb->lock); 3837 return (0); 3838 } 3839 3840 /** 3841 * Prepare this block back device for suspension of this VM. 3842 * 3843 * \param dev NewBus device object representing this Xen Block Back instance. 3844 * 3845 * \return 0 for success, errno codes for failure. 3846 */ 3847 static int 3848 xbb_suspend(device_t dev) 3849 { 3850 #ifdef NOT_YET 3851 struct xbb_softc *sc = device_get_softc(dev); 3852 3853 /* Prevent new requests being issued until we fix things up. */ 3854 mtx_lock(&sc->xb_io_lock); 3855 sc->connected = BLKIF_STATE_SUSPENDED; 3856 mtx_unlock(&sc->xb_io_lock); 3857 #endif 3858 3859 return (0); 3860 } 3861 3862 /** 3863 * Perform any processing required to recover from a suspended state. 3864 * 3865 * \param dev NewBus device object representing this Xen Block Back instance. 3866 * 3867 * \return 0 for success, errno codes for failure. 3868 */ 3869 static int 3870 xbb_resume(device_t dev) 3871 { 3872 return (0); 3873 } 3874 3875 /** 3876 * Handle state changes expressed via the XenStore by our front-end peer. 3877 * 3878 * \param dev NewBus device object representing this Xen 3879 * Block Back instance. 3880 * \param frontend_state The new state of the front-end. 3881 * 3882 * \return 0 for success, errno codes for failure. 3883 */ 3884 static void 3885 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3886 { 3887 struct xbb_softc *xbb = device_get_softc(dev); 3888 3889 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3890 xenbus_strstate(frontend_state), 3891 xenbus_strstate(xenbus_get_state(xbb->dev))); 3892 3893 switch (frontend_state) { 3894 case XenbusStateInitialising: 3895 break; 3896 case XenbusStateInitialised: 3897 case XenbusStateConnected: 3898 xbb_connect(xbb); 3899 break; 3900 case XenbusStateClosing: 3901 case XenbusStateClosed: 3902 mtx_lock(&xbb->lock); 3903 xbb_shutdown(xbb); 3904 mtx_unlock(&xbb->lock); 3905 if (frontend_state == XenbusStateClosed) 3906 xenbus_set_state(xbb->dev, XenbusStateClosed); 3907 break; 3908 default: 3909 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3910 frontend_state); 3911 break; 3912 } 3913 } 3914 3915 /*---------------------------- NewBus Registration ---------------------------*/ 3916 static device_method_t xbb_methods[] = { 3917 /* Device interface */ 3918 DEVMETHOD(device_probe, xbb_probe), 3919 DEVMETHOD(device_attach, xbb_attach), 3920 DEVMETHOD(device_detach, xbb_detach), 3921 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3922 DEVMETHOD(device_suspend, xbb_suspend), 3923 DEVMETHOD(device_resume, xbb_resume), 3924 3925 /* Xenbus interface */ 3926 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3927 3928 { 0, 0 } 3929 }; 3930 3931 static driver_t xbb_driver = { 3932 "xbbd", 3933 xbb_methods, 3934 sizeof(struct xbb_softc), 3935 }; 3936 devclass_t xbb_devclass; 3937 3938 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 3939