1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009-2012 Spectra Logic Corporation 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions, and the following disclaimer, 12 * without modification. 13 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 14 * substantially similar to the "NO WARRANTY" disclaimer below 15 * ("Disclaimer") and any redistribution must be conditioned upon 16 * including a substantially similar Disclaimer requirement for further 17 * binary redistribution. 18 * 19 * NO WARRANTY 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 29 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGES. 31 * 32 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 33 * Ken Merry (Spectra Logic Corporation) 34 */ 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 /** 39 * \file blkback.c 40 * 41 * \brief Device driver supporting the vending of block storage from 42 * a FreeBSD domain to other domains. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kernel.h> 48 #include <sys/malloc.h> 49 50 #include <sys/bio.h> 51 #include <sys/bus.h> 52 #include <sys/conf.h> 53 #include <sys/devicestat.h> 54 #include <sys/disk.h> 55 #include <sys/fcntl.h> 56 #include <sys/filedesc.h> 57 #include <sys/kdb.h> 58 #include <sys/module.h> 59 #include <sys/namei.h> 60 #include <sys/proc.h> 61 #include <sys/rman.h> 62 #include <sys/taskqueue.h> 63 #include <sys/types.h> 64 #include <sys/vnode.h> 65 #include <sys/mount.h> 66 #include <sys/sysctl.h> 67 #include <sys/bitstring.h> 68 #include <sys/sdt.h> 69 70 #include <geom/geom.h> 71 72 #include <machine/_inttypes.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 78 #include <xen/xen-os.h> 79 #include <xen/blkif.h> 80 #include <xen/gnttab.h> 81 #include <xen/xen_intr.h> 82 83 #include <xen/interface/event_channel.h> 84 #include <xen/interface/grant_table.h> 85 86 #include <xen/xenbus/xenbusvar.h> 87 88 /*--------------------------- Compile-time Tunables --------------------------*/ 89 /** 90 * The maximum number of shared memory ring pages we will allow in a 91 * negotiated block-front/back communication channel. Allow enough 92 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 93 */ 94 #define XBB_MAX_RING_PAGES 32 95 96 /** 97 * The maximum number of outstanding request blocks (request headers plus 98 * additional segment blocks) we will allow in a negotiated block-front/back 99 * communication channel. 100 */ 101 #define XBB_MAX_REQUESTS \ 102 __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) 103 104 /** 105 * \brief Define to force all I/O to be performed on memory owned by the 106 * backend device, with a copy-in/out to the remote domain's memory. 107 * 108 * \note This option is currently required when this driver's domain is 109 * operating in HVM mode on a system using an IOMMU. 110 * 111 * This driver uses Xen's grant table API to gain access to the memory of 112 * the remote domains it serves. When our domain is operating in PV mode, 113 * the grant table mechanism directly updates our domain's page table entries 114 * to point to the physical pages of the remote domain. This scheme guarantees 115 * that blkback and the backing devices it uses can safely perform DMA 116 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 117 * insure that our domain cannot DMA to pages owned by another domain. As 118 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 119 * table API. For this reason, in HVM mode, we must bounce all requests into 120 * memory that is mapped into our domain at domain startup and thus has 121 * valid IOMMU mappings. 122 */ 123 #define XBB_USE_BOUNCE_BUFFERS 124 125 /** 126 * \brief Define to enable rudimentary request logging to the console. 127 */ 128 #undef XBB_DEBUG 129 130 /*---------------------------------- Macros ----------------------------------*/ 131 /** 132 * Custom malloc type for all driver allocations. 133 */ 134 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 135 136 #ifdef XBB_DEBUG 137 #define DPRINTF(fmt, args...) \ 138 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 139 #else 140 #define DPRINTF(fmt, args...) do {} while(0) 141 #endif 142 143 /** 144 * The maximum mapped region size per request we will allow in a negotiated 145 * block-front/back communication channel. 146 */ 147 #define XBB_MAX_REQUEST_SIZE \ 148 MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 149 150 /** 151 * The maximum number of segments (within a request header and accompanying 152 * segment blocks) per request we will allow in a negotiated block-front/back 153 * communication channel. 154 */ 155 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 156 (MIN(UIO_MAXIOV, \ 157 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 158 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 159 160 /** 161 * The maximum number of ring pages that we can allow per request list. 162 * We limit this to the maximum number of segments per request, because 163 * that is already a reasonable number of segments to aggregate. This 164 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 165 * because that would leave situations where we can't dispatch even one 166 * large request. 167 */ 168 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 169 170 /*--------------------------- Forward Declarations ---------------------------*/ 171 struct xbb_softc; 172 struct xbb_xen_req; 173 174 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 175 ...) __attribute__((format(printf, 3, 4))); 176 static int xbb_shutdown(struct xbb_softc *xbb); 177 178 /*------------------------------ Data Structures -----------------------------*/ 179 180 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 181 182 typedef enum { 183 XBB_REQLIST_NONE = 0x00, 184 XBB_REQLIST_MAPPED = 0x01 185 } xbb_reqlist_flags; 186 187 struct xbb_xen_reqlist { 188 /** 189 * Back reference to the parent block back instance for this 190 * request. Used during bio_done handling. 191 */ 192 struct xbb_softc *xbb; 193 194 /** 195 * BLKIF_OP code for this request. 196 */ 197 int operation; 198 199 /** 200 * Set to BLKIF_RSP_* to indicate request status. 201 * 202 * This field allows an error status to be recorded even if the 203 * delivery of this status must be deferred. Deferred reporting 204 * is necessary, for example, when an error is detected during 205 * completion processing of one bio when other bios for this 206 * request are still outstanding. 207 */ 208 int status; 209 210 /** 211 * Number of 512 byte sectors not transferred. 212 */ 213 int residual_512b_sectors; 214 215 /** 216 * Starting sector number of the first request in the list. 217 */ 218 off_t starting_sector_number; 219 220 /** 221 * If we're going to coalesce, the next contiguous sector would be 222 * this one. 223 */ 224 off_t next_contig_sector; 225 226 /** 227 * Number of child requests in the list. 228 */ 229 int num_children; 230 231 /** 232 * Number of I/O requests still pending on the backend. 233 */ 234 int pendcnt; 235 236 /** 237 * Total number of segments for requests in the list. 238 */ 239 int nr_segments; 240 241 /** 242 * Flags for this particular request list. 243 */ 244 xbb_reqlist_flags flags; 245 246 /** 247 * Kernel virtual address space reserved for this request 248 * list structure and used to map the remote domain's pages for 249 * this I/O, into our domain's address space. 250 */ 251 uint8_t *kva; 252 253 /** 254 * Base, pseudo-physical address, corresponding to the start 255 * of this request's kva region. 256 */ 257 uint64_t gnt_base; 258 259 #ifdef XBB_USE_BOUNCE_BUFFERS 260 /** 261 * Pre-allocated domain local memory used to proxy remote 262 * domain memory during I/O operations. 263 */ 264 uint8_t *bounce; 265 #endif 266 267 /** 268 * Array of grant handles (one per page) used to map this request. 269 */ 270 grant_handle_t *gnt_handles; 271 272 /** 273 * Device statistics request ordering type (ordered or simple). 274 */ 275 devstat_tag_type ds_tag_type; 276 277 /** 278 * Device statistics request type (read, write, no_data). 279 */ 280 devstat_trans_flags ds_trans_type; 281 282 /** 283 * The start time for this request. 284 */ 285 struct bintime ds_t0; 286 287 /** 288 * Linked list of contiguous requests with the same operation type. 289 */ 290 struct xbb_xen_req_list contig_req_list; 291 292 /** 293 * Linked list links used to aggregate idle requests in the 294 * request list free pool (xbb->reqlist_free_stailq) and pending 295 * requests waiting for execution (xbb->reqlist_pending_stailq). 296 */ 297 STAILQ_ENTRY(xbb_xen_reqlist) links; 298 }; 299 300 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 301 302 /** 303 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 304 */ 305 struct xbb_xen_req { 306 /** 307 * Linked list links used to aggregate requests into a reqlist 308 * and to store them in the request free pool. 309 */ 310 STAILQ_ENTRY(xbb_xen_req) links; 311 312 /** 313 * The remote domain's identifier for this I/O request. 314 */ 315 uint64_t id; 316 317 /** 318 * The number of pages currently mapped for this request. 319 */ 320 int nr_pages; 321 322 /** 323 * The number of 512 byte sectors comprising this requests. 324 */ 325 int nr_512b_sectors; 326 327 /** 328 * BLKIF_OP code for this request. 329 */ 330 int operation; 331 332 /** 333 * Storage used for non-native ring requests. 334 */ 335 blkif_request_t ring_req_storage; 336 337 /** 338 * Pointer to the Xen request in the ring. 339 */ 340 blkif_request_t *ring_req; 341 342 /** 343 * Consumer index for this request. 344 */ 345 RING_IDX req_ring_idx; 346 347 /** 348 * The start time for this request. 349 */ 350 struct bintime ds_t0; 351 352 /** 353 * Pointer back to our parent request list. 354 */ 355 struct xbb_xen_reqlist *reqlist; 356 }; 357 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 358 359 /** 360 * \brief Configuration data for the shared memory request ring 361 * used to communicate with the front-end client of this 362 * this driver. 363 */ 364 struct xbb_ring_config { 365 /** KVA address where ring memory is mapped. */ 366 vm_offset_t va; 367 368 /** The pseudo-physical address where ring memory is mapped.*/ 369 uint64_t gnt_addr; 370 371 /** 372 * Grant table handles, one per-ring page, returned by the 373 * hyperpervisor upon mapping of the ring and required to 374 * unmap it when a connection is torn down. 375 */ 376 grant_handle_t handle[XBB_MAX_RING_PAGES]; 377 378 /** 379 * The device bus address returned by the hypervisor when 380 * mapping the ring and required to unmap it when a connection 381 * is torn down. 382 */ 383 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 384 385 /** The number of ring pages mapped for the current connection. */ 386 u_int ring_pages; 387 388 /** 389 * The grant references, one per-ring page, supplied by the 390 * front-end, allowing us to reference the ring pages in the 391 * front-end's domain and to map these pages into our own domain. 392 */ 393 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 394 395 /** The interrupt driven even channel used to signal ring events. */ 396 evtchn_port_t evtchn; 397 }; 398 399 /** 400 * Per-instance connection state flags. 401 */ 402 typedef enum 403 { 404 /** 405 * The front-end requested a read-only mount of the 406 * back-end device/file. 407 */ 408 XBBF_READ_ONLY = 0x01, 409 410 /** Communication with the front-end has been established. */ 411 XBBF_RING_CONNECTED = 0x02, 412 413 /** 414 * Front-end requests exist in the ring and are waiting for 415 * xbb_xen_req objects to free up. 416 */ 417 XBBF_RESOURCE_SHORTAGE = 0x04, 418 419 /** Connection teardown in progress. */ 420 XBBF_SHUTDOWN = 0x08, 421 422 /** A thread is already performing shutdown processing. */ 423 XBBF_IN_SHUTDOWN = 0x10 424 } xbb_flag_t; 425 426 /** Backend device type. */ 427 typedef enum { 428 /** Backend type unknown. */ 429 XBB_TYPE_NONE = 0x00, 430 431 /** 432 * Backend type disk (access via cdev switch 433 * strategy routine). 434 */ 435 XBB_TYPE_DISK = 0x01, 436 437 /** Backend type file (access vnode operations.). */ 438 XBB_TYPE_FILE = 0x02 439 } xbb_type; 440 441 /** 442 * \brief Structure used to memoize information about a per-request 443 * scatter-gather list. 444 * 445 * The chief benefit of using this data structure is it avoids having 446 * to reparse the possibly discontiguous S/G list in the original 447 * request. Due to the way that the mapping of the memory backing an 448 * I/O transaction is handled by Xen, a second pass is unavoidable. 449 * At least this way the second walk is a simple array traversal. 450 * 451 * \note A single Scatter/Gather element in the block interface covers 452 * at most 1 machine page. In this context a sector (blkif 453 * nomenclature, not what I'd choose) is a 512b aligned unit 454 * of mapping within the machine page referenced by an S/G 455 * element. 456 */ 457 struct xbb_sg { 458 /** The number of 512b data chunks mapped in this S/G element. */ 459 int16_t nsect; 460 461 /** 462 * The index (0 based) of the first 512b data chunk mapped 463 * in this S/G element. 464 */ 465 uint8_t first_sect; 466 467 /** 468 * The index (0 based) of the last 512b data chunk mapped 469 * in this S/G element. 470 */ 471 uint8_t last_sect; 472 }; 473 474 /** 475 * Character device backend specific configuration data. 476 */ 477 struct xbb_dev_data { 478 /** Cdev used for device backend access. */ 479 struct cdev *cdev; 480 481 /** Cdev switch used for device backend access. */ 482 struct cdevsw *csw; 483 484 /** Used to hold a reference on opened cdev backend devices. */ 485 int dev_ref; 486 }; 487 488 /** 489 * File backend specific configuration data. 490 */ 491 struct xbb_file_data { 492 /** Credentials to use for vnode backed (file based) I/O. */ 493 struct ucred *cred; 494 495 /** 496 * \brief Array of io vectors used to process file based I/O. 497 * 498 * Only a single file based request is outstanding per-xbb instance, 499 * so we only need one of these. 500 */ 501 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 502 #ifdef XBB_USE_BOUNCE_BUFFERS 503 504 /** 505 * \brief Array of io vectors used to handle bouncing of file reads. 506 * 507 * Vnode operations are free to modify uio data during their 508 * exectuion. In the case of a read with bounce buffering active, 509 * we need some of the data from the original uio in order to 510 * bounce-out the read data. This array serves as the temporary 511 * storage for this saved data. 512 */ 513 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 514 515 /** 516 * \brief Array of memoized bounce buffer kva offsets used 517 * in the file based backend. 518 * 519 * Due to the way that the mapping of the memory backing an 520 * I/O transaction is handled by Xen, a second pass through 521 * the request sg elements is unavoidable. We memoize the computed 522 * bounce address here to reduce the cost of the second walk. 523 */ 524 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 525 #endif /* XBB_USE_BOUNCE_BUFFERS */ 526 }; 527 528 /** 529 * Collection of backend type specific data. 530 */ 531 union xbb_backend_data { 532 struct xbb_dev_data dev; 533 struct xbb_file_data file; 534 }; 535 536 /** 537 * Function signature of backend specific I/O handlers. 538 */ 539 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 540 struct xbb_xen_reqlist *reqlist, int operation, 541 int flags); 542 543 /** 544 * Per-instance configuration data. 545 */ 546 struct xbb_softc { 547 /** 548 * Task-queue used to process I/O requests. 549 */ 550 struct taskqueue *io_taskqueue; 551 552 /** 553 * Single "run the request queue" task enqueued 554 * on io_taskqueue. 555 */ 556 struct task io_task; 557 558 /** Device type for this instance. */ 559 xbb_type device_type; 560 561 /** NewBus device corresponding to this instance. */ 562 device_t dev; 563 564 /** Backend specific dispatch routine for this instance. */ 565 xbb_dispatch_t dispatch_io; 566 567 /** The number of requests outstanding on the backend device/file. */ 568 int active_request_count; 569 570 /** Free pool of request tracking structures. */ 571 struct xbb_xen_req_list request_free_stailq; 572 573 /** Array, sized at connection time, of request tracking structures. */ 574 struct xbb_xen_req *requests; 575 576 /** Free pool of request list structures. */ 577 struct xbb_xen_reqlist_list reqlist_free_stailq; 578 579 /** List of pending request lists awaiting execution. */ 580 struct xbb_xen_reqlist_list reqlist_pending_stailq; 581 582 /** Array, sized at connection time, of request list structures. */ 583 struct xbb_xen_reqlist *request_lists; 584 585 /** 586 * Global pool of kva used for mapping remote domain ring 587 * and I/O transaction data. 588 */ 589 vm_offset_t kva; 590 591 /** Pseudo-physical address corresponding to kva. */ 592 uint64_t gnt_base_addr; 593 594 /** The size of the global kva pool. */ 595 int kva_size; 596 597 /** The size of the KVA area used for request lists. */ 598 int reqlist_kva_size; 599 600 /** The number of pages of KVA used for request lists */ 601 int reqlist_kva_pages; 602 603 /** Bitmap of free KVA pages */ 604 bitstr_t *kva_free; 605 606 /** 607 * \brief Cached value of the front-end's domain id. 608 * 609 * This value is used at once for each mapped page in 610 * a transaction. We cache it to avoid incuring the 611 * cost of an ivar access every time this is needed. 612 */ 613 domid_t otherend_id; 614 615 /** 616 * \brief The blkif protocol abi in effect. 617 * 618 * There are situations where the back and front ends can 619 * have a different, native abi (e.g. intel x86_64 and 620 * 32bit x86 domains on the same machine). The back-end 621 * always accommodates the front-end's native abi. That 622 * value is pulled from the XenStore and recorded here. 623 */ 624 int abi; 625 626 /** 627 * \brief The maximum number of requests and request lists allowed 628 * to be in flight at a time. 629 * 630 * This value is negotiated via the XenStore. 631 */ 632 u_int max_requests; 633 634 /** 635 * \brief The maximum number of segments (1 page per segment) 636 * that can be mapped by a request. 637 * 638 * This value is negotiated via the XenStore. 639 */ 640 u_int max_request_segments; 641 642 /** 643 * \brief Maximum number of segments per request list. 644 * 645 * This value is derived from and will generally be larger than 646 * max_request_segments. 647 */ 648 u_int max_reqlist_segments; 649 650 /** 651 * The maximum size of any request to this back-end 652 * device. 653 * 654 * This value is negotiated via the XenStore. 655 */ 656 u_int max_request_size; 657 658 /** 659 * The maximum size of any request list. This is derived directly 660 * from max_reqlist_segments. 661 */ 662 u_int max_reqlist_size; 663 664 /** Various configuration and state bit flags. */ 665 xbb_flag_t flags; 666 667 /** Ring mapping and interrupt configuration data. */ 668 struct xbb_ring_config ring_config; 669 670 /** Runtime, cross-abi safe, structures for ring access. */ 671 blkif_back_rings_t rings; 672 673 /** IRQ mapping for the communication ring event channel. */ 674 xen_intr_handle_t xen_intr_handle; 675 676 /** 677 * \brief Backend access mode flags (e.g. write, or read-only). 678 * 679 * This value is passed to us by the front-end via the XenStore. 680 */ 681 char *dev_mode; 682 683 /** 684 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 685 * 686 * This value is passed to us by the front-end via the XenStore. 687 * Currently unused. 688 */ 689 char *dev_type; 690 691 /** 692 * \brief Backend device/file identifier. 693 * 694 * This value is passed to us by the front-end via the XenStore. 695 * We expect this to be a POSIX path indicating the file or 696 * device to open. 697 */ 698 char *dev_name; 699 700 /** 701 * Vnode corresponding to the backend device node or file 702 * we are acessing. 703 */ 704 struct vnode *vn; 705 706 union xbb_backend_data backend; 707 708 /** The native sector size of the backend. */ 709 u_int sector_size; 710 711 /** log2 of sector_size. */ 712 u_int sector_size_shift; 713 714 /** Size in bytes of the backend device or file. */ 715 off_t media_size; 716 717 /** 718 * \brief media_size expressed in terms of the backend native 719 * sector size. 720 * 721 * (e.g. xbb->media_size >> xbb->sector_size_shift). 722 */ 723 uint64_t media_num_sectors; 724 725 /** 726 * \brief Array of memoized scatter gather data computed during the 727 * conversion of blkif ring requests to internal xbb_xen_req 728 * structures. 729 * 730 * Ring processing is serialized so we only need one of these. 731 */ 732 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 733 734 /** 735 * Temporary grant table map used in xbb_dispatch_io(). When 736 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 737 * stack could cause a stack overflow. 738 */ 739 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 740 741 /** Mutex protecting per-instance data. */ 742 struct mtx lock; 743 744 /** 745 * Resource representing allocated physical address space 746 * associated with our per-instance kva region. 747 */ 748 struct resource *pseudo_phys_res; 749 750 /** Resource id for allocated physical address space. */ 751 int pseudo_phys_res_id; 752 753 /** 754 * I/O statistics from BlockBack dispatch down. These are 755 * coalesced requests, and we start them right before execution. 756 */ 757 struct devstat *xbb_stats; 758 759 /** 760 * I/O statistics coming into BlockBack. These are the requests as 761 * we get them from BlockFront. They are started as soon as we 762 * receive a request, and completed when the I/O is complete. 763 */ 764 struct devstat *xbb_stats_in; 765 766 /** Disable sending flush to the backend */ 767 int disable_flush; 768 769 /** Send a real flush for every N flush requests */ 770 int flush_interval; 771 772 /** Count of flush requests in the interval */ 773 int flush_count; 774 775 /** Don't coalesce requests if this is set */ 776 int no_coalesce_reqs; 777 778 /** Number of requests we have received */ 779 uint64_t reqs_received; 780 781 /** Number of requests we have completed*/ 782 uint64_t reqs_completed; 783 784 /** Number of requests we queued but not pushed*/ 785 uint64_t reqs_queued_for_completion; 786 787 /** Number of requests we completed with an error status*/ 788 uint64_t reqs_completed_with_error; 789 790 /** How many forced dispatches (i.e. without coalescing) have happened */ 791 uint64_t forced_dispatch; 792 793 /** How many normal dispatches have happened */ 794 uint64_t normal_dispatch; 795 796 /** How many total dispatches have happened */ 797 uint64_t total_dispatch; 798 799 /** How many times we have run out of KVA */ 800 uint64_t kva_shortages; 801 802 /** How many times we have run out of request structures */ 803 uint64_t request_shortages; 804 805 /** Watch to wait for hotplug script execution */ 806 struct xs_watch hotplug_watch; 807 808 /** Got the needed data from hotplug scripts? */ 809 bool hotplug_done; 810 }; 811 812 /*---------------------------- Request Processing ----------------------------*/ 813 /** 814 * Allocate an internal transaction tracking structure from the free pool. 815 * 816 * \param xbb Per-instance xbb configuration structure. 817 * 818 * \return On success, a pointer to the allocated xbb_xen_req structure. 819 * Otherwise NULL. 820 */ 821 static inline struct xbb_xen_req * 822 xbb_get_req(struct xbb_softc *xbb) 823 { 824 struct xbb_xen_req *req; 825 826 req = NULL; 827 828 mtx_assert(&xbb->lock, MA_OWNED); 829 830 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 831 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 832 xbb->active_request_count++; 833 } 834 835 return (req); 836 } 837 838 /** 839 * Return an allocated transaction tracking structure to the free pool. 840 * 841 * \param xbb Per-instance xbb configuration structure. 842 * \param req The request structure to free. 843 */ 844 static inline void 845 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 846 { 847 mtx_assert(&xbb->lock, MA_OWNED); 848 849 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 850 xbb->active_request_count--; 851 852 KASSERT(xbb->active_request_count >= 0, 853 ("xbb_release_req: negative active count")); 854 } 855 856 /** 857 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 858 * 859 * \param xbb Per-instance xbb configuration structure. 860 * \param req_list The list of requests to free. 861 * \param nreqs The number of items in the list. 862 */ 863 static inline void 864 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 865 int nreqs) 866 { 867 mtx_assert(&xbb->lock, MA_OWNED); 868 869 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 870 xbb->active_request_count -= nreqs; 871 872 KASSERT(xbb->active_request_count >= 0, 873 ("xbb_release_reqs: negative active count")); 874 } 875 876 /** 877 * Given a page index and 512b sector offset within that page, 878 * calculate an offset into a request's kva region. 879 * 880 * \param reqlist The request structure whose kva region will be accessed. 881 * \param pagenr The page index used to compute the kva offset. 882 * \param sector The 512b sector index used to compute the page relative 883 * kva offset. 884 * 885 * \return The computed global KVA offset. 886 */ 887 static inline uint8_t * 888 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 889 { 890 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 891 } 892 893 #ifdef XBB_USE_BOUNCE_BUFFERS 894 /** 895 * Given a page index and 512b sector offset within that page, 896 * calculate an offset into a request's local bounce memory region. 897 * 898 * \param reqlist The request structure whose bounce region will be accessed. 899 * \param pagenr The page index used to compute the bounce offset. 900 * \param sector The 512b sector index used to compute the page relative 901 * bounce offset. 902 * 903 * \return The computed global bounce buffer address. 904 */ 905 static inline uint8_t * 906 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 907 { 908 return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 909 } 910 #endif 911 912 /** 913 * Given a page number and 512b sector offset within that page, 914 * calculate an offset into the request's memory region that the 915 * underlying backend device/file should use for I/O. 916 * 917 * \param reqlist The request structure whose I/O region will be accessed. 918 * \param pagenr The page index used to compute the I/O offset. 919 * \param sector The 512b sector index used to compute the page relative 920 * I/O offset. 921 * 922 * \return The computed global I/O address. 923 * 924 * Depending on configuration, this will either be a local bounce buffer 925 * or a pointer to the memory mapped in from the front-end domain for 926 * this request. 927 */ 928 static inline uint8_t * 929 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 930 { 931 #ifdef XBB_USE_BOUNCE_BUFFERS 932 return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 933 #else 934 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 935 #endif 936 } 937 938 /** 939 * Given a page index and 512b sector offset within that page, calculate 940 * an offset into the local pseudo-physical address space used to map a 941 * front-end's request data into a request. 942 * 943 * \param reqlist The request list structure whose pseudo-physical region 944 * will be accessed. 945 * \param pagenr The page index used to compute the pseudo-physical offset. 946 * \param sector The 512b sector index used to compute the page relative 947 * pseudo-physical offset. 948 * 949 * \return The computed global pseudo-phsyical address. 950 * 951 * Depending on configuration, this will either be a local bounce buffer 952 * or a pointer to the memory mapped in from the front-end domain for 953 * this request. 954 */ 955 static inline uintptr_t 956 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 957 { 958 struct xbb_softc *xbb; 959 960 xbb = reqlist->xbb; 961 962 return ((uintptr_t)(xbb->gnt_base_addr + 963 (uintptr_t)(reqlist->kva - xbb->kva) + 964 (PAGE_SIZE * pagenr) + (sector << 9))); 965 } 966 967 /** 968 * Get Kernel Virtual Address space for mapping requests. 969 * 970 * \param xbb Per-instance xbb configuration structure. 971 * \param nr_pages Number of pages needed. 972 * \param check_only If set, check for free KVA but don't allocate it. 973 * \param have_lock If set, xbb lock is already held. 974 * 975 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 976 * 977 * Note: This should be unnecessary once we have either chaining or 978 * scatter/gather support for struct bio. At that point we'll be able to 979 * put multiple addresses and lengths in one bio/bio chain and won't need 980 * to map everything into one virtual segment. 981 */ 982 static uint8_t * 983 xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 984 { 985 int first_clear; 986 int num_clear; 987 uint8_t *free_kva; 988 int i; 989 990 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 991 992 first_clear = 0; 993 free_kva = NULL; 994 995 mtx_lock(&xbb->lock); 996 997 /* 998 * Look for the first available page. If there are none, we're done. 999 */ 1000 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 1001 1002 if (first_clear == -1) 1003 goto bailout; 1004 1005 /* 1006 * Starting at the first available page, look for consecutive free 1007 * pages that will satisfy the user's request. 1008 */ 1009 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1010 /* 1011 * If this is true, the page is used, so we have to reset 1012 * the number of clear pages and the first clear page 1013 * (since it pointed to a region with an insufficient number 1014 * of clear pages). 1015 */ 1016 if (bit_test(xbb->kva_free, i)) { 1017 num_clear = 0; 1018 first_clear = -1; 1019 continue; 1020 } 1021 1022 if (first_clear == -1) 1023 first_clear = i; 1024 1025 /* 1026 * If this is true, we've found a large enough free region 1027 * to satisfy the request. 1028 */ 1029 if (++num_clear == nr_pages) { 1030 bit_nset(xbb->kva_free, first_clear, 1031 first_clear + nr_pages - 1); 1032 1033 free_kva = xbb->kva + 1034 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); 1035 1036 KASSERT(free_kva >= (uint8_t *)xbb->kva && 1037 free_kva + (nr_pages * PAGE_SIZE) <= 1038 (uint8_t *)xbb->ring_config.va, 1039 ("Free KVA %p len %d out of range, " 1040 "kva = %#jx, ring VA = %#jx\n", free_kva, 1041 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1042 (uintmax_t)xbb->ring_config.va)); 1043 break; 1044 } 1045 } 1046 1047 bailout: 1048 1049 if (free_kva == NULL) { 1050 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1051 xbb->kva_shortages++; 1052 } 1053 1054 mtx_unlock(&xbb->lock); 1055 1056 return (free_kva); 1057 } 1058 1059 /** 1060 * Free allocated KVA. 1061 * 1062 * \param xbb Per-instance xbb configuration structure. 1063 * \param kva_ptr Pointer to allocated KVA region. 1064 * \param nr_pages Number of pages in the KVA region. 1065 */ 1066 static void 1067 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 1068 { 1069 intptr_t start_page; 1070 1071 mtx_assert(&xbb->lock, MA_OWNED); 1072 1073 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1074 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1075 1076 } 1077 1078 /** 1079 * Unmap the front-end pages associated with this I/O request. 1080 * 1081 * \param req The request structure to unmap. 1082 */ 1083 static void 1084 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1085 { 1086 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1087 u_int i; 1088 u_int invcount; 1089 int error; 1090 1091 invcount = 0; 1092 for (i = 0; i < reqlist->nr_segments; i++) { 1093 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1094 continue; 1095 1096 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1097 unmap[invcount].dev_bus_addr = 0; 1098 unmap[invcount].handle = reqlist->gnt_handles[i]; 1099 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1100 invcount++; 1101 } 1102 1103 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1104 unmap, invcount); 1105 KASSERT(error == 0, ("Grant table operation failed")); 1106 } 1107 1108 /** 1109 * Allocate an internal transaction tracking structure from the free pool. 1110 * 1111 * \param xbb Per-instance xbb configuration structure. 1112 * 1113 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1114 * Otherwise NULL. 1115 */ 1116 static inline struct xbb_xen_reqlist * 1117 xbb_get_reqlist(struct xbb_softc *xbb) 1118 { 1119 struct xbb_xen_reqlist *reqlist; 1120 1121 reqlist = NULL; 1122 1123 mtx_assert(&xbb->lock, MA_OWNED); 1124 1125 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1126 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1127 reqlist->flags = XBB_REQLIST_NONE; 1128 reqlist->kva = NULL; 1129 reqlist->status = BLKIF_RSP_OKAY; 1130 reqlist->residual_512b_sectors = 0; 1131 reqlist->num_children = 0; 1132 reqlist->nr_segments = 0; 1133 STAILQ_INIT(&reqlist->contig_req_list); 1134 } 1135 1136 return (reqlist); 1137 } 1138 1139 /** 1140 * Return an allocated transaction tracking structure to the free pool. 1141 * 1142 * \param xbb Per-instance xbb configuration structure. 1143 * \param req The request list structure to free. 1144 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1145 * during a resource shortage condition. 1146 */ 1147 static inline void 1148 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1149 int wakeup) 1150 { 1151 1152 mtx_assert(&xbb->lock, MA_OWNED); 1153 1154 if (wakeup) { 1155 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1156 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1157 } 1158 1159 if (reqlist->kva != NULL) 1160 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1161 1162 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1163 1164 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1165 1166 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1167 /* 1168 * Shutdown is in progress. See if we can 1169 * progress further now that one more request 1170 * has completed and been returned to the 1171 * free pool. 1172 */ 1173 xbb_shutdown(xbb); 1174 } 1175 1176 if (wakeup != 0) 1177 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1178 } 1179 1180 /** 1181 * Request resources and do basic request setup. 1182 * 1183 * \param xbb Per-instance xbb configuration structure. 1184 * \param reqlist Pointer to reqlist pointer. 1185 * \param ring_req Pointer to a block ring request. 1186 * \param ring_index The ring index of this request. 1187 * 1188 * \return 0 for success, non-zero for failure. 1189 */ 1190 static int 1191 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1192 blkif_request_t *ring_req, RING_IDX ring_idx) 1193 { 1194 struct xbb_xen_reqlist *nreqlist; 1195 struct xbb_xen_req *nreq; 1196 1197 nreqlist = NULL; 1198 nreq = NULL; 1199 1200 mtx_lock(&xbb->lock); 1201 1202 /* 1203 * We don't allow new resources to be allocated if we're in the 1204 * process of shutting down. 1205 */ 1206 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1207 mtx_unlock(&xbb->lock); 1208 return (1); 1209 } 1210 1211 /* 1212 * Allocate a reqlist if the caller doesn't have one already. 1213 */ 1214 if (*reqlist == NULL) { 1215 nreqlist = xbb_get_reqlist(xbb); 1216 if (nreqlist == NULL) 1217 goto bailout_error; 1218 } 1219 1220 /* We always allocate a request. */ 1221 nreq = xbb_get_req(xbb); 1222 if (nreq == NULL) 1223 goto bailout_error; 1224 1225 mtx_unlock(&xbb->lock); 1226 1227 if (*reqlist == NULL) { 1228 *reqlist = nreqlist; 1229 nreqlist->operation = ring_req->operation; 1230 nreqlist->starting_sector_number = ring_req->sector_number; 1231 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1232 links); 1233 } 1234 1235 nreq->reqlist = *reqlist; 1236 nreq->req_ring_idx = ring_idx; 1237 nreq->id = ring_req->id; 1238 nreq->operation = ring_req->operation; 1239 1240 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1241 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1242 nreq->ring_req = &nreq->ring_req_storage; 1243 } else { 1244 nreq->ring_req = ring_req; 1245 } 1246 1247 binuptime(&nreq->ds_t0); 1248 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1249 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1250 (*reqlist)->num_children++; 1251 (*reqlist)->nr_segments += ring_req->nr_segments; 1252 1253 return (0); 1254 1255 bailout_error: 1256 1257 /* 1258 * We're out of resources, so set the shortage flag. The next time 1259 * a request is released, we'll try waking up the work thread to 1260 * see if we can allocate more resources. 1261 */ 1262 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1263 xbb->request_shortages++; 1264 1265 if (nreq != NULL) 1266 xbb_release_req(xbb, nreq); 1267 1268 if (nreqlist != NULL) 1269 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1270 1271 mtx_unlock(&xbb->lock); 1272 1273 return (1); 1274 } 1275 1276 /** 1277 * Create and queue a response to a blkif request. 1278 * 1279 * \param xbb Per-instance xbb configuration structure. 1280 * \param req The request structure to which to respond. 1281 * \param status The status code to report. See BLKIF_RSP_* 1282 * in sys/xen/interface/io/blkif.h. 1283 */ 1284 static void 1285 xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1286 { 1287 blkif_response_t *resp; 1288 1289 /* 1290 * The mutex is required here, and should be held across this call 1291 * until after the subsequent call to xbb_push_responses(). This 1292 * is to guarantee that another context won't queue responses and 1293 * push them while we're active. 1294 * 1295 * That could lead to the other end being notified of responses 1296 * before the resources have been freed on this end. The other end 1297 * would then be able to queue additional I/O, and we may run out 1298 * of resources because we haven't freed them all yet. 1299 */ 1300 mtx_assert(&xbb->lock, MA_OWNED); 1301 1302 /* 1303 * Place on the response ring for the relevant domain. 1304 * For now, only the spacing between entries is different 1305 * in the different ABIs, not the response entry layout. 1306 */ 1307 switch (xbb->abi) { 1308 case BLKIF_PROTOCOL_NATIVE: 1309 resp = RING_GET_RESPONSE(&xbb->rings.native, 1310 xbb->rings.native.rsp_prod_pvt); 1311 break; 1312 case BLKIF_PROTOCOL_X86_32: 1313 resp = (blkif_response_t *) 1314 RING_GET_RESPONSE(&xbb->rings.x86_32, 1315 xbb->rings.x86_32.rsp_prod_pvt); 1316 break; 1317 case BLKIF_PROTOCOL_X86_64: 1318 resp = (blkif_response_t *) 1319 RING_GET_RESPONSE(&xbb->rings.x86_64, 1320 xbb->rings.x86_64.rsp_prod_pvt); 1321 break; 1322 default: 1323 panic("Unexpected blkif protocol ABI."); 1324 } 1325 1326 resp->id = req->id; 1327 resp->operation = req->operation; 1328 resp->status = status; 1329 1330 if (status != BLKIF_RSP_OKAY) 1331 xbb->reqs_completed_with_error++; 1332 1333 xbb->rings.common.rsp_prod_pvt++; 1334 1335 xbb->reqs_queued_for_completion++; 1336 1337 } 1338 1339 /** 1340 * Send queued responses to blkif requests. 1341 * 1342 * \param xbb Per-instance xbb configuration structure. 1343 * \param run_taskqueue Flag that is set to 1 if the taskqueue 1344 * should be run, 0 if it does not need to be run. 1345 * \param notify Flag that is set to 1 if the other end should be 1346 * notified via irq, 0 if the other end should not be 1347 * notified. 1348 */ 1349 static void 1350 xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) 1351 { 1352 int more_to_do; 1353 1354 /* 1355 * The mutex is required here. 1356 */ 1357 mtx_assert(&xbb->lock, MA_OWNED); 1358 1359 more_to_do = 0; 1360 1361 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); 1362 1363 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1364 /* 1365 * Tail check for pending requests. Allows frontend to avoid 1366 * notifications if requests are already in flight (lower 1367 * overheads and promotes batching). 1368 */ 1369 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1370 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1371 more_to_do = 1; 1372 } 1373 1374 xbb->reqs_completed += xbb->reqs_queued_for_completion; 1375 xbb->reqs_queued_for_completion = 0; 1376 1377 *run_taskqueue = more_to_do; 1378 } 1379 1380 /** 1381 * Complete a request list. 1382 * 1383 * \param xbb Per-instance xbb configuration structure. 1384 * \param reqlist Allocated internal request list structure. 1385 */ 1386 static void 1387 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1388 { 1389 struct xbb_xen_req *nreq; 1390 off_t sectors_sent; 1391 int notify, run_taskqueue; 1392 1393 sectors_sent = 0; 1394 1395 if (reqlist->flags & XBB_REQLIST_MAPPED) 1396 xbb_unmap_reqlist(reqlist); 1397 1398 mtx_lock(&xbb->lock); 1399 1400 /* 1401 * All I/O is done, send the response. A lock is not necessary 1402 * to protect the request list, because all requests have 1403 * completed. Therefore this is the only context accessing this 1404 * reqlist right now. However, in order to make sure that no one 1405 * else queues responses onto the queue or pushes them to the other 1406 * side while we're active, we need to hold the lock across the 1407 * calls to xbb_queue_response() and xbb_push_responses(). 1408 */ 1409 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1410 off_t cur_sectors_sent; 1411 1412 /* Put this response on the ring, but don't push yet */ 1413 xbb_queue_response(xbb, nreq, reqlist->status); 1414 1415 /* We don't report bytes sent if there is an error. */ 1416 if (reqlist->status == BLKIF_RSP_OKAY) 1417 cur_sectors_sent = nreq->nr_512b_sectors; 1418 else 1419 cur_sectors_sent = 0; 1420 1421 sectors_sent += cur_sectors_sent; 1422 1423 devstat_end_transaction(xbb->xbb_stats_in, 1424 /*bytes*/cur_sectors_sent << 9, 1425 reqlist->ds_tag_type, 1426 reqlist->ds_trans_type, 1427 /*now*/NULL, 1428 /*then*/&nreq->ds_t0); 1429 } 1430 1431 /* 1432 * Take out any sectors not sent. If we wind up negative (which 1433 * might happen if an error is reported as well as a residual), just 1434 * report 0 sectors sent. 1435 */ 1436 sectors_sent -= reqlist->residual_512b_sectors; 1437 if (sectors_sent < 0) 1438 sectors_sent = 0; 1439 1440 devstat_end_transaction(xbb->xbb_stats, 1441 /*bytes*/ sectors_sent << 9, 1442 reqlist->ds_tag_type, 1443 reqlist->ds_trans_type, 1444 /*now*/NULL, 1445 /*then*/&reqlist->ds_t0); 1446 1447 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1448 1449 xbb_push_responses(xbb, &run_taskqueue, ¬ify); 1450 1451 mtx_unlock(&xbb->lock); 1452 1453 if (run_taskqueue) 1454 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1455 1456 if (notify) 1457 xen_intr_signal(xbb->xen_intr_handle); 1458 } 1459 1460 /** 1461 * Completion handler for buffer I/O requests issued by the device 1462 * backend driver. 1463 * 1464 * \param bio The buffer I/O request on which to perform completion 1465 * processing. 1466 */ 1467 static void 1468 xbb_bio_done(struct bio *bio) 1469 { 1470 struct xbb_softc *xbb; 1471 struct xbb_xen_reqlist *reqlist; 1472 1473 reqlist = bio->bio_caller1; 1474 xbb = reqlist->xbb; 1475 1476 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1477 1478 /* 1479 * This is a bit imprecise. With aggregated I/O a single 1480 * request list can contain multiple front-end requests and 1481 * a multiple bios may point to a single request. By carefully 1482 * walking the request list, we could map residuals and errors 1483 * back to the original front-end request, but the interface 1484 * isn't sufficiently rich for us to properly report the error. 1485 * So, we just treat the entire request list as having failed if an 1486 * error occurs on any part. And, if an error occurs, we treat 1487 * the amount of data transferred as 0. 1488 * 1489 * For residuals, we report it on the overall aggregated device, 1490 * but not on the individual requests, since we don't currently 1491 * do the work to determine which front-end request to which the 1492 * residual applies. 1493 */ 1494 if (bio->bio_error) { 1495 DPRINTF("BIO returned error %d for operation on device %s\n", 1496 bio->bio_error, xbb->dev_name); 1497 reqlist->status = BLKIF_RSP_ERROR; 1498 1499 if (bio->bio_error == ENXIO 1500 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1501 /* 1502 * Backend device has disappeared. Signal the 1503 * front-end that we (the device proxy) want to 1504 * go away. 1505 */ 1506 xenbus_set_state(xbb->dev, XenbusStateClosing); 1507 } 1508 } 1509 1510 #ifdef XBB_USE_BOUNCE_BUFFERS 1511 if (bio->bio_cmd == BIO_READ) { 1512 vm_offset_t kva_offset; 1513 1514 kva_offset = (vm_offset_t)bio->bio_data 1515 - (vm_offset_t)reqlist->bounce; 1516 memcpy((uint8_t *)reqlist->kva + kva_offset, 1517 bio->bio_data, bio->bio_bcount); 1518 } 1519 #endif /* XBB_USE_BOUNCE_BUFFERS */ 1520 1521 /* 1522 * Decrement the pending count for the request list. When we're 1523 * done with the requests, send status back for all of them. 1524 */ 1525 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1526 xbb_complete_reqlist(xbb, reqlist); 1527 1528 g_destroy_bio(bio); 1529 } 1530 1531 /** 1532 * Parse a blkif request into an internal request structure and send 1533 * it to the backend for processing. 1534 * 1535 * \param xbb Per-instance xbb configuration structure. 1536 * \param reqlist Allocated internal request list structure. 1537 * 1538 * \return On success, 0. For resource shortages, non-zero. 1539 * 1540 * This routine performs the backend common aspects of request parsing 1541 * including compiling an internal request structure, parsing the S/G 1542 * list and any secondary ring requests in which they may reside, and 1543 * the mapping of front-end I/O pages into our domain. 1544 */ 1545 static int 1546 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1547 { 1548 struct xbb_sg *xbb_sg; 1549 struct gnttab_map_grant_ref *map; 1550 struct blkif_request_segment *sg; 1551 struct blkif_request_segment *last_block_sg; 1552 struct xbb_xen_req *nreq; 1553 u_int nseg; 1554 u_int seg_idx; 1555 u_int block_segs; 1556 int nr_sects; 1557 int total_sects; 1558 int operation; 1559 uint8_t bio_flags; 1560 int error; 1561 1562 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1563 bio_flags = 0; 1564 total_sects = 0; 1565 nr_sects = 0; 1566 1567 /* 1568 * First determine whether we have enough free KVA to satisfy this 1569 * request list. If not, tell xbb_run_queue() so it can go to 1570 * sleep until we have more KVA. 1571 */ 1572 reqlist->kva = NULL; 1573 if (reqlist->nr_segments != 0) { 1574 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1575 if (reqlist->kva == NULL) { 1576 /* 1577 * If we're out of KVA, return ENOMEM. 1578 */ 1579 return (ENOMEM); 1580 } 1581 } 1582 1583 binuptime(&reqlist->ds_t0); 1584 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1585 1586 switch (reqlist->operation) { 1587 case BLKIF_OP_WRITE_BARRIER: 1588 bio_flags |= BIO_ORDERED; 1589 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1590 /* FALLTHROUGH */ 1591 case BLKIF_OP_WRITE: 1592 operation = BIO_WRITE; 1593 reqlist->ds_trans_type = DEVSTAT_WRITE; 1594 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1595 DPRINTF("Attempt to write to read only device %s\n", 1596 xbb->dev_name); 1597 reqlist->status = BLKIF_RSP_ERROR; 1598 goto send_response; 1599 } 1600 break; 1601 case BLKIF_OP_READ: 1602 operation = BIO_READ; 1603 reqlist->ds_trans_type = DEVSTAT_READ; 1604 break; 1605 case BLKIF_OP_FLUSH_DISKCACHE: 1606 /* 1607 * If this is true, the user has requested that we disable 1608 * flush support. So we just complete the requests 1609 * successfully. 1610 */ 1611 if (xbb->disable_flush != 0) { 1612 goto send_response; 1613 } 1614 1615 /* 1616 * The user has requested that we only send a real flush 1617 * for every N flush requests. So keep count, and either 1618 * complete the request immediately or queue it for the 1619 * backend. 1620 */ 1621 if (xbb->flush_interval != 0) { 1622 if (++(xbb->flush_count) < xbb->flush_interval) { 1623 goto send_response; 1624 } else 1625 xbb->flush_count = 0; 1626 } 1627 1628 operation = BIO_FLUSH; 1629 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1630 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1631 goto do_dispatch; 1632 /*NOTREACHED*/ 1633 default: 1634 DPRINTF("error: unknown block io operation [%d]\n", 1635 reqlist->operation); 1636 reqlist->status = BLKIF_RSP_ERROR; 1637 goto send_response; 1638 } 1639 1640 reqlist->xbb = xbb; 1641 xbb_sg = xbb->xbb_sgs; 1642 map = xbb->maps; 1643 seg_idx = 0; 1644 1645 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1646 blkif_request_t *ring_req; 1647 RING_IDX req_ring_idx; 1648 u_int req_seg_idx; 1649 1650 ring_req = nreq->ring_req; 1651 req_ring_idx = nreq->req_ring_idx; 1652 nr_sects = 0; 1653 nseg = ring_req->nr_segments; 1654 nreq->nr_pages = nseg; 1655 nreq->nr_512b_sectors = 0; 1656 req_seg_idx = 0; 1657 sg = NULL; 1658 1659 /* Check that number of segments is sane. */ 1660 if (__predict_false(nseg == 0) 1661 || __predict_false(nseg > xbb->max_request_segments)) { 1662 DPRINTF("Bad number of segments in request (%d)\n", 1663 nseg); 1664 reqlist->status = BLKIF_RSP_ERROR; 1665 goto send_response; 1666 } 1667 1668 block_segs = nseg; 1669 sg = ring_req->seg; 1670 last_block_sg = sg + block_segs; 1671 1672 while (sg < last_block_sg) { 1673 KASSERT(seg_idx < 1674 XBB_MAX_SEGMENTS_PER_REQLIST, 1675 ("seg_idx %d is too large, max " 1676 "segs %d\n", seg_idx, 1677 XBB_MAX_SEGMENTS_PER_REQLIST)); 1678 1679 xbb_sg->first_sect = sg->first_sect; 1680 xbb_sg->last_sect = sg->last_sect; 1681 xbb_sg->nsect = 1682 (int8_t)(sg->last_sect - 1683 sg->first_sect + 1); 1684 1685 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1686 || (xbb_sg->nsect <= 0)) { 1687 reqlist->status = BLKIF_RSP_ERROR; 1688 goto send_response; 1689 } 1690 1691 nr_sects += xbb_sg->nsect; 1692 map->host_addr = xbb_get_gntaddr(reqlist, 1693 seg_idx, /*sector*/0); 1694 KASSERT(map->host_addr + PAGE_SIZE <= 1695 xbb->ring_config.gnt_addr, 1696 ("Host address %#jx len %d overlaps " 1697 "ring address %#jx\n", 1698 (uintmax_t)map->host_addr, PAGE_SIZE, 1699 (uintmax_t)xbb->ring_config.gnt_addr)); 1700 1701 map->flags = GNTMAP_host_map; 1702 map->ref = sg->gref; 1703 map->dom = xbb->otherend_id; 1704 if (operation == BIO_WRITE) 1705 map->flags |= GNTMAP_readonly; 1706 sg++; 1707 map++; 1708 xbb_sg++; 1709 seg_idx++; 1710 req_seg_idx++; 1711 } 1712 1713 /* Convert to the disk's sector size */ 1714 nreq->nr_512b_sectors = nr_sects; 1715 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1716 total_sects += nr_sects; 1717 1718 if ((nreq->nr_512b_sectors & 1719 ((xbb->sector_size >> 9) - 1)) != 0) { 1720 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1721 "a multiple of the backing store sector " 1722 "size (%d)\n", __func__, 1723 nreq->nr_512b_sectors << 9, 1724 xbb->sector_size); 1725 reqlist->status = BLKIF_RSP_ERROR; 1726 goto send_response; 1727 } 1728 } 1729 1730 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1731 xbb->maps, reqlist->nr_segments); 1732 if (error != 0) 1733 panic("Grant table operation failed (%d)", error); 1734 1735 reqlist->flags |= XBB_REQLIST_MAPPED; 1736 1737 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1738 seg_idx++, map++){ 1739 if (__predict_false(map->status != 0)) { 1740 DPRINTF("invalid buffer -- could not remap " 1741 "it (%d)\n", map->status); 1742 DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " 1743 "0x%x ref 0x%x, dom %d\n", seg_idx, 1744 map->host_addr, map->flags, map->ref, 1745 map->dom); 1746 reqlist->status = BLKIF_RSP_ERROR; 1747 goto send_response; 1748 } 1749 1750 reqlist->gnt_handles[seg_idx] = map->handle; 1751 } 1752 if (reqlist->starting_sector_number + total_sects > 1753 xbb->media_num_sectors) { 1754 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1755 "extends past end of device %s\n", 1756 operation == BIO_READ ? "read" : "write", 1757 reqlist->starting_sector_number, 1758 reqlist->starting_sector_number + total_sects, 1759 xbb->dev_name); 1760 reqlist->status = BLKIF_RSP_ERROR; 1761 goto send_response; 1762 } 1763 1764 do_dispatch: 1765 1766 error = xbb->dispatch_io(xbb, 1767 reqlist, 1768 operation, 1769 bio_flags); 1770 1771 if (error != 0) { 1772 reqlist->status = BLKIF_RSP_ERROR; 1773 goto send_response; 1774 } 1775 1776 return (0); 1777 1778 send_response: 1779 1780 xbb_complete_reqlist(xbb, reqlist); 1781 1782 return (0); 1783 } 1784 1785 static __inline int 1786 xbb_count_sects(blkif_request_t *ring_req) 1787 { 1788 int i; 1789 int cur_size = 0; 1790 1791 for (i = 0; i < ring_req->nr_segments; i++) { 1792 int nsect; 1793 1794 nsect = (int8_t)(ring_req->seg[i].last_sect - 1795 ring_req->seg[i].first_sect + 1); 1796 if (nsect <= 0) 1797 break; 1798 1799 cur_size += nsect; 1800 } 1801 1802 return (cur_size); 1803 } 1804 1805 /** 1806 * Process incoming requests from the shared communication ring in response 1807 * to a signal on the ring's event channel. 1808 * 1809 * \param context Callback argument registerd during task initialization - 1810 * the xbb_softc for this instance. 1811 * \param pending The number of taskqueue_enqueue events that have 1812 * occurred since this handler was last run. 1813 */ 1814 static void 1815 xbb_run_queue(void *context, int pending) 1816 { 1817 struct xbb_softc *xbb; 1818 blkif_back_rings_t *rings; 1819 RING_IDX rp; 1820 uint64_t cur_sector; 1821 int cur_operation; 1822 struct xbb_xen_reqlist *reqlist; 1823 1824 xbb = (struct xbb_softc *)context; 1825 rings = &xbb->rings; 1826 1827 /* 1828 * Work gather and dispatch loop. Note that we have a bias here 1829 * towards gathering I/O sent by blockfront. We first gather up 1830 * everything in the ring, as long as we have resources. Then we 1831 * dispatch one request, and then attempt to gather up any 1832 * additional requests that have come in while we were dispatching 1833 * the request. 1834 * 1835 * This allows us to get a clearer picture (via devstat) of how 1836 * many requests blockfront is queueing to us at any given time. 1837 */ 1838 for (;;) { 1839 int retval; 1840 1841 /* 1842 * Initialize reqlist to the last element in the pending 1843 * queue, if there is one. This allows us to add more 1844 * requests to that request list, if we have room. 1845 */ 1846 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1847 xbb_xen_reqlist, links); 1848 if (reqlist != NULL) { 1849 cur_sector = reqlist->next_contig_sector; 1850 cur_operation = reqlist->operation; 1851 } else { 1852 cur_operation = 0; 1853 cur_sector = 0; 1854 } 1855 1856 /* 1857 * Cache req_prod to avoid accessing a cache line shared 1858 * with the frontend. 1859 */ 1860 rp = rings->common.sring->req_prod; 1861 1862 /* Ensure we see queued requests up to 'rp'. */ 1863 rmb(); 1864 1865 /** 1866 * Run so long as there is work to consume and the generation 1867 * of a response will not overflow the ring. 1868 * 1869 * @note There's a 1 to 1 relationship between requests and 1870 * responses, so an overflow should never occur. This 1871 * test is to protect our domain from digesting bogus 1872 * data. Shouldn't we log this? 1873 */ 1874 while (rings->common.req_cons != rp 1875 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1876 rings->common.req_cons) == 0){ 1877 blkif_request_t ring_req_storage; 1878 blkif_request_t *ring_req; 1879 int cur_size; 1880 1881 switch (xbb->abi) { 1882 case BLKIF_PROTOCOL_NATIVE: 1883 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1884 rings->common.req_cons); 1885 break; 1886 case BLKIF_PROTOCOL_X86_32: 1887 { 1888 struct blkif_x86_32_request *ring_req32; 1889 1890 ring_req32 = RING_GET_REQUEST( 1891 &xbb->rings.x86_32, rings->common.req_cons); 1892 blkif_get_x86_32_req(&ring_req_storage, 1893 ring_req32); 1894 ring_req = &ring_req_storage; 1895 break; 1896 } 1897 case BLKIF_PROTOCOL_X86_64: 1898 { 1899 struct blkif_x86_64_request *ring_req64; 1900 1901 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1902 rings->common.req_cons); 1903 blkif_get_x86_64_req(&ring_req_storage, 1904 ring_req64); 1905 ring_req = &ring_req_storage; 1906 break; 1907 } 1908 default: 1909 panic("Unexpected blkif protocol ABI."); 1910 /* NOTREACHED */ 1911 } 1912 1913 /* 1914 * Check for situations that would require closing 1915 * off this I/O for further coalescing: 1916 * - Coalescing is turned off. 1917 * - Current I/O is out of sequence with the previous 1918 * I/O. 1919 * - Coalesced I/O would be too large. 1920 */ 1921 if ((reqlist != NULL) 1922 && ((xbb->no_coalesce_reqs != 0) 1923 || ((xbb->no_coalesce_reqs == 0) 1924 && ((ring_req->sector_number != cur_sector) 1925 || (ring_req->operation != cur_operation) 1926 || ((ring_req->nr_segments + reqlist->nr_segments) > 1927 xbb->max_reqlist_segments))))) { 1928 reqlist = NULL; 1929 } 1930 1931 /* 1932 * Grab and check for all resources in one shot. 1933 * If we can't get all of the resources we need, 1934 * the shortage is noted and the thread will get 1935 * woken up when more resources are available. 1936 */ 1937 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1938 xbb->rings.common.req_cons); 1939 1940 if (retval != 0) { 1941 /* 1942 * Resource shortage has been recorded. 1943 * We'll be scheduled to run once a request 1944 * object frees up due to a completion. 1945 */ 1946 break; 1947 } 1948 1949 /* 1950 * Signify that we can overwrite this request with 1951 * a response by incrementing our consumer index. 1952 * The response won't be generated until after 1953 * we've already consumed all necessary data out 1954 * of the version of the request in the ring buffer 1955 * (for native mode). We must update the consumer 1956 * index before issuing back-end I/O so there is 1957 * no possibility that it will complete and a 1958 * response be generated before we make room in 1959 * the queue for that response. 1960 */ 1961 xbb->rings.common.req_cons++; 1962 xbb->reqs_received++; 1963 1964 cur_size = xbb_count_sects(ring_req); 1965 cur_sector = ring_req->sector_number + cur_size; 1966 reqlist->next_contig_sector = cur_sector; 1967 cur_operation = ring_req->operation; 1968 } 1969 1970 /* Check for I/O to dispatch */ 1971 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1972 if (reqlist == NULL) { 1973 /* 1974 * We're out of work to do, put the task queue to 1975 * sleep. 1976 */ 1977 break; 1978 } 1979 1980 /* 1981 * Grab the first request off the queue and attempt 1982 * to dispatch it. 1983 */ 1984 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1985 1986 retval = xbb_dispatch_io(xbb, reqlist); 1987 if (retval != 0) { 1988 /* 1989 * xbb_dispatch_io() returns non-zero only when 1990 * there is a resource shortage. If that's the 1991 * case, re-queue this request on the head of the 1992 * queue, and go to sleep until we have more 1993 * resources. 1994 */ 1995 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1996 reqlist, links); 1997 break; 1998 } else { 1999 /* 2000 * If we still have anything on the queue after 2001 * removing the head entry, that is because we 2002 * met one of the criteria to create a new 2003 * request list (outlined above), and we'll call 2004 * that a forced dispatch for statistical purposes. 2005 * 2006 * Otherwise, if there is only one element on the 2007 * queue, we coalesced everything available on 2008 * the ring and we'll call that a normal dispatch. 2009 */ 2010 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2011 2012 if (reqlist != NULL) 2013 xbb->forced_dispatch++; 2014 else 2015 xbb->normal_dispatch++; 2016 2017 xbb->total_dispatch++; 2018 } 2019 } 2020 } 2021 2022 /** 2023 * Interrupt handler bound to the shared ring's event channel. 2024 * 2025 * \param arg Callback argument registerd during event channel 2026 * binding - the xbb_softc for this instance. 2027 */ 2028 static int 2029 xbb_filter(void *arg) 2030 { 2031 struct xbb_softc *xbb; 2032 2033 /* Defer to taskqueue thread. */ 2034 xbb = (struct xbb_softc *)arg; 2035 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2036 2037 return (FILTER_HANDLED); 2038 } 2039 2040 SDT_PROVIDER_DEFINE(xbb); 2041 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 2042 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 2043 "uint64_t"); 2044 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 2045 "uint64_t", "uint64_t"); 2046 2047 /*----------------------------- Backend Handlers -----------------------------*/ 2048 /** 2049 * Backend handler for character device access. 2050 * 2051 * \param xbb Per-instance xbb configuration structure. 2052 * \param reqlist Allocated internal request list structure. 2053 * \param operation BIO_* I/O operation code. 2054 * \param bio_flags Additional bio_flag data to pass to any generated 2055 * bios (e.g. BIO_ORDERED).. 2056 * 2057 * \return 0 for success, errno codes for failure. 2058 */ 2059 static int 2060 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2061 int operation, int bio_flags) 2062 { 2063 struct xbb_dev_data *dev_data; 2064 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2065 off_t bio_offset; 2066 struct bio *bio; 2067 struct xbb_sg *xbb_sg; 2068 u_int nbio; 2069 u_int bio_idx; 2070 u_int nseg; 2071 u_int seg_idx; 2072 int error; 2073 2074 dev_data = &xbb->backend.dev; 2075 bio_offset = (off_t)reqlist->starting_sector_number 2076 << xbb->sector_size_shift; 2077 error = 0; 2078 nbio = 0; 2079 bio_idx = 0; 2080 2081 if (operation == BIO_FLUSH) { 2082 bio = g_new_bio(); 2083 if (__predict_false(bio == NULL)) { 2084 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2085 error = ENOMEM; 2086 return (error); 2087 } 2088 2089 bio->bio_cmd = BIO_FLUSH; 2090 bio->bio_flags |= BIO_ORDERED; 2091 bio->bio_dev = dev_data->cdev; 2092 bio->bio_offset = 0; 2093 bio->bio_data = 0; 2094 bio->bio_done = xbb_bio_done; 2095 bio->bio_caller1 = reqlist; 2096 bio->bio_pblkno = 0; 2097 2098 reqlist->pendcnt = 1; 2099 2100 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2101 device_get_unit(xbb->dev)); 2102 2103 (*dev_data->csw->d_strategy)(bio); 2104 2105 return (0); 2106 } 2107 2108 xbb_sg = xbb->xbb_sgs; 2109 bio = NULL; 2110 nseg = reqlist->nr_segments; 2111 2112 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2113 /* 2114 * KVA will not be contiguous, so any additional 2115 * I/O will need to be represented in a new bio. 2116 */ 2117 if ((bio != NULL) 2118 && (xbb_sg->first_sect != 0)) { 2119 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2120 printf("%s: Discontiguous I/O request " 2121 "from domain %d ends on " 2122 "non-sector boundary\n", 2123 __func__, xbb->otherend_id); 2124 error = EINVAL; 2125 goto fail_free_bios; 2126 } 2127 bio = NULL; 2128 } 2129 2130 if (bio == NULL) { 2131 /* 2132 * Make sure that the start of this bio is 2133 * aligned to a device sector. 2134 */ 2135 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2136 printf("%s: Misaligned I/O request " 2137 "from domain %d\n", __func__, 2138 xbb->otherend_id); 2139 error = EINVAL; 2140 goto fail_free_bios; 2141 } 2142 2143 bio = bios[nbio++] = g_new_bio(); 2144 if (__predict_false(bio == NULL)) { 2145 error = ENOMEM; 2146 goto fail_free_bios; 2147 } 2148 bio->bio_cmd = operation; 2149 bio->bio_flags |= bio_flags; 2150 bio->bio_dev = dev_data->cdev; 2151 bio->bio_offset = bio_offset; 2152 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2153 xbb_sg->first_sect); 2154 bio->bio_done = xbb_bio_done; 2155 bio->bio_caller1 = reqlist; 2156 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2157 } 2158 2159 bio->bio_length += xbb_sg->nsect << 9; 2160 bio->bio_bcount = bio->bio_length; 2161 bio_offset += xbb_sg->nsect << 9; 2162 2163 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2164 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2165 printf("%s: Discontiguous I/O request " 2166 "from domain %d ends on " 2167 "non-sector boundary\n", 2168 __func__, xbb->otherend_id); 2169 error = EINVAL; 2170 goto fail_free_bios; 2171 } 2172 /* 2173 * KVA will not be contiguous, so any additional 2174 * I/O will need to be represented in a new bio. 2175 */ 2176 bio = NULL; 2177 } 2178 } 2179 2180 reqlist->pendcnt = nbio; 2181 2182 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2183 { 2184 #ifdef XBB_USE_BOUNCE_BUFFERS 2185 vm_offset_t kva_offset; 2186 2187 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2188 - (vm_offset_t)reqlist->bounce; 2189 if (operation == BIO_WRITE) { 2190 memcpy(bios[bio_idx]->bio_data, 2191 (uint8_t *)reqlist->kva + kva_offset, 2192 bios[bio_idx]->bio_bcount); 2193 } 2194 #endif 2195 if (operation == BIO_READ) { 2196 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2197 device_get_unit(xbb->dev), 2198 bios[bio_idx]->bio_offset, 2199 bios[bio_idx]->bio_length); 2200 } else if (operation == BIO_WRITE) { 2201 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2202 device_get_unit(xbb->dev), 2203 bios[bio_idx]->bio_offset, 2204 bios[bio_idx]->bio_length); 2205 } 2206 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2207 } 2208 2209 return (error); 2210 2211 fail_free_bios: 2212 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2213 g_destroy_bio(bios[bio_idx]); 2214 2215 return (error); 2216 } 2217 2218 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2219 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2220 "uint64_t"); 2221 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2222 "uint64_t", "uint64_t"); 2223 2224 /** 2225 * Backend handler for file access. 2226 * 2227 * \param xbb Per-instance xbb configuration structure. 2228 * \param reqlist Allocated internal request list. 2229 * \param operation BIO_* I/O operation code. 2230 * \param flags Additional bio_flag data to pass to any generated bios 2231 * (e.g. BIO_ORDERED).. 2232 * 2233 * \return 0 for success, errno codes for failure. 2234 */ 2235 static int 2236 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2237 int operation, int flags) 2238 { 2239 struct xbb_file_data *file_data; 2240 u_int seg_idx; 2241 u_int nseg; 2242 struct uio xuio; 2243 struct xbb_sg *xbb_sg; 2244 struct iovec *xiovec; 2245 #ifdef XBB_USE_BOUNCE_BUFFERS 2246 void **p_vaddr; 2247 int saved_uio_iovcnt; 2248 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2249 int error; 2250 2251 file_data = &xbb->backend.file; 2252 error = 0; 2253 bzero(&xuio, sizeof(xuio)); 2254 2255 switch (operation) { 2256 case BIO_READ: 2257 xuio.uio_rw = UIO_READ; 2258 break; 2259 case BIO_WRITE: 2260 xuio.uio_rw = UIO_WRITE; 2261 break; 2262 case BIO_FLUSH: { 2263 struct mount *mountpoint; 2264 2265 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2266 device_get_unit(xbb->dev)); 2267 2268 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2269 2270 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2271 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2272 VOP_UNLOCK(xbb->vn); 2273 2274 vn_finished_write(mountpoint); 2275 2276 goto bailout_send_response; 2277 /* NOTREACHED */ 2278 } 2279 default: 2280 panic("invalid operation %d", operation); 2281 /* NOTREACHED */ 2282 } 2283 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2284 << xbb->sector_size_shift; 2285 xuio.uio_segflg = UIO_SYSSPACE; 2286 xuio.uio_iov = file_data->xiovecs; 2287 xuio.uio_iovcnt = 0; 2288 xbb_sg = xbb->xbb_sgs; 2289 nseg = reqlist->nr_segments; 2290 2291 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2292 /* 2293 * If the first sector is not 0, the KVA will 2294 * not be contiguous and we'll need to go on 2295 * to another segment. 2296 */ 2297 if (xbb_sg->first_sect != 0) 2298 xiovec = NULL; 2299 2300 if (xiovec == NULL) { 2301 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2302 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2303 seg_idx, xbb_sg->first_sect); 2304 #ifdef XBB_USE_BOUNCE_BUFFERS 2305 /* 2306 * Store the address of the incoming 2307 * buffer at this particular offset 2308 * as well, so we can do the copy 2309 * later without having to do more 2310 * work to recalculate this address. 2311 */ 2312 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2313 *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2314 xbb_sg->first_sect); 2315 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2316 xiovec->iov_len = 0; 2317 xuio.uio_iovcnt++; 2318 } 2319 2320 xiovec->iov_len += xbb_sg->nsect << 9; 2321 2322 xuio.uio_resid += xbb_sg->nsect << 9; 2323 2324 /* 2325 * If the last sector is not the full page 2326 * size count, the next segment will not be 2327 * contiguous in KVA and we need a new iovec. 2328 */ 2329 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2330 xiovec = NULL; 2331 } 2332 2333 xuio.uio_td = curthread; 2334 2335 #ifdef XBB_USE_BOUNCE_BUFFERS 2336 saved_uio_iovcnt = xuio.uio_iovcnt; 2337 2338 if (operation == BIO_WRITE) { 2339 /* Copy the write data to the local buffer. */ 2340 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2341 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2342 seg_idx++, xiovec++, p_vaddr++) { 2343 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 2344 } 2345 } else { 2346 /* 2347 * We only need to save off the iovecs in the case of a 2348 * read, because the copy for the read happens after the 2349 * VOP_READ(). (The uio will get modified in that call 2350 * sequence.) 2351 */ 2352 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2353 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2354 } 2355 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2356 2357 switch (operation) { 2358 case BIO_READ: 2359 2360 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2361 device_get_unit(xbb->dev), xuio.uio_offset, 2362 xuio.uio_resid); 2363 2364 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2365 2366 /* 2367 * UFS pays attention to IO_DIRECT for reads. If the 2368 * DIRECTIO option is configured into the kernel, it calls 2369 * ffs_rawread(). But that only works for single-segment 2370 * uios with user space addresses. In our case, with a 2371 * kernel uio, it still reads into the buffer cache, but it 2372 * will just try to release the buffer from the cache later 2373 * on in ffs_read(). 2374 * 2375 * ZFS does not pay attention to IO_DIRECT for reads. 2376 * 2377 * UFS does not pay attention to IO_SYNC for reads. 2378 * 2379 * ZFS pays attention to IO_SYNC (which translates into the 2380 * Solaris define FRSYNC for zfs_read()) for reads. It 2381 * attempts to sync the file before reading. 2382 * 2383 * So, to attempt to provide some barrier semantics in the 2384 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2385 */ 2386 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2387 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2388 2389 VOP_UNLOCK(xbb->vn); 2390 break; 2391 case BIO_WRITE: { 2392 struct mount *mountpoint; 2393 2394 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2395 device_get_unit(xbb->dev), xuio.uio_offset, 2396 xuio.uio_resid); 2397 2398 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2399 2400 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2401 2402 /* 2403 * UFS pays attention to IO_DIRECT for writes. The write 2404 * is done asynchronously. (Normally the write would just 2405 * get put into cache. 2406 * 2407 * UFS pays attention to IO_SYNC for writes. It will 2408 * attempt to write the buffer out synchronously if that 2409 * flag is set. 2410 * 2411 * ZFS does not pay attention to IO_DIRECT for writes. 2412 * 2413 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2414 * for writes. It will flush the transaction from the 2415 * cache before returning. 2416 * 2417 * So if we've got the BIO_ORDERED flag set, we want 2418 * IO_SYNC in either the UFS or ZFS case. 2419 */ 2420 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2421 IO_SYNC : 0, file_data->cred); 2422 VOP_UNLOCK(xbb->vn); 2423 2424 vn_finished_write(mountpoint); 2425 2426 break; 2427 } 2428 default: 2429 panic("invalid operation %d", operation); 2430 /* NOTREACHED */ 2431 } 2432 2433 #ifdef XBB_USE_BOUNCE_BUFFERS 2434 /* We only need to copy here for read operations */ 2435 if (operation == BIO_READ) { 2436 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2437 xiovec = file_data->saved_xiovecs; 2438 seg_idx < saved_uio_iovcnt; seg_idx++, 2439 xiovec++, p_vaddr++) { 2440 /* 2441 * Note that we have to use the copy of the 2442 * io vector we made above. uiomove() modifies 2443 * the uio and its referenced vector as uiomove 2444 * performs the copy, so we can't rely on any 2445 * state from the original uio. 2446 */ 2447 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 2448 } 2449 } 2450 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2451 2452 bailout_send_response: 2453 2454 if (error != 0) 2455 reqlist->status = BLKIF_RSP_ERROR; 2456 2457 xbb_complete_reqlist(xbb, reqlist); 2458 2459 return (0); 2460 } 2461 2462 /*--------------------------- Backend Configuration --------------------------*/ 2463 /** 2464 * Close and cleanup any backend device/file specific state for this 2465 * block back instance. 2466 * 2467 * \param xbb Per-instance xbb configuration structure. 2468 */ 2469 static void 2470 xbb_close_backend(struct xbb_softc *xbb) 2471 { 2472 DROP_GIANT(); 2473 DPRINTF("closing dev=%s\n", xbb->dev_name); 2474 if (xbb->vn) { 2475 int flags = FREAD; 2476 2477 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2478 flags |= FWRITE; 2479 2480 switch (xbb->device_type) { 2481 case XBB_TYPE_DISK: 2482 if (xbb->backend.dev.csw) { 2483 dev_relthread(xbb->backend.dev.cdev, 2484 xbb->backend.dev.dev_ref); 2485 xbb->backend.dev.csw = NULL; 2486 xbb->backend.dev.cdev = NULL; 2487 } 2488 break; 2489 case XBB_TYPE_FILE: 2490 break; 2491 case XBB_TYPE_NONE: 2492 default: 2493 panic("Unexpected backend type."); 2494 break; 2495 } 2496 2497 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2498 xbb->vn = NULL; 2499 2500 switch (xbb->device_type) { 2501 case XBB_TYPE_DISK: 2502 break; 2503 case XBB_TYPE_FILE: 2504 if (xbb->backend.file.cred != NULL) { 2505 crfree(xbb->backend.file.cred); 2506 xbb->backend.file.cred = NULL; 2507 } 2508 break; 2509 case XBB_TYPE_NONE: 2510 default: 2511 panic("Unexpected backend type."); 2512 break; 2513 } 2514 } 2515 PICKUP_GIANT(); 2516 } 2517 2518 /** 2519 * Open a character device to be used for backend I/O. 2520 * 2521 * \param xbb Per-instance xbb configuration structure. 2522 * 2523 * \return 0 for success, errno codes for failure. 2524 */ 2525 static int 2526 xbb_open_dev(struct xbb_softc *xbb) 2527 { 2528 struct vattr vattr; 2529 struct cdev *dev; 2530 struct cdevsw *devsw; 2531 int error; 2532 2533 xbb->device_type = XBB_TYPE_DISK; 2534 xbb->dispatch_io = xbb_dispatch_dev; 2535 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2536 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2537 &xbb->backend.dev.dev_ref); 2538 if (xbb->backend.dev.csw == NULL) 2539 panic("Unable to retrieve device switch"); 2540 2541 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2542 if (error) { 2543 xenbus_dev_fatal(xbb->dev, error, "error getting " 2544 "vnode attributes for device %s", 2545 xbb->dev_name); 2546 return (error); 2547 } 2548 2549 dev = xbb->vn->v_rdev; 2550 devsw = dev->si_devsw; 2551 if (!devsw->d_ioctl) { 2552 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2553 "device %s!", xbb->dev_name); 2554 return (ENODEV); 2555 } 2556 2557 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2558 (caddr_t)&xbb->sector_size, FREAD, 2559 curthread); 2560 if (error) { 2561 xenbus_dev_fatal(xbb->dev, error, 2562 "error calling ioctl DIOCGSECTORSIZE " 2563 "for device %s", xbb->dev_name); 2564 return (error); 2565 } 2566 2567 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2568 (caddr_t)&xbb->media_size, FREAD, 2569 curthread); 2570 if (error) { 2571 xenbus_dev_fatal(xbb->dev, error, 2572 "error calling ioctl DIOCGMEDIASIZE " 2573 "for device %s", xbb->dev_name); 2574 return (error); 2575 } 2576 2577 return (0); 2578 } 2579 2580 /** 2581 * Open a file to be used for backend I/O. 2582 * 2583 * \param xbb Per-instance xbb configuration structure. 2584 * 2585 * \return 0 for success, errno codes for failure. 2586 */ 2587 static int 2588 xbb_open_file(struct xbb_softc *xbb) 2589 { 2590 struct xbb_file_data *file_data; 2591 struct vattr vattr; 2592 int error; 2593 2594 file_data = &xbb->backend.file; 2595 xbb->device_type = XBB_TYPE_FILE; 2596 xbb->dispatch_io = xbb_dispatch_file; 2597 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2598 if (error != 0) { 2599 xenbus_dev_fatal(xbb->dev, error, 2600 "error calling VOP_GETATTR()" 2601 "for file %s", xbb->dev_name); 2602 return (error); 2603 } 2604 2605 /* 2606 * Verify that we have the ability to upgrade to exclusive 2607 * access on this file so we can trap errors at open instead 2608 * of reporting them during first access. 2609 */ 2610 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2611 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2612 if (VN_IS_DOOMED(xbb->vn)) { 2613 error = EBADF; 2614 xenbus_dev_fatal(xbb->dev, error, 2615 "error locking file %s", 2616 xbb->dev_name); 2617 2618 return (error); 2619 } 2620 } 2621 2622 file_data->cred = crhold(curthread->td_ucred); 2623 xbb->media_size = vattr.va_size; 2624 2625 /* 2626 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2627 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2628 * with disklabel and UFS on FreeBSD at least. Large block sizes 2629 * may not work with other OSes as well. So just export a sector 2630 * size of 512 bytes, which should work with any OS or 2631 * application. Since our backing is a file, any block size will 2632 * work fine for the backing store. 2633 */ 2634 #if 0 2635 xbb->sector_size = vattr.va_blocksize; 2636 #endif 2637 xbb->sector_size = 512; 2638 2639 /* 2640 * Sanity check. The media size has to be at least one 2641 * sector long. 2642 */ 2643 if (xbb->media_size < xbb->sector_size) { 2644 error = EINVAL; 2645 xenbus_dev_fatal(xbb->dev, error, 2646 "file %s size %ju < block size %u", 2647 xbb->dev_name, 2648 (uintmax_t)xbb->media_size, 2649 xbb->sector_size); 2650 } 2651 return (error); 2652 } 2653 2654 /** 2655 * Open the backend provider for this connection. 2656 * 2657 * \param xbb Per-instance xbb configuration structure. 2658 * 2659 * \return 0 for success, errno codes for failure. 2660 */ 2661 static int 2662 xbb_open_backend(struct xbb_softc *xbb) 2663 { 2664 struct nameidata nd; 2665 int flags; 2666 int error; 2667 2668 flags = FREAD; 2669 error = 0; 2670 2671 DPRINTF("opening dev=%s\n", xbb->dev_name); 2672 2673 if (rootvnode == NULL) { 2674 xenbus_dev_fatal(xbb->dev, ENOENT, 2675 "Root file system not mounted"); 2676 return (ENOENT); 2677 } 2678 2679 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2680 flags |= FWRITE; 2681 2682 pwd_ensure_dirs(); 2683 2684 again: 2685 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2686 error = vn_open(&nd, &flags, 0, NULL); 2687 if (error) { 2688 /* 2689 * This is the only reasonable guess we can make as far as 2690 * path if the user doesn't give us a fully qualified path. 2691 * If they want to specify a file, they need to specify the 2692 * full path. 2693 */ 2694 if (xbb->dev_name[0] != '/') { 2695 char *dev_path = "/dev/"; 2696 char *dev_name; 2697 2698 /* Try adding device path at beginning of name */ 2699 dev_name = malloc(strlen(xbb->dev_name) 2700 + strlen(dev_path) + 1, 2701 M_XENBLOCKBACK, M_NOWAIT); 2702 if (dev_name) { 2703 sprintf(dev_name, "%s%s", dev_path, 2704 xbb->dev_name); 2705 free(xbb->dev_name, M_XENBLOCKBACK); 2706 xbb->dev_name = dev_name; 2707 goto again; 2708 } 2709 } 2710 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2711 xbb->dev_name); 2712 return (error); 2713 } 2714 2715 NDFREE(&nd, NDF_ONLY_PNBUF); 2716 2717 xbb->vn = nd.ni_vp; 2718 2719 /* We only support disks and files. */ 2720 if (vn_isdisk_error(xbb->vn, &error)) { 2721 error = xbb_open_dev(xbb); 2722 } else if (xbb->vn->v_type == VREG) { 2723 error = xbb_open_file(xbb); 2724 } else { 2725 error = EINVAL; 2726 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2727 "or file", xbb->dev_name); 2728 } 2729 VOP_UNLOCK(xbb->vn); 2730 2731 if (error != 0) { 2732 xbb_close_backend(xbb); 2733 return (error); 2734 } 2735 2736 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2737 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2738 2739 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2740 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2741 xbb->dev_name, xbb->sector_size, xbb->media_size); 2742 2743 return (0); 2744 } 2745 2746 /*------------------------ Inter-Domain Communication ------------------------*/ 2747 /** 2748 * Free dynamically allocated KVA or pseudo-physical address allocations. 2749 * 2750 * \param xbb Per-instance xbb configuration structure. 2751 */ 2752 static void 2753 xbb_free_communication_mem(struct xbb_softc *xbb) 2754 { 2755 if (xbb->kva != 0) { 2756 if (xbb->pseudo_phys_res != NULL) { 2757 xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, 2758 xbb->pseudo_phys_res); 2759 xbb->pseudo_phys_res = NULL; 2760 } 2761 } 2762 xbb->kva = 0; 2763 xbb->gnt_base_addr = 0; 2764 if (xbb->kva_free != NULL) { 2765 free(xbb->kva_free, M_XENBLOCKBACK); 2766 xbb->kva_free = NULL; 2767 } 2768 } 2769 2770 /** 2771 * Cleanup all inter-domain communication mechanisms. 2772 * 2773 * \param xbb Per-instance xbb configuration structure. 2774 */ 2775 static int 2776 xbb_disconnect(struct xbb_softc *xbb) 2777 { 2778 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2779 struct gnttab_unmap_grant_ref *op; 2780 u_int ring_idx; 2781 int error; 2782 2783 DPRINTF("\n"); 2784 2785 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2786 return (0); 2787 2788 mtx_unlock(&xbb->lock); 2789 xen_intr_unbind(&xbb->xen_intr_handle); 2790 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2791 mtx_lock(&xbb->lock); 2792 2793 /* 2794 * No new interrupts can generate work, but we must wait 2795 * for all currently active requests to drain. 2796 */ 2797 if (xbb->active_request_count != 0) 2798 return (EAGAIN); 2799 2800 for (ring_idx = 0, op = ops; 2801 ring_idx < xbb->ring_config.ring_pages; 2802 ring_idx++, op++) { 2803 op->host_addr = xbb->ring_config.gnt_addr 2804 + (ring_idx * PAGE_SIZE); 2805 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2806 op->handle = xbb->ring_config.handle[ring_idx]; 2807 } 2808 2809 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2810 xbb->ring_config.ring_pages); 2811 if (error != 0) 2812 panic("Grant table op failed (%d)", error); 2813 2814 xbb_free_communication_mem(xbb); 2815 2816 if (xbb->requests != NULL) { 2817 free(xbb->requests, M_XENBLOCKBACK); 2818 xbb->requests = NULL; 2819 } 2820 2821 if (xbb->request_lists != NULL) { 2822 struct xbb_xen_reqlist *reqlist; 2823 int i; 2824 2825 /* There is one request list for ever allocated request. */ 2826 for (i = 0, reqlist = xbb->request_lists; 2827 i < xbb->max_requests; i++, reqlist++){ 2828 #ifdef XBB_USE_BOUNCE_BUFFERS 2829 if (reqlist->bounce != NULL) { 2830 free(reqlist->bounce, M_XENBLOCKBACK); 2831 reqlist->bounce = NULL; 2832 } 2833 #endif 2834 if (reqlist->gnt_handles != NULL) { 2835 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2836 reqlist->gnt_handles = NULL; 2837 } 2838 } 2839 free(xbb->request_lists, M_XENBLOCKBACK); 2840 xbb->request_lists = NULL; 2841 } 2842 2843 xbb->flags &= ~XBBF_RING_CONNECTED; 2844 return (0); 2845 } 2846 2847 /** 2848 * Map shared memory ring into domain local address space, initialize 2849 * ring control structures, and bind an interrupt to the event channel 2850 * used to notify us of ring changes. 2851 * 2852 * \param xbb Per-instance xbb configuration structure. 2853 */ 2854 static int 2855 xbb_connect_ring(struct xbb_softc *xbb) 2856 { 2857 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2858 struct gnttab_map_grant_ref *gnt; 2859 u_int ring_idx; 2860 int error; 2861 2862 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2863 return (0); 2864 2865 /* 2866 * Kva for our ring is at the tail of the region of kva allocated 2867 * by xbb_alloc_communication_mem(). 2868 */ 2869 xbb->ring_config.va = xbb->kva 2870 + (xbb->kva_size 2871 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2872 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2873 + (xbb->kva_size 2874 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2875 2876 for (ring_idx = 0, gnt = gnts; 2877 ring_idx < xbb->ring_config.ring_pages; 2878 ring_idx++, gnt++) { 2879 gnt->host_addr = xbb->ring_config.gnt_addr 2880 + (ring_idx * PAGE_SIZE); 2881 gnt->flags = GNTMAP_host_map; 2882 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2883 gnt->dom = xbb->otherend_id; 2884 } 2885 2886 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2887 xbb->ring_config.ring_pages); 2888 if (error) 2889 panic("blkback: Ring page grant table op failed (%d)", error); 2890 2891 for (ring_idx = 0, gnt = gnts; 2892 ring_idx < xbb->ring_config.ring_pages; 2893 ring_idx++, gnt++) { 2894 if (gnt->status != 0) { 2895 xbb->ring_config.va = 0; 2896 xenbus_dev_fatal(xbb->dev, EACCES, 2897 "Ring shared page mapping failed. " 2898 "Status %d.", gnt->status); 2899 return (EACCES); 2900 } 2901 xbb->ring_config.handle[ring_idx] = gnt->handle; 2902 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2903 } 2904 2905 /* Initialize the ring based on ABI. */ 2906 switch (xbb->abi) { 2907 case BLKIF_PROTOCOL_NATIVE: 2908 { 2909 blkif_sring_t *sring; 2910 sring = (blkif_sring_t *)xbb->ring_config.va; 2911 BACK_RING_INIT(&xbb->rings.native, sring, 2912 xbb->ring_config.ring_pages * PAGE_SIZE); 2913 break; 2914 } 2915 case BLKIF_PROTOCOL_X86_32: 2916 { 2917 blkif_x86_32_sring_t *sring_x86_32; 2918 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2919 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2920 xbb->ring_config.ring_pages * PAGE_SIZE); 2921 break; 2922 } 2923 case BLKIF_PROTOCOL_X86_64: 2924 { 2925 blkif_x86_64_sring_t *sring_x86_64; 2926 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2927 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2928 xbb->ring_config.ring_pages * PAGE_SIZE); 2929 break; 2930 } 2931 default: 2932 panic("Unexpected blkif protocol ABI."); 2933 } 2934 2935 xbb->flags |= XBBF_RING_CONNECTED; 2936 2937 error = xen_intr_bind_remote_port(xbb->dev, 2938 xbb->otherend_id, 2939 xbb->ring_config.evtchn, 2940 xbb_filter, 2941 /*ithread_handler*/NULL, 2942 /*arg*/xbb, 2943 INTR_TYPE_BIO | INTR_MPSAFE, 2944 &xbb->xen_intr_handle); 2945 if (error) { 2946 (void)xbb_disconnect(xbb); 2947 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2948 return (error); 2949 } 2950 2951 DPRINTF("rings connected!\n"); 2952 2953 return 0; 2954 } 2955 2956 /** 2957 * Size KVA and pseudo-physical address allocations based on negotiated 2958 * values for the size and number of I/O requests, and the size of our 2959 * communication ring. 2960 * 2961 * \param xbb Per-instance xbb configuration structure. 2962 * 2963 * These address spaces are used to dynamically map pages in the 2964 * front-end's domain into our own. 2965 */ 2966 static int 2967 xbb_alloc_communication_mem(struct xbb_softc *xbb) 2968 { 2969 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2970 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2971 xbb->kva_size = xbb->reqlist_kva_size + 2972 (xbb->ring_config.ring_pages * PAGE_SIZE); 2973 2974 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); 2975 if (xbb->kva_free == NULL) 2976 return (ENOMEM); 2977 2978 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 2979 device_get_nameunit(xbb->dev), xbb->kva_size, 2980 xbb->reqlist_kva_size); 2981 /* 2982 * Reserve a range of pseudo physical memory that we can map 2983 * into kva. These pages will only be backed by machine 2984 * pages ("real memory") during the lifetime of front-end requests 2985 * via grant table operations. 2986 */ 2987 xbb->pseudo_phys_res_id = 0; 2988 xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, 2989 xbb->kva_size); 2990 if (xbb->pseudo_phys_res == NULL) { 2991 xbb->kva = 0; 2992 return (ENOMEM); 2993 } 2994 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 2995 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 2996 2997 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 2998 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 2999 (uintmax_t)xbb->gnt_base_addr); 3000 return (0); 3001 } 3002 3003 /** 3004 * Collect front-end information from the XenStore. 3005 * 3006 * \param xbb Per-instance xbb configuration structure. 3007 */ 3008 static int 3009 xbb_collect_frontend_info(struct xbb_softc *xbb) 3010 { 3011 char protocol_abi[64]; 3012 const char *otherend_path; 3013 int error; 3014 u_int ring_idx; 3015 u_int ring_page_order; 3016 size_t ring_size; 3017 3018 otherend_path = xenbus_get_otherend_path(xbb->dev); 3019 3020 /* 3021 * Protocol defaults valid even if all negotiation fails. 3022 */ 3023 xbb->ring_config.ring_pages = 1; 3024 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; 3025 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3026 3027 /* 3028 * Mandatory data (used in all versions of the protocol) first. 3029 */ 3030 error = xs_scanf(XST_NIL, otherend_path, 3031 "event-channel", NULL, "%" PRIu32, 3032 &xbb->ring_config.evtchn); 3033 if (error != 0) { 3034 xenbus_dev_fatal(xbb->dev, error, 3035 "Unable to retrieve event-channel information " 3036 "from frontend %s. Unable to connect.", 3037 xenbus_get_otherend_path(xbb->dev)); 3038 return (error); 3039 } 3040 3041 /* 3042 * These fields are initialized to legacy protocol defaults 3043 * so we only need to fail if reading the updated value succeeds 3044 * and the new value is outside of its allowed range. 3045 * 3046 * \note xs_gather() returns on the first encountered error, so 3047 * we must use independent calls in order to guarantee 3048 * we don't miss information in a sparsly populated front-end 3049 * tree. 3050 * 3051 * \note xs_scanf() does not update variables for unmatched 3052 * fields. 3053 */ 3054 ring_page_order = 0; 3055 xbb->max_requests = 32; 3056 3057 (void)xs_scanf(XST_NIL, otherend_path, 3058 "ring-page-order", NULL, "%u", 3059 &ring_page_order); 3060 xbb->ring_config.ring_pages = 1 << ring_page_order; 3061 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3062 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3063 3064 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3065 xenbus_dev_fatal(xbb->dev, EINVAL, 3066 "Front-end specified ring-pages of %u " 3067 "exceeds backend limit of %u. " 3068 "Unable to connect.", 3069 xbb->ring_config.ring_pages, 3070 XBB_MAX_RING_PAGES); 3071 return (EINVAL); 3072 } 3073 3074 if (xbb->ring_config.ring_pages == 1) { 3075 error = xs_gather(XST_NIL, otherend_path, 3076 "ring-ref", "%" PRIu32, 3077 &xbb->ring_config.ring_ref[0], 3078 NULL); 3079 if (error != 0) { 3080 xenbus_dev_fatal(xbb->dev, error, 3081 "Unable to retrieve ring information " 3082 "from frontend %s. Unable to " 3083 "connect.", 3084 xenbus_get_otherend_path(xbb->dev)); 3085 return (error); 3086 } 3087 } else { 3088 /* Multi-page ring format. */ 3089 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3090 ring_idx++) { 3091 char ring_ref_name[]= "ring_refXX"; 3092 3093 snprintf(ring_ref_name, sizeof(ring_ref_name), 3094 "ring-ref%u", ring_idx); 3095 error = xs_scanf(XST_NIL, otherend_path, 3096 ring_ref_name, NULL, "%" PRIu32, 3097 &xbb->ring_config.ring_ref[ring_idx]); 3098 if (error != 0) { 3099 xenbus_dev_fatal(xbb->dev, error, 3100 "Failed to retriev grant " 3101 "reference for page %u of " 3102 "shared ring. Unable " 3103 "to connect.", ring_idx); 3104 return (error); 3105 } 3106 } 3107 } 3108 3109 error = xs_gather(XST_NIL, otherend_path, 3110 "protocol", "%63s", protocol_abi, 3111 NULL); 3112 if (error != 0 3113 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3114 /* 3115 * Assume native if the frontend has not 3116 * published ABI data or it has published and 3117 * matches our own ABI. 3118 */ 3119 xbb->abi = BLKIF_PROTOCOL_NATIVE; 3120 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3121 xbb->abi = BLKIF_PROTOCOL_X86_32; 3122 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3123 xbb->abi = BLKIF_PROTOCOL_X86_64; 3124 } else { 3125 xenbus_dev_fatal(xbb->dev, EINVAL, 3126 "Unknown protocol ABI (%s) published by " 3127 "frontend. Unable to connect.", protocol_abi); 3128 return (EINVAL); 3129 } 3130 return (0); 3131 } 3132 3133 /** 3134 * Allocate per-request data structures given request size and number 3135 * information negotiated with the front-end. 3136 * 3137 * \param xbb Per-instance xbb configuration structure. 3138 */ 3139 static int 3140 xbb_alloc_requests(struct xbb_softc *xbb) 3141 { 3142 struct xbb_xen_req *req; 3143 struct xbb_xen_req *last_req; 3144 3145 /* 3146 * Allocate request book keeping datastructures. 3147 */ 3148 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3149 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3150 if (xbb->requests == NULL) { 3151 xenbus_dev_fatal(xbb->dev, ENOMEM, 3152 "Unable to allocate request structures"); 3153 return (ENOMEM); 3154 } 3155 3156 req = xbb->requests; 3157 last_req = &xbb->requests[xbb->max_requests - 1]; 3158 STAILQ_INIT(&xbb->request_free_stailq); 3159 while (req <= last_req) { 3160 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3161 req++; 3162 } 3163 return (0); 3164 } 3165 3166 static int 3167 xbb_alloc_request_lists(struct xbb_softc *xbb) 3168 { 3169 struct xbb_xen_reqlist *reqlist; 3170 int i; 3171 3172 /* 3173 * If no requests can be merged, we need 1 request list per 3174 * in flight request. 3175 */ 3176 xbb->request_lists = malloc(xbb->max_requests * 3177 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3178 if (xbb->request_lists == NULL) { 3179 xenbus_dev_fatal(xbb->dev, ENOMEM, 3180 "Unable to allocate request list structures"); 3181 return (ENOMEM); 3182 } 3183 3184 STAILQ_INIT(&xbb->reqlist_free_stailq); 3185 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3186 for (i = 0; i < xbb->max_requests; i++) { 3187 int seg; 3188 3189 reqlist = &xbb->request_lists[i]; 3190 3191 reqlist->xbb = xbb; 3192 3193 #ifdef XBB_USE_BOUNCE_BUFFERS 3194 reqlist->bounce = malloc(xbb->max_reqlist_size, 3195 M_XENBLOCKBACK, M_NOWAIT); 3196 if (reqlist->bounce == NULL) { 3197 xenbus_dev_fatal(xbb->dev, ENOMEM, 3198 "Unable to allocate request " 3199 "bounce buffers"); 3200 return (ENOMEM); 3201 } 3202 #endif /* XBB_USE_BOUNCE_BUFFERS */ 3203 3204 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3205 sizeof(*reqlist->gnt_handles), 3206 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3207 if (reqlist->gnt_handles == NULL) { 3208 xenbus_dev_fatal(xbb->dev, ENOMEM, 3209 "Unable to allocate request " 3210 "grant references"); 3211 return (ENOMEM); 3212 } 3213 3214 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3215 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3216 3217 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3218 } 3219 return (0); 3220 } 3221 3222 /** 3223 * Supply information about the physical device to the frontend 3224 * via XenBus. 3225 * 3226 * \param xbb Per-instance xbb configuration structure. 3227 */ 3228 static int 3229 xbb_publish_backend_info(struct xbb_softc *xbb) 3230 { 3231 struct xs_transaction xst; 3232 const char *our_path; 3233 const char *leaf; 3234 int error; 3235 3236 our_path = xenbus_get_node(xbb->dev); 3237 while (1) { 3238 error = xs_transaction_start(&xst); 3239 if (error != 0) { 3240 xenbus_dev_fatal(xbb->dev, error, 3241 "Error publishing backend info " 3242 "(start transaction)"); 3243 return (error); 3244 } 3245 3246 leaf = "sectors"; 3247 error = xs_printf(xst, our_path, leaf, 3248 "%"PRIu64, xbb->media_num_sectors); 3249 if (error != 0) 3250 break; 3251 3252 /* XXX Support all VBD attributes here. */ 3253 leaf = "info"; 3254 error = xs_printf(xst, our_path, leaf, "%u", 3255 xbb->flags & XBBF_READ_ONLY 3256 ? VDISK_READONLY : 0); 3257 if (error != 0) 3258 break; 3259 3260 leaf = "sector-size"; 3261 error = xs_printf(xst, our_path, leaf, "%u", 3262 xbb->sector_size); 3263 if (error != 0) 3264 break; 3265 3266 error = xs_transaction_end(xst, 0); 3267 if (error == 0) { 3268 return (0); 3269 } else if (error != EAGAIN) { 3270 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3271 return (error); 3272 } 3273 } 3274 3275 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3276 our_path, leaf); 3277 xs_transaction_end(xst, 1); 3278 return (error); 3279 } 3280 3281 /** 3282 * Connect to our blkfront peer now that it has completed publishing 3283 * its configuration into the XenStore. 3284 * 3285 * \param xbb Per-instance xbb configuration structure. 3286 */ 3287 static void 3288 xbb_connect(struct xbb_softc *xbb) 3289 { 3290 int error; 3291 3292 if (!xbb->hotplug_done || 3293 (xenbus_get_state(xbb->dev) != XenbusStateInitWait) || 3294 (xbb_collect_frontend_info(xbb) != 0)) 3295 return; 3296 3297 xbb->flags &= ~XBBF_SHUTDOWN; 3298 3299 /* 3300 * We limit the maximum number of reqlist segments to the maximum 3301 * number of segments in the ring, or our absolute maximum, 3302 * whichever is smaller. 3303 */ 3304 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3305 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3306 3307 /* 3308 * The maximum size is simply a function of the number of segments 3309 * we can handle. 3310 */ 3311 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3312 3313 /* Allocate resources whose size depends on front-end configuration. */ 3314 error = xbb_alloc_communication_mem(xbb); 3315 if (error != 0) { 3316 xenbus_dev_fatal(xbb->dev, error, 3317 "Unable to allocate communication memory"); 3318 return; 3319 } 3320 3321 error = xbb_alloc_requests(xbb); 3322 if (error != 0) { 3323 /* Specific errors are reported by xbb_alloc_requests(). */ 3324 return; 3325 } 3326 3327 error = xbb_alloc_request_lists(xbb); 3328 if (error != 0) { 3329 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3330 return; 3331 } 3332 3333 /* 3334 * Connect communication channel. 3335 */ 3336 error = xbb_connect_ring(xbb); 3337 if (error != 0) { 3338 /* Specific errors are reported by xbb_connect_ring(). */ 3339 return; 3340 } 3341 3342 if (xbb_publish_backend_info(xbb) != 0) { 3343 /* 3344 * If we can't publish our data, we cannot participate 3345 * in this connection, and waiting for a front-end state 3346 * change will not help the situation. 3347 */ 3348 (void)xbb_disconnect(xbb); 3349 return; 3350 } 3351 3352 /* Ready for I/O. */ 3353 xenbus_set_state(xbb->dev, XenbusStateConnected); 3354 } 3355 3356 /*-------------------------- Device Teardown Support -------------------------*/ 3357 /** 3358 * Perform device shutdown functions. 3359 * 3360 * \param xbb Per-instance xbb configuration structure. 3361 * 3362 * Mark this instance as shutting down, wait for any active I/O on the 3363 * backend device/file to drain, disconnect from the front-end, and notify 3364 * any waiters (e.g. a thread invoking our detach method) that detach can 3365 * now proceed. 3366 */ 3367 static int 3368 xbb_shutdown(struct xbb_softc *xbb) 3369 { 3370 XenbusState frontState; 3371 int error; 3372 3373 DPRINTF("\n"); 3374 3375 /* 3376 * Due to the need to drop our mutex during some 3377 * xenbus operations, it is possible for two threads 3378 * to attempt to close out shutdown processing at 3379 * the same time. Tell the caller that hits this 3380 * race to try back later. 3381 */ 3382 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3383 return (EAGAIN); 3384 3385 xbb->flags |= XBBF_IN_SHUTDOWN; 3386 mtx_unlock(&xbb->lock); 3387 3388 if (xbb->hotplug_watch.node != NULL) { 3389 xs_unregister_watch(&xbb->hotplug_watch); 3390 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3391 xbb->hotplug_watch.node = NULL; 3392 } 3393 xbb->hotplug_done = false; 3394 3395 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3396 xenbus_set_state(xbb->dev, XenbusStateClosing); 3397 3398 frontState = xenbus_get_otherend_state(xbb->dev); 3399 mtx_lock(&xbb->lock); 3400 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3401 3402 /* Wait for the frontend to disconnect (if it's connected). */ 3403 if (frontState == XenbusStateConnected) 3404 return (EAGAIN); 3405 3406 DPRINTF("\n"); 3407 3408 /* Indicate shutdown is in progress. */ 3409 xbb->flags |= XBBF_SHUTDOWN; 3410 3411 /* Disconnect from the front-end. */ 3412 error = xbb_disconnect(xbb); 3413 if (error != 0) { 3414 /* 3415 * Requests still outstanding. We'll be called again 3416 * once they complete. 3417 */ 3418 KASSERT(error == EAGAIN, 3419 ("%s: Unexpected xbb_disconnect() failure %d", 3420 __func__, error)); 3421 3422 return (error); 3423 } 3424 3425 DPRINTF("\n"); 3426 3427 /* Indicate to xbb_detach() that is it safe to proceed. */ 3428 wakeup(xbb); 3429 3430 return (0); 3431 } 3432 3433 /** 3434 * Report an attach time error to the console and Xen, and cleanup 3435 * this instance by forcing immediate detach processing. 3436 * 3437 * \param xbb Per-instance xbb configuration structure. 3438 * \param err Errno describing the error. 3439 * \param fmt Printf style format and arguments 3440 */ 3441 static void 3442 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3443 { 3444 va_list ap; 3445 va_list ap_hotplug; 3446 3447 va_start(ap, fmt); 3448 va_copy(ap_hotplug, ap); 3449 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3450 "hotplug-error", fmt, ap_hotplug); 3451 va_end(ap_hotplug); 3452 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3453 "hotplug-status", "error"); 3454 3455 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3456 va_end(ap); 3457 3458 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3459 "online", "0"); 3460 mtx_lock(&xbb->lock); 3461 xbb_shutdown(xbb); 3462 mtx_unlock(&xbb->lock); 3463 } 3464 3465 /*---------------------------- NewBus Entrypoints ----------------------------*/ 3466 /** 3467 * Inspect a XenBus device and claim it if is of the appropriate type. 3468 * 3469 * \param dev NewBus device object representing a candidate XenBus device. 3470 * 3471 * \return 0 for success, errno codes for failure. 3472 */ 3473 static int 3474 xbb_probe(device_t dev) 3475 { 3476 3477 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3478 device_set_desc(dev, "Backend Virtual Block Device"); 3479 device_quiet(dev); 3480 return (0); 3481 } 3482 3483 return (ENXIO); 3484 } 3485 3486 /** 3487 * Setup sysctl variables to control various Block Back parameters. 3488 * 3489 * \param xbb Xen Block Back softc. 3490 * 3491 */ 3492 static void 3493 xbb_setup_sysctl(struct xbb_softc *xbb) 3494 { 3495 struct sysctl_ctx_list *sysctl_ctx = NULL; 3496 struct sysctl_oid *sysctl_tree = NULL; 3497 3498 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3499 if (sysctl_ctx == NULL) 3500 return; 3501 3502 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3503 if (sysctl_tree == NULL) 3504 return; 3505 3506 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3507 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3508 "fake the flush command"); 3509 3510 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3511 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3512 "send a real flush for N flush requests"); 3513 3514 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3515 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3516 "Don't coalesce contiguous requests"); 3517 3518 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3519 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3520 "how many I/O requests we have received"); 3521 3522 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3523 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3524 "how many I/O requests have been completed"); 3525 3526 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3527 "reqs_queued_for_completion", CTLFLAG_RW, 3528 &xbb->reqs_queued_for_completion, 3529 "how many I/O requests queued but not yet pushed"); 3530 3531 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3532 "reqs_completed_with_error", CTLFLAG_RW, 3533 &xbb->reqs_completed_with_error, 3534 "how many I/O requests completed with error status"); 3535 3536 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3537 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3538 "how many I/O dispatches were forced"); 3539 3540 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3541 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3542 "how many I/O dispatches were normal"); 3543 3544 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3545 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3546 "total number of I/O dispatches"); 3547 3548 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3549 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3550 "how many times we have run out of KVA"); 3551 3552 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3553 "request_shortages", CTLFLAG_RW, 3554 &xbb->request_shortages, 3555 "how many times we have run out of requests"); 3556 3557 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3558 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3559 "maximum outstanding requests (negotiated)"); 3560 3561 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3562 "max_request_segments", CTLFLAG_RD, 3563 &xbb->max_request_segments, 0, 3564 "maximum number of pages per requests (negotiated)"); 3565 3566 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3567 "max_request_size", CTLFLAG_RD, 3568 &xbb->max_request_size, 0, 3569 "maximum size in bytes of a request (negotiated)"); 3570 3571 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3572 "ring_pages", CTLFLAG_RD, 3573 &xbb->ring_config.ring_pages, 0, 3574 "communication channel pages (negotiated)"); 3575 } 3576 3577 static void 3578 xbb_attach_disk(struct xs_watch *watch, const char **vec, unsigned int len) 3579 { 3580 device_t dev; 3581 struct xbb_softc *xbb; 3582 int error; 3583 3584 dev = (device_t) watch->callback_data; 3585 xbb = device_get_softc(dev); 3586 3587 error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", 3588 NULL, &xbb->dev_name, NULL); 3589 if (error != 0) 3590 return; 3591 3592 xs_unregister_watch(watch); 3593 free(watch->node, M_XENBLOCKBACK); 3594 watch->node = NULL; 3595 3596 /* Collect physical device information. */ 3597 error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3598 "device-type", NULL, &xbb->dev_type, 3599 NULL); 3600 if (error != 0) 3601 xbb->dev_type = NULL; 3602 3603 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3604 "mode", NULL, &xbb->dev_mode, 3605 NULL); 3606 if (error != 0) { 3607 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3608 xenbus_get_node(dev)); 3609 return; 3610 } 3611 3612 /* Parse fopen style mode flags. */ 3613 if (strchr(xbb->dev_mode, 'w') == NULL) 3614 xbb->flags |= XBBF_READ_ONLY; 3615 3616 /* 3617 * Verify the physical device is present and can support 3618 * the desired I/O mode. 3619 */ 3620 error = xbb_open_backend(xbb); 3621 if (error != 0) { 3622 xbb_attach_failed(xbb, error, "Unable to open %s", 3623 xbb->dev_name); 3624 return; 3625 } 3626 3627 /* Use devstat(9) for recording statistics. */ 3628 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3629 xbb->sector_size, 3630 DEVSTAT_ALL_SUPPORTED, 3631 DEVSTAT_TYPE_DIRECT 3632 | DEVSTAT_TYPE_IF_OTHER, 3633 DEVSTAT_PRIORITY_OTHER); 3634 3635 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3636 xbb->sector_size, 3637 DEVSTAT_ALL_SUPPORTED, 3638 DEVSTAT_TYPE_DIRECT 3639 | DEVSTAT_TYPE_IF_OTHER, 3640 DEVSTAT_PRIORITY_OTHER); 3641 /* 3642 * Setup sysctl variables. 3643 */ 3644 xbb_setup_sysctl(xbb); 3645 3646 /* 3647 * Create a taskqueue for doing work that must occur from a 3648 * thread context. 3649 */ 3650 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3651 M_NOWAIT, 3652 taskqueue_thread_enqueue, 3653 /*contxt*/&xbb->io_taskqueue); 3654 if (xbb->io_taskqueue == NULL) { 3655 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3656 return; 3657 } 3658 3659 taskqueue_start_threads(&xbb->io_taskqueue, 3660 /*num threads*/1, 3661 /*priority*/PWAIT, 3662 /*thread name*/ 3663 "%s taskq", device_get_nameunit(dev)); 3664 3665 /* Update hot-plug status to satisfy xend. */ 3666 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3667 "hotplug-status", "connected"); 3668 if (error) { 3669 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3670 xenbus_get_node(xbb->dev)); 3671 return; 3672 } 3673 3674 xbb->hotplug_done = true; 3675 3676 /* The front end might be waiting for the backend, attach if so. */ 3677 if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised) 3678 xbb_connect(xbb); 3679 } 3680 3681 /** 3682 * Attach to a XenBus device that has been claimed by our probe routine. 3683 * 3684 * \param dev NewBus device object representing this Xen Block Back instance. 3685 * 3686 * \return 0 for success, errno codes for failure. 3687 */ 3688 static int 3689 xbb_attach(device_t dev) 3690 { 3691 struct xbb_softc *xbb; 3692 int error; 3693 u_int max_ring_page_order; 3694 struct sbuf *watch_path; 3695 3696 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3697 3698 /* 3699 * Basic initialization. 3700 * After this block it is safe to call xbb_detach() 3701 * to clean up any allocated data for this instance. 3702 */ 3703 xbb = device_get_softc(dev); 3704 xbb->dev = dev; 3705 xbb->otherend_id = xenbus_get_otherend_id(dev); 3706 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3707 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3708 3709 /* 3710 * Publish protocol capabilities for consumption by the 3711 * front-end. 3712 */ 3713 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3714 "feature-barrier", "1"); 3715 if (error) { 3716 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3717 xenbus_get_node(xbb->dev)); 3718 return (error); 3719 } 3720 3721 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3722 "feature-flush-cache", "1"); 3723 if (error) { 3724 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3725 xenbus_get_node(xbb->dev)); 3726 return (error); 3727 } 3728 3729 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3730 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3731 "max-ring-page-order", "%u", max_ring_page_order); 3732 if (error) { 3733 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3734 xenbus_get_node(xbb->dev)); 3735 return (error); 3736 } 3737 3738 /* 3739 * We need to wait for hotplug script execution before 3740 * moving forward. 3741 */ 3742 KASSERT(!xbb->hotplug_done, ("Hotplug scripts already executed")); 3743 watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); 3744 xbb->hotplug_watch.callback_data = (uintptr_t)dev; 3745 xbb->hotplug_watch.callback = xbb_attach_disk; 3746 KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); 3747 xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); 3748 sbuf_delete(watch_path); 3749 error = xs_register_watch(&xbb->hotplug_watch); 3750 if (error != 0) { 3751 xbb_attach_failed(xbb, error, "failed to create watch on %s", 3752 xbb->hotplug_watch.node); 3753 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3754 return (error); 3755 } 3756 3757 /* Tell the toolstack blkback has attached. */ 3758 xenbus_set_state(dev, XenbusStateInitWait); 3759 3760 return (0); 3761 } 3762 3763 /** 3764 * Detach from a block back device instance. 3765 * 3766 * \param dev NewBus device object representing this Xen Block Back instance. 3767 * 3768 * \return 0 for success, errno codes for failure. 3769 * 3770 * \note A block back device may be detached at any time in its life-cycle, 3771 * including part way through the attach process. For this reason, 3772 * initialization order and the initialization state checks in this 3773 * routine must be carefully coupled so that attach time failures 3774 * are gracefully handled. 3775 */ 3776 static int 3777 xbb_detach(device_t dev) 3778 { 3779 struct xbb_softc *xbb; 3780 3781 DPRINTF("\n"); 3782 3783 xbb = device_get_softc(dev); 3784 mtx_lock(&xbb->lock); 3785 while (xbb_shutdown(xbb) == EAGAIN) { 3786 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3787 "xbb_shutdown", 0); 3788 } 3789 mtx_unlock(&xbb->lock); 3790 3791 DPRINTF("\n"); 3792 3793 if (xbb->io_taskqueue != NULL) 3794 taskqueue_free(xbb->io_taskqueue); 3795 3796 if (xbb->xbb_stats != NULL) 3797 devstat_remove_entry(xbb->xbb_stats); 3798 3799 if (xbb->xbb_stats_in != NULL) 3800 devstat_remove_entry(xbb->xbb_stats_in); 3801 3802 xbb_close_backend(xbb); 3803 3804 if (xbb->dev_mode != NULL) { 3805 free(xbb->dev_mode, M_XENSTORE); 3806 xbb->dev_mode = NULL; 3807 } 3808 3809 if (xbb->dev_type != NULL) { 3810 free(xbb->dev_type, M_XENSTORE); 3811 xbb->dev_type = NULL; 3812 } 3813 3814 if (xbb->dev_name != NULL) { 3815 free(xbb->dev_name, M_XENSTORE); 3816 xbb->dev_name = NULL; 3817 } 3818 3819 mtx_destroy(&xbb->lock); 3820 return (0); 3821 } 3822 3823 /** 3824 * Prepare this block back device for suspension of this VM. 3825 * 3826 * \param dev NewBus device object representing this Xen Block Back instance. 3827 * 3828 * \return 0 for success, errno codes for failure. 3829 */ 3830 static int 3831 xbb_suspend(device_t dev) 3832 { 3833 #ifdef NOT_YET 3834 struct xbb_softc *sc = device_get_softc(dev); 3835 3836 /* Prevent new requests being issued until we fix things up. */ 3837 mtx_lock(&sc->xb_io_lock); 3838 sc->connected = BLKIF_STATE_SUSPENDED; 3839 mtx_unlock(&sc->xb_io_lock); 3840 #endif 3841 3842 return (0); 3843 } 3844 3845 /** 3846 * Perform any processing required to recover from a suspended state. 3847 * 3848 * \param dev NewBus device object representing this Xen Block Back instance. 3849 * 3850 * \return 0 for success, errno codes for failure. 3851 */ 3852 static int 3853 xbb_resume(device_t dev) 3854 { 3855 return (0); 3856 } 3857 3858 /** 3859 * Handle state changes expressed via the XenStore by our front-end peer. 3860 * 3861 * \param dev NewBus device object representing this Xen 3862 * Block Back instance. 3863 * \param frontend_state The new state of the front-end. 3864 * 3865 * \return 0 for success, errno codes for failure. 3866 */ 3867 static void 3868 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3869 { 3870 struct xbb_softc *xbb = device_get_softc(dev); 3871 3872 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3873 xenbus_strstate(frontend_state), 3874 xenbus_strstate(xenbus_get_state(xbb->dev))); 3875 3876 switch (frontend_state) { 3877 case XenbusStateInitialising: 3878 break; 3879 case XenbusStateInitialised: 3880 case XenbusStateConnected: 3881 xbb_connect(xbb); 3882 break; 3883 case XenbusStateClosing: 3884 case XenbusStateClosed: 3885 mtx_lock(&xbb->lock); 3886 xbb_shutdown(xbb); 3887 mtx_unlock(&xbb->lock); 3888 if (frontend_state == XenbusStateClosed) 3889 xenbus_set_state(xbb->dev, XenbusStateClosed); 3890 break; 3891 default: 3892 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3893 frontend_state); 3894 break; 3895 } 3896 } 3897 3898 /*---------------------------- NewBus Registration ---------------------------*/ 3899 static device_method_t xbb_methods[] = { 3900 /* Device interface */ 3901 DEVMETHOD(device_probe, xbb_probe), 3902 DEVMETHOD(device_attach, xbb_attach), 3903 DEVMETHOD(device_detach, xbb_detach), 3904 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3905 DEVMETHOD(device_suspend, xbb_suspend), 3906 DEVMETHOD(device_resume, xbb_resume), 3907 3908 /* Xenbus interface */ 3909 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3910 { 0, 0 } 3911 }; 3912 3913 static driver_t xbb_driver = { 3914 "xbbd", 3915 xbb_methods, 3916 sizeof(struct xbb_softc), 3917 }; 3918 devclass_t xbb_devclass; 3919 3920 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 3921