1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009-2012 Spectra Logic Corporation 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions, and the following disclaimer, 12 * without modification. 13 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 14 * substantially similar to the "NO WARRANTY" disclaimer below 15 * ("Disclaimer") and any redistribution must be conditioned upon 16 * including a substantially similar Disclaimer requirement for further 17 * binary redistribution. 18 * 19 * NO WARRANTY 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 29 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGES. 31 * 32 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 33 * Ken Merry (Spectra Logic Corporation) 34 */ 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 /** 39 * \file blkback.c 40 * 41 * \brief Device driver supporting the vending of block storage from 42 * a FreeBSD domain to other domains. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kernel.h> 48 #include <sys/malloc.h> 49 50 #include <sys/bio.h> 51 #include <sys/bus.h> 52 #include <sys/conf.h> 53 #include <sys/devicestat.h> 54 #include <sys/disk.h> 55 #include <sys/fcntl.h> 56 #include <sys/filedesc.h> 57 #include <sys/kdb.h> 58 #include <sys/module.h> 59 #include <sys/namei.h> 60 #include <sys/proc.h> 61 #include <sys/rman.h> 62 #include <sys/taskqueue.h> 63 #include <sys/types.h> 64 #include <sys/vnode.h> 65 #include <sys/mount.h> 66 #include <sys/sysctl.h> 67 #include <sys/bitstring.h> 68 #include <sys/sdt.h> 69 70 #include <geom/geom.h> 71 72 #include <machine/_inttypes.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 78 #include <xen/xen-os.h> 79 #include <xen/blkif.h> 80 #include <xen/gnttab.h> 81 #include <xen/xen_intr.h> 82 83 #include <contrib/xen/event_channel.h> 84 #include <contrib/xen/grant_table.h> 85 86 #include <xen/xenbus/xenbusvar.h> 87 88 /*--------------------------- Compile-time Tunables --------------------------*/ 89 /** 90 * The maximum number of shared memory ring pages we will allow in a 91 * negotiated block-front/back communication channel. Allow enough 92 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 93 */ 94 #define XBB_MAX_RING_PAGES 32 95 96 /** 97 * The maximum number of outstanding request blocks (request headers plus 98 * additional segment blocks) we will allow in a negotiated block-front/back 99 * communication channel. 100 */ 101 #define XBB_MAX_REQUESTS \ 102 __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) 103 104 /** 105 * \brief Define to force all I/O to be performed on memory owned by the 106 * backend device, with a copy-in/out to the remote domain's memory. 107 * 108 * \note This option is currently required when this driver's domain is 109 * operating in HVM mode on a system using an IOMMU. 110 * 111 * This driver uses Xen's grant table API to gain access to the memory of 112 * the remote domains it serves. When our domain is operating in PV mode, 113 * the grant table mechanism directly updates our domain's page table entries 114 * to point to the physical pages of the remote domain. This scheme guarantees 115 * that blkback and the backing devices it uses can safely perform DMA 116 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 117 * insure that our domain cannot DMA to pages owned by another domain. As 118 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 119 * table API. For this reason, in HVM mode, we must bounce all requests into 120 * memory that is mapped into our domain at domain startup and thus has 121 * valid IOMMU mappings. 122 */ 123 #define XBB_USE_BOUNCE_BUFFERS 124 125 /** 126 * \brief Define to enable rudimentary request logging to the console. 127 */ 128 #undef XBB_DEBUG 129 130 /*---------------------------------- Macros ----------------------------------*/ 131 /** 132 * Custom malloc type for all driver allocations. 133 */ 134 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 135 136 #ifdef XBB_DEBUG 137 #define DPRINTF(fmt, args...) \ 138 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 139 #else 140 #define DPRINTF(fmt, args...) do {} while(0) 141 #endif 142 143 /** 144 * The maximum mapped region size per request we will allow in a negotiated 145 * block-front/back communication channel. 146 * Use old default of MAXPHYS == 128K. 147 */ 148 #define XBB_MAX_REQUEST_SIZE \ 149 MIN(128 * 1024, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 150 151 /** 152 * The maximum number of segments (within a request header and accompanying 153 * segment blocks) per request we will allow in a negotiated block-front/back 154 * communication channel. 155 */ 156 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 157 (MIN(UIO_MAXIOV, \ 158 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 159 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 160 161 /** 162 * The maximum number of ring pages that we can allow per request list. 163 * We limit this to the maximum number of segments per request, because 164 * that is already a reasonable number of segments to aggregate. This 165 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 166 * because that would leave situations where we can't dispatch even one 167 * large request. 168 */ 169 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 170 171 /*--------------------------- Forward Declarations ---------------------------*/ 172 struct xbb_softc; 173 struct xbb_xen_req; 174 175 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 176 ...) __attribute__((format(printf, 3, 4))); 177 static int xbb_shutdown(struct xbb_softc *xbb); 178 179 /*------------------------------ Data Structures -----------------------------*/ 180 181 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 182 183 typedef enum { 184 XBB_REQLIST_NONE = 0x00, 185 XBB_REQLIST_MAPPED = 0x01 186 } xbb_reqlist_flags; 187 188 struct xbb_xen_reqlist { 189 /** 190 * Back reference to the parent block back instance for this 191 * request. Used during bio_done handling. 192 */ 193 struct xbb_softc *xbb; 194 195 /** 196 * BLKIF_OP code for this request. 197 */ 198 int operation; 199 200 /** 201 * Set to BLKIF_RSP_* to indicate request status. 202 * 203 * This field allows an error status to be recorded even if the 204 * delivery of this status must be deferred. Deferred reporting 205 * is necessary, for example, when an error is detected during 206 * completion processing of one bio when other bios for this 207 * request are still outstanding. 208 */ 209 int status; 210 211 /** 212 * Number of 512 byte sectors not transferred. 213 */ 214 int residual_512b_sectors; 215 216 /** 217 * Starting sector number of the first request in the list. 218 */ 219 off_t starting_sector_number; 220 221 /** 222 * If we're going to coalesce, the next contiguous sector would be 223 * this one. 224 */ 225 off_t next_contig_sector; 226 227 /** 228 * Number of child requests in the list. 229 */ 230 int num_children; 231 232 /** 233 * Number of I/O requests still pending on the backend. 234 */ 235 int pendcnt; 236 237 /** 238 * Total number of segments for requests in the list. 239 */ 240 int nr_segments; 241 242 /** 243 * Flags for this particular request list. 244 */ 245 xbb_reqlist_flags flags; 246 247 /** 248 * Kernel virtual address space reserved for this request 249 * list structure and used to map the remote domain's pages for 250 * this I/O, into our domain's address space. 251 */ 252 uint8_t *kva; 253 254 /** 255 * Base, pseudo-physical address, corresponding to the start 256 * of this request's kva region. 257 */ 258 uint64_t gnt_base; 259 260 #ifdef XBB_USE_BOUNCE_BUFFERS 261 /** 262 * Pre-allocated domain local memory used to proxy remote 263 * domain memory during I/O operations. 264 */ 265 uint8_t *bounce; 266 #endif 267 268 /** 269 * Array of grant handles (one per page) used to map this request. 270 */ 271 grant_handle_t *gnt_handles; 272 273 /** 274 * Device statistics request ordering type (ordered or simple). 275 */ 276 devstat_tag_type ds_tag_type; 277 278 /** 279 * Device statistics request type (read, write, no_data). 280 */ 281 devstat_trans_flags ds_trans_type; 282 283 /** 284 * The start time for this request. 285 */ 286 struct bintime ds_t0; 287 288 /** 289 * Linked list of contiguous requests with the same operation type. 290 */ 291 struct xbb_xen_req_list contig_req_list; 292 293 /** 294 * Linked list links used to aggregate idle requests in the 295 * request list free pool (xbb->reqlist_free_stailq) and pending 296 * requests waiting for execution (xbb->reqlist_pending_stailq). 297 */ 298 STAILQ_ENTRY(xbb_xen_reqlist) links; 299 }; 300 301 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 302 303 /** 304 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 305 */ 306 struct xbb_xen_req { 307 /** 308 * Linked list links used to aggregate requests into a reqlist 309 * and to store them in the request free pool. 310 */ 311 STAILQ_ENTRY(xbb_xen_req) links; 312 313 /** 314 * The remote domain's identifier for this I/O request. 315 */ 316 uint64_t id; 317 318 /** 319 * The number of pages currently mapped for this request. 320 */ 321 int nr_pages; 322 323 /** 324 * The number of 512 byte sectors comprising this requests. 325 */ 326 int nr_512b_sectors; 327 328 /** 329 * BLKIF_OP code for this request. 330 */ 331 int operation; 332 333 /** 334 * Storage used for non-native ring requests. 335 */ 336 blkif_request_t ring_req_storage; 337 338 /** 339 * Pointer to the Xen request in the ring. 340 */ 341 blkif_request_t *ring_req; 342 343 /** 344 * Consumer index for this request. 345 */ 346 RING_IDX req_ring_idx; 347 348 /** 349 * The start time for this request. 350 */ 351 struct bintime ds_t0; 352 353 /** 354 * Pointer back to our parent request list. 355 */ 356 struct xbb_xen_reqlist *reqlist; 357 }; 358 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 359 360 /** 361 * \brief Configuration data for the shared memory request ring 362 * used to communicate with the front-end client of this 363 * this driver. 364 */ 365 struct xbb_ring_config { 366 /** KVA address where ring memory is mapped. */ 367 vm_offset_t va; 368 369 /** The pseudo-physical address where ring memory is mapped.*/ 370 uint64_t gnt_addr; 371 372 /** 373 * Grant table handles, one per-ring page, returned by the 374 * hyperpervisor upon mapping of the ring and required to 375 * unmap it when a connection is torn down. 376 */ 377 grant_handle_t handle[XBB_MAX_RING_PAGES]; 378 379 /** 380 * The device bus address returned by the hypervisor when 381 * mapping the ring and required to unmap it when a connection 382 * is torn down. 383 */ 384 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 385 386 /** The number of ring pages mapped for the current connection. */ 387 u_int ring_pages; 388 389 /** 390 * The grant references, one per-ring page, supplied by the 391 * front-end, allowing us to reference the ring pages in the 392 * front-end's domain and to map these pages into our own domain. 393 */ 394 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 395 396 /** The interrupt driven even channel used to signal ring events. */ 397 evtchn_port_t evtchn; 398 }; 399 400 /** 401 * Per-instance connection state flags. 402 */ 403 typedef enum 404 { 405 /** 406 * The front-end requested a read-only mount of the 407 * back-end device/file. 408 */ 409 XBBF_READ_ONLY = 0x01, 410 411 /** Communication with the front-end has been established. */ 412 XBBF_RING_CONNECTED = 0x02, 413 414 /** 415 * Front-end requests exist in the ring and are waiting for 416 * xbb_xen_req objects to free up. 417 */ 418 XBBF_RESOURCE_SHORTAGE = 0x04, 419 420 /** Connection teardown in progress. */ 421 XBBF_SHUTDOWN = 0x08, 422 423 /** A thread is already performing shutdown processing. */ 424 XBBF_IN_SHUTDOWN = 0x10 425 } xbb_flag_t; 426 427 /** Backend device type. */ 428 typedef enum { 429 /** Backend type unknown. */ 430 XBB_TYPE_NONE = 0x00, 431 432 /** 433 * Backend type disk (access via cdev switch 434 * strategy routine). 435 */ 436 XBB_TYPE_DISK = 0x01, 437 438 /** Backend type file (access vnode operations.). */ 439 XBB_TYPE_FILE = 0x02 440 } xbb_type; 441 442 /** 443 * \brief Structure used to memoize information about a per-request 444 * scatter-gather list. 445 * 446 * The chief benefit of using this data structure is it avoids having 447 * to reparse the possibly discontiguous S/G list in the original 448 * request. Due to the way that the mapping of the memory backing an 449 * I/O transaction is handled by Xen, a second pass is unavoidable. 450 * At least this way the second walk is a simple array traversal. 451 * 452 * \note A single Scatter/Gather element in the block interface covers 453 * at most 1 machine page. In this context a sector (blkif 454 * nomenclature, not what I'd choose) is a 512b aligned unit 455 * of mapping within the machine page referenced by an S/G 456 * element. 457 */ 458 struct xbb_sg { 459 /** The number of 512b data chunks mapped in this S/G element. */ 460 int16_t nsect; 461 462 /** 463 * The index (0 based) of the first 512b data chunk mapped 464 * in this S/G element. 465 */ 466 uint8_t first_sect; 467 468 /** 469 * The index (0 based) of the last 512b data chunk mapped 470 * in this S/G element. 471 */ 472 uint8_t last_sect; 473 }; 474 475 /** 476 * Character device backend specific configuration data. 477 */ 478 struct xbb_dev_data { 479 /** Cdev used for device backend access. */ 480 struct cdev *cdev; 481 482 /** Cdev switch used for device backend access. */ 483 struct cdevsw *csw; 484 485 /** Used to hold a reference on opened cdev backend devices. */ 486 int dev_ref; 487 }; 488 489 /** 490 * File backend specific configuration data. 491 */ 492 struct xbb_file_data { 493 /** Credentials to use for vnode backed (file based) I/O. */ 494 struct ucred *cred; 495 496 /** 497 * \brief Array of io vectors used to process file based I/O. 498 * 499 * Only a single file based request is outstanding per-xbb instance, 500 * so we only need one of these. 501 */ 502 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 503 #ifdef XBB_USE_BOUNCE_BUFFERS 504 505 /** 506 * \brief Array of io vectors used to handle bouncing of file reads. 507 * 508 * Vnode operations are free to modify uio data during their 509 * exectuion. In the case of a read with bounce buffering active, 510 * we need some of the data from the original uio in order to 511 * bounce-out the read data. This array serves as the temporary 512 * storage for this saved data. 513 */ 514 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 515 516 /** 517 * \brief Array of memoized bounce buffer kva offsets used 518 * in the file based backend. 519 * 520 * Due to the way that the mapping of the memory backing an 521 * I/O transaction is handled by Xen, a second pass through 522 * the request sg elements is unavoidable. We memoize the computed 523 * bounce address here to reduce the cost of the second walk. 524 */ 525 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 526 #endif /* XBB_USE_BOUNCE_BUFFERS */ 527 }; 528 529 /** 530 * Collection of backend type specific data. 531 */ 532 union xbb_backend_data { 533 struct xbb_dev_data dev; 534 struct xbb_file_data file; 535 }; 536 537 /** 538 * Function signature of backend specific I/O handlers. 539 */ 540 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 541 struct xbb_xen_reqlist *reqlist, int operation, 542 int flags); 543 544 /** 545 * Per-instance configuration data. 546 */ 547 struct xbb_softc { 548 /** 549 * Task-queue used to process I/O requests. 550 */ 551 struct taskqueue *io_taskqueue; 552 553 /** 554 * Single "run the request queue" task enqueued 555 * on io_taskqueue. 556 */ 557 struct task io_task; 558 559 /** Device type for this instance. */ 560 xbb_type device_type; 561 562 /** NewBus device corresponding to this instance. */ 563 device_t dev; 564 565 /** Backend specific dispatch routine for this instance. */ 566 xbb_dispatch_t dispatch_io; 567 568 /** The number of requests outstanding on the backend device/file. */ 569 int active_request_count; 570 571 /** Free pool of request tracking structures. */ 572 struct xbb_xen_req_list request_free_stailq; 573 574 /** Array, sized at connection time, of request tracking structures. */ 575 struct xbb_xen_req *requests; 576 577 /** Free pool of request list structures. */ 578 struct xbb_xen_reqlist_list reqlist_free_stailq; 579 580 /** List of pending request lists awaiting execution. */ 581 struct xbb_xen_reqlist_list reqlist_pending_stailq; 582 583 /** Array, sized at connection time, of request list structures. */ 584 struct xbb_xen_reqlist *request_lists; 585 586 /** 587 * Global pool of kva used for mapping remote domain ring 588 * and I/O transaction data. 589 */ 590 vm_offset_t kva; 591 592 /** Pseudo-physical address corresponding to kva. */ 593 uint64_t gnt_base_addr; 594 595 /** The size of the global kva pool. */ 596 int kva_size; 597 598 /** The size of the KVA area used for request lists. */ 599 int reqlist_kva_size; 600 601 /** The number of pages of KVA used for request lists */ 602 int reqlist_kva_pages; 603 604 /** Bitmap of free KVA pages */ 605 bitstr_t *kva_free; 606 607 /** 608 * \brief Cached value of the front-end's domain id. 609 * 610 * This value is used at once for each mapped page in 611 * a transaction. We cache it to avoid incuring the 612 * cost of an ivar access every time this is needed. 613 */ 614 domid_t otherend_id; 615 616 /** 617 * \brief The blkif protocol abi in effect. 618 * 619 * There are situations where the back and front ends can 620 * have a different, native abi (e.g. intel x86_64 and 621 * 32bit x86 domains on the same machine). The back-end 622 * always accommodates the front-end's native abi. That 623 * value is pulled from the XenStore and recorded here. 624 */ 625 int abi; 626 627 /** 628 * \brief The maximum number of requests and request lists allowed 629 * to be in flight at a time. 630 * 631 * This value is negotiated via the XenStore. 632 */ 633 u_int max_requests; 634 635 /** 636 * \brief The maximum number of segments (1 page per segment) 637 * that can be mapped by a request. 638 * 639 * This value is negotiated via the XenStore. 640 */ 641 u_int max_request_segments; 642 643 /** 644 * \brief Maximum number of segments per request list. 645 * 646 * This value is derived from and will generally be larger than 647 * max_request_segments. 648 */ 649 u_int max_reqlist_segments; 650 651 /** 652 * The maximum size of any request to this back-end 653 * device. 654 * 655 * This value is negotiated via the XenStore. 656 */ 657 u_int max_request_size; 658 659 /** 660 * The maximum size of any request list. This is derived directly 661 * from max_reqlist_segments. 662 */ 663 u_int max_reqlist_size; 664 665 /** Various configuration and state bit flags. */ 666 xbb_flag_t flags; 667 668 /** Ring mapping and interrupt configuration data. */ 669 struct xbb_ring_config ring_config; 670 671 /** Runtime, cross-abi safe, structures for ring access. */ 672 blkif_back_rings_t rings; 673 674 /** IRQ mapping for the communication ring event channel. */ 675 xen_intr_handle_t xen_intr_handle; 676 677 /** 678 * \brief Backend access mode flags (e.g. write, or read-only). 679 * 680 * This value is passed to us by the front-end via the XenStore. 681 */ 682 char *dev_mode; 683 684 /** 685 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 686 * 687 * This value is passed to us by the front-end via the XenStore. 688 * Currently unused. 689 */ 690 char *dev_type; 691 692 /** 693 * \brief Backend device/file identifier. 694 * 695 * This value is passed to us by the front-end via the XenStore. 696 * We expect this to be a POSIX path indicating the file or 697 * device to open. 698 */ 699 char *dev_name; 700 701 /** 702 * Vnode corresponding to the backend device node or file 703 * we are acessing. 704 */ 705 struct vnode *vn; 706 707 union xbb_backend_data backend; 708 709 /** The native sector size of the backend. */ 710 u_int sector_size; 711 712 /** log2 of sector_size. */ 713 u_int sector_size_shift; 714 715 /** Size in bytes of the backend device or file. */ 716 off_t media_size; 717 718 /** 719 * \brief media_size expressed in terms of the backend native 720 * sector size. 721 * 722 * (e.g. xbb->media_size >> xbb->sector_size_shift). 723 */ 724 uint64_t media_num_sectors; 725 726 /** 727 * \brief Array of memoized scatter gather data computed during the 728 * conversion of blkif ring requests to internal xbb_xen_req 729 * structures. 730 * 731 * Ring processing is serialized so we only need one of these. 732 */ 733 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 734 735 /** 736 * Temporary grant table map used in xbb_dispatch_io(). When 737 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 738 * stack could cause a stack overflow. 739 */ 740 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 741 742 /** Mutex protecting per-instance data. */ 743 struct mtx lock; 744 745 /** 746 * Resource representing allocated physical address space 747 * associated with our per-instance kva region. 748 */ 749 struct resource *pseudo_phys_res; 750 751 /** Resource id for allocated physical address space. */ 752 int pseudo_phys_res_id; 753 754 /** 755 * I/O statistics from BlockBack dispatch down. These are 756 * coalesced requests, and we start them right before execution. 757 */ 758 struct devstat *xbb_stats; 759 760 /** 761 * I/O statistics coming into BlockBack. These are the requests as 762 * we get them from BlockFront. They are started as soon as we 763 * receive a request, and completed when the I/O is complete. 764 */ 765 struct devstat *xbb_stats_in; 766 767 /** Disable sending flush to the backend */ 768 int disable_flush; 769 770 /** Send a real flush for every N flush requests */ 771 int flush_interval; 772 773 /** Count of flush requests in the interval */ 774 int flush_count; 775 776 /** Don't coalesce requests if this is set */ 777 int no_coalesce_reqs; 778 779 /** Number of requests we have received */ 780 uint64_t reqs_received; 781 782 /** Number of requests we have completed*/ 783 uint64_t reqs_completed; 784 785 /** Number of requests we queued but not pushed*/ 786 uint64_t reqs_queued_for_completion; 787 788 /** Number of requests we completed with an error status*/ 789 uint64_t reqs_completed_with_error; 790 791 /** How many forced dispatches (i.e. without coalescing) have happened */ 792 uint64_t forced_dispatch; 793 794 /** How many normal dispatches have happened */ 795 uint64_t normal_dispatch; 796 797 /** How many total dispatches have happened */ 798 uint64_t total_dispatch; 799 800 /** How many times we have run out of KVA */ 801 uint64_t kva_shortages; 802 803 /** How many times we have run out of request structures */ 804 uint64_t request_shortages; 805 806 /** Watch to wait for hotplug script execution */ 807 struct xs_watch hotplug_watch; 808 809 /** Got the needed data from hotplug scripts? */ 810 bool hotplug_done; 811 }; 812 813 /*---------------------------- Request Processing ----------------------------*/ 814 /** 815 * Allocate an internal transaction tracking structure from the free pool. 816 * 817 * \param xbb Per-instance xbb configuration structure. 818 * 819 * \return On success, a pointer to the allocated xbb_xen_req structure. 820 * Otherwise NULL. 821 */ 822 static inline struct xbb_xen_req * 823 xbb_get_req(struct xbb_softc *xbb) 824 { 825 struct xbb_xen_req *req; 826 827 req = NULL; 828 829 mtx_assert(&xbb->lock, MA_OWNED); 830 831 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 832 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 833 xbb->active_request_count++; 834 } 835 836 return (req); 837 } 838 839 /** 840 * Return an allocated transaction tracking structure to the free pool. 841 * 842 * \param xbb Per-instance xbb configuration structure. 843 * \param req The request structure to free. 844 */ 845 static inline void 846 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 847 { 848 mtx_assert(&xbb->lock, MA_OWNED); 849 850 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 851 xbb->active_request_count--; 852 853 KASSERT(xbb->active_request_count >= 0, 854 ("xbb_release_req: negative active count")); 855 } 856 857 /** 858 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 859 * 860 * \param xbb Per-instance xbb configuration structure. 861 * \param req_list The list of requests to free. 862 * \param nreqs The number of items in the list. 863 */ 864 static inline void 865 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 866 int nreqs) 867 { 868 mtx_assert(&xbb->lock, MA_OWNED); 869 870 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 871 xbb->active_request_count -= nreqs; 872 873 KASSERT(xbb->active_request_count >= 0, 874 ("xbb_release_reqs: negative active count")); 875 } 876 877 /** 878 * Given a page index and 512b sector offset within that page, 879 * calculate an offset into a request's kva region. 880 * 881 * \param reqlist The request structure whose kva region will be accessed. 882 * \param pagenr The page index used to compute the kva offset. 883 * \param sector The 512b sector index used to compute the page relative 884 * kva offset. 885 * 886 * \return The computed global KVA offset. 887 */ 888 static inline uint8_t * 889 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 890 { 891 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 892 } 893 894 #ifdef XBB_USE_BOUNCE_BUFFERS 895 /** 896 * Given a page index and 512b sector offset within that page, 897 * calculate an offset into a request's local bounce memory region. 898 * 899 * \param reqlist The request structure whose bounce region will be accessed. 900 * \param pagenr The page index used to compute the bounce offset. 901 * \param sector The 512b sector index used to compute the page relative 902 * bounce offset. 903 * 904 * \return The computed global bounce buffer address. 905 */ 906 static inline uint8_t * 907 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 908 { 909 return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 910 } 911 #endif 912 913 /** 914 * Given a page number and 512b sector offset within that page, 915 * calculate an offset into the request's memory region that the 916 * underlying backend device/file should use for I/O. 917 * 918 * \param reqlist The request structure whose I/O region will be accessed. 919 * \param pagenr The page index used to compute the I/O offset. 920 * \param sector The 512b sector index used to compute the page relative 921 * I/O offset. 922 * 923 * \return The computed global I/O address. 924 * 925 * Depending on configuration, this will either be a local bounce buffer 926 * or a pointer to the memory mapped in from the front-end domain for 927 * this request. 928 */ 929 static inline uint8_t * 930 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 931 { 932 #ifdef XBB_USE_BOUNCE_BUFFERS 933 return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 934 #else 935 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 936 #endif 937 } 938 939 /** 940 * Given a page index and 512b sector offset within that page, calculate 941 * an offset into the local pseudo-physical address space used to map a 942 * front-end's request data into a request. 943 * 944 * \param reqlist The request list structure whose pseudo-physical region 945 * will be accessed. 946 * \param pagenr The page index used to compute the pseudo-physical offset. 947 * \param sector The 512b sector index used to compute the page relative 948 * pseudo-physical offset. 949 * 950 * \return The computed global pseudo-phsyical address. 951 * 952 * Depending on configuration, this will either be a local bounce buffer 953 * or a pointer to the memory mapped in from the front-end domain for 954 * this request. 955 */ 956 static inline uintptr_t 957 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 958 { 959 struct xbb_softc *xbb; 960 961 xbb = reqlist->xbb; 962 963 return ((uintptr_t)(xbb->gnt_base_addr + 964 (uintptr_t)(reqlist->kva - xbb->kva) + 965 (PAGE_SIZE * pagenr) + (sector << 9))); 966 } 967 968 /** 969 * Get Kernel Virtual Address space for mapping requests. 970 * 971 * \param xbb Per-instance xbb configuration structure. 972 * \param nr_pages Number of pages needed. 973 * \param check_only If set, check for free KVA but don't allocate it. 974 * \param have_lock If set, xbb lock is already held. 975 * 976 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 977 * 978 * Note: This should be unnecessary once we have either chaining or 979 * scatter/gather support for struct bio. At that point we'll be able to 980 * put multiple addresses and lengths in one bio/bio chain and won't need 981 * to map everything into one virtual segment. 982 */ 983 static uint8_t * 984 xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 985 { 986 int first_clear; 987 int num_clear; 988 uint8_t *free_kva; 989 int i; 990 991 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 992 993 first_clear = 0; 994 free_kva = NULL; 995 996 mtx_lock(&xbb->lock); 997 998 /* 999 * Look for the first available page. If there are none, we're done. 1000 */ 1001 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 1002 1003 if (first_clear == -1) 1004 goto bailout; 1005 1006 /* 1007 * Starting at the first available page, look for consecutive free 1008 * pages that will satisfy the user's request. 1009 */ 1010 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1011 /* 1012 * If this is true, the page is used, so we have to reset 1013 * the number of clear pages and the first clear page 1014 * (since it pointed to a region with an insufficient number 1015 * of clear pages). 1016 */ 1017 if (bit_test(xbb->kva_free, i)) { 1018 num_clear = 0; 1019 first_clear = -1; 1020 continue; 1021 } 1022 1023 if (first_clear == -1) 1024 first_clear = i; 1025 1026 /* 1027 * If this is true, we've found a large enough free region 1028 * to satisfy the request. 1029 */ 1030 if (++num_clear == nr_pages) { 1031 bit_nset(xbb->kva_free, first_clear, 1032 first_clear + nr_pages - 1); 1033 1034 free_kva = xbb->kva + 1035 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); 1036 1037 KASSERT(free_kva >= (uint8_t *)xbb->kva && 1038 free_kva + (nr_pages * PAGE_SIZE) <= 1039 (uint8_t *)xbb->ring_config.va, 1040 ("Free KVA %p len %d out of range, " 1041 "kva = %#jx, ring VA = %#jx\n", free_kva, 1042 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1043 (uintmax_t)xbb->ring_config.va)); 1044 break; 1045 } 1046 } 1047 1048 bailout: 1049 1050 if (free_kva == NULL) { 1051 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1052 xbb->kva_shortages++; 1053 } 1054 1055 mtx_unlock(&xbb->lock); 1056 1057 return (free_kva); 1058 } 1059 1060 /** 1061 * Free allocated KVA. 1062 * 1063 * \param xbb Per-instance xbb configuration structure. 1064 * \param kva_ptr Pointer to allocated KVA region. 1065 * \param nr_pages Number of pages in the KVA region. 1066 */ 1067 static void 1068 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 1069 { 1070 intptr_t start_page; 1071 1072 mtx_assert(&xbb->lock, MA_OWNED); 1073 1074 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1075 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1076 1077 } 1078 1079 /** 1080 * Unmap the front-end pages associated with this I/O request. 1081 * 1082 * \param req The request structure to unmap. 1083 */ 1084 static void 1085 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1086 { 1087 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1088 u_int i; 1089 u_int invcount; 1090 int error __diagused; 1091 1092 invcount = 0; 1093 for (i = 0; i < reqlist->nr_segments; i++) { 1094 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1095 continue; 1096 1097 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1098 unmap[invcount].dev_bus_addr = 0; 1099 unmap[invcount].handle = reqlist->gnt_handles[i]; 1100 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1101 invcount++; 1102 } 1103 1104 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1105 unmap, invcount); 1106 KASSERT(error == 0, ("Grant table operation failed")); 1107 } 1108 1109 /** 1110 * Allocate an internal transaction tracking structure from the free pool. 1111 * 1112 * \param xbb Per-instance xbb configuration structure. 1113 * 1114 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1115 * Otherwise NULL. 1116 */ 1117 static inline struct xbb_xen_reqlist * 1118 xbb_get_reqlist(struct xbb_softc *xbb) 1119 { 1120 struct xbb_xen_reqlist *reqlist; 1121 1122 reqlist = NULL; 1123 1124 mtx_assert(&xbb->lock, MA_OWNED); 1125 1126 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1127 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1128 reqlist->flags = XBB_REQLIST_NONE; 1129 reqlist->kva = NULL; 1130 reqlist->status = BLKIF_RSP_OKAY; 1131 reqlist->residual_512b_sectors = 0; 1132 reqlist->num_children = 0; 1133 reqlist->nr_segments = 0; 1134 STAILQ_INIT(&reqlist->contig_req_list); 1135 } 1136 1137 return (reqlist); 1138 } 1139 1140 /** 1141 * Return an allocated transaction tracking structure to the free pool. 1142 * 1143 * \param xbb Per-instance xbb configuration structure. 1144 * \param req The request list structure to free. 1145 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1146 * during a resource shortage condition. 1147 */ 1148 static inline void 1149 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1150 int wakeup) 1151 { 1152 1153 mtx_assert(&xbb->lock, MA_OWNED); 1154 1155 if (wakeup) { 1156 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1157 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1158 } 1159 1160 if (reqlist->kva != NULL) 1161 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1162 1163 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1164 1165 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1166 1167 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1168 /* 1169 * Shutdown is in progress. See if we can 1170 * progress further now that one more request 1171 * has completed and been returned to the 1172 * free pool. 1173 */ 1174 xbb_shutdown(xbb); 1175 } 1176 1177 if (wakeup != 0) 1178 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1179 } 1180 1181 /** 1182 * Request resources and do basic request setup. 1183 * 1184 * \param xbb Per-instance xbb configuration structure. 1185 * \param reqlist Pointer to reqlist pointer. 1186 * \param ring_req Pointer to a block ring request. 1187 * \param ring_index The ring index of this request. 1188 * 1189 * \return 0 for success, non-zero for failure. 1190 */ 1191 static int 1192 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1193 blkif_request_t *ring_req, RING_IDX ring_idx) 1194 { 1195 struct xbb_xen_reqlist *nreqlist; 1196 struct xbb_xen_req *nreq; 1197 1198 nreqlist = NULL; 1199 nreq = NULL; 1200 1201 mtx_lock(&xbb->lock); 1202 1203 /* 1204 * We don't allow new resources to be allocated if we're in the 1205 * process of shutting down. 1206 */ 1207 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1208 mtx_unlock(&xbb->lock); 1209 return (1); 1210 } 1211 1212 /* 1213 * Allocate a reqlist if the caller doesn't have one already. 1214 */ 1215 if (*reqlist == NULL) { 1216 nreqlist = xbb_get_reqlist(xbb); 1217 if (nreqlist == NULL) 1218 goto bailout_error; 1219 } 1220 1221 /* We always allocate a request. */ 1222 nreq = xbb_get_req(xbb); 1223 if (nreq == NULL) 1224 goto bailout_error; 1225 1226 mtx_unlock(&xbb->lock); 1227 1228 if (*reqlist == NULL) { 1229 *reqlist = nreqlist; 1230 nreqlist->operation = ring_req->operation; 1231 nreqlist->starting_sector_number = ring_req->sector_number; 1232 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1233 links); 1234 } 1235 1236 nreq->reqlist = *reqlist; 1237 nreq->req_ring_idx = ring_idx; 1238 nreq->id = ring_req->id; 1239 nreq->operation = ring_req->operation; 1240 1241 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1242 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1243 nreq->ring_req = &nreq->ring_req_storage; 1244 } else { 1245 nreq->ring_req = ring_req; 1246 } 1247 1248 binuptime(&nreq->ds_t0); 1249 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1250 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1251 (*reqlist)->num_children++; 1252 (*reqlist)->nr_segments += ring_req->nr_segments; 1253 1254 return (0); 1255 1256 bailout_error: 1257 1258 /* 1259 * We're out of resources, so set the shortage flag. The next time 1260 * a request is released, we'll try waking up the work thread to 1261 * see if we can allocate more resources. 1262 */ 1263 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1264 xbb->request_shortages++; 1265 1266 if (nreq != NULL) 1267 xbb_release_req(xbb, nreq); 1268 1269 if (nreqlist != NULL) 1270 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1271 1272 mtx_unlock(&xbb->lock); 1273 1274 return (1); 1275 } 1276 1277 /** 1278 * Create and queue a response to a blkif request. 1279 * 1280 * \param xbb Per-instance xbb configuration structure. 1281 * \param req The request structure to which to respond. 1282 * \param status The status code to report. See BLKIF_RSP_* 1283 * in sys/contrib/xen/io/blkif.h. 1284 */ 1285 static void 1286 xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1287 { 1288 blkif_response_t *resp; 1289 1290 /* 1291 * The mutex is required here, and should be held across this call 1292 * until after the subsequent call to xbb_push_responses(). This 1293 * is to guarantee that another context won't queue responses and 1294 * push them while we're active. 1295 * 1296 * That could lead to the other end being notified of responses 1297 * before the resources have been freed on this end. The other end 1298 * would then be able to queue additional I/O, and we may run out 1299 * of resources because we haven't freed them all yet. 1300 */ 1301 mtx_assert(&xbb->lock, MA_OWNED); 1302 1303 /* 1304 * Place on the response ring for the relevant domain. 1305 * For now, only the spacing between entries is different 1306 * in the different ABIs, not the response entry layout. 1307 */ 1308 switch (xbb->abi) { 1309 case BLKIF_PROTOCOL_NATIVE: 1310 resp = RING_GET_RESPONSE(&xbb->rings.native, 1311 xbb->rings.native.rsp_prod_pvt); 1312 break; 1313 case BLKIF_PROTOCOL_X86_32: 1314 resp = (blkif_response_t *) 1315 RING_GET_RESPONSE(&xbb->rings.x86_32, 1316 xbb->rings.x86_32.rsp_prod_pvt); 1317 break; 1318 case BLKIF_PROTOCOL_X86_64: 1319 resp = (blkif_response_t *) 1320 RING_GET_RESPONSE(&xbb->rings.x86_64, 1321 xbb->rings.x86_64.rsp_prod_pvt); 1322 break; 1323 default: 1324 panic("Unexpected blkif protocol ABI."); 1325 } 1326 1327 resp->id = req->id; 1328 resp->operation = req->operation; 1329 resp->status = status; 1330 1331 if (status != BLKIF_RSP_OKAY) 1332 xbb->reqs_completed_with_error++; 1333 1334 xbb->rings.common.rsp_prod_pvt++; 1335 1336 xbb->reqs_queued_for_completion++; 1337 1338 } 1339 1340 /** 1341 * Send queued responses to blkif requests. 1342 * 1343 * \param xbb Per-instance xbb configuration structure. 1344 * \param run_taskqueue Flag that is set to 1 if the taskqueue 1345 * should be run, 0 if it does not need to be run. 1346 * \param notify Flag that is set to 1 if the other end should be 1347 * notified via irq, 0 if the other end should not be 1348 * notified. 1349 */ 1350 static void 1351 xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) 1352 { 1353 int more_to_do; 1354 1355 /* 1356 * The mutex is required here. 1357 */ 1358 mtx_assert(&xbb->lock, MA_OWNED); 1359 1360 more_to_do = 0; 1361 1362 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); 1363 1364 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1365 /* 1366 * Tail check for pending requests. Allows frontend to avoid 1367 * notifications if requests are already in flight (lower 1368 * overheads and promotes batching). 1369 */ 1370 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1371 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1372 more_to_do = 1; 1373 } 1374 1375 xbb->reqs_completed += xbb->reqs_queued_for_completion; 1376 xbb->reqs_queued_for_completion = 0; 1377 1378 *run_taskqueue = more_to_do; 1379 } 1380 1381 /** 1382 * Complete a request list. 1383 * 1384 * \param xbb Per-instance xbb configuration structure. 1385 * \param reqlist Allocated internal request list structure. 1386 */ 1387 static void 1388 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1389 { 1390 struct xbb_xen_req *nreq; 1391 off_t sectors_sent; 1392 int notify, run_taskqueue; 1393 1394 sectors_sent = 0; 1395 1396 if (reqlist->flags & XBB_REQLIST_MAPPED) 1397 xbb_unmap_reqlist(reqlist); 1398 1399 mtx_lock(&xbb->lock); 1400 1401 /* 1402 * All I/O is done, send the response. A lock is not necessary 1403 * to protect the request list, because all requests have 1404 * completed. Therefore this is the only context accessing this 1405 * reqlist right now. However, in order to make sure that no one 1406 * else queues responses onto the queue or pushes them to the other 1407 * side while we're active, we need to hold the lock across the 1408 * calls to xbb_queue_response() and xbb_push_responses(). 1409 */ 1410 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1411 off_t cur_sectors_sent; 1412 1413 /* Put this response on the ring, but don't push yet */ 1414 xbb_queue_response(xbb, nreq, reqlist->status); 1415 1416 /* We don't report bytes sent if there is an error. */ 1417 if (reqlist->status == BLKIF_RSP_OKAY) 1418 cur_sectors_sent = nreq->nr_512b_sectors; 1419 else 1420 cur_sectors_sent = 0; 1421 1422 sectors_sent += cur_sectors_sent; 1423 1424 devstat_end_transaction(xbb->xbb_stats_in, 1425 /*bytes*/cur_sectors_sent << 9, 1426 reqlist->ds_tag_type, 1427 reqlist->ds_trans_type, 1428 /*now*/NULL, 1429 /*then*/&nreq->ds_t0); 1430 } 1431 1432 /* 1433 * Take out any sectors not sent. If we wind up negative (which 1434 * might happen if an error is reported as well as a residual), just 1435 * report 0 sectors sent. 1436 */ 1437 sectors_sent -= reqlist->residual_512b_sectors; 1438 if (sectors_sent < 0) 1439 sectors_sent = 0; 1440 1441 devstat_end_transaction(xbb->xbb_stats, 1442 /*bytes*/ sectors_sent << 9, 1443 reqlist->ds_tag_type, 1444 reqlist->ds_trans_type, 1445 /*now*/NULL, 1446 /*then*/&reqlist->ds_t0); 1447 1448 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1449 1450 xbb_push_responses(xbb, &run_taskqueue, ¬ify); 1451 1452 mtx_unlock(&xbb->lock); 1453 1454 if (run_taskqueue) 1455 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1456 1457 if (notify) 1458 xen_intr_signal(xbb->xen_intr_handle); 1459 } 1460 1461 /** 1462 * Completion handler for buffer I/O requests issued by the device 1463 * backend driver. 1464 * 1465 * \param bio The buffer I/O request on which to perform completion 1466 * processing. 1467 */ 1468 static void 1469 xbb_bio_done(struct bio *bio) 1470 { 1471 struct xbb_softc *xbb; 1472 struct xbb_xen_reqlist *reqlist; 1473 1474 reqlist = bio->bio_caller1; 1475 xbb = reqlist->xbb; 1476 1477 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1478 1479 /* 1480 * This is a bit imprecise. With aggregated I/O a single 1481 * request list can contain multiple front-end requests and 1482 * a multiple bios may point to a single request. By carefully 1483 * walking the request list, we could map residuals and errors 1484 * back to the original front-end request, but the interface 1485 * isn't sufficiently rich for us to properly report the error. 1486 * So, we just treat the entire request list as having failed if an 1487 * error occurs on any part. And, if an error occurs, we treat 1488 * the amount of data transferred as 0. 1489 * 1490 * For residuals, we report it on the overall aggregated device, 1491 * but not on the individual requests, since we don't currently 1492 * do the work to determine which front-end request to which the 1493 * residual applies. 1494 */ 1495 if (bio->bio_error) { 1496 DPRINTF("BIO returned error %d for operation on device %s\n", 1497 bio->bio_error, xbb->dev_name); 1498 reqlist->status = BLKIF_RSP_ERROR; 1499 1500 if (bio->bio_error == ENXIO 1501 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1502 /* 1503 * Backend device has disappeared. Signal the 1504 * front-end that we (the device proxy) want to 1505 * go away. 1506 */ 1507 xenbus_set_state(xbb->dev, XenbusStateClosing); 1508 } 1509 } 1510 1511 #ifdef XBB_USE_BOUNCE_BUFFERS 1512 if (bio->bio_cmd == BIO_READ) { 1513 vm_offset_t kva_offset; 1514 1515 kva_offset = (vm_offset_t)bio->bio_data 1516 - (vm_offset_t)reqlist->bounce; 1517 memcpy((uint8_t *)reqlist->kva + kva_offset, 1518 bio->bio_data, bio->bio_bcount); 1519 } 1520 #endif /* XBB_USE_BOUNCE_BUFFERS */ 1521 1522 /* 1523 * Decrement the pending count for the request list. When we're 1524 * done with the requests, send status back for all of them. 1525 */ 1526 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1527 xbb_complete_reqlist(xbb, reqlist); 1528 1529 g_destroy_bio(bio); 1530 } 1531 1532 /** 1533 * Parse a blkif request into an internal request structure and send 1534 * it to the backend for processing. 1535 * 1536 * \param xbb Per-instance xbb configuration structure. 1537 * \param reqlist Allocated internal request list structure. 1538 * 1539 * \return On success, 0. For resource shortages, non-zero. 1540 * 1541 * This routine performs the backend common aspects of request parsing 1542 * including compiling an internal request structure, parsing the S/G 1543 * list and any secondary ring requests in which they may reside, and 1544 * the mapping of front-end I/O pages into our domain. 1545 */ 1546 static int 1547 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1548 { 1549 struct xbb_sg *xbb_sg; 1550 struct gnttab_map_grant_ref *map; 1551 struct blkif_request_segment *sg; 1552 struct blkif_request_segment *last_block_sg; 1553 struct xbb_xen_req *nreq; 1554 u_int nseg; 1555 u_int seg_idx; 1556 u_int block_segs; 1557 int nr_sects; 1558 int total_sects; 1559 int operation; 1560 uint8_t bio_flags; 1561 int error; 1562 1563 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1564 bio_flags = 0; 1565 total_sects = 0; 1566 nr_sects = 0; 1567 1568 /* 1569 * First determine whether we have enough free KVA to satisfy this 1570 * request list. If not, tell xbb_run_queue() so it can go to 1571 * sleep until we have more KVA. 1572 */ 1573 reqlist->kva = NULL; 1574 if (reqlist->nr_segments != 0) { 1575 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1576 if (reqlist->kva == NULL) { 1577 /* 1578 * If we're out of KVA, return ENOMEM. 1579 */ 1580 return (ENOMEM); 1581 } 1582 } 1583 1584 binuptime(&reqlist->ds_t0); 1585 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1586 1587 switch (reqlist->operation) { 1588 case BLKIF_OP_WRITE_BARRIER: 1589 bio_flags |= BIO_ORDERED; 1590 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1591 /* FALLTHROUGH */ 1592 case BLKIF_OP_WRITE: 1593 operation = BIO_WRITE; 1594 reqlist->ds_trans_type = DEVSTAT_WRITE; 1595 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1596 DPRINTF("Attempt to write to read only device %s\n", 1597 xbb->dev_name); 1598 reqlist->status = BLKIF_RSP_ERROR; 1599 goto send_response; 1600 } 1601 break; 1602 case BLKIF_OP_READ: 1603 operation = BIO_READ; 1604 reqlist->ds_trans_type = DEVSTAT_READ; 1605 break; 1606 case BLKIF_OP_FLUSH_DISKCACHE: 1607 /* 1608 * If this is true, the user has requested that we disable 1609 * flush support. So we just complete the requests 1610 * successfully. 1611 */ 1612 if (xbb->disable_flush != 0) { 1613 goto send_response; 1614 } 1615 1616 /* 1617 * The user has requested that we only send a real flush 1618 * for every N flush requests. So keep count, and either 1619 * complete the request immediately or queue it for the 1620 * backend. 1621 */ 1622 if (xbb->flush_interval != 0) { 1623 if (++(xbb->flush_count) < xbb->flush_interval) { 1624 goto send_response; 1625 } else 1626 xbb->flush_count = 0; 1627 } 1628 1629 operation = BIO_FLUSH; 1630 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1631 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1632 goto do_dispatch; 1633 /*NOTREACHED*/ 1634 default: 1635 DPRINTF("error: unknown block io operation [%d]\n", 1636 reqlist->operation); 1637 reqlist->status = BLKIF_RSP_ERROR; 1638 goto send_response; 1639 } 1640 1641 reqlist->xbb = xbb; 1642 xbb_sg = xbb->xbb_sgs; 1643 map = xbb->maps; 1644 seg_idx = 0; 1645 1646 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1647 blkif_request_t *ring_req; 1648 u_int req_seg_idx; 1649 1650 ring_req = nreq->ring_req; 1651 nr_sects = 0; 1652 nseg = ring_req->nr_segments; 1653 nreq->nr_pages = nseg; 1654 nreq->nr_512b_sectors = 0; 1655 req_seg_idx = 0; 1656 sg = NULL; 1657 1658 /* Check that number of segments is sane. */ 1659 if (__predict_false(nseg == 0) 1660 || __predict_false(nseg > xbb->max_request_segments)) { 1661 DPRINTF("Bad number of segments in request (%d)\n", 1662 nseg); 1663 reqlist->status = BLKIF_RSP_ERROR; 1664 goto send_response; 1665 } 1666 1667 block_segs = nseg; 1668 sg = ring_req->seg; 1669 last_block_sg = sg + block_segs; 1670 1671 while (sg < last_block_sg) { 1672 KASSERT(seg_idx < 1673 XBB_MAX_SEGMENTS_PER_REQLIST, 1674 ("seg_idx %d is too large, max " 1675 "segs %d\n", seg_idx, 1676 XBB_MAX_SEGMENTS_PER_REQLIST)); 1677 1678 xbb_sg->first_sect = sg->first_sect; 1679 xbb_sg->last_sect = sg->last_sect; 1680 xbb_sg->nsect = 1681 (int8_t)(sg->last_sect - 1682 sg->first_sect + 1); 1683 1684 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1685 || (xbb_sg->nsect <= 0)) { 1686 reqlist->status = BLKIF_RSP_ERROR; 1687 goto send_response; 1688 } 1689 1690 nr_sects += xbb_sg->nsect; 1691 map->host_addr = xbb_get_gntaddr(reqlist, 1692 seg_idx, /*sector*/0); 1693 KASSERT(map->host_addr + PAGE_SIZE <= 1694 xbb->ring_config.gnt_addr, 1695 ("Host address %#jx len %d overlaps " 1696 "ring address %#jx\n", 1697 (uintmax_t)map->host_addr, PAGE_SIZE, 1698 (uintmax_t)xbb->ring_config.gnt_addr)); 1699 1700 map->flags = GNTMAP_host_map; 1701 map->ref = sg->gref; 1702 map->dom = xbb->otherend_id; 1703 if (operation == BIO_WRITE) 1704 map->flags |= GNTMAP_readonly; 1705 sg++; 1706 map++; 1707 xbb_sg++; 1708 seg_idx++; 1709 req_seg_idx++; 1710 } 1711 1712 /* Convert to the disk's sector size */ 1713 nreq->nr_512b_sectors = nr_sects; 1714 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1715 total_sects += nr_sects; 1716 1717 if ((nreq->nr_512b_sectors & 1718 ((xbb->sector_size >> 9) - 1)) != 0) { 1719 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1720 "a multiple of the backing store sector " 1721 "size (%d)\n", __func__, 1722 nreq->nr_512b_sectors << 9, 1723 xbb->sector_size); 1724 reqlist->status = BLKIF_RSP_ERROR; 1725 goto send_response; 1726 } 1727 } 1728 1729 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1730 xbb->maps, reqlist->nr_segments); 1731 if (error != 0) 1732 panic("Grant table operation failed (%d)", error); 1733 1734 reqlist->flags |= XBB_REQLIST_MAPPED; 1735 1736 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1737 seg_idx++, map++){ 1738 if (__predict_false(map->status != 0)) { 1739 DPRINTF("invalid buffer -- could not remap " 1740 "it (%d)\n", map->status); 1741 DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " 1742 "0x%x ref 0x%x, dom %d\n", seg_idx, 1743 map->host_addr, map->flags, map->ref, 1744 map->dom); 1745 reqlist->status = BLKIF_RSP_ERROR; 1746 goto send_response; 1747 } 1748 1749 reqlist->gnt_handles[seg_idx] = map->handle; 1750 } 1751 if (reqlist->starting_sector_number + total_sects > 1752 xbb->media_num_sectors) { 1753 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1754 "extends past end of device %s\n", 1755 operation == BIO_READ ? "read" : "write", 1756 reqlist->starting_sector_number, 1757 reqlist->starting_sector_number + total_sects, 1758 xbb->dev_name); 1759 reqlist->status = BLKIF_RSP_ERROR; 1760 goto send_response; 1761 } 1762 1763 do_dispatch: 1764 1765 error = xbb->dispatch_io(xbb, 1766 reqlist, 1767 operation, 1768 bio_flags); 1769 1770 if (error != 0) { 1771 reqlist->status = BLKIF_RSP_ERROR; 1772 goto send_response; 1773 } 1774 1775 return (0); 1776 1777 send_response: 1778 1779 xbb_complete_reqlist(xbb, reqlist); 1780 1781 return (0); 1782 } 1783 1784 static __inline int 1785 xbb_count_sects(blkif_request_t *ring_req) 1786 { 1787 int i; 1788 int cur_size = 0; 1789 1790 for (i = 0; i < ring_req->nr_segments; i++) { 1791 int nsect; 1792 1793 nsect = (int8_t)(ring_req->seg[i].last_sect - 1794 ring_req->seg[i].first_sect + 1); 1795 if (nsect <= 0) 1796 break; 1797 1798 cur_size += nsect; 1799 } 1800 1801 return (cur_size); 1802 } 1803 1804 /** 1805 * Process incoming requests from the shared communication ring in response 1806 * to a signal on the ring's event channel. 1807 * 1808 * \param context Callback argument registerd during task initialization - 1809 * the xbb_softc for this instance. 1810 * \param pending The number of taskqueue_enqueue events that have 1811 * occurred since this handler was last run. 1812 */ 1813 static void 1814 xbb_run_queue(void *context, int pending) 1815 { 1816 struct xbb_softc *xbb; 1817 blkif_back_rings_t *rings; 1818 RING_IDX rp; 1819 uint64_t cur_sector; 1820 int cur_operation; 1821 struct xbb_xen_reqlist *reqlist; 1822 1823 xbb = (struct xbb_softc *)context; 1824 rings = &xbb->rings; 1825 1826 /* 1827 * Work gather and dispatch loop. Note that we have a bias here 1828 * towards gathering I/O sent by blockfront. We first gather up 1829 * everything in the ring, as long as we have resources. Then we 1830 * dispatch one request, and then attempt to gather up any 1831 * additional requests that have come in while we were dispatching 1832 * the request. 1833 * 1834 * This allows us to get a clearer picture (via devstat) of how 1835 * many requests blockfront is queueing to us at any given time. 1836 */ 1837 for (;;) { 1838 int retval; 1839 1840 /* 1841 * Initialize reqlist to the last element in the pending 1842 * queue, if there is one. This allows us to add more 1843 * requests to that request list, if we have room. 1844 */ 1845 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1846 xbb_xen_reqlist, links); 1847 if (reqlist != NULL) { 1848 cur_sector = reqlist->next_contig_sector; 1849 cur_operation = reqlist->operation; 1850 } else { 1851 cur_operation = 0; 1852 cur_sector = 0; 1853 } 1854 1855 /* 1856 * Cache req_prod to avoid accessing a cache line shared 1857 * with the frontend. 1858 */ 1859 rp = rings->common.sring->req_prod; 1860 1861 /* Ensure we see queued requests up to 'rp'. */ 1862 rmb(); 1863 1864 /** 1865 * Run so long as there is work to consume and the generation 1866 * of a response will not overflow the ring. 1867 * 1868 * @note There's a 1 to 1 relationship between requests and 1869 * responses, so an overflow should never occur. This 1870 * test is to protect our domain from digesting bogus 1871 * data. Shouldn't we log this? 1872 */ 1873 while (rings->common.req_cons != rp 1874 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1875 rings->common.req_cons) == 0){ 1876 blkif_request_t ring_req_storage; 1877 blkif_request_t *ring_req; 1878 int cur_size; 1879 1880 switch (xbb->abi) { 1881 case BLKIF_PROTOCOL_NATIVE: 1882 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1883 rings->common.req_cons); 1884 break; 1885 case BLKIF_PROTOCOL_X86_32: 1886 { 1887 struct blkif_x86_32_request *ring_req32; 1888 1889 ring_req32 = RING_GET_REQUEST( 1890 &xbb->rings.x86_32, rings->common.req_cons); 1891 blkif_get_x86_32_req(&ring_req_storage, 1892 ring_req32); 1893 ring_req = &ring_req_storage; 1894 break; 1895 } 1896 case BLKIF_PROTOCOL_X86_64: 1897 { 1898 struct blkif_x86_64_request *ring_req64; 1899 1900 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1901 rings->common.req_cons); 1902 blkif_get_x86_64_req(&ring_req_storage, 1903 ring_req64); 1904 ring_req = &ring_req_storage; 1905 break; 1906 } 1907 default: 1908 panic("Unexpected blkif protocol ABI."); 1909 /* NOTREACHED */ 1910 } 1911 1912 /* 1913 * Check for situations that would require closing 1914 * off this I/O for further coalescing: 1915 * - Coalescing is turned off. 1916 * - Current I/O is out of sequence with the previous 1917 * I/O. 1918 * - Coalesced I/O would be too large. 1919 */ 1920 if ((reqlist != NULL) 1921 && ((xbb->no_coalesce_reqs != 0) 1922 || ((xbb->no_coalesce_reqs == 0) 1923 && ((ring_req->sector_number != cur_sector) 1924 || (ring_req->operation != cur_operation) 1925 || ((ring_req->nr_segments + reqlist->nr_segments) > 1926 xbb->max_reqlist_segments))))) { 1927 reqlist = NULL; 1928 } 1929 1930 /* 1931 * Grab and check for all resources in one shot. 1932 * If we can't get all of the resources we need, 1933 * the shortage is noted and the thread will get 1934 * woken up when more resources are available. 1935 */ 1936 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1937 xbb->rings.common.req_cons); 1938 1939 if (retval != 0) { 1940 /* 1941 * Resource shortage has been recorded. 1942 * We'll be scheduled to run once a request 1943 * object frees up due to a completion. 1944 */ 1945 break; 1946 } 1947 1948 /* 1949 * Signify that we can overwrite this request with 1950 * a response by incrementing our consumer index. 1951 * The response won't be generated until after 1952 * we've already consumed all necessary data out 1953 * of the version of the request in the ring buffer 1954 * (for native mode). We must update the consumer 1955 * index before issuing back-end I/O so there is 1956 * no possibility that it will complete and a 1957 * response be generated before we make room in 1958 * the queue for that response. 1959 */ 1960 xbb->rings.common.req_cons++; 1961 xbb->reqs_received++; 1962 1963 cur_size = xbb_count_sects(ring_req); 1964 cur_sector = ring_req->sector_number + cur_size; 1965 reqlist->next_contig_sector = cur_sector; 1966 cur_operation = ring_req->operation; 1967 } 1968 1969 /* Check for I/O to dispatch */ 1970 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1971 if (reqlist == NULL) { 1972 /* 1973 * We're out of work to do, put the task queue to 1974 * sleep. 1975 */ 1976 break; 1977 } 1978 1979 /* 1980 * Grab the first request off the queue and attempt 1981 * to dispatch it. 1982 */ 1983 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1984 1985 retval = xbb_dispatch_io(xbb, reqlist); 1986 if (retval != 0) { 1987 /* 1988 * xbb_dispatch_io() returns non-zero only when 1989 * there is a resource shortage. If that's the 1990 * case, re-queue this request on the head of the 1991 * queue, and go to sleep until we have more 1992 * resources. 1993 */ 1994 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1995 reqlist, links); 1996 break; 1997 } else { 1998 /* 1999 * If we still have anything on the queue after 2000 * removing the head entry, that is because we 2001 * met one of the criteria to create a new 2002 * request list (outlined above), and we'll call 2003 * that a forced dispatch for statistical purposes. 2004 * 2005 * Otherwise, if there is only one element on the 2006 * queue, we coalesced everything available on 2007 * the ring and we'll call that a normal dispatch. 2008 */ 2009 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2010 2011 if (reqlist != NULL) 2012 xbb->forced_dispatch++; 2013 else 2014 xbb->normal_dispatch++; 2015 2016 xbb->total_dispatch++; 2017 } 2018 } 2019 } 2020 2021 /** 2022 * Interrupt handler bound to the shared ring's event channel. 2023 * 2024 * \param arg Callback argument registerd during event channel 2025 * binding - the xbb_softc for this instance. 2026 */ 2027 static int 2028 xbb_filter(void *arg) 2029 { 2030 struct xbb_softc *xbb; 2031 2032 /* Defer to taskqueue thread. */ 2033 xbb = (struct xbb_softc *)arg; 2034 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2035 2036 return (FILTER_HANDLED); 2037 } 2038 2039 SDT_PROVIDER_DEFINE(xbb); 2040 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 2041 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 2042 "uint64_t"); 2043 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 2044 "uint64_t", "uint64_t"); 2045 2046 /*----------------------------- Backend Handlers -----------------------------*/ 2047 /** 2048 * Backend handler for character device access. 2049 * 2050 * \param xbb Per-instance xbb configuration structure. 2051 * \param reqlist Allocated internal request list structure. 2052 * \param operation BIO_* I/O operation code. 2053 * \param bio_flags Additional bio_flag data to pass to any generated 2054 * bios (e.g. BIO_ORDERED).. 2055 * 2056 * \return 0 for success, errno codes for failure. 2057 */ 2058 static int 2059 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2060 int operation, int bio_flags) 2061 { 2062 struct xbb_dev_data *dev_data; 2063 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2064 off_t bio_offset; 2065 struct bio *bio; 2066 struct xbb_sg *xbb_sg; 2067 u_int nbio; 2068 u_int bio_idx; 2069 u_int nseg; 2070 u_int seg_idx; 2071 int error; 2072 2073 dev_data = &xbb->backend.dev; 2074 bio_offset = (off_t)reqlist->starting_sector_number 2075 << xbb->sector_size_shift; 2076 error = 0; 2077 nbio = 0; 2078 bio_idx = 0; 2079 2080 if (operation == BIO_FLUSH) { 2081 bio = g_new_bio(); 2082 if (__predict_false(bio == NULL)) { 2083 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2084 error = ENOMEM; 2085 return (error); 2086 } 2087 2088 bio->bio_cmd = BIO_FLUSH; 2089 bio->bio_flags |= BIO_ORDERED; 2090 bio->bio_dev = dev_data->cdev; 2091 bio->bio_offset = 0; 2092 bio->bio_data = 0; 2093 bio->bio_done = xbb_bio_done; 2094 bio->bio_caller1 = reqlist; 2095 bio->bio_pblkno = 0; 2096 2097 reqlist->pendcnt = 1; 2098 2099 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2100 device_get_unit(xbb->dev)); 2101 2102 (*dev_data->csw->d_strategy)(bio); 2103 2104 return (0); 2105 } 2106 2107 xbb_sg = xbb->xbb_sgs; 2108 bio = NULL; 2109 nseg = reqlist->nr_segments; 2110 2111 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2112 /* 2113 * KVA will not be contiguous, so any additional 2114 * I/O will need to be represented in a new bio. 2115 */ 2116 if ((bio != NULL) 2117 && (xbb_sg->first_sect != 0)) { 2118 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2119 printf("%s: Discontiguous I/O request " 2120 "from domain %d ends on " 2121 "non-sector boundary\n", 2122 __func__, xbb->otherend_id); 2123 error = EINVAL; 2124 goto fail_free_bios; 2125 } 2126 bio = NULL; 2127 } 2128 2129 if (bio == NULL) { 2130 /* 2131 * Make sure that the start of this bio is 2132 * aligned to a device sector. 2133 */ 2134 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2135 printf("%s: Misaligned I/O request " 2136 "from domain %d\n", __func__, 2137 xbb->otherend_id); 2138 error = EINVAL; 2139 goto fail_free_bios; 2140 } 2141 2142 bio = bios[nbio++] = g_new_bio(); 2143 if (__predict_false(bio == NULL)) { 2144 error = ENOMEM; 2145 goto fail_free_bios; 2146 } 2147 bio->bio_cmd = operation; 2148 bio->bio_flags |= bio_flags; 2149 bio->bio_dev = dev_data->cdev; 2150 bio->bio_offset = bio_offset; 2151 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2152 xbb_sg->first_sect); 2153 bio->bio_done = xbb_bio_done; 2154 bio->bio_caller1 = reqlist; 2155 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2156 } 2157 2158 bio->bio_length += xbb_sg->nsect << 9; 2159 bio->bio_bcount = bio->bio_length; 2160 bio_offset += xbb_sg->nsect << 9; 2161 2162 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2163 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2164 printf("%s: Discontiguous I/O request " 2165 "from domain %d ends on " 2166 "non-sector boundary\n", 2167 __func__, xbb->otherend_id); 2168 error = EINVAL; 2169 goto fail_free_bios; 2170 } 2171 /* 2172 * KVA will not be contiguous, so any additional 2173 * I/O will need to be represented in a new bio. 2174 */ 2175 bio = NULL; 2176 } 2177 } 2178 2179 reqlist->pendcnt = nbio; 2180 2181 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2182 { 2183 #ifdef XBB_USE_BOUNCE_BUFFERS 2184 vm_offset_t kva_offset; 2185 2186 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2187 - (vm_offset_t)reqlist->bounce; 2188 if (operation == BIO_WRITE) { 2189 memcpy(bios[bio_idx]->bio_data, 2190 (uint8_t *)reqlist->kva + kva_offset, 2191 bios[bio_idx]->bio_bcount); 2192 } 2193 #endif 2194 if (operation == BIO_READ) { 2195 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2196 device_get_unit(xbb->dev), 2197 bios[bio_idx]->bio_offset, 2198 bios[bio_idx]->bio_length); 2199 } else if (operation == BIO_WRITE) { 2200 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2201 device_get_unit(xbb->dev), 2202 bios[bio_idx]->bio_offset, 2203 bios[bio_idx]->bio_length); 2204 } 2205 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2206 } 2207 2208 return (error); 2209 2210 fail_free_bios: 2211 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2212 g_destroy_bio(bios[bio_idx]); 2213 2214 return (error); 2215 } 2216 2217 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2218 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2219 "uint64_t"); 2220 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2221 "uint64_t", "uint64_t"); 2222 2223 /** 2224 * Backend handler for file access. 2225 * 2226 * \param xbb Per-instance xbb configuration structure. 2227 * \param reqlist Allocated internal request list. 2228 * \param operation BIO_* I/O operation code. 2229 * \param flags Additional bio_flag data to pass to any generated bios 2230 * (e.g. BIO_ORDERED).. 2231 * 2232 * \return 0 for success, errno codes for failure. 2233 */ 2234 static int 2235 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2236 int operation, int flags) 2237 { 2238 struct xbb_file_data *file_data; 2239 u_int seg_idx; 2240 u_int nseg; 2241 struct uio xuio; 2242 struct xbb_sg *xbb_sg; 2243 struct iovec *xiovec; 2244 #ifdef XBB_USE_BOUNCE_BUFFERS 2245 void **p_vaddr; 2246 int saved_uio_iovcnt; 2247 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2248 int error; 2249 2250 file_data = &xbb->backend.file; 2251 error = 0; 2252 bzero(&xuio, sizeof(xuio)); 2253 2254 switch (operation) { 2255 case BIO_READ: 2256 xuio.uio_rw = UIO_READ; 2257 break; 2258 case BIO_WRITE: 2259 xuio.uio_rw = UIO_WRITE; 2260 break; 2261 case BIO_FLUSH: { 2262 struct mount *mountpoint; 2263 2264 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2265 device_get_unit(xbb->dev)); 2266 2267 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2268 2269 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2270 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2271 VOP_UNLOCK(xbb->vn); 2272 2273 vn_finished_write(mountpoint); 2274 2275 goto bailout_send_response; 2276 /* NOTREACHED */ 2277 } 2278 default: 2279 panic("invalid operation %d", operation); 2280 /* NOTREACHED */ 2281 } 2282 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2283 << xbb->sector_size_shift; 2284 xuio.uio_segflg = UIO_SYSSPACE; 2285 xuio.uio_iov = file_data->xiovecs; 2286 xuio.uio_iovcnt = 0; 2287 xbb_sg = xbb->xbb_sgs; 2288 nseg = reqlist->nr_segments; 2289 2290 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2291 /* 2292 * If the first sector is not 0, the KVA will 2293 * not be contiguous and we'll need to go on 2294 * to another segment. 2295 */ 2296 if (xbb_sg->first_sect != 0) 2297 xiovec = NULL; 2298 2299 if (xiovec == NULL) { 2300 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2301 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2302 seg_idx, xbb_sg->first_sect); 2303 #ifdef XBB_USE_BOUNCE_BUFFERS 2304 /* 2305 * Store the address of the incoming 2306 * buffer at this particular offset 2307 * as well, so we can do the copy 2308 * later without having to do more 2309 * work to recalculate this address. 2310 */ 2311 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2312 *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2313 xbb_sg->first_sect); 2314 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2315 xiovec->iov_len = 0; 2316 xuio.uio_iovcnt++; 2317 } 2318 2319 xiovec->iov_len += xbb_sg->nsect << 9; 2320 2321 xuio.uio_resid += xbb_sg->nsect << 9; 2322 2323 /* 2324 * If the last sector is not the full page 2325 * size count, the next segment will not be 2326 * contiguous in KVA and we need a new iovec. 2327 */ 2328 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2329 xiovec = NULL; 2330 } 2331 2332 xuio.uio_td = curthread; 2333 2334 #ifdef XBB_USE_BOUNCE_BUFFERS 2335 saved_uio_iovcnt = xuio.uio_iovcnt; 2336 2337 if (operation == BIO_WRITE) { 2338 /* Copy the write data to the local buffer. */ 2339 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2340 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2341 seg_idx++, xiovec++, p_vaddr++) { 2342 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 2343 } 2344 } else { 2345 /* 2346 * We only need to save off the iovecs in the case of a 2347 * read, because the copy for the read happens after the 2348 * VOP_READ(). (The uio will get modified in that call 2349 * sequence.) 2350 */ 2351 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2352 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2353 } 2354 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2355 2356 switch (operation) { 2357 case BIO_READ: 2358 2359 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2360 device_get_unit(xbb->dev), xuio.uio_offset, 2361 xuio.uio_resid); 2362 2363 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2364 2365 /* 2366 * UFS pays attention to IO_DIRECT for reads. If the 2367 * DIRECTIO option is configured into the kernel, it calls 2368 * ffs_rawread(). But that only works for single-segment 2369 * uios with user space addresses. In our case, with a 2370 * kernel uio, it still reads into the buffer cache, but it 2371 * will just try to release the buffer from the cache later 2372 * on in ffs_read(). 2373 * 2374 * ZFS does not pay attention to IO_DIRECT for reads. 2375 * 2376 * UFS does not pay attention to IO_SYNC for reads. 2377 * 2378 * ZFS pays attention to IO_SYNC (which translates into the 2379 * Solaris define FRSYNC for zfs_read()) for reads. It 2380 * attempts to sync the file before reading. 2381 * 2382 * So, to attempt to provide some barrier semantics in the 2383 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2384 */ 2385 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2386 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2387 2388 VOP_UNLOCK(xbb->vn); 2389 break; 2390 case BIO_WRITE: { 2391 struct mount *mountpoint; 2392 2393 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2394 device_get_unit(xbb->dev), xuio.uio_offset, 2395 xuio.uio_resid); 2396 2397 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2398 2399 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2400 2401 /* 2402 * UFS pays attention to IO_DIRECT for writes. The write 2403 * is done asynchronously. (Normally the write would just 2404 * get put into cache. 2405 * 2406 * UFS pays attention to IO_SYNC for writes. It will 2407 * attempt to write the buffer out synchronously if that 2408 * flag is set. 2409 * 2410 * ZFS does not pay attention to IO_DIRECT for writes. 2411 * 2412 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2413 * for writes. It will flush the transaction from the 2414 * cache before returning. 2415 * 2416 * So if we've got the BIO_ORDERED flag set, we want 2417 * IO_SYNC in either the UFS or ZFS case. 2418 */ 2419 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2420 IO_SYNC : 0, file_data->cred); 2421 VOP_UNLOCK(xbb->vn); 2422 2423 vn_finished_write(mountpoint); 2424 2425 break; 2426 } 2427 default: 2428 panic("invalid operation %d", operation); 2429 /* NOTREACHED */ 2430 } 2431 2432 #ifdef XBB_USE_BOUNCE_BUFFERS 2433 /* We only need to copy here for read operations */ 2434 if (operation == BIO_READ) { 2435 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2436 xiovec = file_data->saved_xiovecs; 2437 seg_idx < saved_uio_iovcnt; seg_idx++, 2438 xiovec++, p_vaddr++) { 2439 /* 2440 * Note that we have to use the copy of the 2441 * io vector we made above. uiomove() modifies 2442 * the uio and its referenced vector as uiomove 2443 * performs the copy, so we can't rely on any 2444 * state from the original uio. 2445 */ 2446 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 2447 } 2448 } 2449 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2450 2451 bailout_send_response: 2452 2453 if (error != 0) 2454 reqlist->status = BLKIF_RSP_ERROR; 2455 2456 xbb_complete_reqlist(xbb, reqlist); 2457 2458 return (0); 2459 } 2460 2461 /*--------------------------- Backend Configuration --------------------------*/ 2462 /** 2463 * Close and cleanup any backend device/file specific state for this 2464 * block back instance. 2465 * 2466 * \param xbb Per-instance xbb configuration structure. 2467 */ 2468 static void 2469 xbb_close_backend(struct xbb_softc *xbb) 2470 { 2471 DROP_GIANT(); 2472 DPRINTF("closing dev=%s\n", xbb->dev_name); 2473 if (xbb->vn) { 2474 int flags = FREAD; 2475 2476 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2477 flags |= FWRITE; 2478 2479 switch (xbb->device_type) { 2480 case XBB_TYPE_DISK: 2481 if (xbb->backend.dev.csw) { 2482 dev_relthread(xbb->backend.dev.cdev, 2483 xbb->backend.dev.dev_ref); 2484 xbb->backend.dev.csw = NULL; 2485 xbb->backend.dev.cdev = NULL; 2486 } 2487 break; 2488 case XBB_TYPE_FILE: 2489 break; 2490 case XBB_TYPE_NONE: 2491 default: 2492 panic("Unexpected backend type."); 2493 break; 2494 } 2495 2496 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2497 xbb->vn = NULL; 2498 2499 switch (xbb->device_type) { 2500 case XBB_TYPE_DISK: 2501 break; 2502 case XBB_TYPE_FILE: 2503 if (xbb->backend.file.cred != NULL) { 2504 crfree(xbb->backend.file.cred); 2505 xbb->backend.file.cred = NULL; 2506 } 2507 break; 2508 case XBB_TYPE_NONE: 2509 default: 2510 panic("Unexpected backend type."); 2511 break; 2512 } 2513 } 2514 PICKUP_GIANT(); 2515 } 2516 2517 /** 2518 * Open a character device to be used for backend I/O. 2519 * 2520 * \param xbb Per-instance xbb configuration structure. 2521 * 2522 * \return 0 for success, errno codes for failure. 2523 */ 2524 static int 2525 xbb_open_dev(struct xbb_softc *xbb) 2526 { 2527 struct vattr vattr; 2528 struct cdev *dev; 2529 struct cdevsw *devsw; 2530 int error; 2531 2532 xbb->device_type = XBB_TYPE_DISK; 2533 xbb->dispatch_io = xbb_dispatch_dev; 2534 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2535 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2536 &xbb->backend.dev.dev_ref); 2537 if (xbb->backend.dev.csw == NULL) 2538 panic("Unable to retrieve device switch"); 2539 2540 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2541 if (error) { 2542 xenbus_dev_fatal(xbb->dev, error, "error getting " 2543 "vnode attributes for device %s", 2544 xbb->dev_name); 2545 return (error); 2546 } 2547 2548 dev = xbb->vn->v_rdev; 2549 devsw = dev->si_devsw; 2550 if (!devsw->d_ioctl) { 2551 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2552 "device %s!", xbb->dev_name); 2553 return (ENODEV); 2554 } 2555 2556 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2557 (caddr_t)&xbb->sector_size, FREAD, 2558 curthread); 2559 if (error) { 2560 xenbus_dev_fatal(xbb->dev, error, 2561 "error calling ioctl DIOCGSECTORSIZE " 2562 "for device %s", xbb->dev_name); 2563 return (error); 2564 } 2565 2566 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2567 (caddr_t)&xbb->media_size, FREAD, 2568 curthread); 2569 if (error) { 2570 xenbus_dev_fatal(xbb->dev, error, 2571 "error calling ioctl DIOCGMEDIASIZE " 2572 "for device %s", xbb->dev_name); 2573 return (error); 2574 } 2575 2576 return (0); 2577 } 2578 2579 /** 2580 * Open a file to be used for backend I/O. 2581 * 2582 * \param xbb Per-instance xbb configuration structure. 2583 * 2584 * \return 0 for success, errno codes for failure. 2585 */ 2586 static int 2587 xbb_open_file(struct xbb_softc *xbb) 2588 { 2589 struct xbb_file_data *file_data; 2590 struct vattr vattr; 2591 int error; 2592 2593 file_data = &xbb->backend.file; 2594 xbb->device_type = XBB_TYPE_FILE; 2595 xbb->dispatch_io = xbb_dispatch_file; 2596 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2597 if (error != 0) { 2598 xenbus_dev_fatal(xbb->dev, error, 2599 "error calling VOP_GETATTR()" 2600 "for file %s", xbb->dev_name); 2601 return (error); 2602 } 2603 2604 /* 2605 * Verify that we have the ability to upgrade to exclusive 2606 * access on this file so we can trap errors at open instead 2607 * of reporting them during first access. 2608 */ 2609 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2610 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2611 if (VN_IS_DOOMED(xbb->vn)) { 2612 error = EBADF; 2613 xenbus_dev_fatal(xbb->dev, error, 2614 "error locking file %s", 2615 xbb->dev_name); 2616 2617 return (error); 2618 } 2619 } 2620 2621 file_data->cred = crhold(curthread->td_ucred); 2622 xbb->media_size = vattr.va_size; 2623 2624 /* 2625 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2626 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2627 * with disklabel and UFS on FreeBSD at least. Large block sizes 2628 * may not work with other OSes as well. So just export a sector 2629 * size of 512 bytes, which should work with any OS or 2630 * application. Since our backing is a file, any block size will 2631 * work fine for the backing store. 2632 */ 2633 #if 0 2634 xbb->sector_size = vattr.va_blocksize; 2635 #endif 2636 xbb->sector_size = 512; 2637 2638 /* 2639 * Sanity check. The media size has to be at least one 2640 * sector long. 2641 */ 2642 if (xbb->media_size < xbb->sector_size) { 2643 error = EINVAL; 2644 xenbus_dev_fatal(xbb->dev, error, 2645 "file %s size %ju < block size %u", 2646 xbb->dev_name, 2647 (uintmax_t)xbb->media_size, 2648 xbb->sector_size); 2649 } 2650 return (error); 2651 } 2652 2653 /** 2654 * Open the backend provider for this connection. 2655 * 2656 * \param xbb Per-instance xbb configuration structure. 2657 * 2658 * \return 0 for success, errno codes for failure. 2659 */ 2660 static int 2661 xbb_open_backend(struct xbb_softc *xbb) 2662 { 2663 struct nameidata nd; 2664 int flags; 2665 int error; 2666 2667 flags = FREAD; 2668 error = 0; 2669 2670 DPRINTF("opening dev=%s\n", xbb->dev_name); 2671 2672 if (rootvnode == NULL) { 2673 xenbus_dev_fatal(xbb->dev, ENOENT, 2674 "Root file system not mounted"); 2675 return (ENOENT); 2676 } 2677 2678 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2679 flags |= FWRITE; 2680 2681 pwd_ensure_dirs(); 2682 2683 again: 2684 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name); 2685 error = vn_open(&nd, &flags, 0, NULL); 2686 if (error) { 2687 /* 2688 * This is the only reasonable guess we can make as far as 2689 * path if the user doesn't give us a fully qualified path. 2690 * If they want to specify a file, they need to specify the 2691 * full path. 2692 */ 2693 if (xbb->dev_name[0] != '/') { 2694 char *dev_path = "/dev/"; 2695 char *dev_name; 2696 2697 /* Try adding device path at beginning of name */ 2698 dev_name = malloc(strlen(xbb->dev_name) 2699 + strlen(dev_path) + 1, 2700 M_XENBLOCKBACK, M_NOWAIT); 2701 if (dev_name) { 2702 sprintf(dev_name, "%s%s", dev_path, 2703 xbb->dev_name); 2704 free(xbb->dev_name, M_XENBLOCKBACK); 2705 xbb->dev_name = dev_name; 2706 goto again; 2707 } 2708 } 2709 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2710 xbb->dev_name); 2711 return (error); 2712 } 2713 2714 NDFREE_PNBUF(&nd); 2715 2716 xbb->vn = nd.ni_vp; 2717 2718 /* We only support disks and files. */ 2719 if (vn_isdisk_error(xbb->vn, &error)) { 2720 error = xbb_open_dev(xbb); 2721 } else if (xbb->vn->v_type == VREG) { 2722 error = xbb_open_file(xbb); 2723 } else { 2724 error = EINVAL; 2725 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2726 "or file", xbb->dev_name); 2727 } 2728 VOP_UNLOCK(xbb->vn); 2729 2730 if (error != 0) { 2731 xbb_close_backend(xbb); 2732 return (error); 2733 } 2734 2735 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2736 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2737 2738 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2739 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2740 xbb->dev_name, xbb->sector_size, xbb->media_size); 2741 2742 return (0); 2743 } 2744 2745 /*------------------------ Inter-Domain Communication ------------------------*/ 2746 /** 2747 * Free dynamically allocated KVA or pseudo-physical address allocations. 2748 * 2749 * \param xbb Per-instance xbb configuration structure. 2750 */ 2751 static void 2752 xbb_free_communication_mem(struct xbb_softc *xbb) 2753 { 2754 if (xbb->kva != 0) { 2755 if (xbb->pseudo_phys_res != NULL) { 2756 xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, 2757 xbb->pseudo_phys_res); 2758 xbb->pseudo_phys_res = NULL; 2759 } 2760 } 2761 xbb->kva = 0; 2762 xbb->gnt_base_addr = 0; 2763 if (xbb->kva_free != NULL) { 2764 free(xbb->kva_free, M_XENBLOCKBACK); 2765 xbb->kva_free = NULL; 2766 } 2767 } 2768 2769 /** 2770 * Cleanup all inter-domain communication mechanisms. 2771 * 2772 * \param xbb Per-instance xbb configuration structure. 2773 */ 2774 static int 2775 xbb_disconnect(struct xbb_softc *xbb) 2776 { 2777 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2778 struct gnttab_unmap_grant_ref *op; 2779 u_int ring_idx; 2780 int error; 2781 2782 DPRINTF("\n"); 2783 2784 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2785 return (0); 2786 2787 mtx_unlock(&xbb->lock); 2788 xen_intr_unbind(&xbb->xen_intr_handle); 2789 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2790 mtx_lock(&xbb->lock); 2791 2792 /* 2793 * No new interrupts can generate work, but we must wait 2794 * for all currently active requests to drain. 2795 */ 2796 if (xbb->active_request_count != 0) 2797 return (EAGAIN); 2798 2799 for (ring_idx = 0, op = ops; 2800 ring_idx < xbb->ring_config.ring_pages; 2801 ring_idx++, op++) { 2802 op->host_addr = xbb->ring_config.gnt_addr 2803 + (ring_idx * PAGE_SIZE); 2804 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2805 op->handle = xbb->ring_config.handle[ring_idx]; 2806 } 2807 2808 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2809 xbb->ring_config.ring_pages); 2810 if (error != 0) 2811 panic("Grant table op failed (%d)", error); 2812 2813 xbb_free_communication_mem(xbb); 2814 2815 if (xbb->requests != NULL) { 2816 free(xbb->requests, M_XENBLOCKBACK); 2817 xbb->requests = NULL; 2818 } 2819 2820 if (xbb->request_lists != NULL) { 2821 struct xbb_xen_reqlist *reqlist; 2822 int i; 2823 2824 /* There is one request list for ever allocated request. */ 2825 for (i = 0, reqlist = xbb->request_lists; 2826 i < xbb->max_requests; i++, reqlist++){ 2827 #ifdef XBB_USE_BOUNCE_BUFFERS 2828 if (reqlist->bounce != NULL) { 2829 free(reqlist->bounce, M_XENBLOCKBACK); 2830 reqlist->bounce = NULL; 2831 } 2832 #endif 2833 if (reqlist->gnt_handles != NULL) { 2834 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2835 reqlist->gnt_handles = NULL; 2836 } 2837 } 2838 free(xbb->request_lists, M_XENBLOCKBACK); 2839 xbb->request_lists = NULL; 2840 } 2841 2842 xbb->flags &= ~XBBF_RING_CONNECTED; 2843 return (0); 2844 } 2845 2846 /** 2847 * Map shared memory ring into domain local address space, initialize 2848 * ring control structures, and bind an interrupt to the event channel 2849 * used to notify us of ring changes. 2850 * 2851 * \param xbb Per-instance xbb configuration structure. 2852 */ 2853 static int 2854 xbb_connect_ring(struct xbb_softc *xbb) 2855 { 2856 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2857 struct gnttab_map_grant_ref *gnt; 2858 u_int ring_idx; 2859 int error; 2860 2861 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2862 return (0); 2863 2864 /* 2865 * Kva for our ring is at the tail of the region of kva allocated 2866 * by xbb_alloc_communication_mem(). 2867 */ 2868 xbb->ring_config.va = xbb->kva 2869 + (xbb->kva_size 2870 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2871 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2872 + (xbb->kva_size 2873 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2874 2875 for (ring_idx = 0, gnt = gnts; 2876 ring_idx < xbb->ring_config.ring_pages; 2877 ring_idx++, gnt++) { 2878 gnt->host_addr = xbb->ring_config.gnt_addr 2879 + (ring_idx * PAGE_SIZE); 2880 gnt->flags = GNTMAP_host_map; 2881 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2882 gnt->dom = xbb->otherend_id; 2883 } 2884 2885 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2886 xbb->ring_config.ring_pages); 2887 if (error) 2888 panic("blkback: Ring page grant table op failed (%d)", error); 2889 2890 for (ring_idx = 0, gnt = gnts; 2891 ring_idx < xbb->ring_config.ring_pages; 2892 ring_idx++, gnt++) { 2893 if (gnt->status != 0) { 2894 struct gnttab_unmap_grant_ref unmap[XBB_MAX_RING_PAGES]; 2895 unsigned int i, j; 2896 2897 xbb->ring_config.va = 0; 2898 xenbus_dev_fatal(xbb->dev, EACCES, 2899 "Ring shared page mapping failed. " 2900 "Status %d.", gnt->status); 2901 2902 /* Unmap everything to avoid leaking grant table maps */ 2903 for (i = 0, j = 0; i < xbb->ring_config.ring_pages; 2904 i++) { 2905 if (gnts[i].status != GNTST_okay) 2906 continue; 2907 2908 unmap[j].host_addr = gnts[i].host_addr; 2909 unmap[j].dev_bus_addr = gnts[i].dev_bus_addr; 2910 unmap[j++].handle = gnts[i].handle; 2911 } 2912 if (j != 0) { 2913 error = HYPERVISOR_grant_table_op( 2914 GNTTABOP_unmap_grant_ref, unmap, j); 2915 if (error != 0) 2916 panic("Unable to unmap grants (%d)", 2917 error); 2918 } 2919 return (EACCES); 2920 } 2921 xbb->ring_config.handle[ring_idx] = gnt->handle; 2922 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2923 } 2924 2925 /* Initialize the ring based on ABI. */ 2926 switch (xbb->abi) { 2927 case BLKIF_PROTOCOL_NATIVE: 2928 { 2929 blkif_sring_t *sring; 2930 sring = (blkif_sring_t *)xbb->ring_config.va; 2931 BACK_RING_INIT(&xbb->rings.native, sring, 2932 xbb->ring_config.ring_pages * PAGE_SIZE); 2933 break; 2934 } 2935 case BLKIF_PROTOCOL_X86_32: 2936 { 2937 blkif_x86_32_sring_t *sring_x86_32; 2938 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2939 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2940 xbb->ring_config.ring_pages * PAGE_SIZE); 2941 break; 2942 } 2943 case BLKIF_PROTOCOL_X86_64: 2944 { 2945 blkif_x86_64_sring_t *sring_x86_64; 2946 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2947 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2948 xbb->ring_config.ring_pages * PAGE_SIZE); 2949 break; 2950 } 2951 default: 2952 panic("Unexpected blkif protocol ABI."); 2953 } 2954 2955 xbb->flags |= XBBF_RING_CONNECTED; 2956 2957 error = xen_intr_bind_remote_port(xbb->dev, 2958 xbb->otherend_id, 2959 xbb->ring_config.evtchn, 2960 xbb_filter, 2961 /*ithread_handler*/NULL, 2962 /*arg*/xbb, 2963 INTR_TYPE_BIO | INTR_MPSAFE, 2964 &xbb->xen_intr_handle); 2965 if (error) { 2966 (void)xbb_disconnect(xbb); 2967 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2968 return (error); 2969 } 2970 2971 DPRINTF("rings connected!\n"); 2972 2973 return 0; 2974 } 2975 2976 /** 2977 * Size KVA and pseudo-physical address allocations based on negotiated 2978 * values for the size and number of I/O requests, and the size of our 2979 * communication ring. 2980 * 2981 * \param xbb Per-instance xbb configuration structure. 2982 * 2983 * These address spaces are used to dynamically map pages in the 2984 * front-end's domain into our own. 2985 */ 2986 static int 2987 xbb_alloc_communication_mem(struct xbb_softc *xbb) 2988 { 2989 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2990 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2991 xbb->kva_size = xbb->reqlist_kva_size + 2992 (xbb->ring_config.ring_pages * PAGE_SIZE); 2993 2994 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); 2995 if (xbb->kva_free == NULL) 2996 return (ENOMEM); 2997 2998 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 2999 device_get_nameunit(xbb->dev), xbb->kva_size, 3000 xbb->reqlist_kva_size); 3001 /* 3002 * Reserve a range of pseudo physical memory that we can map 3003 * into kva. These pages will only be backed by machine 3004 * pages ("real memory") during the lifetime of front-end requests 3005 * via grant table operations. 3006 */ 3007 xbb->pseudo_phys_res_id = 0; 3008 xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, 3009 xbb->kva_size); 3010 if (xbb->pseudo_phys_res == NULL) { 3011 xbb->kva = 0; 3012 return (ENOMEM); 3013 } 3014 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3015 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3016 3017 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3018 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3019 (uintmax_t)xbb->gnt_base_addr); 3020 return (0); 3021 } 3022 3023 /** 3024 * Collect front-end information from the XenStore. 3025 * 3026 * \param xbb Per-instance xbb configuration structure. 3027 */ 3028 static int 3029 xbb_collect_frontend_info(struct xbb_softc *xbb) 3030 { 3031 char protocol_abi[64]; 3032 const char *otherend_path; 3033 int error; 3034 u_int ring_idx; 3035 u_int ring_page_order; 3036 size_t ring_size; 3037 3038 otherend_path = xenbus_get_otherend_path(xbb->dev); 3039 3040 /* 3041 * Protocol defaults valid even if all negotiation fails. 3042 */ 3043 xbb->ring_config.ring_pages = 1; 3044 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; 3045 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3046 3047 /* 3048 * Mandatory data (used in all versions of the protocol) first. 3049 */ 3050 error = xs_scanf(XST_NIL, otherend_path, 3051 "event-channel", NULL, "%" PRIu32, 3052 &xbb->ring_config.evtchn); 3053 if (error != 0) { 3054 xenbus_dev_fatal(xbb->dev, error, 3055 "Unable to retrieve event-channel information " 3056 "from frontend %s. Unable to connect.", 3057 xenbus_get_otherend_path(xbb->dev)); 3058 return (error); 3059 } 3060 3061 /* 3062 * These fields are initialized to legacy protocol defaults 3063 * so we only need to fail if reading the updated value succeeds 3064 * and the new value is outside of its allowed range. 3065 * 3066 * \note xs_gather() returns on the first encountered error, so 3067 * we must use independent calls in order to guarantee 3068 * we don't miss information in a sparsly populated front-end 3069 * tree. 3070 * 3071 * \note xs_scanf() does not update variables for unmatched 3072 * fields. 3073 */ 3074 ring_page_order = 0; 3075 xbb->max_requests = 32; 3076 3077 (void)xs_scanf(XST_NIL, otherend_path, 3078 "ring-page-order", NULL, "%u", 3079 &ring_page_order); 3080 xbb->ring_config.ring_pages = 1 << ring_page_order; 3081 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3082 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3083 3084 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3085 xenbus_dev_fatal(xbb->dev, EINVAL, 3086 "Front-end specified ring-pages of %u " 3087 "exceeds backend limit of %u. " 3088 "Unable to connect.", 3089 xbb->ring_config.ring_pages, 3090 XBB_MAX_RING_PAGES); 3091 return (EINVAL); 3092 } 3093 3094 if (xbb->ring_config.ring_pages == 1) { 3095 error = xs_gather(XST_NIL, otherend_path, 3096 "ring-ref", "%" PRIu32, 3097 &xbb->ring_config.ring_ref[0], 3098 NULL); 3099 if (error != 0) { 3100 xenbus_dev_fatal(xbb->dev, error, 3101 "Unable to retrieve ring information " 3102 "from frontend %s. Unable to " 3103 "connect.", 3104 xenbus_get_otherend_path(xbb->dev)); 3105 return (error); 3106 } 3107 } else { 3108 /* Multi-page ring format. */ 3109 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3110 ring_idx++) { 3111 char ring_ref_name[]= "ring_refXX"; 3112 3113 snprintf(ring_ref_name, sizeof(ring_ref_name), 3114 "ring-ref%u", ring_idx); 3115 error = xs_scanf(XST_NIL, otherend_path, 3116 ring_ref_name, NULL, "%" PRIu32, 3117 &xbb->ring_config.ring_ref[ring_idx]); 3118 if (error != 0) { 3119 xenbus_dev_fatal(xbb->dev, error, 3120 "Failed to retriev grant " 3121 "reference for page %u of " 3122 "shared ring. Unable " 3123 "to connect.", ring_idx); 3124 return (error); 3125 } 3126 } 3127 } 3128 3129 error = xs_gather(XST_NIL, otherend_path, 3130 "protocol", "%63s", protocol_abi, 3131 NULL); 3132 if (error != 0 3133 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3134 /* 3135 * Assume native if the frontend has not 3136 * published ABI data or it has published and 3137 * matches our own ABI. 3138 */ 3139 xbb->abi = BLKIF_PROTOCOL_NATIVE; 3140 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3141 xbb->abi = BLKIF_PROTOCOL_X86_32; 3142 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3143 xbb->abi = BLKIF_PROTOCOL_X86_64; 3144 } else { 3145 xenbus_dev_fatal(xbb->dev, EINVAL, 3146 "Unknown protocol ABI (%s) published by " 3147 "frontend. Unable to connect.", protocol_abi); 3148 return (EINVAL); 3149 } 3150 return (0); 3151 } 3152 3153 /** 3154 * Allocate per-request data structures given request size and number 3155 * information negotiated with the front-end. 3156 * 3157 * \param xbb Per-instance xbb configuration structure. 3158 */ 3159 static int 3160 xbb_alloc_requests(struct xbb_softc *xbb) 3161 { 3162 struct xbb_xen_req *req; 3163 struct xbb_xen_req *last_req; 3164 3165 /* 3166 * Allocate request book keeping datastructures. 3167 */ 3168 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3169 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3170 if (xbb->requests == NULL) { 3171 xenbus_dev_fatal(xbb->dev, ENOMEM, 3172 "Unable to allocate request structures"); 3173 return (ENOMEM); 3174 } 3175 3176 req = xbb->requests; 3177 last_req = &xbb->requests[xbb->max_requests - 1]; 3178 STAILQ_INIT(&xbb->request_free_stailq); 3179 while (req <= last_req) { 3180 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3181 req++; 3182 } 3183 return (0); 3184 } 3185 3186 static int 3187 xbb_alloc_request_lists(struct xbb_softc *xbb) 3188 { 3189 struct xbb_xen_reqlist *reqlist; 3190 int i; 3191 3192 /* 3193 * If no requests can be merged, we need 1 request list per 3194 * in flight request. 3195 */ 3196 xbb->request_lists = malloc(xbb->max_requests * 3197 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3198 if (xbb->request_lists == NULL) { 3199 xenbus_dev_fatal(xbb->dev, ENOMEM, 3200 "Unable to allocate request list structures"); 3201 return (ENOMEM); 3202 } 3203 3204 STAILQ_INIT(&xbb->reqlist_free_stailq); 3205 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3206 for (i = 0; i < xbb->max_requests; i++) { 3207 int seg; 3208 3209 reqlist = &xbb->request_lists[i]; 3210 3211 reqlist->xbb = xbb; 3212 3213 #ifdef XBB_USE_BOUNCE_BUFFERS 3214 reqlist->bounce = malloc(xbb->max_reqlist_size, 3215 M_XENBLOCKBACK, M_NOWAIT); 3216 if (reqlist->bounce == NULL) { 3217 xenbus_dev_fatal(xbb->dev, ENOMEM, 3218 "Unable to allocate request " 3219 "bounce buffers"); 3220 return (ENOMEM); 3221 } 3222 #endif /* XBB_USE_BOUNCE_BUFFERS */ 3223 3224 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3225 sizeof(*reqlist->gnt_handles), 3226 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3227 if (reqlist->gnt_handles == NULL) { 3228 xenbus_dev_fatal(xbb->dev, ENOMEM, 3229 "Unable to allocate request " 3230 "grant references"); 3231 return (ENOMEM); 3232 } 3233 3234 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3235 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3236 3237 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3238 } 3239 return (0); 3240 } 3241 3242 /** 3243 * Supply information about the physical device to the frontend 3244 * via XenBus. 3245 * 3246 * \param xbb Per-instance xbb configuration structure. 3247 */ 3248 static int 3249 xbb_publish_backend_info(struct xbb_softc *xbb) 3250 { 3251 struct xs_transaction xst; 3252 const char *our_path; 3253 const char *leaf; 3254 int error; 3255 3256 our_path = xenbus_get_node(xbb->dev); 3257 while (1) { 3258 error = xs_transaction_start(&xst); 3259 if (error != 0) { 3260 xenbus_dev_fatal(xbb->dev, error, 3261 "Error publishing backend info " 3262 "(start transaction)"); 3263 return (error); 3264 } 3265 3266 leaf = "sectors"; 3267 error = xs_printf(xst, our_path, leaf, 3268 "%"PRIu64, xbb->media_num_sectors); 3269 if (error != 0) 3270 break; 3271 3272 /* XXX Support all VBD attributes here. */ 3273 leaf = "info"; 3274 error = xs_printf(xst, our_path, leaf, "%u", 3275 xbb->flags & XBBF_READ_ONLY 3276 ? VDISK_READONLY : 0); 3277 if (error != 0) 3278 break; 3279 3280 leaf = "sector-size"; 3281 error = xs_printf(xst, our_path, leaf, "%u", 3282 xbb->sector_size); 3283 if (error != 0) 3284 break; 3285 3286 error = xs_transaction_end(xst, 0); 3287 if (error == 0) { 3288 return (0); 3289 } else if (error != EAGAIN) { 3290 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3291 return (error); 3292 } 3293 } 3294 3295 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3296 our_path, leaf); 3297 xs_transaction_end(xst, 1); 3298 return (error); 3299 } 3300 3301 /** 3302 * Connect to our blkfront peer now that it has completed publishing 3303 * its configuration into the XenStore. 3304 * 3305 * \param xbb Per-instance xbb configuration structure. 3306 */ 3307 static void 3308 xbb_connect(struct xbb_softc *xbb) 3309 { 3310 int error; 3311 3312 if (!xbb->hotplug_done || 3313 (xenbus_get_state(xbb->dev) != XenbusStateInitWait) || 3314 (xbb_collect_frontend_info(xbb) != 0)) 3315 return; 3316 3317 xbb->flags &= ~XBBF_SHUTDOWN; 3318 3319 /* 3320 * We limit the maximum number of reqlist segments to the maximum 3321 * number of segments in the ring, or our absolute maximum, 3322 * whichever is smaller. 3323 */ 3324 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3325 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3326 3327 /* 3328 * The maximum size is simply a function of the number of segments 3329 * we can handle. 3330 */ 3331 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3332 3333 /* Allocate resources whose size depends on front-end configuration. */ 3334 error = xbb_alloc_communication_mem(xbb); 3335 if (error != 0) { 3336 xenbus_dev_fatal(xbb->dev, error, 3337 "Unable to allocate communication memory"); 3338 return; 3339 } 3340 3341 error = xbb_alloc_requests(xbb); 3342 if (error != 0) { 3343 /* Specific errors are reported by xbb_alloc_requests(). */ 3344 return; 3345 } 3346 3347 error = xbb_alloc_request_lists(xbb); 3348 if (error != 0) { 3349 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3350 return; 3351 } 3352 3353 /* 3354 * Connect communication channel. 3355 */ 3356 error = xbb_connect_ring(xbb); 3357 if (error != 0) { 3358 /* Specific errors are reported by xbb_connect_ring(). */ 3359 return; 3360 } 3361 3362 if (xbb_publish_backend_info(xbb) != 0) { 3363 /* 3364 * If we can't publish our data, we cannot participate 3365 * in this connection, and waiting for a front-end state 3366 * change will not help the situation. 3367 */ 3368 (void)xbb_disconnect(xbb); 3369 return; 3370 } 3371 3372 /* Ready for I/O. */ 3373 xenbus_set_state(xbb->dev, XenbusStateConnected); 3374 } 3375 3376 /*-------------------------- Device Teardown Support -------------------------*/ 3377 /** 3378 * Perform device shutdown functions. 3379 * 3380 * \param xbb Per-instance xbb configuration structure. 3381 * 3382 * Mark this instance as shutting down, wait for any active I/O on the 3383 * backend device/file to drain, disconnect from the front-end, and notify 3384 * any waiters (e.g. a thread invoking our detach method) that detach can 3385 * now proceed. 3386 */ 3387 static int 3388 xbb_shutdown(struct xbb_softc *xbb) 3389 { 3390 XenbusState frontState; 3391 int error; 3392 3393 DPRINTF("\n"); 3394 3395 /* 3396 * Due to the need to drop our mutex during some 3397 * xenbus operations, it is possible for two threads 3398 * to attempt to close out shutdown processing at 3399 * the same time. Tell the caller that hits this 3400 * race to try back later. 3401 */ 3402 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3403 return (EAGAIN); 3404 3405 xbb->flags |= XBBF_IN_SHUTDOWN; 3406 mtx_unlock(&xbb->lock); 3407 3408 if (xbb->hotplug_watch.node != NULL) { 3409 xs_unregister_watch(&xbb->hotplug_watch); 3410 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3411 xbb->hotplug_watch.node = NULL; 3412 } 3413 3414 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3415 xenbus_set_state(xbb->dev, XenbusStateClosing); 3416 3417 frontState = xenbus_get_otherend_state(xbb->dev); 3418 mtx_lock(&xbb->lock); 3419 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3420 3421 /* Wait for the frontend to disconnect (if it's connected). */ 3422 if (frontState == XenbusStateConnected) 3423 return (EAGAIN); 3424 3425 DPRINTF("\n"); 3426 3427 /* Indicate shutdown is in progress. */ 3428 xbb->flags |= XBBF_SHUTDOWN; 3429 3430 /* Disconnect from the front-end. */ 3431 error = xbb_disconnect(xbb); 3432 if (error != 0) { 3433 /* 3434 * Requests still outstanding. We'll be called again 3435 * once they complete. 3436 */ 3437 KASSERT(error == EAGAIN, 3438 ("%s: Unexpected xbb_disconnect() failure %d", 3439 __func__, error)); 3440 3441 return (error); 3442 } 3443 3444 DPRINTF("\n"); 3445 3446 /* Indicate to xbb_detach() that is it safe to proceed. */ 3447 wakeup(xbb); 3448 3449 return (0); 3450 } 3451 3452 /** 3453 * Report an attach time error to the console and Xen, and cleanup 3454 * this instance by forcing immediate detach processing. 3455 * 3456 * \param xbb Per-instance xbb configuration structure. 3457 * \param err Errno describing the error. 3458 * \param fmt Printf style format and arguments 3459 */ 3460 static void 3461 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3462 { 3463 va_list ap; 3464 va_list ap_hotplug; 3465 3466 va_start(ap, fmt); 3467 va_copy(ap_hotplug, ap); 3468 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3469 "hotplug-error", fmt, ap_hotplug); 3470 va_end(ap_hotplug); 3471 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3472 "hotplug-status", "error"); 3473 3474 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3475 va_end(ap); 3476 3477 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3478 "online", "0"); 3479 mtx_lock(&xbb->lock); 3480 xbb_shutdown(xbb); 3481 mtx_unlock(&xbb->lock); 3482 } 3483 3484 /*---------------------------- NewBus Entrypoints ----------------------------*/ 3485 /** 3486 * Inspect a XenBus device and claim it if is of the appropriate type. 3487 * 3488 * \param dev NewBus device object representing a candidate XenBus device. 3489 * 3490 * \return 0 for success, errno codes for failure. 3491 */ 3492 static int 3493 xbb_probe(device_t dev) 3494 { 3495 3496 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3497 device_set_desc(dev, "Backend Virtual Block Device"); 3498 device_quiet(dev); 3499 return (0); 3500 } 3501 3502 return (ENXIO); 3503 } 3504 3505 /** 3506 * Setup sysctl variables to control various Block Back parameters. 3507 * 3508 * \param xbb Xen Block Back softc. 3509 * 3510 */ 3511 static void 3512 xbb_setup_sysctl(struct xbb_softc *xbb) 3513 { 3514 struct sysctl_ctx_list *sysctl_ctx = NULL; 3515 struct sysctl_oid *sysctl_tree = NULL; 3516 3517 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3518 if (sysctl_ctx == NULL) 3519 return; 3520 3521 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3522 if (sysctl_tree == NULL) 3523 return; 3524 3525 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3526 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3527 "fake the flush command"); 3528 3529 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3530 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3531 "send a real flush for N flush requests"); 3532 3533 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3534 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3535 "Don't coalesce contiguous requests"); 3536 3537 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3538 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3539 "how many I/O requests we have received"); 3540 3541 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3542 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3543 "how many I/O requests have been completed"); 3544 3545 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3546 "reqs_queued_for_completion", CTLFLAG_RW, 3547 &xbb->reqs_queued_for_completion, 3548 "how many I/O requests queued but not yet pushed"); 3549 3550 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3551 "reqs_completed_with_error", CTLFLAG_RW, 3552 &xbb->reqs_completed_with_error, 3553 "how many I/O requests completed with error status"); 3554 3555 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3556 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3557 "how many I/O dispatches were forced"); 3558 3559 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3560 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3561 "how many I/O dispatches were normal"); 3562 3563 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3564 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3565 "total number of I/O dispatches"); 3566 3567 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3568 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3569 "how many times we have run out of KVA"); 3570 3571 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3572 "request_shortages", CTLFLAG_RW, 3573 &xbb->request_shortages, 3574 "how many times we have run out of requests"); 3575 3576 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3577 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3578 "maximum outstanding requests (negotiated)"); 3579 3580 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3581 "max_request_segments", CTLFLAG_RD, 3582 &xbb->max_request_segments, 0, 3583 "maximum number of pages per requests (negotiated)"); 3584 3585 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3586 "max_request_size", CTLFLAG_RD, 3587 &xbb->max_request_size, 0, 3588 "maximum size in bytes of a request (negotiated)"); 3589 3590 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3591 "ring_pages", CTLFLAG_RD, 3592 &xbb->ring_config.ring_pages, 0, 3593 "communication channel pages (negotiated)"); 3594 } 3595 3596 static void 3597 xbb_attach_disk(device_t dev) 3598 { 3599 struct xbb_softc *xbb; 3600 int error; 3601 3602 xbb = device_get_softc(dev); 3603 3604 KASSERT(xbb->hotplug_done, ("Missing hotplug execution")); 3605 3606 /* Parse fopen style mode flags. */ 3607 if (strchr(xbb->dev_mode, 'w') == NULL) 3608 xbb->flags |= XBBF_READ_ONLY; 3609 3610 /* 3611 * Verify the physical device is present and can support 3612 * the desired I/O mode. 3613 */ 3614 error = xbb_open_backend(xbb); 3615 if (error != 0) { 3616 xbb_attach_failed(xbb, error, "Unable to open %s", 3617 xbb->dev_name); 3618 return; 3619 } 3620 3621 /* Use devstat(9) for recording statistics. */ 3622 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3623 xbb->sector_size, 3624 DEVSTAT_ALL_SUPPORTED, 3625 DEVSTAT_TYPE_DIRECT 3626 | DEVSTAT_TYPE_IF_OTHER, 3627 DEVSTAT_PRIORITY_OTHER); 3628 3629 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3630 xbb->sector_size, 3631 DEVSTAT_ALL_SUPPORTED, 3632 DEVSTAT_TYPE_DIRECT 3633 | DEVSTAT_TYPE_IF_OTHER, 3634 DEVSTAT_PRIORITY_OTHER); 3635 /* 3636 * Setup sysctl variables. 3637 */ 3638 xbb_setup_sysctl(xbb); 3639 3640 /* 3641 * Create a taskqueue for doing work that must occur from a 3642 * thread context. 3643 */ 3644 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3645 M_NOWAIT, 3646 taskqueue_thread_enqueue, 3647 /*contxt*/&xbb->io_taskqueue); 3648 if (xbb->io_taskqueue == NULL) { 3649 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3650 return; 3651 } 3652 3653 taskqueue_start_threads(&xbb->io_taskqueue, 3654 /*num threads*/1, 3655 /*priority*/PWAIT, 3656 /*thread name*/ 3657 "%s taskq", device_get_nameunit(dev)); 3658 3659 /* Update hot-plug status to satisfy xend. */ 3660 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3661 "hotplug-status", "connected"); 3662 if (error) { 3663 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3664 xenbus_get_node(xbb->dev)); 3665 return; 3666 } 3667 3668 /* The front end might be waiting for the backend, attach if so. */ 3669 if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised) 3670 xbb_connect(xbb); 3671 } 3672 3673 static void 3674 xbb_attach_cb(struct xs_watch *watch, const char **vec, unsigned int len) 3675 { 3676 device_t dev; 3677 struct xbb_softc *xbb; 3678 int error; 3679 3680 dev = (device_t)watch->callback_data; 3681 xbb = device_get_softc(dev); 3682 3683 error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", 3684 NULL, &xbb->dev_name, NULL); 3685 if (error != 0) 3686 return; 3687 3688 xs_unregister_watch(watch); 3689 free(watch->node, M_XENBLOCKBACK); 3690 watch->node = NULL; 3691 xbb->hotplug_done = true; 3692 3693 /* Collect physical device information. */ 3694 error = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "device-type", 3695 NULL, &xbb->dev_type, NULL); 3696 if (error != 0) 3697 xbb->dev_type = NULL; 3698 3699 error = xs_gather(XST_NIL, xenbus_get_node(dev), "mode", NULL, 3700 &xbb->dev_mode, NULL); 3701 if (error != 0) { 3702 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3703 xenbus_get_node(dev)); 3704 return; 3705 } 3706 3707 xbb_attach_disk(dev); 3708 } 3709 3710 /** 3711 * Attach to a XenBus device that has been claimed by our probe routine. 3712 * 3713 * \param dev NewBus device object representing this Xen Block Back instance. 3714 * 3715 * \return 0 for success, errno codes for failure. 3716 */ 3717 static int 3718 xbb_attach(device_t dev) 3719 { 3720 struct xbb_softc *xbb; 3721 int error; 3722 u_int max_ring_page_order; 3723 struct sbuf *watch_path; 3724 3725 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3726 3727 /* 3728 * Basic initialization. 3729 * After this block it is safe to call xbb_detach() 3730 * to clean up any allocated data for this instance. 3731 */ 3732 xbb = device_get_softc(dev); 3733 xbb->dev = dev; 3734 xbb->otherend_id = xenbus_get_otherend_id(dev); 3735 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3736 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3737 3738 /* 3739 * Publish protocol capabilities for consumption by the 3740 * front-end. 3741 */ 3742 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3743 "feature-barrier", "1"); 3744 if (error) { 3745 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3746 xenbus_get_node(xbb->dev)); 3747 return (error); 3748 } 3749 3750 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3751 "feature-flush-cache", "1"); 3752 if (error) { 3753 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3754 xenbus_get_node(xbb->dev)); 3755 return (error); 3756 } 3757 3758 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3759 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3760 "max-ring-page-order", "%u", max_ring_page_order); 3761 if (error) { 3762 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3763 xenbus_get_node(xbb->dev)); 3764 return (error); 3765 } 3766 3767 /* Tell the toolstack blkback has attached. */ 3768 xenbus_set_state(dev, XenbusStateInitWait); 3769 3770 if (xbb->hotplug_done) { 3771 xbb_attach_disk(dev); 3772 return (0); 3773 } 3774 3775 /* 3776 * We need to wait for hotplug script execution before 3777 * moving forward. 3778 */ 3779 watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); 3780 xbb->hotplug_watch.callback_data = (uintptr_t)dev; 3781 xbb->hotplug_watch.callback = xbb_attach_cb; 3782 KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); 3783 xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); 3784 /* 3785 * We don't care about the path updated, just about the value changes 3786 * on that single node, hence there's no need to queue more that one 3787 * event. 3788 */ 3789 xbb->hotplug_watch.max_pending = 1; 3790 sbuf_delete(watch_path); 3791 error = xs_register_watch(&xbb->hotplug_watch); 3792 if (error != 0) { 3793 xbb_attach_failed(xbb, error, "failed to create watch on %s", 3794 xbb->hotplug_watch.node); 3795 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3796 return (error); 3797 } 3798 3799 return (0); 3800 } 3801 3802 /** 3803 * Detach from a block back device instance. 3804 * 3805 * \param dev NewBus device object representing this Xen Block Back instance. 3806 * 3807 * \return 0 for success, errno codes for failure. 3808 * 3809 * \note A block back device may be detached at any time in its life-cycle, 3810 * including part way through the attach process. For this reason, 3811 * initialization order and the initialization state checks in this 3812 * routine must be carefully coupled so that attach time failures 3813 * are gracefully handled. 3814 */ 3815 static int 3816 xbb_detach(device_t dev) 3817 { 3818 struct xbb_softc *xbb; 3819 3820 DPRINTF("\n"); 3821 3822 xbb = device_get_softc(dev); 3823 mtx_lock(&xbb->lock); 3824 while (xbb_shutdown(xbb) == EAGAIN) { 3825 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3826 "xbb_shutdown", 0); 3827 } 3828 mtx_unlock(&xbb->lock); 3829 3830 DPRINTF("\n"); 3831 3832 if (xbb->io_taskqueue != NULL) 3833 taskqueue_free(xbb->io_taskqueue); 3834 3835 if (xbb->xbb_stats != NULL) 3836 devstat_remove_entry(xbb->xbb_stats); 3837 3838 if (xbb->xbb_stats_in != NULL) 3839 devstat_remove_entry(xbb->xbb_stats_in); 3840 3841 xbb_close_backend(xbb); 3842 3843 if (xbb->dev_mode != NULL) { 3844 free(xbb->dev_mode, M_XENSTORE); 3845 xbb->dev_mode = NULL; 3846 } 3847 3848 if (xbb->dev_type != NULL) { 3849 free(xbb->dev_type, M_XENSTORE); 3850 xbb->dev_type = NULL; 3851 } 3852 3853 if (xbb->dev_name != NULL) { 3854 free(xbb->dev_name, M_XENSTORE); 3855 xbb->dev_name = NULL; 3856 } 3857 3858 mtx_destroy(&xbb->lock); 3859 return (0); 3860 } 3861 3862 /** 3863 * Prepare this block back device for suspension of this VM. 3864 * 3865 * \param dev NewBus device object representing this Xen Block Back instance. 3866 * 3867 * \return 0 for success, errno codes for failure. 3868 */ 3869 static int 3870 xbb_suspend(device_t dev) 3871 { 3872 #ifdef NOT_YET 3873 struct xbb_softc *sc = device_get_softc(dev); 3874 3875 /* Prevent new requests being issued until we fix things up. */ 3876 mtx_lock(&sc->xb_io_lock); 3877 sc->connected = BLKIF_STATE_SUSPENDED; 3878 mtx_unlock(&sc->xb_io_lock); 3879 #endif 3880 3881 return (0); 3882 } 3883 3884 /** 3885 * Perform any processing required to recover from a suspended state. 3886 * 3887 * \param dev NewBus device object representing this Xen Block Back instance. 3888 * 3889 * \return 0 for success, errno codes for failure. 3890 */ 3891 static int 3892 xbb_resume(device_t dev) 3893 { 3894 return (0); 3895 } 3896 3897 /** 3898 * Handle state changes expressed via the XenStore by our front-end peer. 3899 * 3900 * \param dev NewBus device object representing this Xen 3901 * Block Back instance. 3902 * \param frontend_state The new state of the front-end. 3903 * 3904 * \return 0 for success, errno codes for failure. 3905 */ 3906 static void 3907 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3908 { 3909 struct xbb_softc *xbb = device_get_softc(dev); 3910 3911 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3912 xenbus_strstate(frontend_state), 3913 xenbus_strstate(xenbus_get_state(xbb->dev))); 3914 3915 switch (frontend_state) { 3916 case XenbusStateInitialising: 3917 break; 3918 case XenbusStateInitialised: 3919 case XenbusStateConnected: 3920 xbb_connect(xbb); 3921 break; 3922 case XenbusStateClosing: 3923 case XenbusStateClosed: 3924 mtx_lock(&xbb->lock); 3925 xbb_shutdown(xbb); 3926 mtx_unlock(&xbb->lock); 3927 if (frontend_state == XenbusStateClosed) 3928 xenbus_set_state(xbb->dev, XenbusStateClosed); 3929 break; 3930 default: 3931 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3932 frontend_state); 3933 break; 3934 } 3935 } 3936 3937 /*---------------------------- NewBus Registration ---------------------------*/ 3938 static device_method_t xbb_methods[] = { 3939 /* Device interface */ 3940 DEVMETHOD(device_probe, xbb_probe), 3941 DEVMETHOD(device_attach, xbb_attach), 3942 DEVMETHOD(device_detach, xbb_detach), 3943 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3944 DEVMETHOD(device_suspend, xbb_suspend), 3945 DEVMETHOD(device_resume, xbb_resume), 3946 3947 /* Xenbus interface */ 3948 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3949 { 0, 0 } 3950 }; 3951 3952 static driver_t xbb_driver = { 3953 "xbbd", 3954 xbb_methods, 3955 sizeof(struct xbb_softc), 3956 }; 3957 devclass_t xbb_devclass; 3958 3959 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 3960