1 /*- 2 * Copyright (c) 2009-2011 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 * Ken Merry (Spectra Logic Corporation) 32 */ 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 36 /** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 */ 42 43 #include "opt_kdtrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kernel.h> 48 #include <sys/malloc.h> 49 50 #include <sys/bio.h> 51 #include <sys/bus.h> 52 #include <sys/conf.h> 53 #include <sys/devicestat.h> 54 #include <sys/disk.h> 55 #include <sys/fcntl.h> 56 #include <sys/filedesc.h> 57 #include <sys/kdb.h> 58 #include <sys/module.h> 59 #include <sys/namei.h> 60 #include <sys/proc.h> 61 #include <sys/rman.h> 62 #include <sys/taskqueue.h> 63 #include <sys/types.h> 64 #include <sys/vnode.h> 65 #include <sys/mount.h> 66 #include <sys/sysctl.h> 67 #include <sys/bitstring.h> 68 #include <sys/sdt.h> 69 70 #include <geom/geom.h> 71 72 #include <machine/_inttypes.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 78 #include <xen/xen-os.h> 79 #include <xen/blkif.h> 80 #include <xen/gnttab.h> 81 #include <xen/xen_intr.h> 82 83 #include <xen/interface/event_channel.h> 84 #include <xen/interface/grant_table.h> 85 86 #include <xen/xenbus/xenbusvar.h> 87 88 /*--------------------------- Compile-time Tunables --------------------------*/ 89 /** 90 * The maximum number of outstanding request blocks (request headers plus 91 * additional segment blocks) we will allow in a negotiated block-front/back 92 * communication channel. 93 */ 94 #define XBB_MAX_REQUESTS 256 95 96 /** 97 * \brief Define to force all I/O to be performed on memory owned by the 98 * backend device, with a copy-in/out to the remote domain's memory. 99 * 100 * \note This option is currently required when this driver's domain is 101 * operating in HVM mode on a system using an IOMMU. 102 * 103 * This driver uses Xen's grant table API to gain access to the memory of 104 * the remote domains it serves. When our domain is operating in PV mode, 105 * the grant table mechanism directly updates our domain's page table entries 106 * to point to the physical pages of the remote domain. This scheme guarantees 107 * that blkback and the backing devices it uses can safely perform DMA 108 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 109 * insure that our domain cannot DMA to pages owned by another domain. As 110 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 111 * table API. For this reason, in HVM mode, we must bounce all requests into 112 * memory that is mapped into our domain at domain startup and thus has 113 * valid IOMMU mappings. 114 */ 115 #define XBB_USE_BOUNCE_BUFFERS 116 117 /** 118 * \brief Define to enable rudimentary request logging to the console. 119 */ 120 #undef XBB_DEBUG 121 122 /*---------------------------------- Macros ----------------------------------*/ 123 /** 124 * Custom malloc type for all driver allocations. 125 */ 126 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 127 128 #ifdef XBB_DEBUG 129 #define DPRINTF(fmt, args...) \ 130 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 131 #else 132 #define DPRINTF(fmt, args...) do {} while(0) 133 #endif 134 135 /** 136 * The maximum mapped region size per request we will allow in a negotiated 137 * block-front/back communication channel. 138 */ 139 #define XBB_MAX_REQUEST_SIZE \ 140 MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 141 142 /** 143 * The maximum number of segments (within a request header and accompanying 144 * segment blocks) per request we will allow in a negotiated block-front/back 145 * communication channel. 146 */ 147 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 148 (MIN(UIO_MAXIOV, \ 149 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 150 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 151 152 /** 153 * The maximum number of shared memory ring pages we will allow in a 154 * negotiated block-front/back communication channel. Allow enough 155 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 156 */ 157 #define XBB_MAX_RING_PAGES \ 158 BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \ 159 * XBB_MAX_REQUESTS) 160 /** 161 * The maximum number of ring pages that we can allow per request list. 162 * We limit this to the maximum number of segments per request, because 163 * that is already a reasonable number of segments to aggregate. This 164 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 165 * because that would leave situations where we can't dispatch even one 166 * large request. 167 */ 168 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 169 170 /*--------------------------- Forward Declarations ---------------------------*/ 171 struct xbb_softc; 172 struct xbb_xen_req; 173 174 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 175 ...) __attribute__((format(printf, 3, 4))); 176 static int xbb_shutdown(struct xbb_softc *xbb); 177 static int xbb_detach(device_t dev); 178 179 /*------------------------------ Data Structures -----------------------------*/ 180 181 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 182 183 typedef enum { 184 XBB_REQLIST_NONE = 0x00, 185 XBB_REQLIST_MAPPED = 0x01 186 } xbb_reqlist_flags; 187 188 struct xbb_xen_reqlist { 189 /** 190 * Back reference to the parent block back instance for this 191 * request. Used during bio_done handling. 192 */ 193 struct xbb_softc *xbb; 194 195 /** 196 * BLKIF_OP code for this request. 197 */ 198 int operation; 199 200 /** 201 * Set to BLKIF_RSP_* to indicate request status. 202 * 203 * This field allows an error status to be recorded even if the 204 * delivery of this status must be deferred. Deferred reporting 205 * is necessary, for example, when an error is detected during 206 * completion processing of one bio when other bios for this 207 * request are still outstanding. 208 */ 209 int status; 210 211 /** 212 * Number of 512 byte sectors not transferred. 213 */ 214 int residual_512b_sectors; 215 216 /** 217 * Starting sector number of the first request in the list. 218 */ 219 off_t starting_sector_number; 220 221 /** 222 * If we're going to coalesce, the next contiguous sector would be 223 * this one. 224 */ 225 off_t next_contig_sector; 226 227 /** 228 * Number of child requests in the list. 229 */ 230 int num_children; 231 232 /** 233 * Number of I/O requests still pending on the backend. 234 */ 235 int pendcnt; 236 237 /** 238 * Total number of segments for requests in the list. 239 */ 240 int nr_segments; 241 242 /** 243 * Flags for this particular request list. 244 */ 245 xbb_reqlist_flags flags; 246 247 /** 248 * Kernel virtual address space reserved for this request 249 * list structure and used to map the remote domain's pages for 250 * this I/O, into our domain's address space. 251 */ 252 uint8_t *kva; 253 254 /** 255 * Base, psuedo-physical address, corresponding to the start 256 * of this request's kva region. 257 */ 258 uint64_t gnt_base; 259 260 261 #ifdef XBB_USE_BOUNCE_BUFFERS 262 /** 263 * Pre-allocated domain local memory used to proxy remote 264 * domain memory during I/O operations. 265 */ 266 uint8_t *bounce; 267 #endif 268 269 /** 270 * Array of grant handles (one per page) used to map this request. 271 */ 272 grant_handle_t *gnt_handles; 273 274 /** 275 * Device statistics request ordering type (ordered or simple). 276 */ 277 devstat_tag_type ds_tag_type; 278 279 /** 280 * Device statistics request type (read, write, no_data). 281 */ 282 devstat_trans_flags ds_trans_type; 283 284 /** 285 * The start time for this request. 286 */ 287 struct bintime ds_t0; 288 289 /** 290 * Linked list of contiguous requests with the same operation type. 291 */ 292 struct xbb_xen_req_list contig_req_list; 293 294 /** 295 * Linked list links used to aggregate idle requests in the 296 * request list free pool (xbb->reqlist_free_stailq) and pending 297 * requests waiting for execution (xbb->reqlist_pending_stailq). 298 */ 299 STAILQ_ENTRY(xbb_xen_reqlist) links; 300 }; 301 302 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 303 304 /** 305 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 306 */ 307 struct xbb_xen_req { 308 /** 309 * Linked list links used to aggregate requests into a reqlist 310 * and to store them in the request free pool. 311 */ 312 STAILQ_ENTRY(xbb_xen_req) links; 313 314 /** 315 * The remote domain's identifier for this I/O request. 316 */ 317 uint64_t id; 318 319 /** 320 * The number of pages currently mapped for this request. 321 */ 322 int nr_pages; 323 324 /** 325 * The number of 512 byte sectors comprising this requests. 326 */ 327 int nr_512b_sectors; 328 329 /** 330 * BLKIF_OP code for this request. 331 */ 332 int operation; 333 334 /** 335 * Storage used for non-native ring requests. 336 */ 337 blkif_request_t ring_req_storage; 338 339 /** 340 * Pointer to the Xen request in the ring. 341 */ 342 blkif_request_t *ring_req; 343 344 /** 345 * Consumer index for this request. 346 */ 347 RING_IDX req_ring_idx; 348 349 /** 350 * The start time for this request. 351 */ 352 struct bintime ds_t0; 353 354 /** 355 * Pointer back to our parent request list. 356 */ 357 struct xbb_xen_reqlist *reqlist; 358 }; 359 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 360 361 /** 362 * \brief Configuration data for the shared memory request ring 363 * used to communicate with the front-end client of this 364 * this driver. 365 */ 366 struct xbb_ring_config { 367 /** KVA address where ring memory is mapped. */ 368 vm_offset_t va; 369 370 /** The pseudo-physical address where ring memory is mapped.*/ 371 uint64_t gnt_addr; 372 373 /** 374 * Grant table handles, one per-ring page, returned by the 375 * hyperpervisor upon mapping of the ring and required to 376 * unmap it when a connection is torn down. 377 */ 378 grant_handle_t handle[XBB_MAX_RING_PAGES]; 379 380 /** 381 * The device bus address returned by the hypervisor when 382 * mapping the ring and required to unmap it when a connection 383 * is torn down. 384 */ 385 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 386 387 /** The number of ring pages mapped for the current connection. */ 388 u_int ring_pages; 389 390 /** 391 * The grant references, one per-ring page, supplied by the 392 * front-end, allowing us to reference the ring pages in the 393 * front-end's domain and to map these pages into our own domain. 394 */ 395 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 396 397 /** The interrupt driven even channel used to signal ring events. */ 398 evtchn_port_t evtchn; 399 }; 400 401 /** 402 * Per-instance connection state flags. 403 */ 404 typedef enum 405 { 406 /** 407 * The front-end requested a read-only mount of the 408 * back-end device/file. 409 */ 410 XBBF_READ_ONLY = 0x01, 411 412 /** Communication with the front-end has been established. */ 413 XBBF_RING_CONNECTED = 0x02, 414 415 /** 416 * Front-end requests exist in the ring and are waiting for 417 * xbb_xen_req objects to free up. 418 */ 419 XBBF_RESOURCE_SHORTAGE = 0x04, 420 421 /** Connection teardown in progress. */ 422 XBBF_SHUTDOWN = 0x08, 423 424 /** A thread is already performing shutdown processing. */ 425 XBBF_IN_SHUTDOWN = 0x10 426 } xbb_flag_t; 427 428 /** Backend device type. */ 429 typedef enum { 430 /** Backend type unknown. */ 431 XBB_TYPE_NONE = 0x00, 432 433 /** 434 * Backend type disk (access via cdev switch 435 * strategy routine). 436 */ 437 XBB_TYPE_DISK = 0x01, 438 439 /** Backend type file (access vnode operations.). */ 440 XBB_TYPE_FILE = 0x02 441 } xbb_type; 442 443 /** 444 * \brief Structure used to memoize information about a per-request 445 * scatter-gather list. 446 * 447 * The chief benefit of using this data structure is it avoids having 448 * to reparse the possibly discontiguous S/G list in the original 449 * request. Due to the way that the mapping of the memory backing an 450 * I/O transaction is handled by Xen, a second pass is unavoidable. 451 * At least this way the second walk is a simple array traversal. 452 * 453 * \note A single Scatter/Gather element in the block interface covers 454 * at most 1 machine page. In this context a sector (blkif 455 * nomenclature, not what I'd choose) is a 512b aligned unit 456 * of mapping within the machine page referenced by an S/G 457 * element. 458 */ 459 struct xbb_sg { 460 /** The number of 512b data chunks mapped in this S/G element. */ 461 int16_t nsect; 462 463 /** 464 * The index (0 based) of the first 512b data chunk mapped 465 * in this S/G element. 466 */ 467 uint8_t first_sect; 468 469 /** 470 * The index (0 based) of the last 512b data chunk mapped 471 * in this S/G element. 472 */ 473 uint8_t last_sect; 474 }; 475 476 /** 477 * Character device backend specific configuration data. 478 */ 479 struct xbb_dev_data { 480 /** Cdev used for device backend access. */ 481 struct cdev *cdev; 482 483 /** Cdev switch used for device backend access. */ 484 struct cdevsw *csw; 485 486 /** Used to hold a reference on opened cdev backend devices. */ 487 int dev_ref; 488 }; 489 490 /** 491 * File backend specific configuration data. 492 */ 493 struct xbb_file_data { 494 /** Credentials to use for vnode backed (file based) I/O. */ 495 struct ucred *cred; 496 497 /** 498 * \brief Array of io vectors used to process file based I/O. 499 * 500 * Only a single file based request is outstanding per-xbb instance, 501 * so we only need one of these. 502 */ 503 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 504 #ifdef XBB_USE_BOUNCE_BUFFERS 505 506 /** 507 * \brief Array of io vectors used to handle bouncing of file reads. 508 * 509 * Vnode operations are free to modify uio data during their 510 * exectuion. In the case of a read with bounce buffering active, 511 * we need some of the data from the original uio in order to 512 * bounce-out the read data. This array serves as the temporary 513 * storage for this saved data. 514 */ 515 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 516 517 /** 518 * \brief Array of memoized bounce buffer kva offsets used 519 * in the file based backend. 520 * 521 * Due to the way that the mapping of the memory backing an 522 * I/O transaction is handled by Xen, a second pass through 523 * the request sg elements is unavoidable. We memoize the computed 524 * bounce address here to reduce the cost of the second walk. 525 */ 526 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 527 #endif /* XBB_USE_BOUNCE_BUFFERS */ 528 }; 529 530 /** 531 * Collection of backend type specific data. 532 */ 533 union xbb_backend_data { 534 struct xbb_dev_data dev; 535 struct xbb_file_data file; 536 }; 537 538 /** 539 * Function signature of backend specific I/O handlers. 540 */ 541 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 542 struct xbb_xen_reqlist *reqlist, int operation, 543 int flags); 544 545 /** 546 * Per-instance configuration data. 547 */ 548 struct xbb_softc { 549 550 /** 551 * Task-queue used to process I/O requests. 552 */ 553 struct taskqueue *io_taskqueue; 554 555 /** 556 * Single "run the request queue" task enqueued 557 * on io_taskqueue. 558 */ 559 struct task io_task; 560 561 /** Device type for this instance. */ 562 xbb_type device_type; 563 564 /** NewBus device corresponding to this instance. */ 565 device_t dev; 566 567 /** Backend specific dispatch routine for this instance. */ 568 xbb_dispatch_t dispatch_io; 569 570 /** The number of requests outstanding on the backend device/file. */ 571 int active_request_count; 572 573 /** Free pool of request tracking structures. */ 574 struct xbb_xen_req_list request_free_stailq; 575 576 /** Array, sized at connection time, of request tracking structures. */ 577 struct xbb_xen_req *requests; 578 579 /** Free pool of request list structures. */ 580 struct xbb_xen_reqlist_list reqlist_free_stailq; 581 582 /** List of pending request lists awaiting execution. */ 583 struct xbb_xen_reqlist_list reqlist_pending_stailq; 584 585 /** Array, sized at connection time, of request list structures. */ 586 struct xbb_xen_reqlist *request_lists; 587 588 /** 589 * Global pool of kva used for mapping remote domain ring 590 * and I/O transaction data. 591 */ 592 vm_offset_t kva; 593 594 /** Psuedo-physical address corresponding to kva. */ 595 uint64_t gnt_base_addr; 596 597 /** The size of the global kva pool. */ 598 int kva_size; 599 600 /** The size of the KVA area used for request lists. */ 601 int reqlist_kva_size; 602 603 /** The number of pages of KVA used for request lists */ 604 int reqlist_kva_pages; 605 606 /** Bitmap of free KVA pages */ 607 bitstr_t *kva_free; 608 609 /** 610 * \brief Cached value of the front-end's domain id. 611 * 612 * This value is used at once for each mapped page in 613 * a transaction. We cache it to avoid incuring the 614 * cost of an ivar access every time this is needed. 615 */ 616 domid_t otherend_id; 617 618 /** 619 * \brief The blkif protocol abi in effect. 620 * 621 * There are situations where the back and front ends can 622 * have a different, native abi (e.g. intel x86_64 and 623 * 32bit x86 domains on the same machine). The back-end 624 * always accomodates the front-end's native abi. That 625 * value is pulled from the XenStore and recorded here. 626 */ 627 int abi; 628 629 /** 630 * \brief The maximum number of requests and request lists allowed 631 * to be in flight at a time. 632 * 633 * This value is negotiated via the XenStore. 634 */ 635 u_int max_requests; 636 637 /** 638 * \brief The maximum number of segments (1 page per segment) 639 * that can be mapped by a request. 640 * 641 * This value is negotiated via the XenStore. 642 */ 643 u_int max_request_segments; 644 645 /** 646 * \brief Maximum number of segments per request list. 647 * 648 * This value is derived from and will generally be larger than 649 * max_request_segments. 650 */ 651 u_int max_reqlist_segments; 652 653 /** 654 * The maximum size of any request to this back-end 655 * device. 656 * 657 * This value is negotiated via the XenStore. 658 */ 659 u_int max_request_size; 660 661 /** 662 * The maximum size of any request list. This is derived directly 663 * from max_reqlist_segments. 664 */ 665 u_int max_reqlist_size; 666 667 /** Various configuration and state bit flags. */ 668 xbb_flag_t flags; 669 670 /** Ring mapping and interrupt configuration data. */ 671 struct xbb_ring_config ring_config; 672 673 /** Runtime, cross-abi safe, structures for ring access. */ 674 blkif_back_rings_t rings; 675 676 /** IRQ mapping for the communication ring event channel. */ 677 xen_intr_handle_t xen_intr_handle; 678 679 /** 680 * \brief Backend access mode flags (e.g. write, or read-only). 681 * 682 * This value is passed to us by the front-end via the XenStore. 683 */ 684 char *dev_mode; 685 686 /** 687 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 688 * 689 * This value is passed to us by the front-end via the XenStore. 690 * Currently unused. 691 */ 692 char *dev_type; 693 694 /** 695 * \brief Backend device/file identifier. 696 * 697 * This value is passed to us by the front-end via the XenStore. 698 * We expect this to be a POSIX path indicating the file or 699 * device to open. 700 */ 701 char *dev_name; 702 703 /** 704 * Vnode corresponding to the backend device node or file 705 * we are acessing. 706 */ 707 struct vnode *vn; 708 709 union xbb_backend_data backend; 710 711 /** The native sector size of the backend. */ 712 u_int sector_size; 713 714 /** log2 of sector_size. */ 715 u_int sector_size_shift; 716 717 /** Size in bytes of the backend device or file. */ 718 off_t media_size; 719 720 /** 721 * \brief media_size expressed in terms of the backend native 722 * sector size. 723 * 724 * (e.g. xbb->media_size >> xbb->sector_size_shift). 725 */ 726 uint64_t media_num_sectors; 727 728 /** 729 * \brief Array of memoized scatter gather data computed during the 730 * conversion of blkif ring requests to internal xbb_xen_req 731 * structures. 732 * 733 * Ring processing is serialized so we only need one of these. 734 */ 735 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 736 737 /** 738 * Temporary grant table map used in xbb_dispatch_io(). When 739 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 740 * stack could cause a stack overflow. 741 */ 742 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 743 744 /** Mutex protecting per-instance data. */ 745 struct mtx lock; 746 747 #ifdef XENHVM 748 /** 749 * Resource representing allocated physical address space 750 * associated with our per-instance kva region. 751 */ 752 struct resource *pseudo_phys_res; 753 754 /** Resource id for allocated physical address space. */ 755 int pseudo_phys_res_id; 756 #endif 757 758 /** 759 * I/O statistics from BlockBack dispatch down. These are 760 * coalesced requests, and we start them right before execution. 761 */ 762 struct devstat *xbb_stats; 763 764 /** 765 * I/O statistics coming into BlockBack. These are the requests as 766 * we get them from BlockFront. They are started as soon as we 767 * receive a request, and completed when the I/O is complete. 768 */ 769 struct devstat *xbb_stats_in; 770 771 /** Disable sending flush to the backend */ 772 int disable_flush; 773 774 /** Send a real flush for every N flush requests */ 775 int flush_interval; 776 777 /** Count of flush requests in the interval */ 778 int flush_count; 779 780 /** Don't coalesce requests if this is set */ 781 int no_coalesce_reqs; 782 783 /** Number of requests we have received */ 784 uint64_t reqs_received; 785 786 /** Number of requests we have completed*/ 787 uint64_t reqs_completed; 788 789 /** How many forced dispatches (i.e. without coalescing) have happend */ 790 uint64_t forced_dispatch; 791 792 /** How many normal dispatches have happend */ 793 uint64_t normal_dispatch; 794 795 /** How many total dispatches have happend */ 796 uint64_t total_dispatch; 797 798 /** How many times we have run out of KVA */ 799 uint64_t kva_shortages; 800 801 /** How many times we have run out of request structures */ 802 uint64_t request_shortages; 803 }; 804 805 /*---------------------------- Request Processing ----------------------------*/ 806 /** 807 * Allocate an internal transaction tracking structure from the free pool. 808 * 809 * \param xbb Per-instance xbb configuration structure. 810 * 811 * \return On success, a pointer to the allocated xbb_xen_req structure. 812 * Otherwise NULL. 813 */ 814 static inline struct xbb_xen_req * 815 xbb_get_req(struct xbb_softc *xbb) 816 { 817 struct xbb_xen_req *req; 818 819 req = NULL; 820 821 mtx_assert(&xbb->lock, MA_OWNED); 822 823 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 824 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 825 xbb->active_request_count++; 826 } 827 828 return (req); 829 } 830 831 /** 832 * Return an allocated transaction tracking structure to the free pool. 833 * 834 * \param xbb Per-instance xbb configuration structure. 835 * \param req The request structure to free. 836 */ 837 static inline void 838 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 839 { 840 mtx_assert(&xbb->lock, MA_OWNED); 841 842 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 843 xbb->active_request_count--; 844 845 KASSERT(xbb->active_request_count >= 0, 846 ("xbb_release_req: negative active count")); 847 } 848 849 /** 850 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 851 * 852 * \param xbb Per-instance xbb configuration structure. 853 * \param req_list The list of requests to free. 854 * \param nreqs The number of items in the list. 855 */ 856 static inline void 857 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 858 int nreqs) 859 { 860 mtx_assert(&xbb->lock, MA_OWNED); 861 862 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 863 xbb->active_request_count -= nreqs; 864 865 KASSERT(xbb->active_request_count >= 0, 866 ("xbb_release_reqs: negative active count")); 867 } 868 869 /** 870 * Given a page index and 512b sector offset within that page, 871 * calculate an offset into a request's kva region. 872 * 873 * \param reqlist The request structure whose kva region will be accessed. 874 * \param pagenr The page index used to compute the kva offset. 875 * \param sector The 512b sector index used to compute the page relative 876 * kva offset. 877 * 878 * \return The computed global KVA offset. 879 */ 880 static inline uint8_t * 881 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 882 { 883 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 884 } 885 886 #ifdef XBB_USE_BOUNCE_BUFFERS 887 /** 888 * Given a page index and 512b sector offset within that page, 889 * calculate an offset into a request's local bounce memory region. 890 * 891 * \param reqlist The request structure whose bounce region will be accessed. 892 * \param pagenr The page index used to compute the bounce offset. 893 * \param sector The 512b sector index used to compute the page relative 894 * bounce offset. 895 * 896 * \return The computed global bounce buffer address. 897 */ 898 static inline uint8_t * 899 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 900 { 901 return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 902 } 903 #endif 904 905 /** 906 * Given a page number and 512b sector offset within that page, 907 * calculate an offset into the request's memory region that the 908 * underlying backend device/file should use for I/O. 909 * 910 * \param reqlist The request structure whose I/O region will be accessed. 911 * \param pagenr The page index used to compute the I/O offset. 912 * \param sector The 512b sector index used to compute the page relative 913 * I/O offset. 914 * 915 * \return The computed global I/O address. 916 * 917 * Depending on configuration, this will either be a local bounce buffer 918 * or a pointer to the memory mapped in from the front-end domain for 919 * this request. 920 */ 921 static inline uint8_t * 922 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 923 { 924 #ifdef XBB_USE_BOUNCE_BUFFERS 925 return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 926 #else 927 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 928 #endif 929 } 930 931 /** 932 * Given a page index and 512b sector offset within that page, calculate 933 * an offset into the local psuedo-physical address space used to map a 934 * front-end's request data into a request. 935 * 936 * \param reqlist The request list structure whose pseudo-physical region 937 * will be accessed. 938 * \param pagenr The page index used to compute the pseudo-physical offset. 939 * \param sector The 512b sector index used to compute the page relative 940 * pseudo-physical offset. 941 * 942 * \return The computed global pseudo-phsyical address. 943 * 944 * Depending on configuration, this will either be a local bounce buffer 945 * or a pointer to the memory mapped in from the front-end domain for 946 * this request. 947 */ 948 static inline uintptr_t 949 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 950 { 951 struct xbb_softc *xbb; 952 953 xbb = reqlist->xbb; 954 955 return ((uintptr_t)(xbb->gnt_base_addr + 956 (uintptr_t)(reqlist->kva - xbb->kva) + 957 (PAGE_SIZE * pagenr) + (sector << 9))); 958 } 959 960 /** 961 * Get Kernel Virtual Address space for mapping requests. 962 * 963 * \param xbb Per-instance xbb configuration structure. 964 * \param nr_pages Number of pages needed. 965 * \param check_only If set, check for free KVA but don't allocate it. 966 * \param have_lock If set, xbb lock is already held. 967 * 968 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 969 * 970 * Note: This should be unnecessary once we have either chaining or 971 * scatter/gather support for struct bio. At that point we'll be able to 972 * put multiple addresses and lengths in one bio/bio chain and won't need 973 * to map everything into one virtual segment. 974 */ 975 static uint8_t * 976 xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 977 { 978 intptr_t first_clear; 979 intptr_t num_clear; 980 uint8_t *free_kva; 981 int i; 982 983 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 984 985 first_clear = 0; 986 free_kva = NULL; 987 988 mtx_lock(&xbb->lock); 989 990 /* 991 * Look for the first available page. If there are none, we're done. 992 */ 993 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 994 995 if (first_clear == -1) 996 goto bailout; 997 998 /* 999 * Starting at the first available page, look for consecutive free 1000 * pages that will satisfy the user's request. 1001 */ 1002 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1003 /* 1004 * If this is true, the page is used, so we have to reset 1005 * the number of clear pages and the first clear page 1006 * (since it pointed to a region with an insufficient number 1007 * of clear pages). 1008 */ 1009 if (bit_test(xbb->kva_free, i)) { 1010 num_clear = 0; 1011 first_clear = -1; 1012 continue; 1013 } 1014 1015 if (first_clear == -1) 1016 first_clear = i; 1017 1018 /* 1019 * If this is true, we've found a large enough free region 1020 * to satisfy the request. 1021 */ 1022 if (++num_clear == nr_pages) { 1023 1024 bit_nset(xbb->kva_free, first_clear, 1025 first_clear + nr_pages - 1); 1026 1027 free_kva = xbb->kva + 1028 (uint8_t *)(first_clear * PAGE_SIZE); 1029 1030 KASSERT(free_kva >= (uint8_t *)xbb->kva && 1031 free_kva + (nr_pages * PAGE_SIZE) <= 1032 (uint8_t *)xbb->ring_config.va, 1033 ("Free KVA %p len %d out of range, " 1034 "kva = %#jx, ring VA = %#jx\n", free_kva, 1035 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1036 (uintmax_t)xbb->ring_config.va)); 1037 break; 1038 } 1039 } 1040 1041 bailout: 1042 1043 if (free_kva == NULL) { 1044 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1045 xbb->kva_shortages++; 1046 } 1047 1048 mtx_unlock(&xbb->lock); 1049 1050 return (free_kva); 1051 } 1052 1053 /** 1054 * Free allocated KVA. 1055 * 1056 * \param xbb Per-instance xbb configuration structure. 1057 * \param kva_ptr Pointer to allocated KVA region. 1058 * \param nr_pages Number of pages in the KVA region. 1059 */ 1060 static void 1061 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 1062 { 1063 intptr_t start_page; 1064 1065 mtx_assert(&xbb->lock, MA_OWNED); 1066 1067 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1068 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1069 1070 } 1071 1072 /** 1073 * Unmap the front-end pages associated with this I/O request. 1074 * 1075 * \param req The request structure to unmap. 1076 */ 1077 static void 1078 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1079 { 1080 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1081 u_int i; 1082 u_int invcount; 1083 int error; 1084 1085 invcount = 0; 1086 for (i = 0; i < reqlist->nr_segments; i++) { 1087 1088 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1089 continue; 1090 1091 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1092 unmap[invcount].dev_bus_addr = 0; 1093 unmap[invcount].handle = reqlist->gnt_handles[i]; 1094 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1095 invcount++; 1096 } 1097 1098 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1099 unmap, invcount); 1100 KASSERT(error == 0, ("Grant table operation failed")); 1101 } 1102 1103 /** 1104 * Allocate an internal transaction tracking structure from the free pool. 1105 * 1106 * \param xbb Per-instance xbb configuration structure. 1107 * 1108 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1109 * Otherwise NULL. 1110 */ 1111 static inline struct xbb_xen_reqlist * 1112 xbb_get_reqlist(struct xbb_softc *xbb) 1113 { 1114 struct xbb_xen_reqlist *reqlist; 1115 1116 reqlist = NULL; 1117 1118 mtx_assert(&xbb->lock, MA_OWNED); 1119 1120 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1121 1122 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1123 reqlist->flags = XBB_REQLIST_NONE; 1124 reqlist->kva = NULL; 1125 reqlist->status = BLKIF_RSP_OKAY; 1126 reqlist->residual_512b_sectors = 0; 1127 reqlist->num_children = 0; 1128 reqlist->nr_segments = 0; 1129 STAILQ_INIT(&reqlist->contig_req_list); 1130 } 1131 1132 return (reqlist); 1133 } 1134 1135 /** 1136 * Return an allocated transaction tracking structure to the free pool. 1137 * 1138 * \param xbb Per-instance xbb configuration structure. 1139 * \param req The request list structure to free. 1140 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1141 * during a resource shortage condition. 1142 */ 1143 static inline void 1144 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1145 int wakeup) 1146 { 1147 1148 mtx_lock(&xbb->lock); 1149 1150 if (wakeup) { 1151 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1152 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1153 } 1154 1155 if (reqlist->kva != NULL) 1156 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1157 1158 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1159 1160 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1161 1162 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1163 /* 1164 * Shutdown is in progress. See if we can 1165 * progress further now that one more request 1166 * has completed and been returned to the 1167 * free pool. 1168 */ 1169 xbb_shutdown(xbb); 1170 } 1171 1172 mtx_unlock(&xbb->lock); 1173 1174 if (wakeup != 0) 1175 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1176 } 1177 1178 /** 1179 * Request resources and do basic request setup. 1180 * 1181 * \param xbb Per-instance xbb configuration structure. 1182 * \param reqlist Pointer to reqlist pointer. 1183 * \param ring_req Pointer to a block ring request. 1184 * \param ring_index The ring index of this request. 1185 * 1186 * \return 0 for success, non-zero for failure. 1187 */ 1188 static int 1189 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1190 blkif_request_t *ring_req, RING_IDX ring_idx) 1191 { 1192 struct xbb_xen_reqlist *nreqlist; 1193 struct xbb_xen_req *nreq; 1194 1195 nreqlist = NULL; 1196 nreq = NULL; 1197 1198 mtx_lock(&xbb->lock); 1199 1200 /* 1201 * We don't allow new resources to be allocated if we're in the 1202 * process of shutting down. 1203 */ 1204 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1205 mtx_unlock(&xbb->lock); 1206 return (1); 1207 } 1208 1209 /* 1210 * Allocate a reqlist if the caller doesn't have one already. 1211 */ 1212 if (*reqlist == NULL) { 1213 nreqlist = xbb_get_reqlist(xbb); 1214 if (nreqlist == NULL) 1215 goto bailout_error; 1216 } 1217 1218 /* We always allocate a request. */ 1219 nreq = xbb_get_req(xbb); 1220 if (nreq == NULL) 1221 goto bailout_error; 1222 1223 mtx_unlock(&xbb->lock); 1224 1225 if (*reqlist == NULL) { 1226 *reqlist = nreqlist; 1227 nreqlist->operation = ring_req->operation; 1228 nreqlist->starting_sector_number = ring_req->sector_number; 1229 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1230 links); 1231 } 1232 1233 nreq->reqlist = *reqlist; 1234 nreq->req_ring_idx = ring_idx; 1235 nreq->id = ring_req->id; 1236 nreq->operation = ring_req->operation; 1237 1238 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1239 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1240 nreq->ring_req = &nreq->ring_req_storage; 1241 } else { 1242 nreq->ring_req = ring_req; 1243 } 1244 1245 binuptime(&nreq->ds_t0); 1246 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1247 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1248 (*reqlist)->num_children++; 1249 (*reqlist)->nr_segments += ring_req->nr_segments; 1250 1251 return (0); 1252 1253 bailout_error: 1254 1255 /* 1256 * We're out of resources, so set the shortage flag. The next time 1257 * a request is released, we'll try waking up the work thread to 1258 * see if we can allocate more resources. 1259 */ 1260 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1261 xbb->request_shortages++; 1262 1263 if (nreq != NULL) 1264 xbb_release_req(xbb, nreq); 1265 1266 mtx_unlock(&xbb->lock); 1267 1268 if (nreqlist != NULL) 1269 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1270 1271 return (1); 1272 } 1273 1274 /** 1275 * Create and transmit a response to a blkif request. 1276 * 1277 * \param xbb Per-instance xbb configuration structure. 1278 * \param req The request structure to which to respond. 1279 * \param status The status code to report. See BLKIF_RSP_* 1280 * in sys/xen/interface/io/blkif.h. 1281 */ 1282 static void 1283 xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1284 { 1285 blkif_response_t *resp; 1286 int more_to_do; 1287 int notify; 1288 1289 more_to_do = 0; 1290 1291 /* 1292 * Place on the response ring for the relevant domain. 1293 * For now, only the spacing between entries is different 1294 * in the different ABIs, not the response entry layout. 1295 */ 1296 mtx_lock(&xbb->lock); 1297 switch (xbb->abi) { 1298 case BLKIF_PROTOCOL_NATIVE: 1299 resp = RING_GET_RESPONSE(&xbb->rings.native, 1300 xbb->rings.native.rsp_prod_pvt); 1301 break; 1302 case BLKIF_PROTOCOL_X86_32: 1303 resp = (blkif_response_t *) 1304 RING_GET_RESPONSE(&xbb->rings.x86_32, 1305 xbb->rings.x86_32.rsp_prod_pvt); 1306 break; 1307 case BLKIF_PROTOCOL_X86_64: 1308 resp = (blkif_response_t *) 1309 RING_GET_RESPONSE(&xbb->rings.x86_64, 1310 xbb->rings.x86_64.rsp_prod_pvt); 1311 break; 1312 default: 1313 panic("Unexpected blkif protocol ABI."); 1314 } 1315 1316 resp->id = req->id; 1317 resp->operation = req->operation; 1318 resp->status = status; 1319 1320 xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages); 1321 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify); 1322 1323 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1324 1325 /* 1326 * Tail check for pending requests. Allows frontend to avoid 1327 * notifications if requests are already in flight (lower 1328 * overheads and promotes batching). 1329 */ 1330 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1331 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1332 1333 more_to_do = 1; 1334 } 1335 1336 xbb->reqs_completed++; 1337 1338 mtx_unlock(&xbb->lock); 1339 1340 if (more_to_do) 1341 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1342 1343 if (notify) 1344 xen_intr_signal(xbb->xen_intr_handle); 1345 } 1346 1347 /** 1348 * Complete a request list. 1349 * 1350 * \param xbb Per-instance xbb configuration structure. 1351 * \param reqlist Allocated internal request list structure. 1352 */ 1353 static void 1354 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1355 { 1356 struct xbb_xen_req *nreq; 1357 off_t sectors_sent; 1358 1359 sectors_sent = 0; 1360 1361 if (reqlist->flags & XBB_REQLIST_MAPPED) 1362 xbb_unmap_reqlist(reqlist); 1363 1364 /* 1365 * All I/O is done, send the response. A lock should not be 1366 * necessary here because the request list is complete, and 1367 * therefore this is the only context accessing this request 1368 * right now. The functions we call do their own locking if 1369 * necessary. 1370 */ 1371 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1372 off_t cur_sectors_sent; 1373 1374 xbb_send_response(xbb, nreq, reqlist->status); 1375 1376 /* We don't report bytes sent if there is an error. */ 1377 if (reqlist->status == BLKIF_RSP_OKAY) 1378 cur_sectors_sent = nreq->nr_512b_sectors; 1379 else 1380 cur_sectors_sent = 0; 1381 1382 sectors_sent += cur_sectors_sent; 1383 1384 devstat_end_transaction(xbb->xbb_stats_in, 1385 /*bytes*/cur_sectors_sent << 9, 1386 reqlist->ds_tag_type, 1387 reqlist->ds_trans_type, 1388 /*now*/NULL, 1389 /*then*/&nreq->ds_t0); 1390 } 1391 1392 /* 1393 * Take out any sectors not sent. If we wind up negative (which 1394 * might happen if an error is reported as well as a residual), just 1395 * report 0 sectors sent. 1396 */ 1397 sectors_sent -= reqlist->residual_512b_sectors; 1398 if (sectors_sent < 0) 1399 sectors_sent = 0; 1400 1401 devstat_end_transaction(xbb->xbb_stats, 1402 /*bytes*/ sectors_sent << 9, 1403 reqlist->ds_tag_type, 1404 reqlist->ds_trans_type, 1405 /*now*/NULL, 1406 /*then*/&reqlist->ds_t0); 1407 1408 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1409 } 1410 1411 /** 1412 * Completion handler for buffer I/O requests issued by the device 1413 * backend driver. 1414 * 1415 * \param bio The buffer I/O request on which to perform completion 1416 * processing. 1417 */ 1418 static void 1419 xbb_bio_done(struct bio *bio) 1420 { 1421 struct xbb_softc *xbb; 1422 struct xbb_xen_reqlist *reqlist; 1423 1424 reqlist = bio->bio_caller1; 1425 xbb = reqlist->xbb; 1426 1427 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1428 1429 /* 1430 * This is a bit imprecise. With aggregated I/O a single 1431 * request list can contain multiple front-end requests and 1432 * a multiple bios may point to a single request. By carefully 1433 * walking the request list, we could map residuals and errors 1434 * back to the original front-end request, but the interface 1435 * isn't sufficiently rich for us to properly report the error. 1436 * So, we just treat the entire request list as having failed if an 1437 * error occurs on any part. And, if an error occurs, we treat 1438 * the amount of data transferred as 0. 1439 * 1440 * For residuals, we report it on the overall aggregated device, 1441 * but not on the individual requests, since we don't currently 1442 * do the work to determine which front-end request to which the 1443 * residual applies. 1444 */ 1445 if (bio->bio_error) { 1446 DPRINTF("BIO returned error %d for operation on device %s\n", 1447 bio->bio_error, xbb->dev_name); 1448 reqlist->status = BLKIF_RSP_ERROR; 1449 1450 if (bio->bio_error == ENXIO 1451 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1452 1453 /* 1454 * Backend device has disappeared. Signal the 1455 * front-end that we (the device proxy) want to 1456 * go away. 1457 */ 1458 xenbus_set_state(xbb->dev, XenbusStateClosing); 1459 } 1460 } 1461 1462 #ifdef XBB_USE_BOUNCE_BUFFERS 1463 if (bio->bio_cmd == BIO_READ) { 1464 vm_offset_t kva_offset; 1465 1466 kva_offset = (vm_offset_t)bio->bio_data 1467 - (vm_offset_t)reqlist->bounce; 1468 memcpy((uint8_t *)reqlist->kva + kva_offset, 1469 bio->bio_data, bio->bio_bcount); 1470 } 1471 #endif /* XBB_USE_BOUNCE_BUFFERS */ 1472 1473 /* 1474 * Decrement the pending count for the request list. When we're 1475 * done with the requests, send status back for all of them. 1476 */ 1477 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1478 xbb_complete_reqlist(xbb, reqlist); 1479 1480 g_destroy_bio(bio); 1481 } 1482 1483 /** 1484 * Parse a blkif request into an internal request structure and send 1485 * it to the backend for processing. 1486 * 1487 * \param xbb Per-instance xbb configuration structure. 1488 * \param reqlist Allocated internal request list structure. 1489 * 1490 * \return On success, 0. For resource shortages, non-zero. 1491 * 1492 * This routine performs the backend common aspects of request parsing 1493 * including compiling an internal request structure, parsing the S/G 1494 * list and any secondary ring requests in which they may reside, and 1495 * the mapping of front-end I/O pages into our domain. 1496 */ 1497 static int 1498 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1499 { 1500 struct xbb_sg *xbb_sg; 1501 struct gnttab_map_grant_ref *map; 1502 struct blkif_request_segment *sg; 1503 struct blkif_request_segment *last_block_sg; 1504 struct xbb_xen_req *nreq; 1505 u_int nseg; 1506 u_int seg_idx; 1507 u_int block_segs; 1508 int nr_sects; 1509 int total_sects; 1510 int operation; 1511 uint8_t bio_flags; 1512 int error; 1513 1514 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1515 bio_flags = 0; 1516 total_sects = 0; 1517 nr_sects = 0; 1518 1519 /* 1520 * First determine whether we have enough free KVA to satisfy this 1521 * request list. If not, tell xbb_run_queue() so it can go to 1522 * sleep until we have more KVA. 1523 */ 1524 reqlist->kva = NULL; 1525 if (reqlist->nr_segments != 0) { 1526 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1527 if (reqlist->kva == NULL) { 1528 /* 1529 * If we're out of KVA, return ENOMEM. 1530 */ 1531 return (ENOMEM); 1532 } 1533 } 1534 1535 binuptime(&reqlist->ds_t0); 1536 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1537 1538 switch (reqlist->operation) { 1539 case BLKIF_OP_WRITE_BARRIER: 1540 bio_flags |= BIO_ORDERED; 1541 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1542 /* FALLTHROUGH */ 1543 case BLKIF_OP_WRITE: 1544 operation = BIO_WRITE; 1545 reqlist->ds_trans_type = DEVSTAT_WRITE; 1546 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1547 DPRINTF("Attempt to write to read only device %s\n", 1548 xbb->dev_name); 1549 reqlist->status = BLKIF_RSP_ERROR; 1550 goto send_response; 1551 } 1552 break; 1553 case BLKIF_OP_READ: 1554 operation = BIO_READ; 1555 reqlist->ds_trans_type = DEVSTAT_READ; 1556 break; 1557 case BLKIF_OP_FLUSH_DISKCACHE: 1558 /* 1559 * If this is true, the user has requested that we disable 1560 * flush support. So we just complete the requests 1561 * successfully. 1562 */ 1563 if (xbb->disable_flush != 0) { 1564 goto send_response; 1565 } 1566 1567 /* 1568 * The user has requested that we only send a real flush 1569 * for every N flush requests. So keep count, and either 1570 * complete the request immediately or queue it for the 1571 * backend. 1572 */ 1573 if (xbb->flush_interval != 0) { 1574 if (++(xbb->flush_count) < xbb->flush_interval) { 1575 goto send_response; 1576 } else 1577 xbb->flush_count = 0; 1578 } 1579 1580 operation = BIO_FLUSH; 1581 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1582 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1583 goto do_dispatch; 1584 /*NOTREACHED*/ 1585 default: 1586 DPRINTF("error: unknown block io operation [%d]\n", 1587 reqlist->operation); 1588 reqlist->status = BLKIF_RSP_ERROR; 1589 goto send_response; 1590 } 1591 1592 reqlist->xbb = xbb; 1593 xbb_sg = xbb->xbb_sgs; 1594 map = xbb->maps; 1595 seg_idx = 0; 1596 1597 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1598 blkif_request_t *ring_req; 1599 RING_IDX req_ring_idx; 1600 u_int req_seg_idx; 1601 1602 ring_req = nreq->ring_req; 1603 req_ring_idx = nreq->req_ring_idx; 1604 nr_sects = 0; 1605 nseg = ring_req->nr_segments; 1606 nreq->nr_pages = nseg; 1607 nreq->nr_512b_sectors = 0; 1608 req_seg_idx = 0; 1609 sg = NULL; 1610 1611 /* Check that number of segments is sane. */ 1612 if (__predict_false(nseg == 0) 1613 || __predict_false(nseg > xbb->max_request_segments)) { 1614 DPRINTF("Bad number of segments in request (%d)\n", 1615 nseg); 1616 reqlist->status = BLKIF_RSP_ERROR; 1617 goto send_response; 1618 } 1619 1620 block_segs = MIN(nreq->nr_pages, 1621 BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); 1622 sg = ring_req->seg; 1623 last_block_sg = sg + block_segs; 1624 while (1) { 1625 1626 while (sg < last_block_sg) { 1627 KASSERT(seg_idx < 1628 XBB_MAX_SEGMENTS_PER_REQLIST, 1629 ("seg_idx %d is too large, max " 1630 "segs %d\n", seg_idx, 1631 XBB_MAX_SEGMENTS_PER_REQLIST)); 1632 1633 xbb_sg->first_sect = sg->first_sect; 1634 xbb_sg->last_sect = sg->last_sect; 1635 xbb_sg->nsect = 1636 (int8_t)(sg->last_sect - 1637 sg->first_sect + 1); 1638 1639 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1640 || (xbb_sg->nsect <= 0)) { 1641 reqlist->status = BLKIF_RSP_ERROR; 1642 goto send_response; 1643 } 1644 1645 nr_sects += xbb_sg->nsect; 1646 map->host_addr = xbb_get_gntaddr(reqlist, 1647 seg_idx, /*sector*/0); 1648 KASSERT(map->host_addr + PAGE_SIZE <= 1649 xbb->ring_config.gnt_addr, 1650 ("Host address %#jx len %d overlaps " 1651 "ring address %#jx\n", 1652 (uintmax_t)map->host_addr, PAGE_SIZE, 1653 (uintmax_t)xbb->ring_config.gnt_addr)); 1654 1655 map->flags = GNTMAP_host_map; 1656 map->ref = sg->gref; 1657 map->dom = xbb->otherend_id; 1658 if (operation == BIO_WRITE) 1659 map->flags |= GNTMAP_readonly; 1660 sg++; 1661 map++; 1662 xbb_sg++; 1663 seg_idx++; 1664 req_seg_idx++; 1665 } 1666 1667 block_segs = MIN(nseg - req_seg_idx, 1668 BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); 1669 if (block_segs == 0) 1670 break; 1671 1672 /* 1673 * Fetch the next request block full of SG elements. 1674 * For now, only the spacing between entries is 1675 * different in the different ABIs, not the sg entry 1676 * layout. 1677 */ 1678 req_ring_idx++; 1679 switch (xbb->abi) { 1680 case BLKIF_PROTOCOL_NATIVE: 1681 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native, 1682 req_ring_idx); 1683 break; 1684 case BLKIF_PROTOCOL_X86_32: 1685 { 1686 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32, 1687 req_ring_idx); 1688 break; 1689 } 1690 case BLKIF_PROTOCOL_X86_64: 1691 { 1692 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64, 1693 req_ring_idx); 1694 break; 1695 } 1696 default: 1697 panic("Unexpected blkif protocol ABI."); 1698 /* NOTREACHED */ 1699 } 1700 last_block_sg = sg + block_segs; 1701 } 1702 1703 /* Convert to the disk's sector size */ 1704 nreq->nr_512b_sectors = nr_sects; 1705 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1706 total_sects += nr_sects; 1707 1708 if ((nreq->nr_512b_sectors & 1709 ((xbb->sector_size >> 9) - 1)) != 0) { 1710 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1711 "a multiple of the backing store sector " 1712 "size (%d)\n", __func__, 1713 nreq->nr_512b_sectors << 9, 1714 xbb->sector_size); 1715 reqlist->status = BLKIF_RSP_ERROR; 1716 goto send_response; 1717 } 1718 } 1719 1720 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1721 xbb->maps, reqlist->nr_segments); 1722 if (error != 0) 1723 panic("Grant table operation failed (%d)", error); 1724 1725 reqlist->flags |= XBB_REQLIST_MAPPED; 1726 1727 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1728 seg_idx++, map++){ 1729 1730 if (__predict_false(map->status != 0)) { 1731 DPRINTF("invalid buffer -- could not remap " 1732 "it (%d)\n", map->status); 1733 DPRINTF("Mapping(%d): Host Addr 0x%lx, flags " 1734 "0x%x ref 0x%x, dom %d\n", seg_idx, 1735 map->host_addr, map->flags, map->ref, 1736 map->dom); 1737 reqlist->status = BLKIF_RSP_ERROR; 1738 goto send_response; 1739 } 1740 1741 reqlist->gnt_handles[seg_idx] = map->handle; 1742 } 1743 if (reqlist->starting_sector_number + total_sects > 1744 xbb->media_num_sectors) { 1745 1746 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1747 "extends past end of device %s\n", 1748 operation == BIO_READ ? "read" : "write", 1749 reqlist->starting_sector_number, 1750 reqlist->starting_sector_number + total_sects, 1751 xbb->dev_name); 1752 reqlist->status = BLKIF_RSP_ERROR; 1753 goto send_response; 1754 } 1755 1756 do_dispatch: 1757 1758 error = xbb->dispatch_io(xbb, 1759 reqlist, 1760 operation, 1761 bio_flags); 1762 1763 if (error != 0) { 1764 reqlist->status = BLKIF_RSP_ERROR; 1765 goto send_response; 1766 } 1767 1768 return (0); 1769 1770 send_response: 1771 1772 xbb_complete_reqlist(xbb, reqlist); 1773 1774 return (0); 1775 } 1776 1777 static __inline int 1778 xbb_count_sects(blkif_request_t *ring_req) 1779 { 1780 int i; 1781 int cur_size = 0; 1782 1783 for (i = 0; i < ring_req->nr_segments; i++) { 1784 int nsect; 1785 1786 nsect = (int8_t)(ring_req->seg[i].last_sect - 1787 ring_req->seg[i].first_sect + 1); 1788 if (nsect <= 0) 1789 break; 1790 1791 cur_size += nsect; 1792 } 1793 1794 return (cur_size); 1795 } 1796 1797 /** 1798 * Process incoming requests from the shared communication ring in response 1799 * to a signal on the ring's event channel. 1800 * 1801 * \param context Callback argument registerd during task initialization - 1802 * the xbb_softc for this instance. 1803 * \param pending The number of taskqueue_enqueue events that have 1804 * occurred since this handler was last run. 1805 */ 1806 static void 1807 xbb_run_queue(void *context, int pending) 1808 { 1809 struct xbb_softc *xbb; 1810 blkif_back_rings_t *rings; 1811 RING_IDX rp; 1812 uint64_t cur_sector; 1813 int cur_operation; 1814 struct xbb_xen_reqlist *reqlist; 1815 1816 1817 xbb = (struct xbb_softc *)context; 1818 rings = &xbb->rings; 1819 1820 /* 1821 * Work gather and dispatch loop. Note that we have a bias here 1822 * towards gathering I/O sent by blockfront. We first gather up 1823 * everything in the ring, as long as we have resources. Then we 1824 * dispatch one request, and then attempt to gather up any 1825 * additional requests that have come in while we were dispatching 1826 * the request. 1827 * 1828 * This allows us to get a clearer picture (via devstat) of how 1829 * many requests blockfront is queueing to us at any given time. 1830 */ 1831 for (;;) { 1832 int retval; 1833 1834 /* 1835 * Initialize reqlist to the last element in the pending 1836 * queue, if there is one. This allows us to add more 1837 * requests to that request list, if we have room. 1838 */ 1839 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1840 xbb_xen_reqlist, links); 1841 if (reqlist != NULL) { 1842 cur_sector = reqlist->next_contig_sector; 1843 cur_operation = reqlist->operation; 1844 } else { 1845 cur_operation = 0; 1846 cur_sector = 0; 1847 } 1848 1849 /* 1850 * Cache req_prod to avoid accessing a cache line shared 1851 * with the frontend. 1852 */ 1853 rp = rings->common.sring->req_prod; 1854 1855 /* Ensure we see queued requests up to 'rp'. */ 1856 rmb(); 1857 1858 /** 1859 * Run so long as there is work to consume and the generation 1860 * of a response will not overflow the ring. 1861 * 1862 * @note There's a 1 to 1 relationship between requests and 1863 * responses, so an overflow should never occur. This 1864 * test is to protect our domain from digesting bogus 1865 * data. Shouldn't we log this? 1866 */ 1867 while (rings->common.req_cons != rp 1868 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1869 rings->common.req_cons) == 0){ 1870 blkif_request_t ring_req_storage; 1871 blkif_request_t *ring_req; 1872 int cur_size; 1873 1874 switch (xbb->abi) { 1875 case BLKIF_PROTOCOL_NATIVE: 1876 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1877 rings->common.req_cons); 1878 break; 1879 case BLKIF_PROTOCOL_X86_32: 1880 { 1881 struct blkif_x86_32_request *ring_req32; 1882 1883 ring_req32 = RING_GET_REQUEST( 1884 &xbb->rings.x86_32, rings->common.req_cons); 1885 blkif_get_x86_32_req(&ring_req_storage, 1886 ring_req32); 1887 ring_req = &ring_req_storage; 1888 break; 1889 } 1890 case BLKIF_PROTOCOL_X86_64: 1891 { 1892 struct blkif_x86_64_request *ring_req64; 1893 1894 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1895 rings->common.req_cons); 1896 blkif_get_x86_64_req(&ring_req_storage, 1897 ring_req64); 1898 ring_req = &ring_req_storage; 1899 break; 1900 } 1901 default: 1902 panic("Unexpected blkif protocol ABI."); 1903 /* NOTREACHED */ 1904 } 1905 1906 /* 1907 * Check for situations that would require closing 1908 * off this I/O for further coalescing: 1909 * - Coalescing is turned off. 1910 * - Current I/O is out of sequence with the previous 1911 * I/O. 1912 * - Coalesced I/O would be too large. 1913 */ 1914 if ((reqlist != NULL) 1915 && ((xbb->no_coalesce_reqs != 0) 1916 || ((xbb->no_coalesce_reqs == 0) 1917 && ((ring_req->sector_number != cur_sector) 1918 || (ring_req->operation != cur_operation) 1919 || ((ring_req->nr_segments + reqlist->nr_segments) > 1920 xbb->max_reqlist_segments))))) { 1921 reqlist = NULL; 1922 } 1923 1924 /* 1925 * Grab and check for all resources in one shot. 1926 * If we can't get all of the resources we need, 1927 * the shortage is noted and the thread will get 1928 * woken up when more resources are available. 1929 */ 1930 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1931 xbb->rings.common.req_cons); 1932 1933 if (retval != 0) { 1934 /* 1935 * Resource shortage has been recorded. 1936 * We'll be scheduled to run once a request 1937 * object frees up due to a completion. 1938 */ 1939 break; 1940 } 1941 1942 /* 1943 * Signify that we can overwrite this request with 1944 * a response by incrementing our consumer index. 1945 * The response won't be generated until after 1946 * we've already consumed all necessary data out 1947 * of the version of the request in the ring buffer 1948 * (for native mode). We must update the consumer 1949 * index before issueing back-end I/O so there is 1950 * no possibility that it will complete and a 1951 * response be generated before we make room in 1952 * the queue for that response. 1953 */ 1954 xbb->rings.common.req_cons += 1955 BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); 1956 xbb->reqs_received++; 1957 1958 cur_size = xbb_count_sects(ring_req); 1959 cur_sector = ring_req->sector_number + cur_size; 1960 reqlist->next_contig_sector = cur_sector; 1961 cur_operation = ring_req->operation; 1962 } 1963 1964 /* Check for I/O to dispatch */ 1965 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1966 if (reqlist == NULL) { 1967 /* 1968 * We're out of work to do, put the task queue to 1969 * sleep. 1970 */ 1971 break; 1972 } 1973 1974 /* 1975 * Grab the first request off the queue and attempt 1976 * to dispatch it. 1977 */ 1978 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1979 1980 retval = xbb_dispatch_io(xbb, reqlist); 1981 if (retval != 0) { 1982 /* 1983 * xbb_dispatch_io() returns non-zero only when 1984 * there is a resource shortage. If that's the 1985 * case, re-queue this request on the head of the 1986 * queue, and go to sleep until we have more 1987 * resources. 1988 */ 1989 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1990 reqlist, links); 1991 break; 1992 } else { 1993 /* 1994 * If we still have anything on the queue after 1995 * removing the head entry, that is because we 1996 * met one of the criteria to create a new 1997 * request list (outlined above), and we'll call 1998 * that a forced dispatch for statistical purposes. 1999 * 2000 * Otherwise, if there is only one element on the 2001 * queue, we coalesced everything available on 2002 * the ring and we'll call that a normal dispatch. 2003 */ 2004 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2005 2006 if (reqlist != NULL) 2007 xbb->forced_dispatch++; 2008 else 2009 xbb->normal_dispatch++; 2010 2011 xbb->total_dispatch++; 2012 } 2013 } 2014 } 2015 2016 /** 2017 * Interrupt handler bound to the shared ring's event channel. 2018 * 2019 * \param arg Callback argument registerd during event channel 2020 * binding - the xbb_softc for this instance. 2021 */ 2022 static int 2023 xbb_filter(void *arg) 2024 { 2025 struct xbb_softc *xbb; 2026 2027 /* Defer to taskqueue thread. */ 2028 xbb = (struct xbb_softc *)arg; 2029 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2030 2031 return (FILTER_HANDLED); 2032 } 2033 2034 SDT_PROVIDER_DEFINE(xbb); 2035 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int"); 2036 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t", 2037 "uint64_t"); 2038 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int", 2039 "uint64_t", "uint64_t"); 2040 2041 /*----------------------------- Backend Handlers -----------------------------*/ 2042 /** 2043 * Backend handler for character device access. 2044 * 2045 * \param xbb Per-instance xbb configuration structure. 2046 * \param reqlist Allocated internal request list structure. 2047 * \param operation BIO_* I/O operation code. 2048 * \param bio_flags Additional bio_flag data to pass to any generated 2049 * bios (e.g. BIO_ORDERED).. 2050 * 2051 * \return 0 for success, errno codes for failure. 2052 */ 2053 static int 2054 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2055 int operation, int bio_flags) 2056 { 2057 struct xbb_dev_data *dev_data; 2058 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2059 off_t bio_offset; 2060 struct bio *bio; 2061 struct xbb_sg *xbb_sg; 2062 u_int nbio; 2063 u_int bio_idx; 2064 u_int nseg; 2065 u_int seg_idx; 2066 int error; 2067 2068 dev_data = &xbb->backend.dev; 2069 bio_offset = (off_t)reqlist->starting_sector_number 2070 << xbb->sector_size_shift; 2071 error = 0; 2072 nbio = 0; 2073 bio_idx = 0; 2074 2075 if (operation == BIO_FLUSH) { 2076 bio = g_new_bio(); 2077 if (__predict_false(bio == NULL)) { 2078 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2079 error = ENOMEM; 2080 return (error); 2081 } 2082 2083 bio->bio_cmd = BIO_FLUSH; 2084 bio->bio_flags |= BIO_ORDERED; 2085 bio->bio_dev = dev_data->cdev; 2086 bio->bio_offset = 0; 2087 bio->bio_data = 0; 2088 bio->bio_done = xbb_bio_done; 2089 bio->bio_caller1 = reqlist; 2090 bio->bio_pblkno = 0; 2091 2092 reqlist->pendcnt = 1; 2093 2094 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2095 device_get_unit(xbb->dev)); 2096 2097 (*dev_data->csw->d_strategy)(bio); 2098 2099 return (0); 2100 } 2101 2102 xbb_sg = xbb->xbb_sgs; 2103 bio = NULL; 2104 nseg = reqlist->nr_segments; 2105 2106 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2107 2108 /* 2109 * KVA will not be contiguous, so any additional 2110 * I/O will need to be represented in a new bio. 2111 */ 2112 if ((bio != NULL) 2113 && (xbb_sg->first_sect != 0)) { 2114 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2115 printf("%s: Discontiguous I/O request " 2116 "from domain %d ends on " 2117 "non-sector boundary\n", 2118 __func__, xbb->otherend_id); 2119 error = EINVAL; 2120 goto fail_free_bios; 2121 } 2122 bio = NULL; 2123 } 2124 2125 if (bio == NULL) { 2126 /* 2127 * Make sure that the start of this bio is 2128 * aligned to a device sector. 2129 */ 2130 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2131 printf("%s: Misaligned I/O request " 2132 "from domain %d\n", __func__, 2133 xbb->otherend_id); 2134 error = EINVAL; 2135 goto fail_free_bios; 2136 } 2137 2138 bio = bios[nbio++] = g_new_bio(); 2139 if (__predict_false(bio == NULL)) { 2140 error = ENOMEM; 2141 goto fail_free_bios; 2142 } 2143 bio->bio_cmd = operation; 2144 bio->bio_flags |= bio_flags; 2145 bio->bio_dev = dev_data->cdev; 2146 bio->bio_offset = bio_offset; 2147 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2148 xbb_sg->first_sect); 2149 bio->bio_done = xbb_bio_done; 2150 bio->bio_caller1 = reqlist; 2151 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2152 } 2153 2154 bio->bio_length += xbb_sg->nsect << 9; 2155 bio->bio_bcount = bio->bio_length; 2156 bio_offset += xbb_sg->nsect << 9; 2157 2158 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2159 2160 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2161 printf("%s: Discontiguous I/O request " 2162 "from domain %d ends on " 2163 "non-sector boundary\n", 2164 __func__, xbb->otherend_id); 2165 error = EINVAL; 2166 goto fail_free_bios; 2167 } 2168 /* 2169 * KVA will not be contiguous, so any additional 2170 * I/O will need to be represented in a new bio. 2171 */ 2172 bio = NULL; 2173 } 2174 } 2175 2176 reqlist->pendcnt = nbio; 2177 2178 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2179 { 2180 #ifdef XBB_USE_BOUNCE_BUFFERS 2181 vm_offset_t kva_offset; 2182 2183 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2184 - (vm_offset_t)reqlist->bounce; 2185 if (operation == BIO_WRITE) { 2186 memcpy(bios[bio_idx]->bio_data, 2187 (uint8_t *)reqlist->kva + kva_offset, 2188 bios[bio_idx]->bio_bcount); 2189 } 2190 #endif 2191 if (operation == BIO_READ) { 2192 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2193 device_get_unit(xbb->dev), 2194 bios[bio_idx]->bio_offset, 2195 bios[bio_idx]->bio_length); 2196 } else if (operation == BIO_WRITE) { 2197 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2198 device_get_unit(xbb->dev), 2199 bios[bio_idx]->bio_offset, 2200 bios[bio_idx]->bio_length); 2201 } 2202 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2203 } 2204 2205 return (error); 2206 2207 fail_free_bios: 2208 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2209 g_destroy_bio(bios[bio_idx]); 2210 2211 return (error); 2212 } 2213 2214 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int"); 2215 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t", 2216 "uint64_t"); 2217 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int", 2218 "uint64_t", "uint64_t"); 2219 2220 /** 2221 * Backend handler for file access. 2222 * 2223 * \param xbb Per-instance xbb configuration structure. 2224 * \param reqlist Allocated internal request list. 2225 * \param operation BIO_* I/O operation code. 2226 * \param flags Additional bio_flag data to pass to any generated bios 2227 * (e.g. BIO_ORDERED).. 2228 * 2229 * \return 0 for success, errno codes for failure. 2230 */ 2231 static int 2232 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2233 int operation, int flags) 2234 { 2235 struct xbb_file_data *file_data; 2236 u_int seg_idx; 2237 u_int nseg; 2238 off_t sectors_sent; 2239 struct uio xuio; 2240 struct xbb_sg *xbb_sg; 2241 struct iovec *xiovec; 2242 #ifdef XBB_USE_BOUNCE_BUFFERS 2243 void **p_vaddr; 2244 int saved_uio_iovcnt; 2245 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2246 int error; 2247 2248 file_data = &xbb->backend.file; 2249 sectors_sent = 0; 2250 error = 0; 2251 bzero(&xuio, sizeof(xuio)); 2252 2253 switch (operation) { 2254 case BIO_READ: 2255 xuio.uio_rw = UIO_READ; 2256 break; 2257 case BIO_WRITE: 2258 xuio.uio_rw = UIO_WRITE; 2259 break; 2260 case BIO_FLUSH: { 2261 struct mount *mountpoint; 2262 2263 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2264 device_get_unit(xbb->dev)); 2265 2266 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2267 2268 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2269 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2270 VOP_UNLOCK(xbb->vn, 0); 2271 2272 vn_finished_write(mountpoint); 2273 2274 goto bailout_send_response; 2275 /* NOTREACHED */ 2276 } 2277 default: 2278 panic("invalid operation %d", operation); 2279 /* NOTREACHED */ 2280 } 2281 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2282 << xbb->sector_size_shift; 2283 xuio.uio_segflg = UIO_SYSSPACE; 2284 xuio.uio_iov = file_data->xiovecs; 2285 xuio.uio_iovcnt = 0; 2286 xbb_sg = xbb->xbb_sgs; 2287 nseg = reqlist->nr_segments; 2288 2289 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2290 2291 /* 2292 * If the first sector is not 0, the KVA will 2293 * not be contiguous and we'll need to go on 2294 * to another segment. 2295 */ 2296 if (xbb_sg->first_sect != 0) 2297 xiovec = NULL; 2298 2299 if (xiovec == NULL) { 2300 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2301 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2302 seg_idx, xbb_sg->first_sect); 2303 #ifdef XBB_USE_BOUNCE_BUFFERS 2304 /* 2305 * Store the address of the incoming 2306 * buffer at this particular offset 2307 * as well, so we can do the copy 2308 * later without having to do more 2309 * work to recalculate this address. 2310 */ 2311 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2312 *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2313 xbb_sg->first_sect); 2314 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2315 xiovec->iov_len = 0; 2316 xuio.uio_iovcnt++; 2317 } 2318 2319 xiovec->iov_len += xbb_sg->nsect << 9; 2320 2321 xuio.uio_resid += xbb_sg->nsect << 9; 2322 2323 /* 2324 * If the last sector is not the full page 2325 * size count, the next segment will not be 2326 * contiguous in KVA and we need a new iovec. 2327 */ 2328 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2329 xiovec = NULL; 2330 } 2331 2332 xuio.uio_td = curthread; 2333 2334 #ifdef XBB_USE_BOUNCE_BUFFERS 2335 saved_uio_iovcnt = xuio.uio_iovcnt; 2336 2337 if (operation == BIO_WRITE) { 2338 /* Copy the write data to the local buffer. */ 2339 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2340 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2341 seg_idx++, xiovec++, p_vaddr++) { 2342 2343 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 2344 } 2345 } else { 2346 /* 2347 * We only need to save off the iovecs in the case of a 2348 * read, because the copy for the read happens after the 2349 * VOP_READ(). (The uio will get modified in that call 2350 * sequence.) 2351 */ 2352 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2353 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2354 } 2355 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2356 2357 switch (operation) { 2358 case BIO_READ: 2359 2360 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2361 device_get_unit(xbb->dev), xuio.uio_offset, 2362 xuio.uio_resid); 2363 2364 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2365 2366 /* 2367 * UFS pays attention to IO_DIRECT for reads. If the 2368 * DIRECTIO option is configured into the kernel, it calls 2369 * ffs_rawread(). But that only works for single-segment 2370 * uios with user space addresses. In our case, with a 2371 * kernel uio, it still reads into the buffer cache, but it 2372 * will just try to release the buffer from the cache later 2373 * on in ffs_read(). 2374 * 2375 * ZFS does not pay attention to IO_DIRECT for reads. 2376 * 2377 * UFS does not pay attention to IO_SYNC for reads. 2378 * 2379 * ZFS pays attention to IO_SYNC (which translates into the 2380 * Solaris define FRSYNC for zfs_read()) for reads. It 2381 * attempts to sync the file before reading. 2382 * 2383 * So, to attempt to provide some barrier semantics in the 2384 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2385 */ 2386 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2387 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2388 2389 VOP_UNLOCK(xbb->vn, 0); 2390 break; 2391 case BIO_WRITE: { 2392 struct mount *mountpoint; 2393 2394 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2395 device_get_unit(xbb->dev), xuio.uio_offset, 2396 xuio.uio_resid); 2397 2398 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2399 2400 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2401 2402 /* 2403 * UFS pays attention to IO_DIRECT for writes. The write 2404 * is done asynchronously. (Normally the write would just 2405 * get put into cache. 2406 * 2407 * UFS pays attention to IO_SYNC for writes. It will 2408 * attempt to write the buffer out synchronously if that 2409 * flag is set. 2410 * 2411 * ZFS does not pay attention to IO_DIRECT for writes. 2412 * 2413 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2414 * for writes. It will flush the transaction from the 2415 * cache before returning. 2416 * 2417 * So if we've got the BIO_ORDERED flag set, we want 2418 * IO_SYNC in either the UFS or ZFS case. 2419 */ 2420 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2421 IO_SYNC : 0, file_data->cred); 2422 VOP_UNLOCK(xbb->vn, 0); 2423 2424 vn_finished_write(mountpoint); 2425 2426 break; 2427 } 2428 default: 2429 panic("invalid operation %d", operation); 2430 /* NOTREACHED */ 2431 } 2432 2433 #ifdef XBB_USE_BOUNCE_BUFFERS 2434 /* We only need to copy here for read operations */ 2435 if (operation == BIO_READ) { 2436 2437 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2438 xiovec = file_data->saved_xiovecs; 2439 seg_idx < saved_uio_iovcnt; seg_idx++, 2440 xiovec++, p_vaddr++) { 2441 2442 /* 2443 * Note that we have to use the copy of the 2444 * io vector we made above. uiomove() modifies 2445 * the uio and its referenced vector as uiomove 2446 * performs the copy, so we can't rely on any 2447 * state from the original uio. 2448 */ 2449 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 2450 } 2451 } 2452 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2453 2454 bailout_send_response: 2455 2456 if (error != 0) 2457 reqlist->status = BLKIF_RSP_ERROR; 2458 2459 xbb_complete_reqlist(xbb, reqlist); 2460 2461 return (0); 2462 } 2463 2464 /*--------------------------- Backend Configuration --------------------------*/ 2465 /** 2466 * Close and cleanup any backend device/file specific state for this 2467 * block back instance. 2468 * 2469 * \param xbb Per-instance xbb configuration structure. 2470 */ 2471 static void 2472 xbb_close_backend(struct xbb_softc *xbb) 2473 { 2474 DROP_GIANT(); 2475 DPRINTF("closing dev=%s\n", xbb->dev_name); 2476 if (xbb->vn) { 2477 int flags = FREAD; 2478 2479 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2480 flags |= FWRITE; 2481 2482 switch (xbb->device_type) { 2483 case XBB_TYPE_DISK: 2484 if (xbb->backend.dev.csw) { 2485 dev_relthread(xbb->backend.dev.cdev, 2486 xbb->backend.dev.dev_ref); 2487 xbb->backend.dev.csw = NULL; 2488 xbb->backend.dev.cdev = NULL; 2489 } 2490 break; 2491 case XBB_TYPE_FILE: 2492 break; 2493 case XBB_TYPE_NONE: 2494 default: 2495 panic("Unexpected backend type."); 2496 break; 2497 } 2498 2499 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2500 xbb->vn = NULL; 2501 2502 switch (xbb->device_type) { 2503 case XBB_TYPE_DISK: 2504 break; 2505 case XBB_TYPE_FILE: 2506 if (xbb->backend.file.cred != NULL) { 2507 crfree(xbb->backend.file.cred); 2508 xbb->backend.file.cred = NULL; 2509 } 2510 break; 2511 case XBB_TYPE_NONE: 2512 default: 2513 panic("Unexpected backend type."); 2514 break; 2515 } 2516 } 2517 PICKUP_GIANT(); 2518 } 2519 2520 /** 2521 * Open a character device to be used for backend I/O. 2522 * 2523 * \param xbb Per-instance xbb configuration structure. 2524 * 2525 * \return 0 for success, errno codes for failure. 2526 */ 2527 static int 2528 xbb_open_dev(struct xbb_softc *xbb) 2529 { 2530 struct vattr vattr; 2531 struct cdev *dev; 2532 struct cdevsw *devsw; 2533 int error; 2534 2535 xbb->device_type = XBB_TYPE_DISK; 2536 xbb->dispatch_io = xbb_dispatch_dev; 2537 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2538 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2539 &xbb->backend.dev.dev_ref); 2540 if (xbb->backend.dev.csw == NULL) 2541 panic("Unable to retrieve device switch"); 2542 2543 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2544 if (error) { 2545 xenbus_dev_fatal(xbb->dev, error, "error getting " 2546 "vnode attributes for device %s", 2547 xbb->dev_name); 2548 return (error); 2549 } 2550 2551 2552 dev = xbb->vn->v_rdev; 2553 devsw = dev->si_devsw; 2554 if (!devsw->d_ioctl) { 2555 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2556 "device %s!", xbb->dev_name); 2557 return (ENODEV); 2558 } 2559 2560 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2561 (caddr_t)&xbb->sector_size, FREAD, 2562 curthread); 2563 if (error) { 2564 xenbus_dev_fatal(xbb->dev, error, 2565 "error calling ioctl DIOCGSECTORSIZE " 2566 "for device %s", xbb->dev_name); 2567 return (error); 2568 } 2569 2570 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2571 (caddr_t)&xbb->media_size, FREAD, 2572 curthread); 2573 if (error) { 2574 xenbus_dev_fatal(xbb->dev, error, 2575 "error calling ioctl DIOCGMEDIASIZE " 2576 "for device %s", xbb->dev_name); 2577 return (error); 2578 } 2579 2580 return (0); 2581 } 2582 2583 /** 2584 * Open a file to be used for backend I/O. 2585 * 2586 * \param xbb Per-instance xbb configuration structure. 2587 * 2588 * \return 0 for success, errno codes for failure. 2589 */ 2590 static int 2591 xbb_open_file(struct xbb_softc *xbb) 2592 { 2593 struct xbb_file_data *file_data; 2594 struct vattr vattr; 2595 int error; 2596 2597 file_data = &xbb->backend.file; 2598 xbb->device_type = XBB_TYPE_FILE; 2599 xbb->dispatch_io = xbb_dispatch_file; 2600 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2601 if (error != 0) { 2602 xenbus_dev_fatal(xbb->dev, error, 2603 "error calling VOP_GETATTR()" 2604 "for file %s", xbb->dev_name); 2605 return (error); 2606 } 2607 2608 /* 2609 * Verify that we have the ability to upgrade to exclusive 2610 * access on this file so we can trap errors at open instead 2611 * of reporting them during first access. 2612 */ 2613 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2614 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2615 if (xbb->vn->v_iflag & VI_DOOMED) { 2616 error = EBADF; 2617 xenbus_dev_fatal(xbb->dev, error, 2618 "error locking file %s", 2619 xbb->dev_name); 2620 2621 return (error); 2622 } 2623 } 2624 2625 file_data->cred = crhold(curthread->td_ucred); 2626 xbb->media_size = vattr.va_size; 2627 2628 /* 2629 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2630 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2631 * with disklabel and UFS on FreeBSD at least. Large block sizes 2632 * may not work with other OSes as well. So just export a sector 2633 * size of 512 bytes, which should work with any OS or 2634 * application. Since our backing is a file, any block size will 2635 * work fine for the backing store. 2636 */ 2637 #if 0 2638 xbb->sector_size = vattr.va_blocksize; 2639 #endif 2640 xbb->sector_size = 512; 2641 2642 /* 2643 * Sanity check. The media size has to be at least one 2644 * sector long. 2645 */ 2646 if (xbb->media_size < xbb->sector_size) { 2647 error = EINVAL; 2648 xenbus_dev_fatal(xbb->dev, error, 2649 "file %s size %ju < block size %u", 2650 xbb->dev_name, 2651 (uintmax_t)xbb->media_size, 2652 xbb->sector_size); 2653 } 2654 return (error); 2655 } 2656 2657 /** 2658 * Open the backend provider for this connection. 2659 * 2660 * \param xbb Per-instance xbb configuration structure. 2661 * 2662 * \return 0 for success, errno codes for failure. 2663 */ 2664 static int 2665 xbb_open_backend(struct xbb_softc *xbb) 2666 { 2667 struct nameidata nd; 2668 int flags; 2669 int error; 2670 2671 flags = FREAD; 2672 error = 0; 2673 2674 DPRINTF("opening dev=%s\n", xbb->dev_name); 2675 2676 if (rootvnode == NULL) { 2677 xenbus_dev_fatal(xbb->dev, ENOENT, 2678 "Root file system not mounted"); 2679 return (ENOENT); 2680 } 2681 2682 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2683 flags |= FWRITE; 2684 2685 if (!curthread->td_proc->p_fd->fd_cdir) { 2686 curthread->td_proc->p_fd->fd_cdir = rootvnode; 2687 VREF(rootvnode); 2688 } 2689 if (!curthread->td_proc->p_fd->fd_rdir) { 2690 curthread->td_proc->p_fd->fd_rdir = rootvnode; 2691 VREF(rootvnode); 2692 } 2693 if (!curthread->td_proc->p_fd->fd_jdir) { 2694 curthread->td_proc->p_fd->fd_jdir = rootvnode; 2695 VREF(rootvnode); 2696 } 2697 2698 again: 2699 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2700 error = vn_open(&nd, &flags, 0, NULL); 2701 if (error) { 2702 /* 2703 * This is the only reasonable guess we can make as far as 2704 * path if the user doesn't give us a fully qualified path. 2705 * If they want to specify a file, they need to specify the 2706 * full path. 2707 */ 2708 if (xbb->dev_name[0] != '/') { 2709 char *dev_path = "/dev/"; 2710 char *dev_name; 2711 2712 /* Try adding device path at beginning of name */ 2713 dev_name = malloc(strlen(xbb->dev_name) 2714 + strlen(dev_path) + 1, 2715 M_XENBLOCKBACK, M_NOWAIT); 2716 if (dev_name) { 2717 sprintf(dev_name, "%s%s", dev_path, 2718 xbb->dev_name); 2719 free(xbb->dev_name, M_XENBLOCKBACK); 2720 xbb->dev_name = dev_name; 2721 goto again; 2722 } 2723 } 2724 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2725 xbb->dev_name); 2726 return (error); 2727 } 2728 2729 NDFREE(&nd, NDF_ONLY_PNBUF); 2730 2731 xbb->vn = nd.ni_vp; 2732 2733 /* We only support disks and files. */ 2734 if (vn_isdisk(xbb->vn, &error)) { 2735 error = xbb_open_dev(xbb); 2736 } else if (xbb->vn->v_type == VREG) { 2737 error = xbb_open_file(xbb); 2738 } else { 2739 error = EINVAL; 2740 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2741 "or file", xbb->dev_name); 2742 } 2743 VOP_UNLOCK(xbb->vn, 0); 2744 2745 if (error != 0) { 2746 xbb_close_backend(xbb); 2747 return (error); 2748 } 2749 2750 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2751 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2752 2753 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2754 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2755 xbb->dev_name, xbb->sector_size, xbb->media_size); 2756 2757 return (0); 2758 } 2759 2760 /*------------------------ Inter-Domain Communication ------------------------*/ 2761 /** 2762 * Free dynamically allocated KVA or pseudo-physical address allocations. 2763 * 2764 * \param xbb Per-instance xbb configuration structure. 2765 */ 2766 static void 2767 xbb_free_communication_mem(struct xbb_softc *xbb) 2768 { 2769 if (xbb->kva != 0) { 2770 #ifndef XENHVM 2771 kva_free(xbb->kva, xbb->kva_size); 2772 #else 2773 if (xbb->pseudo_phys_res != NULL) { 2774 bus_release_resource(xbb->dev, SYS_RES_MEMORY, 2775 xbb->pseudo_phys_res_id, 2776 xbb->pseudo_phys_res); 2777 xbb->pseudo_phys_res = NULL; 2778 } 2779 #endif 2780 } 2781 xbb->kva = 0; 2782 xbb->gnt_base_addr = 0; 2783 if (xbb->kva_free != NULL) { 2784 free(xbb->kva_free, M_XENBLOCKBACK); 2785 xbb->kva_free = NULL; 2786 } 2787 } 2788 2789 /** 2790 * Cleanup all inter-domain communication mechanisms. 2791 * 2792 * \param xbb Per-instance xbb configuration structure. 2793 */ 2794 static int 2795 xbb_disconnect(struct xbb_softc *xbb) 2796 { 2797 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2798 struct gnttab_unmap_grant_ref *op; 2799 u_int ring_idx; 2800 int error; 2801 2802 DPRINTF("\n"); 2803 2804 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2805 return (0); 2806 2807 xen_intr_unbind(&xbb->xen_intr_handle); 2808 2809 mtx_unlock(&xbb->lock); 2810 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2811 mtx_lock(&xbb->lock); 2812 2813 /* 2814 * No new interrupts can generate work, but we must wait 2815 * for all currently active requests to drain. 2816 */ 2817 if (xbb->active_request_count != 0) 2818 return (EAGAIN); 2819 2820 for (ring_idx = 0, op = ops; 2821 ring_idx < xbb->ring_config.ring_pages; 2822 ring_idx++, op++) { 2823 2824 op->host_addr = xbb->ring_config.gnt_addr 2825 + (ring_idx * PAGE_SIZE); 2826 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2827 op->handle = xbb->ring_config.handle[ring_idx]; 2828 } 2829 2830 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2831 xbb->ring_config.ring_pages); 2832 if (error != 0) 2833 panic("Grant table op failed (%d)", error); 2834 2835 xbb_free_communication_mem(xbb); 2836 2837 if (xbb->requests != NULL) { 2838 free(xbb->requests, M_XENBLOCKBACK); 2839 xbb->requests = NULL; 2840 } 2841 2842 if (xbb->request_lists != NULL) { 2843 struct xbb_xen_reqlist *reqlist; 2844 int i; 2845 2846 /* There is one request list for ever allocated request. */ 2847 for (i = 0, reqlist = xbb->request_lists; 2848 i < xbb->max_requests; i++, reqlist++){ 2849 #ifdef XBB_USE_BOUNCE_BUFFERS 2850 if (reqlist->bounce != NULL) { 2851 free(reqlist->bounce, M_XENBLOCKBACK); 2852 reqlist->bounce = NULL; 2853 } 2854 #endif 2855 if (reqlist->gnt_handles != NULL) { 2856 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2857 reqlist->gnt_handles = NULL; 2858 } 2859 } 2860 free(xbb->request_lists, M_XENBLOCKBACK); 2861 xbb->request_lists = NULL; 2862 } 2863 2864 xbb->flags &= ~XBBF_RING_CONNECTED; 2865 return (0); 2866 } 2867 2868 /** 2869 * Map shared memory ring into domain local address space, initialize 2870 * ring control structures, and bind an interrupt to the event channel 2871 * used to notify us of ring changes. 2872 * 2873 * \param xbb Per-instance xbb configuration structure. 2874 */ 2875 static int 2876 xbb_connect_ring(struct xbb_softc *xbb) 2877 { 2878 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2879 struct gnttab_map_grant_ref *gnt; 2880 u_int ring_idx; 2881 int error; 2882 2883 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2884 return (0); 2885 2886 /* 2887 * Kva for our ring is at the tail of the region of kva allocated 2888 * by xbb_alloc_communication_mem(). 2889 */ 2890 xbb->ring_config.va = xbb->kva 2891 + (xbb->kva_size 2892 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2893 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2894 + (xbb->kva_size 2895 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2896 2897 for (ring_idx = 0, gnt = gnts; 2898 ring_idx < xbb->ring_config.ring_pages; 2899 ring_idx++, gnt++) { 2900 2901 gnt->host_addr = xbb->ring_config.gnt_addr 2902 + (ring_idx * PAGE_SIZE); 2903 gnt->flags = GNTMAP_host_map; 2904 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2905 gnt->dom = xbb->otherend_id; 2906 } 2907 2908 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2909 xbb->ring_config.ring_pages); 2910 if (error) 2911 panic("blkback: Ring page grant table op failed (%d)", error); 2912 2913 for (ring_idx = 0, gnt = gnts; 2914 ring_idx < xbb->ring_config.ring_pages; 2915 ring_idx++, gnt++) { 2916 if (gnt->status != 0) { 2917 xbb->ring_config.va = 0; 2918 xenbus_dev_fatal(xbb->dev, EACCES, 2919 "Ring shared page mapping failed. " 2920 "Status %d.", gnt->status); 2921 return (EACCES); 2922 } 2923 xbb->ring_config.handle[ring_idx] = gnt->handle; 2924 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2925 } 2926 2927 /* Initialize the ring based on ABI. */ 2928 switch (xbb->abi) { 2929 case BLKIF_PROTOCOL_NATIVE: 2930 { 2931 blkif_sring_t *sring; 2932 sring = (blkif_sring_t *)xbb->ring_config.va; 2933 BACK_RING_INIT(&xbb->rings.native, sring, 2934 xbb->ring_config.ring_pages * PAGE_SIZE); 2935 break; 2936 } 2937 case BLKIF_PROTOCOL_X86_32: 2938 { 2939 blkif_x86_32_sring_t *sring_x86_32; 2940 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2941 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2942 xbb->ring_config.ring_pages * PAGE_SIZE); 2943 break; 2944 } 2945 case BLKIF_PROTOCOL_X86_64: 2946 { 2947 blkif_x86_64_sring_t *sring_x86_64; 2948 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2949 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2950 xbb->ring_config.ring_pages * PAGE_SIZE); 2951 break; 2952 } 2953 default: 2954 panic("Unexpected blkif protocol ABI."); 2955 } 2956 2957 xbb->flags |= XBBF_RING_CONNECTED; 2958 2959 error = xen_intr_bind_remote_port(xbb->dev, 2960 xbb->otherend_id, 2961 xbb->ring_config.evtchn, 2962 xbb_filter, 2963 /*ithread_handler*/NULL, 2964 /*arg*/xbb, 2965 INTR_TYPE_BIO | INTR_MPSAFE, 2966 &xbb->xen_intr_handle); 2967 if (error) { 2968 (void)xbb_disconnect(xbb); 2969 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2970 return (error); 2971 } 2972 2973 DPRINTF("rings connected!\n"); 2974 2975 return 0; 2976 } 2977 2978 /* Needed to make bit_alloc() macro work */ 2979 #define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK, \ 2980 M_NOWAIT|M_ZERO); 2981 2982 /** 2983 * Size KVA and pseudo-physical address allocations based on negotiated 2984 * values for the size and number of I/O requests, and the size of our 2985 * communication ring. 2986 * 2987 * \param xbb Per-instance xbb configuration structure. 2988 * 2989 * These address spaces are used to dynamically map pages in the 2990 * front-end's domain into our own. 2991 */ 2992 static int 2993 xbb_alloc_communication_mem(struct xbb_softc *xbb) 2994 { 2995 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2996 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2997 xbb->kva_size = xbb->reqlist_kva_size + 2998 (xbb->ring_config.ring_pages * PAGE_SIZE); 2999 3000 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages); 3001 if (xbb->kva_free == NULL) 3002 return (ENOMEM); 3003 3004 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 3005 device_get_nameunit(xbb->dev), xbb->kva_size, 3006 xbb->reqlist_kva_size); 3007 #ifndef XENHVM 3008 xbb->kva = kva_alloc(xbb->kva_size); 3009 if (xbb->kva == 0) 3010 return (ENOMEM); 3011 xbb->gnt_base_addr = xbb->kva; 3012 #else /* XENHVM */ 3013 /* 3014 * Reserve a range of pseudo physical memory that we can map 3015 * into kva. These pages will only be backed by machine 3016 * pages ("real memory") during the lifetime of front-end requests 3017 * via grant table operations. 3018 */ 3019 xbb->pseudo_phys_res_id = 0; 3020 xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY, 3021 &xbb->pseudo_phys_res_id, 3022 0, ~0, xbb->kva_size, 3023 RF_ACTIVE); 3024 if (xbb->pseudo_phys_res == NULL) { 3025 xbb->kva = 0; 3026 return (ENOMEM); 3027 } 3028 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3029 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3030 #endif /* XENHVM */ 3031 3032 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3033 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3034 (uintmax_t)xbb->gnt_base_addr); 3035 return (0); 3036 } 3037 3038 /** 3039 * Collect front-end information from the XenStore. 3040 * 3041 * \param xbb Per-instance xbb configuration structure. 3042 */ 3043 static int 3044 xbb_collect_frontend_info(struct xbb_softc *xbb) 3045 { 3046 char protocol_abi[64]; 3047 const char *otherend_path; 3048 int error; 3049 u_int ring_idx; 3050 u_int ring_page_order; 3051 size_t ring_size; 3052 3053 otherend_path = xenbus_get_otherend_path(xbb->dev); 3054 3055 /* 3056 * Protocol defaults valid even if all negotiation fails. 3057 */ 3058 xbb->ring_config.ring_pages = 1; 3059 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; 3060 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3061 3062 /* 3063 * Mandatory data (used in all versions of the protocol) first. 3064 */ 3065 error = xs_scanf(XST_NIL, otherend_path, 3066 "event-channel", NULL, "%" PRIu32, 3067 &xbb->ring_config.evtchn); 3068 if (error != 0) { 3069 xenbus_dev_fatal(xbb->dev, error, 3070 "Unable to retrieve event-channel information " 3071 "from frontend %s. Unable to connect.", 3072 xenbus_get_otherend_path(xbb->dev)); 3073 return (error); 3074 } 3075 3076 /* 3077 * These fields are initialized to legacy protocol defaults 3078 * so we only need to fail if reading the updated value succeeds 3079 * and the new value is outside of its allowed range. 3080 * 3081 * \note xs_gather() returns on the first encountered error, so 3082 * we must use independant calls in order to guarantee 3083 * we don't miss information in a sparsly populated front-end 3084 * tree. 3085 * 3086 * \note xs_scanf() does not update variables for unmatched 3087 * fields. 3088 */ 3089 ring_page_order = 0; 3090 (void)xs_scanf(XST_NIL, otherend_path, 3091 "ring-page-order", NULL, "%u", 3092 &ring_page_order); 3093 xbb->ring_config.ring_pages = 1 << ring_page_order; 3094 (void)xs_scanf(XST_NIL, otherend_path, 3095 "num-ring-pages", NULL, "%u", 3096 &xbb->ring_config.ring_pages); 3097 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3098 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3099 3100 (void)xs_scanf(XST_NIL, otherend_path, 3101 "max-requests", NULL, "%u", 3102 &xbb->max_requests); 3103 3104 (void)xs_scanf(XST_NIL, otherend_path, 3105 "max-request-segments", NULL, "%u", 3106 &xbb->max_request_segments); 3107 3108 (void)xs_scanf(XST_NIL, otherend_path, 3109 "max-request-size", NULL, "%u", 3110 &xbb->max_request_size); 3111 3112 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3113 xenbus_dev_fatal(xbb->dev, EINVAL, 3114 "Front-end specified ring-pages of %u " 3115 "exceeds backend limit of %zu. " 3116 "Unable to connect.", 3117 xbb->ring_config.ring_pages, 3118 XBB_MAX_RING_PAGES); 3119 return (EINVAL); 3120 } else if (xbb->max_requests > XBB_MAX_REQUESTS) { 3121 xenbus_dev_fatal(xbb->dev, EINVAL, 3122 "Front-end specified max_requests of %u " 3123 "exceeds backend limit of %u. " 3124 "Unable to connect.", 3125 xbb->max_requests, 3126 XBB_MAX_REQUESTS); 3127 return (EINVAL); 3128 } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) { 3129 xenbus_dev_fatal(xbb->dev, EINVAL, 3130 "Front-end specified max_requests_segments " 3131 "of %u exceeds backend limit of %u. " 3132 "Unable to connect.", 3133 xbb->max_request_segments, 3134 XBB_MAX_SEGMENTS_PER_REQUEST); 3135 return (EINVAL); 3136 } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) { 3137 xenbus_dev_fatal(xbb->dev, EINVAL, 3138 "Front-end specified max_request_size " 3139 "of %u exceeds backend limit of %u. " 3140 "Unable to connect.", 3141 xbb->max_request_size, 3142 XBB_MAX_REQUEST_SIZE); 3143 return (EINVAL); 3144 } 3145 3146 if (xbb->ring_config.ring_pages == 1) { 3147 error = xs_gather(XST_NIL, otherend_path, 3148 "ring-ref", "%" PRIu32, 3149 &xbb->ring_config.ring_ref[0], 3150 NULL); 3151 if (error != 0) { 3152 xenbus_dev_fatal(xbb->dev, error, 3153 "Unable to retrieve ring information " 3154 "from frontend %s. Unable to " 3155 "connect.", 3156 xenbus_get_otherend_path(xbb->dev)); 3157 return (error); 3158 } 3159 } else { 3160 /* Multi-page ring format. */ 3161 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3162 ring_idx++) { 3163 char ring_ref_name[]= "ring_refXX"; 3164 3165 snprintf(ring_ref_name, sizeof(ring_ref_name), 3166 "ring-ref%u", ring_idx); 3167 error = xs_scanf(XST_NIL, otherend_path, 3168 ring_ref_name, NULL, "%" PRIu32, 3169 &xbb->ring_config.ring_ref[ring_idx]); 3170 if (error != 0) { 3171 xenbus_dev_fatal(xbb->dev, error, 3172 "Failed to retriev grant " 3173 "reference for page %u of " 3174 "shared ring. Unable " 3175 "to connect.", ring_idx); 3176 return (error); 3177 } 3178 } 3179 } 3180 3181 error = xs_gather(XST_NIL, otherend_path, 3182 "protocol", "%63s", protocol_abi, 3183 NULL); 3184 if (error != 0 3185 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3186 /* 3187 * Assume native if the frontend has not 3188 * published ABI data or it has published and 3189 * matches our own ABI. 3190 */ 3191 xbb->abi = BLKIF_PROTOCOL_NATIVE; 3192 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3193 3194 xbb->abi = BLKIF_PROTOCOL_X86_32; 3195 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3196 3197 xbb->abi = BLKIF_PROTOCOL_X86_64; 3198 } else { 3199 3200 xenbus_dev_fatal(xbb->dev, EINVAL, 3201 "Unknown protocol ABI (%s) published by " 3202 "frontend. Unable to connect.", protocol_abi); 3203 return (EINVAL); 3204 } 3205 return (0); 3206 } 3207 3208 /** 3209 * Allocate per-request data structures given request size and number 3210 * information negotiated with the front-end. 3211 * 3212 * \param xbb Per-instance xbb configuration structure. 3213 */ 3214 static int 3215 xbb_alloc_requests(struct xbb_softc *xbb) 3216 { 3217 struct xbb_xen_req *req; 3218 struct xbb_xen_req *last_req; 3219 3220 /* 3221 * Allocate request book keeping datastructures. 3222 */ 3223 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3224 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3225 if (xbb->requests == NULL) { 3226 xenbus_dev_fatal(xbb->dev, ENOMEM, 3227 "Unable to allocate request structures"); 3228 return (ENOMEM); 3229 } 3230 3231 req = xbb->requests; 3232 last_req = &xbb->requests[xbb->max_requests - 1]; 3233 STAILQ_INIT(&xbb->request_free_stailq); 3234 while (req <= last_req) { 3235 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3236 req++; 3237 } 3238 return (0); 3239 } 3240 3241 static int 3242 xbb_alloc_request_lists(struct xbb_softc *xbb) 3243 { 3244 struct xbb_xen_reqlist *reqlist; 3245 int i; 3246 3247 /* 3248 * If no requests can be merged, we need 1 request list per 3249 * in flight request. 3250 */ 3251 xbb->request_lists = malloc(xbb->max_requests * 3252 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3253 if (xbb->request_lists == NULL) { 3254 xenbus_dev_fatal(xbb->dev, ENOMEM, 3255 "Unable to allocate request list structures"); 3256 return (ENOMEM); 3257 } 3258 3259 STAILQ_INIT(&xbb->reqlist_free_stailq); 3260 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3261 for (i = 0; i < xbb->max_requests; i++) { 3262 int seg; 3263 3264 reqlist = &xbb->request_lists[i]; 3265 3266 reqlist->xbb = xbb; 3267 3268 #ifdef XBB_USE_BOUNCE_BUFFERS 3269 reqlist->bounce = malloc(xbb->max_reqlist_size, 3270 M_XENBLOCKBACK, M_NOWAIT); 3271 if (reqlist->bounce == NULL) { 3272 xenbus_dev_fatal(xbb->dev, ENOMEM, 3273 "Unable to allocate request " 3274 "bounce buffers"); 3275 return (ENOMEM); 3276 } 3277 #endif /* XBB_USE_BOUNCE_BUFFERS */ 3278 3279 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3280 sizeof(*reqlist->gnt_handles), 3281 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3282 if (reqlist->gnt_handles == NULL) { 3283 xenbus_dev_fatal(xbb->dev, ENOMEM, 3284 "Unable to allocate request " 3285 "grant references"); 3286 return (ENOMEM); 3287 } 3288 3289 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3290 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3291 3292 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3293 } 3294 return (0); 3295 } 3296 3297 /** 3298 * Supply information about the physical device to the frontend 3299 * via XenBus. 3300 * 3301 * \param xbb Per-instance xbb configuration structure. 3302 */ 3303 static int 3304 xbb_publish_backend_info(struct xbb_softc *xbb) 3305 { 3306 struct xs_transaction xst; 3307 const char *our_path; 3308 const char *leaf; 3309 int error; 3310 3311 our_path = xenbus_get_node(xbb->dev); 3312 while (1) { 3313 error = xs_transaction_start(&xst); 3314 if (error != 0) { 3315 xenbus_dev_fatal(xbb->dev, error, 3316 "Error publishing backend info " 3317 "(start transaction)"); 3318 return (error); 3319 } 3320 3321 leaf = "sectors"; 3322 error = xs_printf(xst, our_path, leaf, 3323 "%"PRIu64, xbb->media_num_sectors); 3324 if (error != 0) 3325 break; 3326 3327 /* XXX Support all VBD attributes here. */ 3328 leaf = "info"; 3329 error = xs_printf(xst, our_path, leaf, "%u", 3330 xbb->flags & XBBF_READ_ONLY 3331 ? VDISK_READONLY : 0); 3332 if (error != 0) 3333 break; 3334 3335 leaf = "sector-size"; 3336 error = xs_printf(xst, our_path, leaf, "%u", 3337 xbb->sector_size); 3338 if (error != 0) 3339 break; 3340 3341 error = xs_transaction_end(xst, 0); 3342 if (error == 0) { 3343 return (0); 3344 } else if (error != EAGAIN) { 3345 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3346 return (error); 3347 } 3348 } 3349 3350 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3351 our_path, leaf); 3352 xs_transaction_end(xst, 1); 3353 return (error); 3354 } 3355 3356 /** 3357 * Connect to our blkfront peer now that it has completed publishing 3358 * its configuration into the XenStore. 3359 * 3360 * \param xbb Per-instance xbb configuration structure. 3361 */ 3362 static void 3363 xbb_connect(struct xbb_softc *xbb) 3364 { 3365 int error; 3366 3367 if (xenbus_get_state(xbb->dev) == XenbusStateConnected) 3368 return; 3369 3370 if (xbb_collect_frontend_info(xbb) != 0) 3371 return; 3372 3373 xbb->flags &= ~XBBF_SHUTDOWN; 3374 3375 /* 3376 * We limit the maximum number of reqlist segments to the maximum 3377 * number of segments in the ring, or our absolute maximum, 3378 * whichever is smaller. 3379 */ 3380 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3381 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3382 3383 /* 3384 * The maximum size is simply a function of the number of segments 3385 * we can handle. 3386 */ 3387 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3388 3389 /* Allocate resources whose size depends on front-end configuration. */ 3390 error = xbb_alloc_communication_mem(xbb); 3391 if (error != 0) { 3392 xenbus_dev_fatal(xbb->dev, error, 3393 "Unable to allocate communication memory"); 3394 return; 3395 } 3396 3397 error = xbb_alloc_requests(xbb); 3398 if (error != 0) { 3399 /* Specific errors are reported by xbb_alloc_requests(). */ 3400 return; 3401 } 3402 3403 error = xbb_alloc_request_lists(xbb); 3404 if (error != 0) { 3405 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3406 return; 3407 } 3408 3409 /* 3410 * Connect communication channel. 3411 */ 3412 error = xbb_connect_ring(xbb); 3413 if (error != 0) { 3414 /* Specific errors are reported by xbb_connect_ring(). */ 3415 return; 3416 } 3417 3418 if (xbb_publish_backend_info(xbb) != 0) { 3419 /* 3420 * If we can't publish our data, we cannot participate 3421 * in this connection, and waiting for a front-end state 3422 * change will not help the situation. 3423 */ 3424 (void)xbb_disconnect(xbb); 3425 return; 3426 } 3427 3428 /* Ready for I/O. */ 3429 xenbus_set_state(xbb->dev, XenbusStateConnected); 3430 } 3431 3432 /*-------------------------- Device Teardown Support -------------------------*/ 3433 /** 3434 * Perform device shutdown functions. 3435 * 3436 * \param xbb Per-instance xbb configuration structure. 3437 * 3438 * Mark this instance as shutting down, wait for any active I/O on the 3439 * backend device/file to drain, disconnect from the front-end, and notify 3440 * any waiters (e.g. a thread invoking our detach method) that detach can 3441 * now proceed. 3442 */ 3443 static int 3444 xbb_shutdown(struct xbb_softc *xbb) 3445 { 3446 XenbusState frontState; 3447 int error; 3448 3449 DPRINTF("\n"); 3450 3451 /* 3452 * Due to the need to drop our mutex during some 3453 * xenbus operations, it is possible for two threads 3454 * to attempt to close out shutdown processing at 3455 * the same time. Tell the caller that hits this 3456 * race to try back later. 3457 */ 3458 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3459 return (EAGAIN); 3460 3461 xbb->flags |= XBBF_IN_SHUTDOWN; 3462 mtx_unlock(&xbb->lock); 3463 3464 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3465 xenbus_set_state(xbb->dev, XenbusStateClosing); 3466 3467 frontState = xenbus_get_otherend_state(xbb->dev); 3468 mtx_lock(&xbb->lock); 3469 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3470 3471 /* The front can submit I/O until entering the closed state. */ 3472 if (frontState < XenbusStateClosed) 3473 return (EAGAIN); 3474 3475 DPRINTF("\n"); 3476 3477 /* Indicate shutdown is in progress. */ 3478 xbb->flags |= XBBF_SHUTDOWN; 3479 3480 /* Disconnect from the front-end. */ 3481 error = xbb_disconnect(xbb); 3482 if (error != 0) { 3483 /* 3484 * Requests still outstanding. We'll be called again 3485 * once they complete. 3486 */ 3487 KASSERT(error == EAGAIN, 3488 ("%s: Unexpected xbb_disconnect() failure %d", 3489 __func__, error)); 3490 3491 return (error); 3492 } 3493 3494 DPRINTF("\n"); 3495 3496 /* Indicate to xbb_detach() that is it safe to proceed. */ 3497 wakeup(xbb); 3498 3499 return (0); 3500 } 3501 3502 /** 3503 * Report an attach time error to the console and Xen, and cleanup 3504 * this instance by forcing immediate detach processing. 3505 * 3506 * \param xbb Per-instance xbb configuration structure. 3507 * \param err Errno describing the error. 3508 * \param fmt Printf style format and arguments 3509 */ 3510 static void 3511 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3512 { 3513 va_list ap; 3514 va_list ap_hotplug; 3515 3516 va_start(ap, fmt); 3517 va_copy(ap_hotplug, ap); 3518 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3519 "hotplug-error", fmt, ap_hotplug); 3520 va_end(ap_hotplug); 3521 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3522 "hotplug-status", "error"); 3523 3524 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3525 va_end(ap); 3526 3527 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3528 "online", "0"); 3529 xbb_detach(xbb->dev); 3530 } 3531 3532 /*---------------------------- NewBus Entrypoints ----------------------------*/ 3533 /** 3534 * Inspect a XenBus device and claim it if is of the appropriate type. 3535 * 3536 * \param dev NewBus device object representing a candidate XenBus device. 3537 * 3538 * \return 0 for success, errno codes for failure. 3539 */ 3540 static int 3541 xbb_probe(device_t dev) 3542 { 3543 3544 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3545 device_set_desc(dev, "Backend Virtual Block Device"); 3546 device_quiet(dev); 3547 return (0); 3548 } 3549 3550 return (ENXIO); 3551 } 3552 3553 /** 3554 * Setup sysctl variables to control various Block Back parameters. 3555 * 3556 * \param xbb Xen Block Back softc. 3557 * 3558 */ 3559 static void 3560 xbb_setup_sysctl(struct xbb_softc *xbb) 3561 { 3562 struct sysctl_ctx_list *sysctl_ctx = NULL; 3563 struct sysctl_oid *sysctl_tree = NULL; 3564 3565 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3566 if (sysctl_ctx == NULL) 3567 return; 3568 3569 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3570 if (sysctl_tree == NULL) 3571 return; 3572 3573 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3574 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3575 "fake the flush command"); 3576 3577 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3578 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3579 "send a real flush for N flush requests"); 3580 3581 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3582 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3583 "Don't coalesce contiguous requests"); 3584 3585 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3586 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3587 "how many I/O requests we have received"); 3588 3589 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3590 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3591 "how many I/O requests have been completed"); 3592 3593 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3594 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3595 "how many I/O dispatches were forced"); 3596 3597 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3598 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3599 "how many I/O dispatches were normal"); 3600 3601 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3602 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3603 "total number of I/O dispatches"); 3604 3605 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3606 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3607 "how many times we have run out of KVA"); 3608 3609 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3610 "request_shortages", CTLFLAG_RW, 3611 &xbb->request_shortages, 3612 "how many times we have run out of requests"); 3613 3614 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3615 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3616 "maximum outstanding requests (negotiated)"); 3617 3618 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3619 "max_request_segments", CTLFLAG_RD, 3620 &xbb->max_request_segments, 0, 3621 "maximum number of pages per requests (negotiated)"); 3622 3623 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3624 "max_request_size", CTLFLAG_RD, 3625 &xbb->max_request_size, 0, 3626 "maximum size in bytes of a request (negotiated)"); 3627 3628 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3629 "ring_pages", CTLFLAG_RD, 3630 &xbb->ring_config.ring_pages, 0, 3631 "communication channel pages (negotiated)"); 3632 } 3633 3634 /** 3635 * Attach to a XenBus device that has been claimed by our probe routine. 3636 * 3637 * \param dev NewBus device object representing this Xen Block Back instance. 3638 * 3639 * \return 0 for success, errno codes for failure. 3640 */ 3641 static int 3642 xbb_attach(device_t dev) 3643 { 3644 struct xbb_softc *xbb; 3645 int error; 3646 u_int max_ring_page_order; 3647 3648 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3649 3650 /* 3651 * Basic initialization. 3652 * After this block it is safe to call xbb_detach() 3653 * to clean up any allocated data for this instance. 3654 */ 3655 xbb = device_get_softc(dev); 3656 xbb->dev = dev; 3657 xbb->otherend_id = xenbus_get_otherend_id(dev); 3658 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3659 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3660 3661 /* 3662 * Publish protocol capabilities for consumption by the 3663 * front-end. 3664 */ 3665 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3666 "feature-barrier", "1"); 3667 if (error) { 3668 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3669 xenbus_get_node(xbb->dev)); 3670 return (error); 3671 } 3672 3673 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3674 "feature-flush-cache", "1"); 3675 if (error) { 3676 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3677 xenbus_get_node(xbb->dev)); 3678 return (error); 3679 } 3680 3681 /* 3682 * Amazon EC2 client compatility. They refer to max-ring-pages 3683 * instead of to max-ring-page-order. 3684 */ 3685 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3686 "max-ring-pages", "%zu", XBB_MAX_RING_PAGES); 3687 if (error) { 3688 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages", 3689 xenbus_get_node(xbb->dev)); 3690 return (error); 3691 } 3692 3693 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3694 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3695 "max-ring-page-order", "%u", max_ring_page_order); 3696 if (error) { 3697 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3698 xenbus_get_node(xbb->dev)); 3699 return (error); 3700 } 3701 3702 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3703 "max-requests", "%u", XBB_MAX_REQUESTS); 3704 if (error) { 3705 xbb_attach_failed(xbb, error, "writing %s/max-requests", 3706 xenbus_get_node(xbb->dev)); 3707 return (error); 3708 } 3709 3710 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3711 "max-request-segments", "%u", 3712 XBB_MAX_SEGMENTS_PER_REQUEST); 3713 if (error) { 3714 xbb_attach_failed(xbb, error, "writing %s/max-request-segments", 3715 xenbus_get_node(xbb->dev)); 3716 return (error); 3717 } 3718 3719 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3720 "max-request-size", "%u", 3721 XBB_MAX_REQUEST_SIZE); 3722 if (error) { 3723 xbb_attach_failed(xbb, error, "writing %s/max-request-size", 3724 xenbus_get_node(xbb->dev)); 3725 return (error); 3726 } 3727 3728 /* Collect physical device information. */ 3729 error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3730 "device-type", NULL, &xbb->dev_type, 3731 NULL); 3732 if (error != 0) 3733 xbb->dev_type = NULL; 3734 3735 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3736 "mode", NULL, &xbb->dev_mode, 3737 "params", NULL, &xbb->dev_name, 3738 NULL); 3739 if (error != 0) { 3740 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3741 xenbus_get_node(dev)); 3742 return (ENXIO); 3743 } 3744 3745 /* Parse fopen style mode flags. */ 3746 if (strchr(xbb->dev_mode, 'w') == NULL) 3747 xbb->flags |= XBBF_READ_ONLY; 3748 3749 /* 3750 * Verify the physical device is present and can support 3751 * the desired I/O mode. 3752 */ 3753 DROP_GIANT(); 3754 error = xbb_open_backend(xbb); 3755 PICKUP_GIANT(); 3756 if (error != 0) { 3757 xbb_attach_failed(xbb, error, "Unable to open %s", 3758 xbb->dev_name); 3759 return (ENXIO); 3760 } 3761 3762 /* Use devstat(9) for recording statistics. */ 3763 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3764 xbb->sector_size, 3765 DEVSTAT_ALL_SUPPORTED, 3766 DEVSTAT_TYPE_DIRECT 3767 | DEVSTAT_TYPE_IF_OTHER, 3768 DEVSTAT_PRIORITY_OTHER); 3769 3770 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3771 xbb->sector_size, 3772 DEVSTAT_ALL_SUPPORTED, 3773 DEVSTAT_TYPE_DIRECT 3774 | DEVSTAT_TYPE_IF_OTHER, 3775 DEVSTAT_PRIORITY_OTHER); 3776 /* 3777 * Setup sysctl variables. 3778 */ 3779 xbb_setup_sysctl(xbb); 3780 3781 /* 3782 * Create a taskqueue for doing work that must occur from a 3783 * thread context. 3784 */ 3785 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3786 M_NOWAIT, 3787 taskqueue_thread_enqueue, 3788 /*contxt*/&xbb->io_taskqueue); 3789 if (xbb->io_taskqueue == NULL) { 3790 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3791 return (ENOMEM); 3792 } 3793 3794 taskqueue_start_threads(&xbb->io_taskqueue, 3795 /*num threads*/1, 3796 /*priority*/PWAIT, 3797 /*thread name*/ 3798 "%s taskq", device_get_nameunit(dev)); 3799 3800 /* Update hot-plug status to satisfy xend. */ 3801 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3802 "hotplug-status", "connected"); 3803 if (error) { 3804 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3805 xenbus_get_node(xbb->dev)); 3806 return (error); 3807 } 3808 3809 /* Tell the front end that we are ready to connect. */ 3810 xenbus_set_state(dev, XenbusStateInitWait); 3811 3812 return (0); 3813 } 3814 3815 /** 3816 * Detach from a block back device instance. 3817 * 3818 * \param dev NewBus device object representing this Xen Block Back instance. 3819 * 3820 * \return 0 for success, errno codes for failure. 3821 * 3822 * \note A block back device may be detached at any time in its life-cycle, 3823 * including part way through the attach process. For this reason, 3824 * initialization order and the intialization state checks in this 3825 * routine must be carefully coupled so that attach time failures 3826 * are gracefully handled. 3827 */ 3828 static int 3829 xbb_detach(device_t dev) 3830 { 3831 struct xbb_softc *xbb; 3832 3833 DPRINTF("\n"); 3834 3835 xbb = device_get_softc(dev); 3836 mtx_lock(&xbb->lock); 3837 while (xbb_shutdown(xbb) == EAGAIN) { 3838 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3839 "xbb_shutdown", 0); 3840 } 3841 mtx_unlock(&xbb->lock); 3842 3843 DPRINTF("\n"); 3844 3845 if (xbb->io_taskqueue != NULL) 3846 taskqueue_free(xbb->io_taskqueue); 3847 3848 if (xbb->xbb_stats != NULL) 3849 devstat_remove_entry(xbb->xbb_stats); 3850 3851 if (xbb->xbb_stats_in != NULL) 3852 devstat_remove_entry(xbb->xbb_stats_in); 3853 3854 xbb_close_backend(xbb); 3855 3856 if (xbb->dev_mode != NULL) { 3857 free(xbb->dev_mode, M_XENBUS); 3858 xbb->dev_mode = NULL; 3859 } 3860 3861 if (xbb->dev_type != NULL) { 3862 free(xbb->dev_type, M_XENBUS); 3863 xbb->dev_type = NULL; 3864 } 3865 3866 if (xbb->dev_name != NULL) { 3867 free(xbb->dev_name, M_XENBUS); 3868 xbb->dev_name = NULL; 3869 } 3870 3871 mtx_destroy(&xbb->lock); 3872 return (0); 3873 } 3874 3875 /** 3876 * Prepare this block back device for suspension of this VM. 3877 * 3878 * \param dev NewBus device object representing this Xen Block Back instance. 3879 * 3880 * \return 0 for success, errno codes for failure. 3881 */ 3882 static int 3883 xbb_suspend(device_t dev) 3884 { 3885 #ifdef NOT_YET 3886 struct xbb_softc *sc = device_get_softc(dev); 3887 3888 /* Prevent new requests being issued until we fix things up. */ 3889 mtx_lock(&sc->xb_io_lock); 3890 sc->connected = BLKIF_STATE_SUSPENDED; 3891 mtx_unlock(&sc->xb_io_lock); 3892 #endif 3893 3894 return (0); 3895 } 3896 3897 /** 3898 * Perform any processing required to recover from a suspended state. 3899 * 3900 * \param dev NewBus device object representing this Xen Block Back instance. 3901 * 3902 * \return 0 for success, errno codes for failure. 3903 */ 3904 static int 3905 xbb_resume(device_t dev) 3906 { 3907 return (0); 3908 } 3909 3910 /** 3911 * Handle state changes expressed via the XenStore by our front-end peer. 3912 * 3913 * \param dev NewBus device object representing this Xen 3914 * Block Back instance. 3915 * \param frontend_state The new state of the front-end. 3916 * 3917 * \return 0 for success, errno codes for failure. 3918 */ 3919 static void 3920 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3921 { 3922 struct xbb_softc *xbb = device_get_softc(dev); 3923 3924 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3925 xenbus_strstate(frontend_state), 3926 xenbus_strstate(xenbus_get_state(xbb->dev))); 3927 3928 switch (frontend_state) { 3929 case XenbusStateInitialising: 3930 break; 3931 case XenbusStateInitialised: 3932 case XenbusStateConnected: 3933 xbb_connect(xbb); 3934 break; 3935 case XenbusStateClosing: 3936 case XenbusStateClosed: 3937 mtx_lock(&xbb->lock); 3938 xbb_shutdown(xbb); 3939 mtx_unlock(&xbb->lock); 3940 if (frontend_state == XenbusStateClosed) 3941 xenbus_set_state(xbb->dev, XenbusStateClosed); 3942 break; 3943 default: 3944 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3945 frontend_state); 3946 break; 3947 } 3948 } 3949 3950 /*---------------------------- NewBus Registration ---------------------------*/ 3951 static device_method_t xbb_methods[] = { 3952 /* Device interface */ 3953 DEVMETHOD(device_probe, xbb_probe), 3954 DEVMETHOD(device_attach, xbb_attach), 3955 DEVMETHOD(device_detach, xbb_detach), 3956 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3957 DEVMETHOD(device_suspend, xbb_suspend), 3958 DEVMETHOD(device_resume, xbb_resume), 3959 3960 /* Xenbus interface */ 3961 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3962 3963 { 0, 0 } 3964 }; 3965 3966 static driver_t xbb_driver = { 3967 "xbbd", 3968 xbb_methods, 3969 sizeof(struct xbb_softc), 3970 }; 3971 devclass_t xbb_devclass; 3972 3973 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 3974