1 /*- 2 * Copyright (c) 2009-2011 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 * Ken Merry (Spectra Logic Corporation) 32 */ 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 36 /** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 */ 42 43 #include "opt_kdtrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kernel.h> 48 #include <sys/malloc.h> 49 50 #include <sys/bio.h> 51 #include <sys/bus.h> 52 #include <sys/conf.h> 53 #include <sys/devicestat.h> 54 #include <sys/disk.h> 55 #include <sys/fcntl.h> 56 #include <sys/filedesc.h> 57 #include <sys/kdb.h> 58 #include <sys/module.h> 59 #include <sys/namei.h> 60 #include <sys/proc.h> 61 #include <sys/rman.h> 62 #include <sys/taskqueue.h> 63 #include <sys/types.h> 64 #include <sys/vnode.h> 65 #include <sys/mount.h> 66 #include <sys/sysctl.h> 67 #include <sys/bitstring.h> 68 #include <sys/sdt.h> 69 70 #include <geom/geom.h> 71 72 #include <machine/_inttypes.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 78 #include <xen/xen-os.h> 79 #include <xen/blkif.h> 80 #include <xen/gnttab.h> 81 #include <xen/xen_intr.h> 82 83 #include <xen/interface/event_channel.h> 84 #include <xen/interface/grant_table.h> 85 86 #include <xen/xenbus/xenbusvar.h> 87 88 /*--------------------------- Compile-time Tunables --------------------------*/ 89 /** 90 * The maximum number of outstanding request blocks (request headers plus 91 * additional segment blocks) we will allow in a negotiated block-front/back 92 * communication channel. 93 */ 94 #define XBB_MAX_REQUESTS 256 95 96 /** 97 * \brief Define to force all I/O to be performed on memory owned by the 98 * backend device, with a copy-in/out to the remote domain's memory. 99 * 100 * \note This option is currently required when this driver's domain is 101 * operating in HVM mode on a system using an IOMMU. 102 * 103 * This driver uses Xen's grant table API to gain access to the memory of 104 * the remote domains it serves. When our domain is operating in PV mode, 105 * the grant table mechanism directly updates our domain's page table entries 106 * to point to the physical pages of the remote domain. This scheme guarantees 107 * that blkback and the backing devices it uses can safely perform DMA 108 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 109 * insure that our domain cannot DMA to pages owned by another domain. As 110 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 111 * table API. For this reason, in HVM mode, we must bounce all requests into 112 * memory that is mapped into our domain at domain startup and thus has 113 * valid IOMMU mappings. 114 */ 115 #define XBB_USE_BOUNCE_BUFFERS 116 117 /** 118 * \brief Define to enable rudimentary request logging to the console. 119 */ 120 #undef XBB_DEBUG 121 122 /*---------------------------------- Macros ----------------------------------*/ 123 /** 124 * Custom malloc type for all driver allocations. 125 */ 126 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 127 128 #ifdef XBB_DEBUG 129 #define DPRINTF(fmt, args...) \ 130 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 131 #else 132 #define DPRINTF(fmt, args...) do {} while(0) 133 #endif 134 135 /** 136 * The maximum mapped region size per request we will allow in a negotiated 137 * block-front/back communication channel. 138 */ 139 #define XBB_MAX_REQUEST_SIZE \ 140 MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 141 142 /** 143 * The maximum number of segments (within a request header and accompanying 144 * segment blocks) per request we will allow in a negotiated block-front/back 145 * communication channel. 146 */ 147 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 148 (MIN(UIO_MAXIOV, \ 149 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 150 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 151 152 /** 153 * The maximum number of shared memory ring pages we will allow in a 154 * negotiated block-front/back communication channel. Allow enough 155 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 156 */ 157 #define XBB_MAX_RING_PAGES \ 158 BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \ 159 * XBB_MAX_REQUESTS) 160 /** 161 * The maximum number of ring pages that we can allow per request list. 162 * We limit this to the maximum number of segments per request, because 163 * that is already a reasonable number of segments to aggregate. This 164 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 165 * because that would leave situations where we can't dispatch even one 166 * large request. 167 */ 168 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 169 170 /*--------------------------- Forward Declarations ---------------------------*/ 171 struct xbb_softc; 172 struct xbb_xen_req; 173 174 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 175 ...) __attribute__((format(printf, 3, 4))); 176 static int xbb_shutdown(struct xbb_softc *xbb); 177 static int xbb_detach(device_t dev); 178 179 /*------------------------------ Data Structures -----------------------------*/ 180 181 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 182 183 typedef enum { 184 XBB_REQLIST_NONE = 0x00, 185 XBB_REQLIST_MAPPED = 0x01 186 } xbb_reqlist_flags; 187 188 struct xbb_xen_reqlist { 189 /** 190 * Back reference to the parent block back instance for this 191 * request. Used during bio_done handling. 192 */ 193 struct xbb_softc *xbb; 194 195 /** 196 * BLKIF_OP code for this request. 197 */ 198 int operation; 199 200 /** 201 * Set to BLKIF_RSP_* to indicate request status. 202 * 203 * This field allows an error status to be recorded even if the 204 * delivery of this status must be deferred. Deferred reporting 205 * is necessary, for example, when an error is detected during 206 * completion processing of one bio when other bios for this 207 * request are still outstanding. 208 */ 209 int status; 210 211 /** 212 * Number of 512 byte sectors not transferred. 213 */ 214 int residual_512b_sectors; 215 216 /** 217 * Starting sector number of the first request in the list. 218 */ 219 off_t starting_sector_number; 220 221 /** 222 * If we're going to coalesce, the next contiguous sector would be 223 * this one. 224 */ 225 off_t next_contig_sector; 226 227 /** 228 * Number of child requests in the list. 229 */ 230 int num_children; 231 232 /** 233 * Number of I/O requests dispatched to the backend. 234 */ 235 int pendcnt; 236 237 /** 238 * Total number of segments for requests in the list. 239 */ 240 int nr_segments; 241 242 /** 243 * Flags for this particular request list. 244 */ 245 xbb_reqlist_flags flags; 246 247 /** 248 * Kernel virtual address space reserved for this request 249 * list structure and used to map the remote domain's pages for 250 * this I/O, into our domain's address space. 251 */ 252 uint8_t *kva; 253 254 /** 255 * Base, psuedo-physical address, corresponding to the start 256 * of this request's kva region. 257 */ 258 uint64_t gnt_base; 259 260 261 #ifdef XBB_USE_BOUNCE_BUFFERS 262 /** 263 * Pre-allocated domain local memory used to proxy remote 264 * domain memory during I/O operations. 265 */ 266 uint8_t *bounce; 267 #endif 268 269 /** 270 * Array of grant handles (one per page) used to map this request. 271 */ 272 grant_handle_t *gnt_handles; 273 274 /** 275 * Device statistics request ordering type (ordered or simple). 276 */ 277 devstat_tag_type ds_tag_type; 278 279 /** 280 * Device statistics request type (read, write, no_data). 281 */ 282 devstat_trans_flags ds_trans_type; 283 284 /** 285 * The start time for this request. 286 */ 287 struct bintime ds_t0; 288 289 /** 290 * Linked list of contiguous requests with the same operation type. 291 */ 292 struct xbb_xen_req_list contig_req_list; 293 294 /** 295 * Linked list links used to aggregate idle requests in the 296 * request list free pool (xbb->reqlist_free_stailq) and pending 297 * requests waiting for execution (xbb->reqlist_pending_stailq). 298 */ 299 STAILQ_ENTRY(xbb_xen_reqlist) links; 300 }; 301 302 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 303 304 /** 305 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 306 */ 307 struct xbb_xen_req { 308 /** 309 * Linked list links used to aggregate requests into a reqlist 310 * and to store them in the request free pool. 311 */ 312 STAILQ_ENTRY(xbb_xen_req) links; 313 314 /** 315 * The remote domain's identifier for this I/O request. 316 */ 317 uint64_t id; 318 319 /** 320 * The number of pages currently mapped for this request. 321 */ 322 int nr_pages; 323 324 /** 325 * The number of 512 byte sectors comprising this requests. 326 */ 327 int nr_512b_sectors; 328 329 /** 330 * The number of struct bio requests still outstanding for this 331 * request on the backend device. This field is only used for 332 * device (rather than file) backed I/O. 333 */ 334 int pendcnt; 335 336 /** 337 * BLKIF_OP code for this request. 338 */ 339 int operation; 340 341 /** 342 * Storage used for non-native ring requests. 343 */ 344 blkif_request_t ring_req_storage; 345 346 /** 347 * Pointer to the Xen request in the ring. 348 */ 349 blkif_request_t *ring_req; 350 351 /** 352 * Consumer index for this request. 353 */ 354 RING_IDX req_ring_idx; 355 356 /** 357 * The start time for this request. 358 */ 359 struct bintime ds_t0; 360 361 /** 362 * Pointer back to our parent request list. 363 */ 364 struct xbb_xen_reqlist *reqlist; 365 }; 366 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 367 368 /** 369 * \brief Configuration data for the shared memory request ring 370 * used to communicate with the front-end client of this 371 * this driver. 372 */ 373 struct xbb_ring_config { 374 /** KVA address where ring memory is mapped. */ 375 vm_offset_t va; 376 377 /** The pseudo-physical address where ring memory is mapped.*/ 378 uint64_t gnt_addr; 379 380 /** 381 * Grant table handles, one per-ring page, returned by the 382 * hyperpervisor upon mapping of the ring and required to 383 * unmap it when a connection is torn down. 384 */ 385 grant_handle_t handle[XBB_MAX_RING_PAGES]; 386 387 /** 388 * The device bus address returned by the hypervisor when 389 * mapping the ring and required to unmap it when a connection 390 * is torn down. 391 */ 392 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 393 394 /** The number of ring pages mapped for the current connection. */ 395 u_int ring_pages; 396 397 /** 398 * The grant references, one per-ring page, supplied by the 399 * front-end, allowing us to reference the ring pages in the 400 * front-end's domain and to map these pages into our own domain. 401 */ 402 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 403 404 /** The interrupt driven even channel used to signal ring events. */ 405 evtchn_port_t evtchn; 406 }; 407 408 /** 409 * Per-instance connection state flags. 410 */ 411 typedef enum 412 { 413 /** 414 * The front-end requested a read-only mount of the 415 * back-end device/file. 416 */ 417 XBBF_READ_ONLY = 0x01, 418 419 /** Communication with the front-end has been established. */ 420 XBBF_RING_CONNECTED = 0x02, 421 422 /** 423 * Front-end requests exist in the ring and are waiting for 424 * xbb_xen_req objects to free up. 425 */ 426 XBBF_RESOURCE_SHORTAGE = 0x04, 427 428 /** Connection teardown in progress. */ 429 XBBF_SHUTDOWN = 0x08, 430 431 /** A thread is already performing shutdown processing. */ 432 XBBF_IN_SHUTDOWN = 0x10 433 } xbb_flag_t; 434 435 /** Backend device type. */ 436 typedef enum { 437 /** Backend type unknown. */ 438 XBB_TYPE_NONE = 0x00, 439 440 /** 441 * Backend type disk (access via cdev switch 442 * strategy routine). 443 */ 444 XBB_TYPE_DISK = 0x01, 445 446 /** Backend type file (access vnode operations.). */ 447 XBB_TYPE_FILE = 0x02 448 } xbb_type; 449 450 /** 451 * \brief Structure used to memoize information about a per-request 452 * scatter-gather list. 453 * 454 * The chief benefit of using this data structure is it avoids having 455 * to reparse the possibly discontiguous S/G list in the original 456 * request. Due to the way that the mapping of the memory backing an 457 * I/O transaction is handled by Xen, a second pass is unavoidable. 458 * At least this way the second walk is a simple array traversal. 459 * 460 * \note A single Scatter/Gather element in the block interface covers 461 * at most 1 machine page. In this context a sector (blkif 462 * nomenclature, not what I'd choose) is a 512b aligned unit 463 * of mapping within the machine page referenced by an S/G 464 * element. 465 */ 466 struct xbb_sg { 467 /** The number of 512b data chunks mapped in this S/G element. */ 468 int16_t nsect; 469 470 /** 471 * The index (0 based) of the first 512b data chunk mapped 472 * in this S/G element. 473 */ 474 uint8_t first_sect; 475 476 /** 477 * The index (0 based) of the last 512b data chunk mapped 478 * in this S/G element. 479 */ 480 uint8_t last_sect; 481 }; 482 483 /** 484 * Character device backend specific configuration data. 485 */ 486 struct xbb_dev_data { 487 /** Cdev used for device backend access. */ 488 struct cdev *cdev; 489 490 /** Cdev switch used for device backend access. */ 491 struct cdevsw *csw; 492 493 /** Used to hold a reference on opened cdev backend devices. */ 494 int dev_ref; 495 }; 496 497 /** 498 * File backend specific configuration data. 499 */ 500 struct xbb_file_data { 501 /** Credentials to use for vnode backed (file based) I/O. */ 502 struct ucred *cred; 503 504 /** 505 * \brief Array of io vectors used to process file based I/O. 506 * 507 * Only a single file based request is outstanding per-xbb instance, 508 * so we only need one of these. 509 */ 510 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 511 #ifdef XBB_USE_BOUNCE_BUFFERS 512 513 /** 514 * \brief Array of io vectors used to handle bouncing of file reads. 515 * 516 * Vnode operations are free to modify uio data during their 517 * exectuion. In the case of a read with bounce buffering active, 518 * we need some of the data from the original uio in order to 519 * bounce-out the read data. This array serves as the temporary 520 * storage for this saved data. 521 */ 522 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 523 524 /** 525 * \brief Array of memoized bounce buffer kva offsets used 526 * in the file based backend. 527 * 528 * Due to the way that the mapping of the memory backing an 529 * I/O transaction is handled by Xen, a second pass through 530 * the request sg elements is unavoidable. We memoize the computed 531 * bounce address here to reduce the cost of the second walk. 532 */ 533 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 534 #endif /* XBB_USE_BOUNCE_BUFFERS */ 535 }; 536 537 /** 538 * Collection of backend type specific data. 539 */ 540 union xbb_backend_data { 541 struct xbb_dev_data dev; 542 struct xbb_file_data file; 543 }; 544 545 /** 546 * Function signature of backend specific I/O handlers. 547 */ 548 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 549 struct xbb_xen_reqlist *reqlist, int operation, 550 int flags); 551 552 /** 553 * Per-instance configuration data. 554 */ 555 struct xbb_softc { 556 557 /** 558 * Task-queue used to process I/O requests. 559 */ 560 struct taskqueue *io_taskqueue; 561 562 /** 563 * Single "run the request queue" task enqueued 564 * on io_taskqueue. 565 */ 566 struct task io_task; 567 568 /** Device type for this instance. */ 569 xbb_type device_type; 570 571 /** NewBus device corresponding to this instance. */ 572 device_t dev; 573 574 /** Backend specific dispatch routine for this instance. */ 575 xbb_dispatch_t dispatch_io; 576 577 /** The number of requests outstanding on the backend device/file. */ 578 int active_request_count; 579 580 /** Free pool of request tracking structures. */ 581 struct xbb_xen_req_list request_free_stailq; 582 583 /** Array, sized at connection time, of request tracking structures. */ 584 struct xbb_xen_req *requests; 585 586 /** Free pool of request list structures. */ 587 struct xbb_xen_reqlist_list reqlist_free_stailq; 588 589 /** List of pending request lists awaiting execution. */ 590 struct xbb_xen_reqlist_list reqlist_pending_stailq; 591 592 /** Array, sized at connection time, of request list structures. */ 593 struct xbb_xen_reqlist *request_lists; 594 595 /** 596 * Global pool of kva used for mapping remote domain ring 597 * and I/O transaction data. 598 */ 599 vm_offset_t kva; 600 601 /** Psuedo-physical address corresponding to kva. */ 602 uint64_t gnt_base_addr; 603 604 /** The size of the global kva pool. */ 605 int kva_size; 606 607 /** The size of the KVA area used for request lists. */ 608 int reqlist_kva_size; 609 610 /** The number of pages of KVA used for request lists */ 611 int reqlist_kva_pages; 612 613 /** Bitmap of free KVA pages */ 614 bitstr_t *kva_free; 615 616 /** 617 * \brief Cached value of the front-end's domain id. 618 * 619 * This value is used at once for each mapped page in 620 * a transaction. We cache it to avoid incuring the 621 * cost of an ivar access every time this is needed. 622 */ 623 domid_t otherend_id; 624 625 /** 626 * \brief The blkif protocol abi in effect. 627 * 628 * There are situations where the back and front ends can 629 * have a different, native abi (e.g. intel x86_64 and 630 * 32bit x86 domains on the same machine). The back-end 631 * always accomodates the front-end's native abi. That 632 * value is pulled from the XenStore and recorded here. 633 */ 634 int abi; 635 636 /** 637 * \brief The maximum number of requests and request lists allowed 638 * to be in flight at a time. 639 * 640 * This value is negotiated via the XenStore. 641 */ 642 u_int max_requests; 643 644 /** 645 * \brief The maximum number of segments (1 page per segment) 646 * that can be mapped by a request. 647 * 648 * This value is negotiated via the XenStore. 649 */ 650 u_int max_request_segments; 651 652 /** 653 * \brief Maximum number of segments per request list. 654 * 655 * This value is derived from and will generally be larger than 656 * max_request_segments. 657 */ 658 u_int max_reqlist_segments; 659 660 /** 661 * The maximum size of any request to this back-end 662 * device. 663 * 664 * This value is negotiated via the XenStore. 665 */ 666 u_int max_request_size; 667 668 /** 669 * The maximum size of any request list. This is derived directly 670 * from max_reqlist_segments. 671 */ 672 u_int max_reqlist_size; 673 674 /** Various configuration and state bit flags. */ 675 xbb_flag_t flags; 676 677 /** Ring mapping and interrupt configuration data. */ 678 struct xbb_ring_config ring_config; 679 680 /** Runtime, cross-abi safe, structures for ring access. */ 681 blkif_back_rings_t rings; 682 683 /** IRQ mapping for the communication ring event channel. */ 684 xen_intr_handle_t xen_intr_handle; 685 686 /** 687 * \brief Backend access mode flags (e.g. write, or read-only). 688 * 689 * This value is passed to us by the front-end via the XenStore. 690 */ 691 char *dev_mode; 692 693 /** 694 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 695 * 696 * This value is passed to us by the front-end via the XenStore. 697 * Currently unused. 698 */ 699 char *dev_type; 700 701 /** 702 * \brief Backend device/file identifier. 703 * 704 * This value is passed to us by the front-end via the XenStore. 705 * We expect this to be a POSIX path indicating the file or 706 * device to open. 707 */ 708 char *dev_name; 709 710 /** 711 * Vnode corresponding to the backend device node or file 712 * we are acessing. 713 */ 714 struct vnode *vn; 715 716 union xbb_backend_data backend; 717 718 /** The native sector size of the backend. */ 719 u_int sector_size; 720 721 /** log2 of sector_size. */ 722 u_int sector_size_shift; 723 724 /** Size in bytes of the backend device or file. */ 725 off_t media_size; 726 727 /** 728 * \brief media_size expressed in terms of the backend native 729 * sector size. 730 * 731 * (e.g. xbb->media_size >> xbb->sector_size_shift). 732 */ 733 uint64_t media_num_sectors; 734 735 /** 736 * \brief Array of memoized scatter gather data computed during the 737 * conversion of blkif ring requests to internal xbb_xen_req 738 * structures. 739 * 740 * Ring processing is serialized so we only need one of these. 741 */ 742 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 743 744 /** 745 * Temporary grant table map used in xbb_dispatch_io(). When 746 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 747 * stack could cause a stack overflow. 748 */ 749 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 750 751 /** Mutex protecting per-instance data. */ 752 struct mtx lock; 753 754 #ifdef XENHVM 755 /** 756 * Resource representing allocated physical address space 757 * associated with our per-instance kva region. 758 */ 759 struct resource *pseudo_phys_res; 760 761 /** Resource id for allocated physical address space. */ 762 int pseudo_phys_res_id; 763 #endif 764 765 /** 766 * I/O statistics from BlockBack dispatch down. These are 767 * coalesced requests, and we start them right before execution. 768 */ 769 struct devstat *xbb_stats; 770 771 /** 772 * I/O statistics coming into BlockBack. These are the requests as 773 * we get them from BlockFront. They are started as soon as we 774 * receive a request, and completed when the I/O is complete. 775 */ 776 struct devstat *xbb_stats_in; 777 778 /** Disable sending flush to the backend */ 779 int disable_flush; 780 781 /** Send a real flush for every N flush requests */ 782 int flush_interval; 783 784 /** Count of flush requests in the interval */ 785 int flush_count; 786 787 /** Don't coalesce requests if this is set */ 788 int no_coalesce_reqs; 789 790 /** Number of requests we have received */ 791 uint64_t reqs_received; 792 793 /** Number of requests we have completed*/ 794 uint64_t reqs_completed; 795 796 /** How many forced dispatches (i.e. without coalescing) have happend */ 797 uint64_t forced_dispatch; 798 799 /** How many normal dispatches have happend */ 800 uint64_t normal_dispatch; 801 802 /** How many total dispatches have happend */ 803 uint64_t total_dispatch; 804 805 /** How many times we have run out of KVA */ 806 uint64_t kva_shortages; 807 808 /** How many times we have run out of request structures */ 809 uint64_t request_shortages; 810 }; 811 812 /*---------------------------- Request Processing ----------------------------*/ 813 /** 814 * Allocate an internal transaction tracking structure from the free pool. 815 * 816 * \param xbb Per-instance xbb configuration structure. 817 * 818 * \return On success, a pointer to the allocated xbb_xen_req structure. 819 * Otherwise NULL. 820 */ 821 static inline struct xbb_xen_req * 822 xbb_get_req(struct xbb_softc *xbb) 823 { 824 struct xbb_xen_req *req; 825 826 req = NULL; 827 828 mtx_assert(&xbb->lock, MA_OWNED); 829 830 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 831 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 832 xbb->active_request_count++; 833 } 834 835 return (req); 836 } 837 838 /** 839 * Return an allocated transaction tracking structure to the free pool. 840 * 841 * \param xbb Per-instance xbb configuration structure. 842 * \param req The request structure to free. 843 */ 844 static inline void 845 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 846 { 847 mtx_assert(&xbb->lock, MA_OWNED); 848 849 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 850 xbb->active_request_count--; 851 852 KASSERT(xbb->active_request_count >= 0, 853 ("xbb_release_req: negative active count")); 854 } 855 856 /** 857 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 858 * 859 * \param xbb Per-instance xbb configuration structure. 860 * \param req_list The list of requests to free. 861 * \param nreqs The number of items in the list. 862 */ 863 static inline void 864 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 865 int nreqs) 866 { 867 mtx_assert(&xbb->lock, MA_OWNED); 868 869 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 870 xbb->active_request_count -= nreqs; 871 872 KASSERT(xbb->active_request_count >= 0, 873 ("xbb_release_reqs: negative active count")); 874 } 875 876 /** 877 * Given a page index and 512b sector offset within that page, 878 * calculate an offset into a request's kva region. 879 * 880 * \param reqlist The request structure whose kva region will be accessed. 881 * \param pagenr The page index used to compute the kva offset. 882 * \param sector The 512b sector index used to compute the page relative 883 * kva offset. 884 * 885 * \return The computed global KVA offset. 886 */ 887 static inline uint8_t * 888 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 889 { 890 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 891 } 892 893 #ifdef XBB_USE_BOUNCE_BUFFERS 894 /** 895 * Given a page index and 512b sector offset within that page, 896 * calculate an offset into a request's local bounce memory region. 897 * 898 * \param reqlist The request structure whose bounce region will be accessed. 899 * \param pagenr The page index used to compute the bounce offset. 900 * \param sector The 512b sector index used to compute the page relative 901 * bounce offset. 902 * 903 * \return The computed global bounce buffer address. 904 */ 905 static inline uint8_t * 906 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 907 { 908 return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 909 } 910 #endif 911 912 /** 913 * Given a page number and 512b sector offset within that page, 914 * calculate an offset into the request's memory region that the 915 * underlying backend device/file should use for I/O. 916 * 917 * \param reqlist The request structure whose I/O region will be accessed. 918 * \param pagenr The page index used to compute the I/O offset. 919 * \param sector The 512b sector index used to compute the page relative 920 * I/O offset. 921 * 922 * \return The computed global I/O address. 923 * 924 * Depending on configuration, this will either be a local bounce buffer 925 * or a pointer to the memory mapped in from the front-end domain for 926 * this request. 927 */ 928 static inline uint8_t * 929 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 930 { 931 #ifdef XBB_USE_BOUNCE_BUFFERS 932 return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 933 #else 934 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 935 #endif 936 } 937 938 /** 939 * Given a page index and 512b sector offset within that page, calculate 940 * an offset into the local psuedo-physical address space used to map a 941 * front-end's request data into a request. 942 * 943 * \param reqlist The request list structure whose pseudo-physical region 944 * will be accessed. 945 * \param pagenr The page index used to compute the pseudo-physical offset. 946 * \param sector The 512b sector index used to compute the page relative 947 * pseudo-physical offset. 948 * 949 * \return The computed global pseudo-phsyical address. 950 * 951 * Depending on configuration, this will either be a local bounce buffer 952 * or a pointer to the memory mapped in from the front-end domain for 953 * this request. 954 */ 955 static inline uintptr_t 956 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 957 { 958 struct xbb_softc *xbb; 959 960 xbb = reqlist->xbb; 961 962 return ((uintptr_t)(xbb->gnt_base_addr + 963 (uintptr_t)(reqlist->kva - xbb->kva) + 964 (PAGE_SIZE * pagenr) + (sector << 9))); 965 } 966 967 /** 968 * Get Kernel Virtual Address space for mapping requests. 969 * 970 * \param xbb Per-instance xbb configuration structure. 971 * \param nr_pages Number of pages needed. 972 * \param check_only If set, check for free KVA but don't allocate it. 973 * \param have_lock If set, xbb lock is already held. 974 * 975 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 976 * 977 * Note: This should be unnecessary once we have either chaining or 978 * scatter/gather support for struct bio. At that point we'll be able to 979 * put multiple addresses and lengths in one bio/bio chain and won't need 980 * to map everything into one virtual segment. 981 */ 982 static uint8_t * 983 xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 984 { 985 intptr_t first_clear; 986 intptr_t num_clear; 987 uint8_t *free_kva; 988 int i; 989 990 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 991 992 first_clear = 0; 993 free_kva = NULL; 994 995 mtx_lock(&xbb->lock); 996 997 /* 998 * Look for the first available page. If there are none, we're done. 999 */ 1000 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 1001 1002 if (first_clear == -1) 1003 goto bailout; 1004 1005 /* 1006 * Starting at the first available page, look for consecutive free 1007 * pages that will satisfy the user's request. 1008 */ 1009 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1010 /* 1011 * If this is true, the page is used, so we have to reset 1012 * the number of clear pages and the first clear page 1013 * (since it pointed to a region with an insufficient number 1014 * of clear pages). 1015 */ 1016 if (bit_test(xbb->kva_free, i)) { 1017 num_clear = 0; 1018 first_clear = -1; 1019 continue; 1020 } 1021 1022 if (first_clear == -1) 1023 first_clear = i; 1024 1025 /* 1026 * If this is true, we've found a large enough free region 1027 * to satisfy the request. 1028 */ 1029 if (++num_clear == nr_pages) { 1030 1031 bit_nset(xbb->kva_free, first_clear, 1032 first_clear + nr_pages - 1); 1033 1034 free_kva = xbb->kva + 1035 (uint8_t *)(first_clear * PAGE_SIZE); 1036 1037 KASSERT(free_kva >= (uint8_t *)xbb->kva && 1038 free_kva + (nr_pages * PAGE_SIZE) <= 1039 (uint8_t *)xbb->ring_config.va, 1040 ("Free KVA %p len %d out of range, " 1041 "kva = %#jx, ring VA = %#jx\n", free_kva, 1042 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1043 (uintmax_t)xbb->ring_config.va)); 1044 break; 1045 } 1046 } 1047 1048 bailout: 1049 1050 if (free_kva == NULL) { 1051 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1052 xbb->kva_shortages++; 1053 } 1054 1055 mtx_unlock(&xbb->lock); 1056 1057 return (free_kva); 1058 } 1059 1060 /** 1061 * Free allocated KVA. 1062 * 1063 * \param xbb Per-instance xbb configuration structure. 1064 * \param kva_ptr Pointer to allocated KVA region. 1065 * \param nr_pages Number of pages in the KVA region. 1066 */ 1067 static void 1068 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 1069 { 1070 intptr_t start_page; 1071 1072 mtx_assert(&xbb->lock, MA_OWNED); 1073 1074 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1075 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1076 1077 } 1078 1079 /** 1080 * Unmap the front-end pages associated with this I/O request. 1081 * 1082 * \param req The request structure to unmap. 1083 */ 1084 static void 1085 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1086 { 1087 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1088 u_int i; 1089 u_int invcount; 1090 int error; 1091 1092 invcount = 0; 1093 for (i = 0; i < reqlist->nr_segments; i++) { 1094 1095 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1096 continue; 1097 1098 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1099 unmap[invcount].dev_bus_addr = 0; 1100 unmap[invcount].handle = reqlist->gnt_handles[i]; 1101 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1102 invcount++; 1103 } 1104 1105 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1106 unmap, invcount); 1107 KASSERT(error == 0, ("Grant table operation failed")); 1108 } 1109 1110 /** 1111 * Allocate an internal transaction tracking structure from the free pool. 1112 * 1113 * \param xbb Per-instance xbb configuration structure. 1114 * 1115 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1116 * Otherwise NULL. 1117 */ 1118 static inline struct xbb_xen_reqlist * 1119 xbb_get_reqlist(struct xbb_softc *xbb) 1120 { 1121 struct xbb_xen_reqlist *reqlist; 1122 1123 reqlist = NULL; 1124 1125 mtx_assert(&xbb->lock, MA_OWNED); 1126 1127 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1128 1129 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1130 reqlist->flags = XBB_REQLIST_NONE; 1131 reqlist->kva = NULL; 1132 reqlist->status = BLKIF_RSP_OKAY; 1133 reqlist->residual_512b_sectors = 0; 1134 reqlist->num_children = 0; 1135 reqlist->nr_segments = 0; 1136 STAILQ_INIT(&reqlist->contig_req_list); 1137 } 1138 1139 return (reqlist); 1140 } 1141 1142 /** 1143 * Return an allocated transaction tracking structure to the free pool. 1144 * 1145 * \param xbb Per-instance xbb configuration structure. 1146 * \param req The request list structure to free. 1147 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1148 * during a resource shortage condition. 1149 */ 1150 static inline void 1151 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1152 int wakeup) 1153 { 1154 1155 mtx_lock(&xbb->lock); 1156 1157 if (wakeup) { 1158 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1159 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1160 } 1161 1162 if (reqlist->kva != NULL) 1163 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1164 1165 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1166 1167 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1168 1169 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1170 /* 1171 * Shutdown is in progress. See if we can 1172 * progress further now that one more request 1173 * has completed and been returned to the 1174 * free pool. 1175 */ 1176 xbb_shutdown(xbb); 1177 } 1178 1179 mtx_unlock(&xbb->lock); 1180 1181 if (wakeup != 0) 1182 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1183 } 1184 1185 /** 1186 * Request resources and do basic request setup. 1187 * 1188 * \param xbb Per-instance xbb configuration structure. 1189 * \param reqlist Pointer to reqlist pointer. 1190 * \param ring_req Pointer to a block ring request. 1191 * \param ring_index The ring index of this request. 1192 * 1193 * \return 0 for success, non-zero for failure. 1194 */ 1195 static int 1196 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1197 blkif_request_t *ring_req, RING_IDX ring_idx) 1198 { 1199 struct xbb_xen_reqlist *nreqlist; 1200 struct xbb_xen_req *nreq; 1201 1202 nreqlist = NULL; 1203 nreq = NULL; 1204 1205 mtx_lock(&xbb->lock); 1206 1207 /* 1208 * We don't allow new resources to be allocated if we're in the 1209 * process of shutting down. 1210 */ 1211 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1212 mtx_unlock(&xbb->lock); 1213 return (1); 1214 } 1215 1216 /* 1217 * Allocate a reqlist if the caller doesn't have one already. 1218 */ 1219 if (*reqlist == NULL) { 1220 nreqlist = xbb_get_reqlist(xbb); 1221 if (nreqlist == NULL) 1222 goto bailout_error; 1223 } 1224 1225 /* We always allocate a request. */ 1226 nreq = xbb_get_req(xbb); 1227 if (nreq == NULL) 1228 goto bailout_error; 1229 1230 mtx_unlock(&xbb->lock); 1231 1232 if (*reqlist == NULL) { 1233 *reqlist = nreqlist; 1234 nreqlist->operation = ring_req->operation; 1235 nreqlist->starting_sector_number = ring_req->sector_number; 1236 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1237 links); 1238 } 1239 1240 nreq->reqlist = *reqlist; 1241 nreq->req_ring_idx = ring_idx; 1242 nreq->id = ring_req->id; 1243 1244 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1245 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1246 nreq->ring_req = &nreq->ring_req_storage; 1247 } else { 1248 nreq->ring_req = ring_req; 1249 } 1250 1251 binuptime(&nreq->ds_t0); 1252 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1253 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1254 (*reqlist)->num_children++; 1255 (*reqlist)->nr_segments += ring_req->nr_segments; 1256 1257 return (0); 1258 1259 bailout_error: 1260 1261 /* 1262 * We're out of resources, so set the shortage flag. The next time 1263 * a request is released, we'll try waking up the work thread to 1264 * see if we can allocate more resources. 1265 */ 1266 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1267 xbb->request_shortages++; 1268 1269 if (nreq != NULL) 1270 xbb_release_req(xbb, nreq); 1271 1272 mtx_unlock(&xbb->lock); 1273 1274 if (nreqlist != NULL) 1275 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1276 1277 return (1); 1278 } 1279 1280 /** 1281 * Create and transmit a response to a blkif request. 1282 * 1283 * \param xbb Per-instance xbb configuration structure. 1284 * \param req The request structure to which to respond. 1285 * \param status The status code to report. See BLKIF_RSP_* 1286 * in sys/xen/interface/io/blkif.h. 1287 */ 1288 static void 1289 xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1290 { 1291 blkif_response_t *resp; 1292 int more_to_do; 1293 int notify; 1294 1295 more_to_do = 0; 1296 1297 /* 1298 * Place on the response ring for the relevant domain. 1299 * For now, only the spacing between entries is different 1300 * in the different ABIs, not the response entry layout. 1301 */ 1302 mtx_lock(&xbb->lock); 1303 switch (xbb->abi) { 1304 case BLKIF_PROTOCOL_NATIVE: 1305 resp = RING_GET_RESPONSE(&xbb->rings.native, 1306 xbb->rings.native.rsp_prod_pvt); 1307 break; 1308 case BLKIF_PROTOCOL_X86_32: 1309 resp = (blkif_response_t *) 1310 RING_GET_RESPONSE(&xbb->rings.x86_32, 1311 xbb->rings.x86_32.rsp_prod_pvt); 1312 break; 1313 case BLKIF_PROTOCOL_X86_64: 1314 resp = (blkif_response_t *) 1315 RING_GET_RESPONSE(&xbb->rings.x86_64, 1316 xbb->rings.x86_64.rsp_prod_pvt); 1317 break; 1318 default: 1319 panic("Unexpected blkif protocol ABI."); 1320 } 1321 1322 resp->id = req->id; 1323 resp->operation = req->operation; 1324 resp->status = status; 1325 1326 xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages); 1327 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify); 1328 1329 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1330 1331 /* 1332 * Tail check for pending requests. Allows frontend to avoid 1333 * notifications if requests are already in flight (lower 1334 * overheads and promotes batching). 1335 */ 1336 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1337 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1338 1339 more_to_do = 1; 1340 } 1341 1342 xbb->reqs_completed++; 1343 1344 mtx_unlock(&xbb->lock); 1345 1346 if (more_to_do) 1347 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1348 1349 if (notify) 1350 xen_intr_signal(xbb->xen_intr_handle); 1351 } 1352 1353 /** 1354 * Complete a request list. 1355 * 1356 * \param xbb Per-instance xbb configuration structure. 1357 * \param reqlist Allocated internal request list structure. 1358 */ 1359 static void 1360 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1361 { 1362 struct xbb_xen_req *nreq; 1363 off_t sectors_sent; 1364 1365 sectors_sent = 0; 1366 1367 if (reqlist->flags & XBB_REQLIST_MAPPED) 1368 xbb_unmap_reqlist(reqlist); 1369 1370 /* 1371 * All I/O is done, send the response. A lock should not be 1372 * necessary here because the request list is complete, and 1373 * therefore this is the only context accessing this request 1374 * right now. The functions we call do their own locking if 1375 * necessary. 1376 */ 1377 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1378 off_t cur_sectors_sent; 1379 1380 xbb_send_response(xbb, nreq, reqlist->status); 1381 1382 /* We don't report bytes sent if there is an error. */ 1383 if (reqlist->status == BLKIF_RSP_OKAY) 1384 cur_sectors_sent = nreq->nr_512b_sectors; 1385 else 1386 cur_sectors_sent = 0; 1387 1388 sectors_sent += cur_sectors_sent; 1389 1390 devstat_end_transaction(xbb->xbb_stats_in, 1391 /*bytes*/cur_sectors_sent << 9, 1392 reqlist->ds_tag_type, 1393 reqlist->ds_trans_type, 1394 /*now*/NULL, 1395 /*then*/&nreq->ds_t0); 1396 } 1397 1398 /* 1399 * Take out any sectors not sent. If we wind up negative (which 1400 * might happen if an error is reported as well as a residual), just 1401 * report 0 sectors sent. 1402 */ 1403 sectors_sent -= reqlist->residual_512b_sectors; 1404 if (sectors_sent < 0) 1405 sectors_sent = 0; 1406 1407 devstat_end_transaction(xbb->xbb_stats, 1408 /*bytes*/ sectors_sent << 9, 1409 reqlist->ds_tag_type, 1410 reqlist->ds_trans_type, 1411 /*now*/NULL, 1412 /*then*/&reqlist->ds_t0); 1413 1414 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1415 } 1416 1417 /** 1418 * Completion handler for buffer I/O requests issued by the device 1419 * backend driver. 1420 * 1421 * \param bio The buffer I/O request on which to perform completion 1422 * processing. 1423 */ 1424 static void 1425 xbb_bio_done(struct bio *bio) 1426 { 1427 struct xbb_softc *xbb; 1428 struct xbb_xen_reqlist *reqlist; 1429 1430 reqlist = bio->bio_caller1; 1431 xbb = reqlist->xbb; 1432 1433 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1434 1435 /* 1436 * This is a bit imprecise. With aggregated I/O a single 1437 * request list can contain multiple front-end requests and 1438 * a multiple bios may point to a single request. By carefully 1439 * walking the request list, we could map residuals and errors 1440 * back to the original front-end request, but the interface 1441 * isn't sufficiently rich for us to properly report the error. 1442 * So, we just treat the entire request list as having failed if an 1443 * error occurs on any part. And, if an error occurs, we treat 1444 * the amount of data transferred as 0. 1445 * 1446 * For residuals, we report it on the overall aggregated device, 1447 * but not on the individual requests, since we don't currently 1448 * do the work to determine which front-end request to which the 1449 * residual applies. 1450 */ 1451 if (bio->bio_error) { 1452 DPRINTF("BIO returned error %d for operation on device %s\n", 1453 bio->bio_error, xbb->dev_name); 1454 reqlist->status = BLKIF_RSP_ERROR; 1455 1456 if (bio->bio_error == ENXIO 1457 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1458 1459 /* 1460 * Backend device has disappeared. Signal the 1461 * front-end that we (the device proxy) want to 1462 * go away. 1463 */ 1464 xenbus_set_state(xbb->dev, XenbusStateClosing); 1465 } 1466 } 1467 1468 #ifdef XBB_USE_BOUNCE_BUFFERS 1469 if (bio->bio_cmd == BIO_READ) { 1470 vm_offset_t kva_offset; 1471 1472 kva_offset = (vm_offset_t)bio->bio_data 1473 - (vm_offset_t)reqlist->bounce; 1474 memcpy((uint8_t *)reqlist->kva + kva_offset, 1475 bio->bio_data, bio->bio_bcount); 1476 } 1477 #endif /* XBB_USE_BOUNCE_BUFFERS */ 1478 1479 /* 1480 * Decrement the pending count for the request list. When we're 1481 * done with the requests, send status back for all of them. 1482 */ 1483 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1484 xbb_complete_reqlist(xbb, reqlist); 1485 1486 g_destroy_bio(bio); 1487 } 1488 1489 /** 1490 * Parse a blkif request into an internal request structure and send 1491 * it to the backend for processing. 1492 * 1493 * \param xbb Per-instance xbb configuration structure. 1494 * \param reqlist Allocated internal request list structure. 1495 * 1496 * \return On success, 0. For resource shortages, non-zero. 1497 * 1498 * This routine performs the backend common aspects of request parsing 1499 * including compiling an internal request structure, parsing the S/G 1500 * list and any secondary ring requests in which they may reside, and 1501 * the mapping of front-end I/O pages into our domain. 1502 */ 1503 static int 1504 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1505 { 1506 struct xbb_sg *xbb_sg; 1507 struct gnttab_map_grant_ref *map; 1508 struct blkif_request_segment *sg; 1509 struct blkif_request_segment *last_block_sg; 1510 struct xbb_xen_req *nreq; 1511 u_int nseg; 1512 u_int seg_idx; 1513 u_int block_segs; 1514 int nr_sects; 1515 int total_sects; 1516 int operation; 1517 uint8_t bio_flags; 1518 int error; 1519 1520 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1521 bio_flags = 0; 1522 total_sects = 0; 1523 nr_sects = 0; 1524 1525 /* 1526 * First determine whether we have enough free KVA to satisfy this 1527 * request list. If not, tell xbb_run_queue() so it can go to 1528 * sleep until we have more KVA. 1529 */ 1530 reqlist->kva = NULL; 1531 if (reqlist->nr_segments != 0) { 1532 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1533 if (reqlist->kva == NULL) { 1534 /* 1535 * If we're out of KVA, return ENOMEM. 1536 */ 1537 return (ENOMEM); 1538 } 1539 } 1540 1541 binuptime(&reqlist->ds_t0); 1542 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1543 1544 switch (reqlist->operation) { 1545 case BLKIF_OP_WRITE_BARRIER: 1546 bio_flags |= BIO_ORDERED; 1547 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1548 /* FALLTHROUGH */ 1549 case BLKIF_OP_WRITE: 1550 operation = BIO_WRITE; 1551 reqlist->ds_trans_type = DEVSTAT_WRITE; 1552 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1553 DPRINTF("Attempt to write to read only device %s\n", 1554 xbb->dev_name); 1555 reqlist->status = BLKIF_RSP_ERROR; 1556 goto send_response; 1557 } 1558 break; 1559 case BLKIF_OP_READ: 1560 operation = BIO_READ; 1561 reqlist->ds_trans_type = DEVSTAT_READ; 1562 break; 1563 case BLKIF_OP_FLUSH_DISKCACHE: 1564 /* 1565 * If this is true, the user has requested that we disable 1566 * flush support. So we just complete the requests 1567 * successfully. 1568 */ 1569 if (xbb->disable_flush != 0) { 1570 goto send_response; 1571 } 1572 1573 /* 1574 * The user has requested that we only send a real flush 1575 * for every N flush requests. So keep count, and either 1576 * complete the request immediately or queue it for the 1577 * backend. 1578 */ 1579 if (xbb->flush_interval != 0) { 1580 if (++(xbb->flush_count) < xbb->flush_interval) { 1581 goto send_response; 1582 } else 1583 xbb->flush_count = 0; 1584 } 1585 1586 operation = BIO_FLUSH; 1587 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1588 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1589 goto do_dispatch; 1590 /*NOTREACHED*/ 1591 default: 1592 DPRINTF("error: unknown block io operation [%d]\n", 1593 reqlist->operation); 1594 reqlist->status = BLKIF_RSP_ERROR; 1595 goto send_response; 1596 } 1597 1598 reqlist->xbb = xbb; 1599 xbb_sg = xbb->xbb_sgs; 1600 map = xbb->maps; 1601 seg_idx = 0; 1602 1603 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1604 blkif_request_t *ring_req; 1605 RING_IDX req_ring_idx; 1606 u_int req_seg_idx; 1607 1608 ring_req = nreq->ring_req; 1609 req_ring_idx = nreq->req_ring_idx; 1610 nr_sects = 0; 1611 nseg = ring_req->nr_segments; 1612 nreq->nr_pages = nseg; 1613 nreq->nr_512b_sectors = 0; 1614 req_seg_idx = 0; 1615 sg = NULL; 1616 1617 /* Check that number of segments is sane. */ 1618 if (__predict_false(nseg == 0) 1619 || __predict_false(nseg > xbb->max_request_segments)) { 1620 DPRINTF("Bad number of segments in request (%d)\n", 1621 nseg); 1622 reqlist->status = BLKIF_RSP_ERROR; 1623 goto send_response; 1624 } 1625 1626 block_segs = MIN(nreq->nr_pages, 1627 BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); 1628 sg = ring_req->seg; 1629 last_block_sg = sg + block_segs; 1630 while (1) { 1631 1632 while (sg < last_block_sg) { 1633 KASSERT(seg_idx < 1634 XBB_MAX_SEGMENTS_PER_REQLIST, 1635 ("seg_idx %d is too large, max " 1636 "segs %d\n", seg_idx, 1637 XBB_MAX_SEGMENTS_PER_REQLIST)); 1638 1639 xbb_sg->first_sect = sg->first_sect; 1640 xbb_sg->last_sect = sg->last_sect; 1641 xbb_sg->nsect = 1642 (int8_t)(sg->last_sect - 1643 sg->first_sect + 1); 1644 1645 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1646 || (xbb_sg->nsect <= 0)) { 1647 reqlist->status = BLKIF_RSP_ERROR; 1648 goto send_response; 1649 } 1650 1651 nr_sects += xbb_sg->nsect; 1652 map->host_addr = xbb_get_gntaddr(reqlist, 1653 seg_idx, /*sector*/0); 1654 KASSERT(map->host_addr + PAGE_SIZE <= 1655 xbb->ring_config.gnt_addr, 1656 ("Host address %#jx len %d overlaps " 1657 "ring address %#jx\n", 1658 (uintmax_t)map->host_addr, PAGE_SIZE, 1659 (uintmax_t)xbb->ring_config.gnt_addr)); 1660 1661 map->flags = GNTMAP_host_map; 1662 map->ref = sg->gref; 1663 map->dom = xbb->otherend_id; 1664 if (operation == BIO_WRITE) 1665 map->flags |= GNTMAP_readonly; 1666 sg++; 1667 map++; 1668 xbb_sg++; 1669 seg_idx++; 1670 req_seg_idx++; 1671 } 1672 1673 block_segs = MIN(nseg - req_seg_idx, 1674 BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); 1675 if (block_segs == 0) 1676 break; 1677 1678 /* 1679 * Fetch the next request block full of SG elements. 1680 * For now, only the spacing between entries is 1681 * different in the different ABIs, not the sg entry 1682 * layout. 1683 */ 1684 req_ring_idx++; 1685 switch (xbb->abi) { 1686 case BLKIF_PROTOCOL_NATIVE: 1687 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native, 1688 req_ring_idx); 1689 break; 1690 case BLKIF_PROTOCOL_X86_32: 1691 { 1692 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32, 1693 req_ring_idx); 1694 break; 1695 } 1696 case BLKIF_PROTOCOL_X86_64: 1697 { 1698 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64, 1699 req_ring_idx); 1700 break; 1701 } 1702 default: 1703 panic("Unexpected blkif protocol ABI."); 1704 /* NOTREACHED */ 1705 } 1706 last_block_sg = sg + block_segs; 1707 } 1708 1709 /* Convert to the disk's sector size */ 1710 nreq->nr_512b_sectors = nr_sects; 1711 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1712 total_sects += nr_sects; 1713 1714 if ((nreq->nr_512b_sectors & 1715 ((xbb->sector_size >> 9) - 1)) != 0) { 1716 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1717 "a multiple of the backing store sector " 1718 "size (%d)\n", __func__, 1719 nreq->nr_512b_sectors << 9, 1720 xbb->sector_size); 1721 reqlist->status = BLKIF_RSP_ERROR; 1722 goto send_response; 1723 } 1724 } 1725 1726 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1727 xbb->maps, reqlist->nr_segments); 1728 if (error != 0) 1729 panic("Grant table operation failed (%d)", error); 1730 1731 reqlist->flags |= XBB_REQLIST_MAPPED; 1732 1733 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1734 seg_idx++, map++){ 1735 1736 if (__predict_false(map->status != 0)) { 1737 DPRINTF("invalid buffer -- could not remap " 1738 "it (%d)\n", map->status); 1739 DPRINTF("Mapping(%d): Host Addr 0x%lx, flags " 1740 "0x%x ref 0x%x, dom %d\n", seg_idx, 1741 map->host_addr, map->flags, map->ref, 1742 map->dom); 1743 reqlist->status = BLKIF_RSP_ERROR; 1744 goto send_response; 1745 } 1746 1747 reqlist->gnt_handles[seg_idx] = map->handle; 1748 } 1749 if (reqlist->starting_sector_number + total_sects > 1750 xbb->media_num_sectors) { 1751 1752 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1753 "extends past end of device %s\n", 1754 operation == BIO_READ ? "read" : "write", 1755 reqlist->starting_sector_number, 1756 reqlist->starting_sector_number + total_sects, 1757 xbb->dev_name); 1758 reqlist->status = BLKIF_RSP_ERROR; 1759 goto send_response; 1760 } 1761 1762 do_dispatch: 1763 1764 error = xbb->dispatch_io(xbb, 1765 reqlist, 1766 operation, 1767 bio_flags); 1768 1769 if (error != 0) { 1770 reqlist->status = BLKIF_RSP_ERROR; 1771 goto send_response; 1772 } 1773 1774 return (0); 1775 1776 send_response: 1777 1778 xbb_complete_reqlist(xbb, reqlist); 1779 1780 return (0); 1781 } 1782 1783 static __inline int 1784 xbb_count_sects(blkif_request_t *ring_req) 1785 { 1786 int i; 1787 int cur_size = 0; 1788 1789 for (i = 0; i < ring_req->nr_segments; i++) { 1790 int nsect; 1791 1792 nsect = (int8_t)(ring_req->seg[i].last_sect - 1793 ring_req->seg[i].first_sect + 1); 1794 if (nsect <= 0) 1795 break; 1796 1797 cur_size += nsect; 1798 } 1799 1800 return (cur_size); 1801 } 1802 1803 /** 1804 * Process incoming requests from the shared communication ring in response 1805 * to a signal on the ring's event channel. 1806 * 1807 * \param context Callback argument registerd during task initialization - 1808 * the xbb_softc for this instance. 1809 * \param pending The number of taskqueue_enqueue events that have 1810 * occurred since this handler was last run. 1811 */ 1812 static void 1813 xbb_run_queue(void *context, int pending) 1814 { 1815 struct xbb_softc *xbb; 1816 blkif_back_rings_t *rings; 1817 RING_IDX rp; 1818 uint64_t cur_sector; 1819 int cur_operation; 1820 struct xbb_xen_reqlist *reqlist; 1821 1822 1823 xbb = (struct xbb_softc *)context; 1824 rings = &xbb->rings; 1825 1826 /* 1827 * Work gather and dispatch loop. Note that we have a bias here 1828 * towards gathering I/O sent by blockfront. We first gather up 1829 * everything in the ring, as long as we have resources. Then we 1830 * dispatch one request, and then attempt to gather up any 1831 * additional requests that have come in while we were dispatching 1832 * the request. 1833 * 1834 * This allows us to get a clearer picture (via devstat) of how 1835 * many requests blockfront is queueing to us at any given time. 1836 */ 1837 for (;;) { 1838 int retval; 1839 1840 /* 1841 * Initialize reqlist to the last element in the pending 1842 * queue, if there is one. This allows us to add more 1843 * requests to that request list, if we have room. 1844 */ 1845 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1846 xbb_xen_reqlist, links); 1847 if (reqlist != NULL) { 1848 cur_sector = reqlist->next_contig_sector; 1849 cur_operation = reqlist->operation; 1850 } else { 1851 cur_operation = 0; 1852 cur_sector = 0; 1853 } 1854 1855 /* 1856 * Cache req_prod to avoid accessing a cache line shared 1857 * with the frontend. 1858 */ 1859 rp = rings->common.sring->req_prod; 1860 1861 /* Ensure we see queued requests up to 'rp'. */ 1862 rmb(); 1863 1864 /** 1865 * Run so long as there is work to consume and the generation 1866 * of a response will not overflow the ring. 1867 * 1868 * @note There's a 1 to 1 relationship between requests and 1869 * responses, so an overflow should never occur. This 1870 * test is to protect our domain from digesting bogus 1871 * data. Shouldn't we log this? 1872 */ 1873 while (rings->common.req_cons != rp 1874 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1875 rings->common.req_cons) == 0){ 1876 blkif_request_t ring_req_storage; 1877 blkif_request_t *ring_req; 1878 int cur_size; 1879 1880 switch (xbb->abi) { 1881 case BLKIF_PROTOCOL_NATIVE: 1882 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1883 rings->common.req_cons); 1884 break; 1885 case BLKIF_PROTOCOL_X86_32: 1886 { 1887 struct blkif_x86_32_request *ring_req32; 1888 1889 ring_req32 = RING_GET_REQUEST( 1890 &xbb->rings.x86_32, rings->common.req_cons); 1891 blkif_get_x86_32_req(&ring_req_storage, 1892 ring_req32); 1893 ring_req = &ring_req_storage; 1894 break; 1895 } 1896 case BLKIF_PROTOCOL_X86_64: 1897 { 1898 struct blkif_x86_64_request *ring_req64; 1899 1900 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1901 rings->common.req_cons); 1902 blkif_get_x86_64_req(&ring_req_storage, 1903 ring_req64); 1904 ring_req = &ring_req_storage; 1905 break; 1906 } 1907 default: 1908 panic("Unexpected blkif protocol ABI."); 1909 /* NOTREACHED */ 1910 } 1911 1912 /* 1913 * Check for situations that would require closing 1914 * off this I/O for further coalescing: 1915 * - Coalescing is turned off. 1916 * - Current I/O is out of sequence with the previous 1917 * I/O. 1918 * - Coalesced I/O would be too large. 1919 */ 1920 if ((reqlist != NULL) 1921 && ((xbb->no_coalesce_reqs != 0) 1922 || ((xbb->no_coalesce_reqs == 0) 1923 && ((ring_req->sector_number != cur_sector) 1924 || (ring_req->operation != cur_operation) 1925 || ((ring_req->nr_segments + reqlist->nr_segments) > 1926 xbb->max_reqlist_segments))))) { 1927 reqlist = NULL; 1928 } 1929 1930 /* 1931 * Grab and check for all resources in one shot. 1932 * If we can't get all of the resources we need, 1933 * the shortage is noted and the thread will get 1934 * woken up when more resources are available. 1935 */ 1936 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1937 xbb->rings.common.req_cons); 1938 1939 if (retval != 0) { 1940 /* 1941 * Resource shortage has been recorded. 1942 * We'll be scheduled to run once a request 1943 * object frees up due to a completion. 1944 */ 1945 break; 1946 } 1947 1948 /* 1949 * Signify that we can overwrite this request with 1950 * a response by incrementing our consumer index. 1951 * The response won't be generated until after 1952 * we've already consumed all necessary data out 1953 * of the version of the request in the ring buffer 1954 * (for native mode). We must update the consumer 1955 * index before issueing back-end I/O so there is 1956 * no possibility that it will complete and a 1957 * response be generated before we make room in 1958 * the queue for that response. 1959 */ 1960 xbb->rings.common.req_cons += 1961 BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); 1962 xbb->reqs_received++; 1963 1964 cur_size = xbb_count_sects(ring_req); 1965 cur_sector = ring_req->sector_number + cur_size; 1966 reqlist->next_contig_sector = cur_sector; 1967 cur_operation = ring_req->operation; 1968 } 1969 1970 /* Check for I/O to dispatch */ 1971 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1972 if (reqlist == NULL) { 1973 /* 1974 * We're out of work to do, put the task queue to 1975 * sleep. 1976 */ 1977 break; 1978 } 1979 1980 /* 1981 * Grab the first request off the queue and attempt 1982 * to dispatch it. 1983 */ 1984 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1985 1986 retval = xbb_dispatch_io(xbb, reqlist); 1987 if (retval != 0) { 1988 /* 1989 * xbb_dispatch_io() returns non-zero only when 1990 * there is a resource shortage. If that's the 1991 * case, re-queue this request on the head of the 1992 * queue, and go to sleep until we have more 1993 * resources. 1994 */ 1995 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1996 reqlist, links); 1997 break; 1998 } else { 1999 /* 2000 * If we still have anything on the queue after 2001 * removing the head entry, that is because we 2002 * met one of the criteria to create a new 2003 * request list (outlined above), and we'll call 2004 * that a forced dispatch for statistical purposes. 2005 * 2006 * Otherwise, if there is only one element on the 2007 * queue, we coalesced everything available on 2008 * the ring and we'll call that a normal dispatch. 2009 */ 2010 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2011 2012 if (reqlist != NULL) 2013 xbb->forced_dispatch++; 2014 else 2015 xbb->normal_dispatch++; 2016 2017 xbb->total_dispatch++; 2018 } 2019 } 2020 } 2021 2022 /** 2023 * Interrupt handler bound to the shared ring's event channel. 2024 * 2025 * \param arg Callback argument registerd during event channel 2026 * binding - the xbb_softc for this instance. 2027 */ 2028 static int 2029 xbb_filter(void *arg) 2030 { 2031 struct xbb_softc *xbb; 2032 2033 /* Defer to taskqueue thread. */ 2034 xbb = (struct xbb_softc *)arg; 2035 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2036 2037 return (FILTER_HANDLED); 2038 } 2039 2040 SDT_PROVIDER_DEFINE(xbb); 2041 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int"); 2042 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t", 2043 "uint64_t"); 2044 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int", 2045 "uint64_t", "uint64_t"); 2046 2047 /*----------------------------- Backend Handlers -----------------------------*/ 2048 /** 2049 * Backend handler for character device access. 2050 * 2051 * \param xbb Per-instance xbb configuration structure. 2052 * \param reqlist Allocated internal request list structure. 2053 * \param operation BIO_* I/O operation code. 2054 * \param bio_flags Additional bio_flag data to pass to any generated 2055 * bios (e.g. BIO_ORDERED).. 2056 * 2057 * \return 0 for success, errno codes for failure. 2058 */ 2059 static int 2060 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2061 int operation, int bio_flags) 2062 { 2063 struct xbb_dev_data *dev_data; 2064 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2065 struct xbb_xen_req *nreq; 2066 off_t bio_offset; 2067 struct bio *bio; 2068 struct xbb_sg *xbb_sg; 2069 u_int nbio; 2070 u_int bio_idx; 2071 u_int nseg; 2072 u_int seg_idx; 2073 int error; 2074 2075 dev_data = &xbb->backend.dev; 2076 bio_offset = (off_t)reqlist->starting_sector_number 2077 << xbb->sector_size_shift; 2078 error = 0; 2079 nbio = 0; 2080 bio_idx = 0; 2081 2082 if (operation == BIO_FLUSH) { 2083 nreq = STAILQ_FIRST(&reqlist->contig_req_list); 2084 bio = g_new_bio(); 2085 if (__predict_false(bio == NULL)) { 2086 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2087 error = ENOMEM; 2088 return (error); 2089 } 2090 2091 bio->bio_cmd = BIO_FLUSH; 2092 bio->bio_flags |= BIO_ORDERED; 2093 bio->bio_dev = dev_data->cdev; 2094 bio->bio_offset = 0; 2095 bio->bio_data = 0; 2096 bio->bio_done = xbb_bio_done; 2097 bio->bio_caller1 = nreq; 2098 bio->bio_pblkno = 0; 2099 2100 nreq->pendcnt = 1; 2101 2102 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2103 device_get_unit(xbb->dev)); 2104 2105 (*dev_data->csw->d_strategy)(bio); 2106 2107 return (0); 2108 } 2109 2110 xbb_sg = xbb->xbb_sgs; 2111 bio = NULL; 2112 nseg = reqlist->nr_segments; 2113 2114 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2115 2116 /* 2117 * KVA will not be contiguous, so any additional 2118 * I/O will need to be represented in a new bio. 2119 */ 2120 if ((bio != NULL) 2121 && (xbb_sg->first_sect != 0)) { 2122 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2123 printf("%s: Discontiguous I/O request " 2124 "from domain %d ends on " 2125 "non-sector boundary\n", 2126 __func__, xbb->otherend_id); 2127 error = EINVAL; 2128 goto fail_free_bios; 2129 } 2130 bio = NULL; 2131 } 2132 2133 if (bio == NULL) { 2134 /* 2135 * Make sure that the start of this bio is 2136 * aligned to a device sector. 2137 */ 2138 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2139 printf("%s: Misaligned I/O request " 2140 "from domain %d\n", __func__, 2141 xbb->otherend_id); 2142 error = EINVAL; 2143 goto fail_free_bios; 2144 } 2145 2146 bio = bios[nbio++] = g_new_bio(); 2147 if (__predict_false(bio == NULL)) { 2148 error = ENOMEM; 2149 goto fail_free_bios; 2150 } 2151 bio->bio_cmd = operation; 2152 bio->bio_flags |= bio_flags; 2153 bio->bio_dev = dev_data->cdev; 2154 bio->bio_offset = bio_offset; 2155 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2156 xbb_sg->first_sect); 2157 bio->bio_done = xbb_bio_done; 2158 bio->bio_caller1 = reqlist; 2159 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2160 } 2161 2162 bio->bio_length += xbb_sg->nsect << 9; 2163 bio->bio_bcount = bio->bio_length; 2164 bio_offset += xbb_sg->nsect << 9; 2165 2166 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2167 2168 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2169 printf("%s: Discontiguous I/O request " 2170 "from domain %d ends on " 2171 "non-sector boundary\n", 2172 __func__, xbb->otherend_id); 2173 error = EINVAL; 2174 goto fail_free_bios; 2175 } 2176 /* 2177 * KVA will not be contiguous, so any additional 2178 * I/O will need to be represented in a new bio. 2179 */ 2180 bio = NULL; 2181 } 2182 } 2183 2184 reqlist->pendcnt = nbio; 2185 2186 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2187 { 2188 #ifdef XBB_USE_BOUNCE_BUFFERS 2189 vm_offset_t kva_offset; 2190 2191 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2192 - (vm_offset_t)reqlist->bounce; 2193 if (operation == BIO_WRITE) { 2194 memcpy(bios[bio_idx]->bio_data, 2195 (uint8_t *)reqlist->kva + kva_offset, 2196 bios[bio_idx]->bio_bcount); 2197 } 2198 #endif 2199 if (operation == BIO_READ) { 2200 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2201 device_get_unit(xbb->dev), 2202 bios[bio_idx]->bio_offset, 2203 bios[bio_idx]->bio_length); 2204 } else if (operation == BIO_WRITE) { 2205 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2206 device_get_unit(xbb->dev), 2207 bios[bio_idx]->bio_offset, 2208 bios[bio_idx]->bio_length); 2209 } 2210 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2211 } 2212 2213 return (error); 2214 2215 fail_free_bios: 2216 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2217 g_destroy_bio(bios[bio_idx]); 2218 2219 return (error); 2220 } 2221 2222 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int"); 2223 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t", 2224 "uint64_t"); 2225 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int", 2226 "uint64_t", "uint64_t"); 2227 2228 /** 2229 * Backend handler for file access. 2230 * 2231 * \param xbb Per-instance xbb configuration structure. 2232 * \param reqlist Allocated internal request list. 2233 * \param operation BIO_* I/O operation code. 2234 * \param flags Additional bio_flag data to pass to any generated bios 2235 * (e.g. BIO_ORDERED).. 2236 * 2237 * \return 0 for success, errno codes for failure. 2238 */ 2239 static int 2240 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2241 int operation, int flags) 2242 { 2243 struct xbb_file_data *file_data; 2244 u_int seg_idx; 2245 u_int nseg; 2246 off_t sectors_sent; 2247 struct uio xuio; 2248 struct xbb_sg *xbb_sg; 2249 struct iovec *xiovec; 2250 #ifdef XBB_USE_BOUNCE_BUFFERS 2251 void **p_vaddr; 2252 int saved_uio_iovcnt; 2253 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2254 int error; 2255 2256 file_data = &xbb->backend.file; 2257 sectors_sent = 0; 2258 error = 0; 2259 bzero(&xuio, sizeof(xuio)); 2260 2261 switch (operation) { 2262 case BIO_READ: 2263 xuio.uio_rw = UIO_READ; 2264 break; 2265 case BIO_WRITE: 2266 xuio.uio_rw = UIO_WRITE; 2267 break; 2268 case BIO_FLUSH: { 2269 struct mount *mountpoint; 2270 2271 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2272 device_get_unit(xbb->dev)); 2273 2274 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2275 2276 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2277 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2278 VOP_UNLOCK(xbb->vn, 0); 2279 2280 vn_finished_write(mountpoint); 2281 2282 goto bailout_send_response; 2283 /* NOTREACHED */ 2284 } 2285 default: 2286 panic("invalid operation %d", operation); 2287 /* NOTREACHED */ 2288 } 2289 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2290 << xbb->sector_size_shift; 2291 xuio.uio_segflg = UIO_SYSSPACE; 2292 xuio.uio_iov = file_data->xiovecs; 2293 xuio.uio_iovcnt = 0; 2294 xbb_sg = xbb->xbb_sgs; 2295 nseg = reqlist->nr_segments; 2296 2297 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2298 2299 /* 2300 * If the first sector is not 0, the KVA will 2301 * not be contiguous and we'll need to go on 2302 * to another segment. 2303 */ 2304 if (xbb_sg->first_sect != 0) 2305 xiovec = NULL; 2306 2307 if (xiovec == NULL) { 2308 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2309 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2310 seg_idx, xbb_sg->first_sect); 2311 #ifdef XBB_USE_BOUNCE_BUFFERS 2312 /* 2313 * Store the address of the incoming 2314 * buffer at this particular offset 2315 * as well, so we can do the copy 2316 * later without having to do more 2317 * work to recalculate this address. 2318 */ 2319 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2320 *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2321 xbb_sg->first_sect); 2322 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2323 xiovec->iov_len = 0; 2324 xuio.uio_iovcnt++; 2325 } 2326 2327 xiovec->iov_len += xbb_sg->nsect << 9; 2328 2329 xuio.uio_resid += xbb_sg->nsect << 9; 2330 2331 /* 2332 * If the last sector is not the full page 2333 * size count, the next segment will not be 2334 * contiguous in KVA and we need a new iovec. 2335 */ 2336 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2337 xiovec = NULL; 2338 } 2339 2340 xuio.uio_td = curthread; 2341 2342 #ifdef XBB_USE_BOUNCE_BUFFERS 2343 saved_uio_iovcnt = xuio.uio_iovcnt; 2344 2345 if (operation == BIO_WRITE) { 2346 /* Copy the write data to the local buffer. */ 2347 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2348 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2349 seg_idx++, xiovec++, p_vaddr++) { 2350 2351 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 2352 } 2353 } else { 2354 /* 2355 * We only need to save off the iovecs in the case of a 2356 * read, because the copy for the read happens after the 2357 * VOP_READ(). (The uio will get modified in that call 2358 * sequence.) 2359 */ 2360 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2361 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2362 } 2363 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2364 2365 switch (operation) { 2366 case BIO_READ: 2367 2368 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2369 device_get_unit(xbb->dev), xuio.uio_offset, 2370 xuio.uio_resid); 2371 2372 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2373 2374 /* 2375 * UFS pays attention to IO_DIRECT for reads. If the 2376 * DIRECTIO option is configured into the kernel, it calls 2377 * ffs_rawread(). But that only works for single-segment 2378 * uios with user space addresses. In our case, with a 2379 * kernel uio, it still reads into the buffer cache, but it 2380 * will just try to release the buffer from the cache later 2381 * on in ffs_read(). 2382 * 2383 * ZFS does not pay attention to IO_DIRECT for reads. 2384 * 2385 * UFS does not pay attention to IO_SYNC for reads. 2386 * 2387 * ZFS pays attention to IO_SYNC (which translates into the 2388 * Solaris define FRSYNC for zfs_read()) for reads. It 2389 * attempts to sync the file before reading. 2390 * 2391 * So, to attempt to provide some barrier semantics in the 2392 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2393 */ 2394 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2395 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2396 2397 VOP_UNLOCK(xbb->vn, 0); 2398 break; 2399 case BIO_WRITE: { 2400 struct mount *mountpoint; 2401 2402 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2403 device_get_unit(xbb->dev), xuio.uio_offset, 2404 xuio.uio_resid); 2405 2406 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2407 2408 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2409 2410 /* 2411 * UFS pays attention to IO_DIRECT for writes. The write 2412 * is done asynchronously. (Normally the write would just 2413 * get put into cache. 2414 * 2415 * UFS pays attention to IO_SYNC for writes. It will 2416 * attempt to write the buffer out synchronously if that 2417 * flag is set. 2418 * 2419 * ZFS does not pay attention to IO_DIRECT for writes. 2420 * 2421 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2422 * for writes. It will flush the transaction from the 2423 * cache before returning. 2424 * 2425 * So if we've got the BIO_ORDERED flag set, we want 2426 * IO_SYNC in either the UFS or ZFS case. 2427 */ 2428 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2429 IO_SYNC : 0, file_data->cred); 2430 VOP_UNLOCK(xbb->vn, 0); 2431 2432 vn_finished_write(mountpoint); 2433 2434 break; 2435 } 2436 default: 2437 panic("invalid operation %d", operation); 2438 /* NOTREACHED */ 2439 } 2440 2441 #ifdef XBB_USE_BOUNCE_BUFFERS 2442 /* We only need to copy here for read operations */ 2443 if (operation == BIO_READ) { 2444 2445 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2446 xiovec = file_data->saved_xiovecs; 2447 seg_idx < saved_uio_iovcnt; seg_idx++, 2448 xiovec++, p_vaddr++) { 2449 2450 /* 2451 * Note that we have to use the copy of the 2452 * io vector we made above. uiomove() modifies 2453 * the uio and its referenced vector as uiomove 2454 * performs the copy, so we can't rely on any 2455 * state from the original uio. 2456 */ 2457 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 2458 } 2459 } 2460 #endif /* XBB_USE_BOUNCE_BUFFERS */ 2461 2462 bailout_send_response: 2463 2464 if (error != 0) 2465 reqlist->status = BLKIF_RSP_ERROR; 2466 2467 xbb_complete_reqlist(xbb, reqlist); 2468 2469 return (0); 2470 } 2471 2472 /*--------------------------- Backend Configuration --------------------------*/ 2473 /** 2474 * Close and cleanup any backend device/file specific state for this 2475 * block back instance. 2476 * 2477 * \param xbb Per-instance xbb configuration structure. 2478 */ 2479 static void 2480 xbb_close_backend(struct xbb_softc *xbb) 2481 { 2482 DROP_GIANT(); 2483 DPRINTF("closing dev=%s\n", xbb->dev_name); 2484 if (xbb->vn) { 2485 int flags = FREAD; 2486 2487 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2488 flags |= FWRITE; 2489 2490 switch (xbb->device_type) { 2491 case XBB_TYPE_DISK: 2492 if (xbb->backend.dev.csw) { 2493 dev_relthread(xbb->backend.dev.cdev, 2494 xbb->backend.dev.dev_ref); 2495 xbb->backend.dev.csw = NULL; 2496 xbb->backend.dev.cdev = NULL; 2497 } 2498 break; 2499 case XBB_TYPE_FILE: 2500 break; 2501 case XBB_TYPE_NONE: 2502 default: 2503 panic("Unexpected backend type."); 2504 break; 2505 } 2506 2507 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2508 xbb->vn = NULL; 2509 2510 switch (xbb->device_type) { 2511 case XBB_TYPE_DISK: 2512 break; 2513 case XBB_TYPE_FILE: 2514 if (xbb->backend.file.cred != NULL) { 2515 crfree(xbb->backend.file.cred); 2516 xbb->backend.file.cred = NULL; 2517 } 2518 break; 2519 case XBB_TYPE_NONE: 2520 default: 2521 panic("Unexpected backend type."); 2522 break; 2523 } 2524 } 2525 PICKUP_GIANT(); 2526 } 2527 2528 /** 2529 * Open a character device to be used for backend I/O. 2530 * 2531 * \param xbb Per-instance xbb configuration structure. 2532 * 2533 * \return 0 for success, errno codes for failure. 2534 */ 2535 static int 2536 xbb_open_dev(struct xbb_softc *xbb) 2537 { 2538 struct vattr vattr; 2539 struct cdev *dev; 2540 struct cdevsw *devsw; 2541 int error; 2542 2543 xbb->device_type = XBB_TYPE_DISK; 2544 xbb->dispatch_io = xbb_dispatch_dev; 2545 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2546 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2547 &xbb->backend.dev.dev_ref); 2548 if (xbb->backend.dev.csw == NULL) 2549 panic("Unable to retrieve device switch"); 2550 2551 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2552 if (error) { 2553 xenbus_dev_fatal(xbb->dev, error, "error getting " 2554 "vnode attributes for device %s", 2555 xbb->dev_name); 2556 return (error); 2557 } 2558 2559 2560 dev = xbb->vn->v_rdev; 2561 devsw = dev->si_devsw; 2562 if (!devsw->d_ioctl) { 2563 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2564 "device %s!", xbb->dev_name); 2565 return (ENODEV); 2566 } 2567 2568 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2569 (caddr_t)&xbb->sector_size, FREAD, 2570 curthread); 2571 if (error) { 2572 xenbus_dev_fatal(xbb->dev, error, 2573 "error calling ioctl DIOCGSECTORSIZE " 2574 "for device %s", xbb->dev_name); 2575 return (error); 2576 } 2577 2578 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2579 (caddr_t)&xbb->media_size, FREAD, 2580 curthread); 2581 if (error) { 2582 xenbus_dev_fatal(xbb->dev, error, 2583 "error calling ioctl DIOCGMEDIASIZE " 2584 "for device %s", xbb->dev_name); 2585 return (error); 2586 } 2587 2588 return (0); 2589 } 2590 2591 /** 2592 * Open a file to be used for backend I/O. 2593 * 2594 * \param xbb Per-instance xbb configuration structure. 2595 * 2596 * \return 0 for success, errno codes for failure. 2597 */ 2598 static int 2599 xbb_open_file(struct xbb_softc *xbb) 2600 { 2601 struct xbb_file_data *file_data; 2602 struct vattr vattr; 2603 int error; 2604 2605 file_data = &xbb->backend.file; 2606 xbb->device_type = XBB_TYPE_FILE; 2607 xbb->dispatch_io = xbb_dispatch_file; 2608 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2609 if (error != 0) { 2610 xenbus_dev_fatal(xbb->dev, error, 2611 "error calling VOP_GETATTR()" 2612 "for file %s", xbb->dev_name); 2613 return (error); 2614 } 2615 2616 /* 2617 * Verify that we have the ability to upgrade to exclusive 2618 * access on this file so we can trap errors at open instead 2619 * of reporting them during first access. 2620 */ 2621 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2622 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2623 if (xbb->vn->v_iflag & VI_DOOMED) { 2624 error = EBADF; 2625 xenbus_dev_fatal(xbb->dev, error, 2626 "error locking file %s", 2627 xbb->dev_name); 2628 2629 return (error); 2630 } 2631 } 2632 2633 file_data->cred = crhold(curthread->td_ucred); 2634 xbb->media_size = vattr.va_size; 2635 2636 /* 2637 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2638 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2639 * with disklabel and UFS on FreeBSD at least. Large block sizes 2640 * may not work with other OSes as well. So just export a sector 2641 * size of 512 bytes, which should work with any OS or 2642 * application. Since our backing is a file, any block size will 2643 * work fine for the backing store. 2644 */ 2645 #if 0 2646 xbb->sector_size = vattr.va_blocksize; 2647 #endif 2648 xbb->sector_size = 512; 2649 2650 /* 2651 * Sanity check. The media size has to be at least one 2652 * sector long. 2653 */ 2654 if (xbb->media_size < xbb->sector_size) { 2655 error = EINVAL; 2656 xenbus_dev_fatal(xbb->dev, error, 2657 "file %s size %ju < block size %u", 2658 xbb->dev_name, 2659 (uintmax_t)xbb->media_size, 2660 xbb->sector_size); 2661 } 2662 return (error); 2663 } 2664 2665 /** 2666 * Open the backend provider for this connection. 2667 * 2668 * \param xbb Per-instance xbb configuration structure. 2669 * 2670 * \return 0 for success, errno codes for failure. 2671 */ 2672 static int 2673 xbb_open_backend(struct xbb_softc *xbb) 2674 { 2675 struct nameidata nd; 2676 int flags; 2677 int error; 2678 2679 flags = FREAD; 2680 error = 0; 2681 2682 DPRINTF("opening dev=%s\n", xbb->dev_name); 2683 2684 if (rootvnode == NULL) { 2685 xenbus_dev_fatal(xbb->dev, ENOENT, 2686 "Root file system not mounted"); 2687 return (ENOENT); 2688 } 2689 2690 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2691 flags |= FWRITE; 2692 2693 if (!curthread->td_proc->p_fd->fd_cdir) { 2694 curthread->td_proc->p_fd->fd_cdir = rootvnode; 2695 VREF(rootvnode); 2696 } 2697 if (!curthread->td_proc->p_fd->fd_rdir) { 2698 curthread->td_proc->p_fd->fd_rdir = rootvnode; 2699 VREF(rootvnode); 2700 } 2701 if (!curthread->td_proc->p_fd->fd_jdir) { 2702 curthread->td_proc->p_fd->fd_jdir = rootvnode; 2703 VREF(rootvnode); 2704 } 2705 2706 again: 2707 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2708 error = vn_open(&nd, &flags, 0, NULL); 2709 if (error) { 2710 /* 2711 * This is the only reasonable guess we can make as far as 2712 * path if the user doesn't give us a fully qualified path. 2713 * If they want to specify a file, they need to specify the 2714 * full path. 2715 */ 2716 if (xbb->dev_name[0] != '/') { 2717 char *dev_path = "/dev/"; 2718 char *dev_name; 2719 2720 /* Try adding device path at beginning of name */ 2721 dev_name = malloc(strlen(xbb->dev_name) 2722 + strlen(dev_path) + 1, 2723 M_XENBLOCKBACK, M_NOWAIT); 2724 if (dev_name) { 2725 sprintf(dev_name, "%s%s", dev_path, 2726 xbb->dev_name); 2727 free(xbb->dev_name, M_XENBLOCKBACK); 2728 xbb->dev_name = dev_name; 2729 goto again; 2730 } 2731 } 2732 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2733 xbb->dev_name); 2734 return (error); 2735 } 2736 2737 NDFREE(&nd, NDF_ONLY_PNBUF); 2738 2739 xbb->vn = nd.ni_vp; 2740 2741 /* We only support disks and files. */ 2742 if (vn_isdisk(xbb->vn, &error)) { 2743 error = xbb_open_dev(xbb); 2744 } else if (xbb->vn->v_type == VREG) { 2745 error = xbb_open_file(xbb); 2746 } else { 2747 error = EINVAL; 2748 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2749 "or file", xbb->dev_name); 2750 } 2751 VOP_UNLOCK(xbb->vn, 0); 2752 2753 if (error != 0) { 2754 xbb_close_backend(xbb); 2755 return (error); 2756 } 2757 2758 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2759 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2760 2761 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2762 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2763 xbb->dev_name, xbb->sector_size, xbb->media_size); 2764 2765 return (0); 2766 } 2767 2768 /*------------------------ Inter-Domain Communication ------------------------*/ 2769 /** 2770 * Free dynamically allocated KVA or pseudo-physical address allocations. 2771 * 2772 * \param xbb Per-instance xbb configuration structure. 2773 */ 2774 static void 2775 xbb_free_communication_mem(struct xbb_softc *xbb) 2776 { 2777 if (xbb->kva != 0) { 2778 #ifndef XENHVM 2779 kva_free(xbb->kva, xbb->kva_size); 2780 #else 2781 if (xbb->pseudo_phys_res != NULL) { 2782 bus_release_resource(xbb->dev, SYS_RES_MEMORY, 2783 xbb->pseudo_phys_res_id, 2784 xbb->pseudo_phys_res); 2785 xbb->pseudo_phys_res = NULL; 2786 } 2787 #endif 2788 } 2789 xbb->kva = 0; 2790 xbb->gnt_base_addr = 0; 2791 if (xbb->kva_free != NULL) { 2792 free(xbb->kva_free, M_XENBLOCKBACK); 2793 xbb->kva_free = NULL; 2794 } 2795 } 2796 2797 /** 2798 * Cleanup all inter-domain communication mechanisms. 2799 * 2800 * \param xbb Per-instance xbb configuration structure. 2801 */ 2802 static int 2803 xbb_disconnect(struct xbb_softc *xbb) 2804 { 2805 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2806 struct gnttab_unmap_grant_ref *op; 2807 u_int ring_idx; 2808 int error; 2809 2810 DPRINTF("\n"); 2811 2812 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2813 return (0); 2814 2815 xen_intr_unbind(&xbb->xen_intr_handle); 2816 2817 mtx_unlock(&xbb->lock); 2818 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2819 mtx_lock(&xbb->lock); 2820 2821 /* 2822 * No new interrupts can generate work, but we must wait 2823 * for all currently active requests to drain. 2824 */ 2825 if (xbb->active_request_count != 0) 2826 return (EAGAIN); 2827 2828 for (ring_idx = 0, op = ops; 2829 ring_idx < xbb->ring_config.ring_pages; 2830 ring_idx++, op++) { 2831 2832 op->host_addr = xbb->ring_config.gnt_addr 2833 + (ring_idx * PAGE_SIZE); 2834 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2835 op->handle = xbb->ring_config.handle[ring_idx]; 2836 } 2837 2838 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2839 xbb->ring_config.ring_pages); 2840 if (error != 0) 2841 panic("Grant table op failed (%d)", error); 2842 2843 xbb_free_communication_mem(xbb); 2844 2845 if (xbb->requests != NULL) { 2846 free(xbb->requests, M_XENBLOCKBACK); 2847 xbb->requests = NULL; 2848 } 2849 2850 if (xbb->request_lists != NULL) { 2851 struct xbb_xen_reqlist *reqlist; 2852 int i; 2853 2854 /* There is one request list for ever allocated request. */ 2855 for (i = 0, reqlist = xbb->request_lists; 2856 i < xbb->max_requests; i++, reqlist++){ 2857 #ifdef XBB_USE_BOUNCE_BUFFERS 2858 if (reqlist->bounce != NULL) { 2859 free(reqlist->bounce, M_XENBLOCKBACK); 2860 reqlist->bounce = NULL; 2861 } 2862 #endif 2863 if (reqlist->gnt_handles != NULL) { 2864 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2865 reqlist->gnt_handles = NULL; 2866 } 2867 } 2868 free(xbb->request_lists, M_XENBLOCKBACK); 2869 xbb->request_lists = NULL; 2870 } 2871 2872 xbb->flags &= ~XBBF_RING_CONNECTED; 2873 return (0); 2874 } 2875 2876 /** 2877 * Map shared memory ring into domain local address space, initialize 2878 * ring control structures, and bind an interrupt to the event channel 2879 * used to notify us of ring changes. 2880 * 2881 * \param xbb Per-instance xbb configuration structure. 2882 */ 2883 static int 2884 xbb_connect_ring(struct xbb_softc *xbb) 2885 { 2886 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2887 struct gnttab_map_grant_ref *gnt; 2888 u_int ring_idx; 2889 int error; 2890 2891 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2892 return (0); 2893 2894 /* 2895 * Kva for our ring is at the tail of the region of kva allocated 2896 * by xbb_alloc_communication_mem(). 2897 */ 2898 xbb->ring_config.va = xbb->kva 2899 + (xbb->kva_size 2900 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2901 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2902 + (xbb->kva_size 2903 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2904 2905 for (ring_idx = 0, gnt = gnts; 2906 ring_idx < xbb->ring_config.ring_pages; 2907 ring_idx++, gnt++) { 2908 2909 gnt->host_addr = xbb->ring_config.gnt_addr 2910 + (ring_idx * PAGE_SIZE); 2911 gnt->flags = GNTMAP_host_map; 2912 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2913 gnt->dom = xbb->otherend_id; 2914 } 2915 2916 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2917 xbb->ring_config.ring_pages); 2918 if (error) 2919 panic("blkback: Ring page grant table op failed (%d)", error); 2920 2921 for (ring_idx = 0, gnt = gnts; 2922 ring_idx < xbb->ring_config.ring_pages; 2923 ring_idx++, gnt++) { 2924 if (gnt->status != 0) { 2925 xbb->ring_config.va = 0; 2926 xenbus_dev_fatal(xbb->dev, EACCES, 2927 "Ring shared page mapping failed. " 2928 "Status %d.", gnt->status); 2929 return (EACCES); 2930 } 2931 xbb->ring_config.handle[ring_idx] = gnt->handle; 2932 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2933 } 2934 2935 /* Initialize the ring based on ABI. */ 2936 switch (xbb->abi) { 2937 case BLKIF_PROTOCOL_NATIVE: 2938 { 2939 blkif_sring_t *sring; 2940 sring = (blkif_sring_t *)xbb->ring_config.va; 2941 BACK_RING_INIT(&xbb->rings.native, sring, 2942 xbb->ring_config.ring_pages * PAGE_SIZE); 2943 break; 2944 } 2945 case BLKIF_PROTOCOL_X86_32: 2946 { 2947 blkif_x86_32_sring_t *sring_x86_32; 2948 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2949 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2950 xbb->ring_config.ring_pages * PAGE_SIZE); 2951 break; 2952 } 2953 case BLKIF_PROTOCOL_X86_64: 2954 { 2955 blkif_x86_64_sring_t *sring_x86_64; 2956 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2957 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2958 xbb->ring_config.ring_pages * PAGE_SIZE); 2959 break; 2960 } 2961 default: 2962 panic("Unexpected blkif protocol ABI."); 2963 } 2964 2965 xbb->flags |= XBBF_RING_CONNECTED; 2966 2967 error = xen_intr_bind_remote_port(xbb->dev, 2968 xbb->otherend_id, 2969 xbb->ring_config.evtchn, 2970 xbb_filter, 2971 /*ithread_handler*/NULL, 2972 /*arg*/xbb, 2973 INTR_TYPE_BIO | INTR_MPSAFE, 2974 &xbb->xen_intr_handle); 2975 if (error) { 2976 (void)xbb_disconnect(xbb); 2977 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2978 return (error); 2979 } 2980 2981 DPRINTF("rings connected!\n"); 2982 2983 return 0; 2984 } 2985 2986 /* Needed to make bit_alloc() macro work */ 2987 #define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK, \ 2988 M_NOWAIT|M_ZERO); 2989 2990 /** 2991 * Size KVA and pseudo-physical address allocations based on negotiated 2992 * values for the size and number of I/O requests, and the size of our 2993 * communication ring. 2994 * 2995 * \param xbb Per-instance xbb configuration structure. 2996 * 2997 * These address spaces are used to dynamically map pages in the 2998 * front-end's domain into our own. 2999 */ 3000 static int 3001 xbb_alloc_communication_mem(struct xbb_softc *xbb) 3002 { 3003 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 3004 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 3005 xbb->kva_size = xbb->reqlist_kva_size + 3006 (xbb->ring_config.ring_pages * PAGE_SIZE); 3007 3008 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages); 3009 if (xbb->kva_free == NULL) 3010 return (ENOMEM); 3011 3012 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 3013 device_get_nameunit(xbb->dev), xbb->kva_size, 3014 xbb->reqlist_kva_size); 3015 #ifndef XENHVM 3016 xbb->kva = kva_alloc(xbb->kva_size); 3017 if (xbb->kva == 0) 3018 return (ENOMEM); 3019 xbb->gnt_base_addr = xbb->kva; 3020 #else /* XENHVM */ 3021 /* 3022 * Reserve a range of pseudo physical memory that we can map 3023 * into kva. These pages will only be backed by machine 3024 * pages ("real memory") during the lifetime of front-end requests 3025 * via grant table operations. 3026 */ 3027 xbb->pseudo_phys_res_id = 0; 3028 xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY, 3029 &xbb->pseudo_phys_res_id, 3030 0, ~0, xbb->kva_size, 3031 RF_ACTIVE); 3032 if (xbb->pseudo_phys_res == NULL) { 3033 xbb->kva = 0; 3034 return (ENOMEM); 3035 } 3036 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3037 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3038 #endif /* XENHVM */ 3039 3040 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3041 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3042 (uintmax_t)xbb->gnt_base_addr); 3043 return (0); 3044 } 3045 3046 /** 3047 * Collect front-end information from the XenStore. 3048 * 3049 * \param xbb Per-instance xbb configuration structure. 3050 */ 3051 static int 3052 xbb_collect_frontend_info(struct xbb_softc *xbb) 3053 { 3054 char protocol_abi[64]; 3055 const char *otherend_path; 3056 int error; 3057 u_int ring_idx; 3058 u_int ring_page_order; 3059 size_t ring_size; 3060 3061 otherend_path = xenbus_get_otherend_path(xbb->dev); 3062 3063 /* 3064 * Protocol defaults valid even if all negotiation fails. 3065 */ 3066 xbb->ring_config.ring_pages = 1; 3067 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; 3068 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3069 3070 /* 3071 * Mandatory data (used in all versions of the protocol) first. 3072 */ 3073 error = xs_scanf(XST_NIL, otherend_path, 3074 "event-channel", NULL, "%" PRIu32, 3075 &xbb->ring_config.evtchn); 3076 if (error != 0) { 3077 xenbus_dev_fatal(xbb->dev, error, 3078 "Unable to retrieve event-channel information " 3079 "from frontend %s. Unable to connect.", 3080 xenbus_get_otherend_path(xbb->dev)); 3081 return (error); 3082 } 3083 3084 /* 3085 * These fields are initialized to legacy protocol defaults 3086 * so we only need to fail if reading the updated value succeeds 3087 * and the new value is outside of its allowed range. 3088 * 3089 * \note xs_gather() returns on the first encountered error, so 3090 * we must use independant calls in order to guarantee 3091 * we don't miss information in a sparsly populated front-end 3092 * tree. 3093 * 3094 * \note xs_scanf() does not update variables for unmatched 3095 * fields. 3096 */ 3097 ring_page_order = 0; 3098 (void)xs_scanf(XST_NIL, otherend_path, 3099 "ring-page-order", NULL, "%u", 3100 &ring_page_order); 3101 xbb->ring_config.ring_pages = 1 << ring_page_order; 3102 (void)xs_scanf(XST_NIL, otherend_path, 3103 "num-ring-pages", NULL, "%u", 3104 &xbb->ring_config.ring_pages); 3105 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3106 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3107 3108 (void)xs_scanf(XST_NIL, otherend_path, 3109 "max-requests", NULL, "%u", 3110 &xbb->max_requests); 3111 3112 (void)xs_scanf(XST_NIL, otherend_path, 3113 "max-request-segments", NULL, "%u", 3114 &xbb->max_request_segments); 3115 3116 (void)xs_scanf(XST_NIL, otherend_path, 3117 "max-request-size", NULL, "%u", 3118 &xbb->max_request_size); 3119 3120 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3121 xenbus_dev_fatal(xbb->dev, EINVAL, 3122 "Front-end specified ring-pages of %u " 3123 "exceeds backend limit of %zu. " 3124 "Unable to connect.", 3125 xbb->ring_config.ring_pages, 3126 XBB_MAX_RING_PAGES); 3127 return (EINVAL); 3128 } else if (xbb->max_requests > XBB_MAX_REQUESTS) { 3129 xenbus_dev_fatal(xbb->dev, EINVAL, 3130 "Front-end specified max_requests of %u " 3131 "exceeds backend limit of %u. " 3132 "Unable to connect.", 3133 xbb->max_requests, 3134 XBB_MAX_REQUESTS); 3135 return (EINVAL); 3136 } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) { 3137 xenbus_dev_fatal(xbb->dev, EINVAL, 3138 "Front-end specified max_requests_segments " 3139 "of %u exceeds backend limit of %u. " 3140 "Unable to connect.", 3141 xbb->max_request_segments, 3142 XBB_MAX_SEGMENTS_PER_REQUEST); 3143 return (EINVAL); 3144 } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) { 3145 xenbus_dev_fatal(xbb->dev, EINVAL, 3146 "Front-end specified max_request_size " 3147 "of %u exceeds backend limit of %u. " 3148 "Unable to connect.", 3149 xbb->max_request_size, 3150 XBB_MAX_REQUEST_SIZE); 3151 return (EINVAL); 3152 } 3153 3154 if (xbb->ring_config.ring_pages == 1) { 3155 error = xs_gather(XST_NIL, otherend_path, 3156 "ring-ref", "%" PRIu32, 3157 &xbb->ring_config.ring_ref[0], 3158 NULL); 3159 if (error != 0) { 3160 xenbus_dev_fatal(xbb->dev, error, 3161 "Unable to retrieve ring information " 3162 "from frontend %s. Unable to " 3163 "connect.", 3164 xenbus_get_otherend_path(xbb->dev)); 3165 return (error); 3166 } 3167 } else { 3168 /* Multi-page ring format. */ 3169 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3170 ring_idx++) { 3171 char ring_ref_name[]= "ring_refXX"; 3172 3173 snprintf(ring_ref_name, sizeof(ring_ref_name), 3174 "ring-ref%u", ring_idx); 3175 error = xs_scanf(XST_NIL, otherend_path, 3176 ring_ref_name, NULL, "%" PRIu32, 3177 &xbb->ring_config.ring_ref[ring_idx]); 3178 if (error != 0) { 3179 xenbus_dev_fatal(xbb->dev, error, 3180 "Failed to retriev grant " 3181 "reference for page %u of " 3182 "shared ring. Unable " 3183 "to connect.", ring_idx); 3184 return (error); 3185 } 3186 } 3187 } 3188 3189 error = xs_gather(XST_NIL, otherend_path, 3190 "protocol", "%63s", protocol_abi, 3191 NULL); 3192 if (error != 0 3193 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3194 /* 3195 * Assume native if the frontend has not 3196 * published ABI data or it has published and 3197 * matches our own ABI. 3198 */ 3199 xbb->abi = BLKIF_PROTOCOL_NATIVE; 3200 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3201 3202 xbb->abi = BLKIF_PROTOCOL_X86_32; 3203 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3204 3205 xbb->abi = BLKIF_PROTOCOL_X86_64; 3206 } else { 3207 3208 xenbus_dev_fatal(xbb->dev, EINVAL, 3209 "Unknown protocol ABI (%s) published by " 3210 "frontend. Unable to connect.", protocol_abi); 3211 return (EINVAL); 3212 } 3213 return (0); 3214 } 3215 3216 /** 3217 * Allocate per-request data structures given request size and number 3218 * information negotiated with the front-end. 3219 * 3220 * \param xbb Per-instance xbb configuration structure. 3221 */ 3222 static int 3223 xbb_alloc_requests(struct xbb_softc *xbb) 3224 { 3225 struct xbb_xen_req *req; 3226 struct xbb_xen_req *last_req; 3227 3228 /* 3229 * Allocate request book keeping datastructures. 3230 */ 3231 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3232 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3233 if (xbb->requests == NULL) { 3234 xenbus_dev_fatal(xbb->dev, ENOMEM, 3235 "Unable to allocate request structures"); 3236 return (ENOMEM); 3237 } 3238 3239 req = xbb->requests; 3240 last_req = &xbb->requests[xbb->max_requests - 1]; 3241 STAILQ_INIT(&xbb->request_free_stailq); 3242 while (req <= last_req) { 3243 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3244 req++; 3245 } 3246 return (0); 3247 } 3248 3249 static int 3250 xbb_alloc_request_lists(struct xbb_softc *xbb) 3251 { 3252 struct xbb_xen_reqlist *reqlist; 3253 int i; 3254 3255 /* 3256 * If no requests can be merged, we need 1 request list per 3257 * in flight request. 3258 */ 3259 xbb->request_lists = malloc(xbb->max_requests * 3260 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3261 if (xbb->request_lists == NULL) { 3262 xenbus_dev_fatal(xbb->dev, ENOMEM, 3263 "Unable to allocate request list structures"); 3264 return (ENOMEM); 3265 } 3266 3267 STAILQ_INIT(&xbb->reqlist_free_stailq); 3268 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3269 for (i = 0; i < xbb->max_requests; i++) { 3270 int seg; 3271 3272 reqlist = &xbb->request_lists[i]; 3273 3274 reqlist->xbb = xbb; 3275 3276 #ifdef XBB_USE_BOUNCE_BUFFERS 3277 reqlist->bounce = malloc(xbb->max_reqlist_size, 3278 M_XENBLOCKBACK, M_NOWAIT); 3279 if (reqlist->bounce == NULL) { 3280 xenbus_dev_fatal(xbb->dev, ENOMEM, 3281 "Unable to allocate request " 3282 "bounce buffers"); 3283 return (ENOMEM); 3284 } 3285 #endif /* XBB_USE_BOUNCE_BUFFERS */ 3286 3287 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3288 sizeof(*reqlist->gnt_handles), 3289 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3290 if (reqlist->gnt_handles == NULL) { 3291 xenbus_dev_fatal(xbb->dev, ENOMEM, 3292 "Unable to allocate request " 3293 "grant references"); 3294 return (ENOMEM); 3295 } 3296 3297 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3298 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3299 3300 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3301 } 3302 return (0); 3303 } 3304 3305 /** 3306 * Supply information about the physical device to the frontend 3307 * via XenBus. 3308 * 3309 * \param xbb Per-instance xbb configuration structure. 3310 */ 3311 static int 3312 xbb_publish_backend_info(struct xbb_softc *xbb) 3313 { 3314 struct xs_transaction xst; 3315 const char *our_path; 3316 const char *leaf; 3317 int error; 3318 3319 our_path = xenbus_get_node(xbb->dev); 3320 while (1) { 3321 error = xs_transaction_start(&xst); 3322 if (error != 0) { 3323 xenbus_dev_fatal(xbb->dev, error, 3324 "Error publishing backend info " 3325 "(start transaction)"); 3326 return (error); 3327 } 3328 3329 leaf = "sectors"; 3330 error = xs_printf(xst, our_path, leaf, 3331 "%"PRIu64, xbb->media_num_sectors); 3332 if (error != 0) 3333 break; 3334 3335 /* XXX Support all VBD attributes here. */ 3336 leaf = "info"; 3337 error = xs_printf(xst, our_path, leaf, "%u", 3338 xbb->flags & XBBF_READ_ONLY 3339 ? VDISK_READONLY : 0); 3340 if (error != 0) 3341 break; 3342 3343 leaf = "sector-size"; 3344 error = xs_printf(xst, our_path, leaf, "%u", 3345 xbb->sector_size); 3346 if (error != 0) 3347 break; 3348 3349 error = xs_transaction_end(xst, 0); 3350 if (error == 0) { 3351 return (0); 3352 } else if (error != EAGAIN) { 3353 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3354 return (error); 3355 } 3356 } 3357 3358 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3359 our_path, leaf); 3360 xs_transaction_end(xst, 1); 3361 return (error); 3362 } 3363 3364 /** 3365 * Connect to our blkfront peer now that it has completed publishing 3366 * its configuration into the XenStore. 3367 * 3368 * \param xbb Per-instance xbb configuration structure. 3369 */ 3370 static void 3371 xbb_connect(struct xbb_softc *xbb) 3372 { 3373 int error; 3374 3375 if (xenbus_get_state(xbb->dev) == XenbusStateConnected) 3376 return; 3377 3378 if (xbb_collect_frontend_info(xbb) != 0) 3379 return; 3380 3381 xbb->flags &= ~XBBF_SHUTDOWN; 3382 3383 /* 3384 * We limit the maximum number of reqlist segments to the maximum 3385 * number of segments in the ring, or our absolute maximum, 3386 * whichever is smaller. 3387 */ 3388 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3389 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3390 3391 /* 3392 * The maximum size is simply a function of the number of segments 3393 * we can handle. 3394 */ 3395 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3396 3397 /* Allocate resources whose size depends on front-end configuration. */ 3398 error = xbb_alloc_communication_mem(xbb); 3399 if (error != 0) { 3400 xenbus_dev_fatal(xbb->dev, error, 3401 "Unable to allocate communication memory"); 3402 return; 3403 } 3404 3405 error = xbb_alloc_requests(xbb); 3406 if (error != 0) { 3407 /* Specific errors are reported by xbb_alloc_requests(). */ 3408 return; 3409 } 3410 3411 error = xbb_alloc_request_lists(xbb); 3412 if (error != 0) { 3413 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3414 return; 3415 } 3416 3417 /* 3418 * Connect communication channel. 3419 */ 3420 error = xbb_connect_ring(xbb); 3421 if (error != 0) { 3422 /* Specific errors are reported by xbb_connect_ring(). */ 3423 return; 3424 } 3425 3426 if (xbb_publish_backend_info(xbb) != 0) { 3427 /* 3428 * If we can't publish our data, we cannot participate 3429 * in this connection, and waiting for a front-end state 3430 * change will not help the situation. 3431 */ 3432 (void)xbb_disconnect(xbb); 3433 return; 3434 } 3435 3436 /* Ready for I/O. */ 3437 xenbus_set_state(xbb->dev, XenbusStateConnected); 3438 } 3439 3440 /*-------------------------- Device Teardown Support -------------------------*/ 3441 /** 3442 * Perform device shutdown functions. 3443 * 3444 * \param xbb Per-instance xbb configuration structure. 3445 * 3446 * Mark this instance as shutting down, wait for any active I/O on the 3447 * backend device/file to drain, disconnect from the front-end, and notify 3448 * any waiters (e.g. a thread invoking our detach method) that detach can 3449 * now proceed. 3450 */ 3451 static int 3452 xbb_shutdown(struct xbb_softc *xbb) 3453 { 3454 XenbusState frontState; 3455 int error; 3456 3457 DPRINTF("\n"); 3458 3459 /* 3460 * Due to the need to drop our mutex during some 3461 * xenbus operations, it is possible for two threads 3462 * to attempt to close out shutdown processing at 3463 * the same time. Tell the caller that hits this 3464 * race to try back later. 3465 */ 3466 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3467 return (EAGAIN); 3468 3469 xbb->flags |= XBBF_IN_SHUTDOWN; 3470 mtx_unlock(&xbb->lock); 3471 3472 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3473 xenbus_set_state(xbb->dev, XenbusStateClosing); 3474 3475 frontState = xenbus_get_otherend_state(xbb->dev); 3476 mtx_lock(&xbb->lock); 3477 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3478 3479 /* The front can submit I/O until entering the closed state. */ 3480 if (frontState < XenbusStateClosed) 3481 return (EAGAIN); 3482 3483 DPRINTF("\n"); 3484 3485 /* Indicate shutdown is in progress. */ 3486 xbb->flags |= XBBF_SHUTDOWN; 3487 3488 /* Disconnect from the front-end. */ 3489 error = xbb_disconnect(xbb); 3490 if (error != 0) { 3491 /* 3492 * Requests still outstanding. We'll be called again 3493 * once they complete. 3494 */ 3495 KASSERT(error == EAGAIN, 3496 ("%s: Unexpected xbb_disconnect() failure %d", 3497 __func__, error)); 3498 3499 return (error); 3500 } 3501 3502 DPRINTF("\n"); 3503 3504 /* Indicate to xbb_detach() that is it safe to proceed. */ 3505 wakeup(xbb); 3506 3507 return (0); 3508 } 3509 3510 /** 3511 * Report an attach time error to the console and Xen, and cleanup 3512 * this instance by forcing immediate detach processing. 3513 * 3514 * \param xbb Per-instance xbb configuration structure. 3515 * \param err Errno describing the error. 3516 * \param fmt Printf style format and arguments 3517 */ 3518 static void 3519 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3520 { 3521 va_list ap; 3522 va_list ap_hotplug; 3523 3524 va_start(ap, fmt); 3525 va_copy(ap_hotplug, ap); 3526 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3527 "hotplug-error", fmt, ap_hotplug); 3528 va_end(ap_hotplug); 3529 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3530 "hotplug-status", "error"); 3531 3532 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3533 va_end(ap); 3534 3535 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3536 "online", "0"); 3537 xbb_detach(xbb->dev); 3538 } 3539 3540 /*---------------------------- NewBus Entrypoints ----------------------------*/ 3541 /** 3542 * Inspect a XenBus device and claim it if is of the appropriate type. 3543 * 3544 * \param dev NewBus device object representing a candidate XenBus device. 3545 * 3546 * \return 0 for success, errno codes for failure. 3547 */ 3548 static int 3549 xbb_probe(device_t dev) 3550 { 3551 3552 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3553 device_set_desc(dev, "Backend Virtual Block Device"); 3554 device_quiet(dev); 3555 return (0); 3556 } 3557 3558 return (ENXIO); 3559 } 3560 3561 /** 3562 * Setup sysctl variables to control various Block Back parameters. 3563 * 3564 * \param xbb Xen Block Back softc. 3565 * 3566 */ 3567 static void 3568 xbb_setup_sysctl(struct xbb_softc *xbb) 3569 { 3570 struct sysctl_ctx_list *sysctl_ctx = NULL; 3571 struct sysctl_oid *sysctl_tree = NULL; 3572 3573 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3574 if (sysctl_ctx == NULL) 3575 return; 3576 3577 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3578 if (sysctl_tree == NULL) 3579 return; 3580 3581 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3582 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3583 "fake the flush command"); 3584 3585 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3586 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3587 "send a real flush for N flush requests"); 3588 3589 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3590 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3591 "Don't coalesce contiguous requests"); 3592 3593 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3594 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3595 "how many I/O requests we have received"); 3596 3597 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3598 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3599 "how many I/O requests have been completed"); 3600 3601 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3602 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3603 "how many I/O dispatches were forced"); 3604 3605 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3606 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3607 "how many I/O dispatches were normal"); 3608 3609 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3610 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3611 "total number of I/O dispatches"); 3612 3613 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3614 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3615 "how many times we have run out of KVA"); 3616 3617 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3618 "request_shortages", CTLFLAG_RW, 3619 &xbb->request_shortages, 3620 "how many times we have run out of requests"); 3621 3622 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3623 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3624 "maximum outstanding requests (negotiated)"); 3625 3626 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3627 "max_request_segments", CTLFLAG_RD, 3628 &xbb->max_request_segments, 0, 3629 "maximum number of pages per requests (negotiated)"); 3630 3631 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3632 "max_request_size", CTLFLAG_RD, 3633 &xbb->max_request_size, 0, 3634 "maximum size in bytes of a request (negotiated)"); 3635 3636 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3637 "ring_pages", CTLFLAG_RD, 3638 &xbb->ring_config.ring_pages, 0, 3639 "communication channel pages (negotiated)"); 3640 } 3641 3642 /** 3643 * Attach to a XenBus device that has been claimed by our probe routine. 3644 * 3645 * \param dev NewBus device object representing this Xen Block Back instance. 3646 * 3647 * \return 0 for success, errno codes for failure. 3648 */ 3649 static int 3650 xbb_attach(device_t dev) 3651 { 3652 struct xbb_softc *xbb; 3653 int error; 3654 u_int max_ring_page_order; 3655 3656 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3657 3658 /* 3659 * Basic initialization. 3660 * After this block it is safe to call xbb_detach() 3661 * to clean up any allocated data for this instance. 3662 */ 3663 xbb = device_get_softc(dev); 3664 xbb->dev = dev; 3665 xbb->otherend_id = xenbus_get_otherend_id(dev); 3666 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3667 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3668 3669 /* 3670 * Publish protocol capabilities for consumption by the 3671 * front-end. 3672 */ 3673 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3674 "feature-barrier", "1"); 3675 if (error) { 3676 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3677 xenbus_get_node(xbb->dev)); 3678 return (error); 3679 } 3680 3681 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3682 "feature-flush-cache", "1"); 3683 if (error) { 3684 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3685 xenbus_get_node(xbb->dev)); 3686 return (error); 3687 } 3688 3689 /* 3690 * Amazon EC2 client compatility. They refer to max-ring-pages 3691 * instead of to max-ring-page-order. 3692 */ 3693 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3694 "max-ring-pages", "%zu", XBB_MAX_RING_PAGES); 3695 if (error) { 3696 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages", 3697 xenbus_get_node(xbb->dev)); 3698 return (error); 3699 } 3700 3701 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3702 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3703 "max-ring-page-order", "%u", max_ring_page_order); 3704 if (error) { 3705 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3706 xenbus_get_node(xbb->dev)); 3707 return (error); 3708 } 3709 3710 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3711 "max-requests", "%u", XBB_MAX_REQUESTS); 3712 if (error) { 3713 xbb_attach_failed(xbb, error, "writing %s/max-requests", 3714 xenbus_get_node(xbb->dev)); 3715 return (error); 3716 } 3717 3718 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3719 "max-request-segments", "%u", 3720 XBB_MAX_SEGMENTS_PER_REQUEST); 3721 if (error) { 3722 xbb_attach_failed(xbb, error, "writing %s/max-request-segments", 3723 xenbus_get_node(xbb->dev)); 3724 return (error); 3725 } 3726 3727 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3728 "max-request-size", "%u", 3729 XBB_MAX_REQUEST_SIZE); 3730 if (error) { 3731 xbb_attach_failed(xbb, error, "writing %s/max-request-size", 3732 xenbus_get_node(xbb->dev)); 3733 return (error); 3734 } 3735 3736 /* Collect physical device information. */ 3737 error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3738 "device-type", NULL, &xbb->dev_type, 3739 NULL); 3740 if (error != 0) 3741 xbb->dev_type = NULL; 3742 3743 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3744 "mode", NULL, &xbb->dev_mode, 3745 "params", NULL, &xbb->dev_name, 3746 NULL); 3747 if (error != 0) { 3748 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3749 xenbus_get_node(dev)); 3750 return (ENXIO); 3751 } 3752 3753 /* Parse fopen style mode flags. */ 3754 if (strchr(xbb->dev_mode, 'w') == NULL) 3755 xbb->flags |= XBBF_READ_ONLY; 3756 3757 /* 3758 * Verify the physical device is present and can support 3759 * the desired I/O mode. 3760 */ 3761 DROP_GIANT(); 3762 error = xbb_open_backend(xbb); 3763 PICKUP_GIANT(); 3764 if (error != 0) { 3765 xbb_attach_failed(xbb, error, "Unable to open %s", 3766 xbb->dev_name); 3767 return (ENXIO); 3768 } 3769 3770 /* Use devstat(9) for recording statistics. */ 3771 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3772 xbb->sector_size, 3773 DEVSTAT_ALL_SUPPORTED, 3774 DEVSTAT_TYPE_DIRECT 3775 | DEVSTAT_TYPE_IF_OTHER, 3776 DEVSTAT_PRIORITY_OTHER); 3777 3778 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3779 xbb->sector_size, 3780 DEVSTAT_ALL_SUPPORTED, 3781 DEVSTAT_TYPE_DIRECT 3782 | DEVSTAT_TYPE_IF_OTHER, 3783 DEVSTAT_PRIORITY_OTHER); 3784 /* 3785 * Setup sysctl variables. 3786 */ 3787 xbb_setup_sysctl(xbb); 3788 3789 /* 3790 * Create a taskqueue for doing work that must occur from a 3791 * thread context. 3792 */ 3793 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3794 M_NOWAIT, 3795 taskqueue_thread_enqueue, 3796 /*contxt*/&xbb->io_taskqueue); 3797 if (xbb->io_taskqueue == NULL) { 3798 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3799 return (ENOMEM); 3800 } 3801 3802 taskqueue_start_threads(&xbb->io_taskqueue, 3803 /*num threads*/1, 3804 /*priority*/PWAIT, 3805 /*thread name*/ 3806 "%s taskq", device_get_nameunit(dev)); 3807 3808 /* Update hot-plug status to satisfy xend. */ 3809 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3810 "hotplug-status", "connected"); 3811 if (error) { 3812 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3813 xenbus_get_node(xbb->dev)); 3814 return (error); 3815 } 3816 3817 /* Tell the front end that we are ready to connect. */ 3818 xenbus_set_state(dev, XenbusStateInitWait); 3819 3820 return (0); 3821 } 3822 3823 /** 3824 * Detach from a block back device instance. 3825 * 3826 * \param dev NewBus device object representing this Xen Block Back instance. 3827 * 3828 * \return 0 for success, errno codes for failure. 3829 * 3830 * \note A block back device may be detached at any time in its life-cycle, 3831 * including part way through the attach process. For this reason, 3832 * initialization order and the intialization state checks in this 3833 * routine must be carefully coupled so that attach time failures 3834 * are gracefully handled. 3835 */ 3836 static int 3837 xbb_detach(device_t dev) 3838 { 3839 struct xbb_softc *xbb; 3840 3841 DPRINTF("\n"); 3842 3843 xbb = device_get_softc(dev); 3844 mtx_lock(&xbb->lock); 3845 while (xbb_shutdown(xbb) == EAGAIN) { 3846 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3847 "xbb_shutdown", 0); 3848 } 3849 mtx_unlock(&xbb->lock); 3850 3851 DPRINTF("\n"); 3852 3853 if (xbb->io_taskqueue != NULL) 3854 taskqueue_free(xbb->io_taskqueue); 3855 3856 if (xbb->xbb_stats != NULL) 3857 devstat_remove_entry(xbb->xbb_stats); 3858 3859 if (xbb->xbb_stats_in != NULL) 3860 devstat_remove_entry(xbb->xbb_stats_in); 3861 3862 xbb_close_backend(xbb); 3863 3864 if (xbb->dev_mode != NULL) { 3865 free(xbb->dev_mode, M_XENBUS); 3866 xbb->dev_mode = NULL; 3867 } 3868 3869 if (xbb->dev_type != NULL) { 3870 free(xbb->dev_type, M_XENBUS); 3871 xbb->dev_type = NULL; 3872 } 3873 3874 if (xbb->dev_name != NULL) { 3875 free(xbb->dev_name, M_XENBUS); 3876 xbb->dev_name = NULL; 3877 } 3878 3879 mtx_destroy(&xbb->lock); 3880 return (0); 3881 } 3882 3883 /** 3884 * Prepare this block back device for suspension of this VM. 3885 * 3886 * \param dev NewBus device object representing this Xen Block Back instance. 3887 * 3888 * \return 0 for success, errno codes for failure. 3889 */ 3890 static int 3891 xbb_suspend(device_t dev) 3892 { 3893 #ifdef NOT_YET 3894 struct xbb_softc *sc = device_get_softc(dev); 3895 3896 /* Prevent new requests being issued until we fix things up. */ 3897 mtx_lock(&sc->xb_io_lock); 3898 sc->connected = BLKIF_STATE_SUSPENDED; 3899 mtx_unlock(&sc->xb_io_lock); 3900 #endif 3901 3902 return (0); 3903 } 3904 3905 /** 3906 * Perform any processing required to recover from a suspended state. 3907 * 3908 * \param dev NewBus device object representing this Xen Block Back instance. 3909 * 3910 * \return 0 for success, errno codes for failure. 3911 */ 3912 static int 3913 xbb_resume(device_t dev) 3914 { 3915 return (0); 3916 } 3917 3918 /** 3919 * Handle state changes expressed via the XenStore by our front-end peer. 3920 * 3921 * \param dev NewBus device object representing this Xen 3922 * Block Back instance. 3923 * \param frontend_state The new state of the front-end. 3924 * 3925 * \return 0 for success, errno codes for failure. 3926 */ 3927 static void 3928 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3929 { 3930 struct xbb_softc *xbb = device_get_softc(dev); 3931 3932 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3933 xenbus_strstate(frontend_state), 3934 xenbus_strstate(xenbus_get_state(xbb->dev))); 3935 3936 switch (frontend_state) { 3937 case XenbusStateInitialising: 3938 break; 3939 case XenbusStateInitialised: 3940 case XenbusStateConnected: 3941 xbb_connect(xbb); 3942 break; 3943 case XenbusStateClosing: 3944 case XenbusStateClosed: 3945 mtx_lock(&xbb->lock); 3946 xbb_shutdown(xbb); 3947 mtx_unlock(&xbb->lock); 3948 if (frontend_state == XenbusStateClosed) 3949 xenbus_set_state(xbb->dev, XenbusStateClosed); 3950 break; 3951 default: 3952 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3953 frontend_state); 3954 break; 3955 } 3956 } 3957 3958 /*---------------------------- NewBus Registration ---------------------------*/ 3959 static device_method_t xbb_methods[] = { 3960 /* Device interface */ 3961 DEVMETHOD(device_probe, xbb_probe), 3962 DEVMETHOD(device_attach, xbb_attach), 3963 DEVMETHOD(device_detach, xbb_detach), 3964 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3965 DEVMETHOD(device_suspend, xbb_suspend), 3966 DEVMETHOD(device_resume, xbb_resume), 3967 3968 /* Xenbus interface */ 3969 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3970 3971 { 0, 0 } 3972 }; 3973 3974 static driver_t xbb_driver = { 3975 "xbbd", 3976 xbb_methods, 3977 sizeof(struct xbb_softc), 3978 }; 3979 devclass_t xbb_devclass; 3980 3981 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 3982