1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2009-2012 Spectra Logic Corporation 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions, and the following disclaimer, 12 * without modification. 13 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 14 * substantially similar to the "NO WARRANTY" disclaimer below 15 * ("Disclaimer") and any redistribution must be conditioned upon 16 * including a substantially similar Disclaimer requirement for further 17 * binary redistribution. 18 * 19 * NO WARRANTY 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 29 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGES. 31 * 32 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 33 * Ken Merry (Spectra Logic Corporation) 34 */ 35 #include <sys/cdefs.h> 36 /** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 */ 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/kernel.h> 46 #include <sys/malloc.h> 47 48 #include <sys/bio.h> 49 #include <sys/bus.h> 50 #include <sys/conf.h> 51 #include <sys/devicestat.h> 52 #include <sys/disk.h> 53 #include <sys/fcntl.h> 54 #include <sys/filedesc.h> 55 #include <sys/kdb.h> 56 #include <sys/module.h> 57 #include <sys/namei.h> 58 #include <sys/proc.h> 59 #include <sys/rman.h> 60 #include <sys/taskqueue.h> 61 #include <sys/types.h> 62 #include <sys/vnode.h> 63 #include <sys/mount.h> 64 #include <sys/sysctl.h> 65 #include <sys/bitstring.h> 66 #include <sys/sdt.h> 67 68 #include <geom/geom.h> 69 70 #include <machine/_inttypes.h> 71 72 #include <vm/vm.h> 73 #include <vm/vm_extern.h> 74 #include <vm/vm_kern.h> 75 76 #include <xen/xen-os.h> 77 #include <xen/blkif.h> 78 #include <xen/gnttab.h> 79 #include <xen/xen_intr.h> 80 81 #include <contrib/xen/event_channel.h> 82 #include <contrib/xen/grant_table.h> 83 84 #include <xen/xenbus/xenbusvar.h> 85 86 /*--------------------------- Compile-time Tunables --------------------------*/ 87 /** 88 * The maximum number of shared memory ring pages we will allow in a 89 * negotiated block-front/back communication channel. Allow enough 90 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 91 */ 92 #define XBB_MAX_RING_PAGES 32 93 94 /** 95 * The maximum number of outstanding request blocks (request headers plus 96 * additional segment blocks) we will allow in a negotiated block-front/back 97 * communication channel. 98 */ 99 #define XBB_MAX_REQUESTS \ 100 __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) 101 102 /** 103 * \brief Define to enable rudimentary request logging to the console. 104 */ 105 #undef XBB_DEBUG 106 107 /*---------------------------------- Macros ----------------------------------*/ 108 /** 109 * Custom malloc type for all driver allocations. 110 */ 111 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 112 113 #ifdef XBB_DEBUG 114 #define DPRINTF(fmt, args...) \ 115 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 116 #else 117 #define DPRINTF(fmt, args...) do {} while(0) 118 #endif 119 120 /** 121 * The maximum mapped region size per request we will allow in a negotiated 122 * block-front/back communication channel. 123 * Use old default of MAXPHYS == 128K. 124 */ 125 #define XBB_MAX_REQUEST_SIZE \ 126 MIN(128 * 1024, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 127 128 /** 129 * The maximum number of segments (within a request header and accompanying 130 * segment blocks) per request we will allow in a negotiated block-front/back 131 * communication channel. 132 */ 133 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 134 (MIN(UIO_MAXIOV, \ 135 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 136 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 137 138 /** 139 * The maximum number of ring pages that we can allow per request list. 140 * We limit this to the maximum number of segments per request, because 141 * that is already a reasonable number of segments to aggregate. This 142 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 143 * because that would leave situations where we can't dispatch even one 144 * large request. 145 */ 146 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 147 148 #define XBD_SECTOR_SHFT 9 149 150 /*--------------------------- Forward Declarations ---------------------------*/ 151 struct xbb_softc; 152 struct xbb_xen_req; 153 154 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 155 ...) __attribute__((format(printf, 3, 4))); 156 static int xbb_shutdown(struct xbb_softc *xbb); 157 158 /*------------------------------ Data Structures -----------------------------*/ 159 160 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 161 162 typedef enum { 163 XBB_REQLIST_NONE = 0x00, 164 XBB_REQLIST_MAPPED = 0x01 165 } xbb_reqlist_flags; 166 167 struct xbb_xen_reqlist { 168 /** 169 * Back reference to the parent block back instance for this 170 * request. Used during bio_done handling. 171 */ 172 struct xbb_softc *xbb; 173 174 /** 175 * BLKIF_OP code for this request. 176 */ 177 int operation; 178 179 /** 180 * Set to BLKIF_RSP_* to indicate request status. 181 * 182 * This field allows an error status to be recorded even if the 183 * delivery of this status must be deferred. Deferred reporting 184 * is necessary, for example, when an error is detected during 185 * completion processing of one bio when other bios for this 186 * request are still outstanding. 187 */ 188 int status; 189 190 /** 191 * Number of 512 byte sectors not transferred. 192 */ 193 int residual_512b_sectors; 194 195 /** 196 * Starting sector number of the first request in the list. 197 */ 198 off_t starting_sector_number; 199 200 /** 201 * If we're going to coalesce, the next contiguous sector would be 202 * this one. 203 */ 204 off_t next_contig_sector; 205 206 /** 207 * Number of child requests in the list. 208 */ 209 int num_children; 210 211 /** 212 * Number of I/O requests still pending on the backend. 213 */ 214 int pendcnt; 215 216 /** 217 * Total number of segments for requests in the list. 218 */ 219 int nr_segments; 220 221 /** 222 * Flags for this particular request list. 223 */ 224 xbb_reqlist_flags flags; 225 226 /** 227 * Kernel virtual address space reserved for this request 228 * list structure and used to map the remote domain's pages for 229 * this I/O, into our domain's address space. 230 */ 231 uint8_t *kva; 232 233 /** 234 * Base, pseudo-physical address, corresponding to the start 235 * of this request's kva region. 236 */ 237 uint64_t gnt_base; 238 239 /** 240 * Array of grant handles (one per page) used to map this request. 241 */ 242 grant_handle_t *gnt_handles; 243 244 /** 245 * Device statistics request ordering type (ordered or simple). 246 */ 247 devstat_tag_type ds_tag_type; 248 249 /** 250 * Device statistics request type (read, write, no_data). 251 */ 252 devstat_trans_flags ds_trans_type; 253 254 /** 255 * The start time for this request. 256 */ 257 struct bintime ds_t0; 258 259 /** 260 * Linked list of contiguous requests with the same operation type. 261 */ 262 struct xbb_xen_req_list contig_req_list; 263 264 /** 265 * Linked list links used to aggregate idle requests in the 266 * request list free pool (xbb->reqlist_free_stailq) and pending 267 * requests waiting for execution (xbb->reqlist_pending_stailq). 268 */ 269 STAILQ_ENTRY(xbb_xen_reqlist) links; 270 }; 271 272 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 273 274 /** 275 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 276 */ 277 struct xbb_xen_req { 278 /** 279 * Linked list links used to aggregate requests into a reqlist 280 * and to store them in the request free pool. 281 */ 282 STAILQ_ENTRY(xbb_xen_req) links; 283 284 /** 285 * The remote domain's identifier for this I/O request. 286 */ 287 uint64_t id; 288 289 /** 290 * The number of pages currently mapped for this request. 291 */ 292 int nr_pages; 293 294 /** 295 * The number of 512 byte sectors comprising this requests. 296 */ 297 int nr_512b_sectors; 298 299 /** 300 * BLKIF_OP code for this request. 301 */ 302 int operation; 303 304 /** 305 * Storage used for non-native ring requests. 306 */ 307 blkif_request_t ring_req_storage; 308 309 /** 310 * Pointer to the Xen request in the ring. 311 */ 312 blkif_request_t *ring_req; 313 314 /** 315 * Consumer index for this request. 316 */ 317 RING_IDX req_ring_idx; 318 319 /** 320 * The start time for this request. 321 */ 322 struct bintime ds_t0; 323 324 /** 325 * Pointer back to our parent request list. 326 */ 327 struct xbb_xen_reqlist *reqlist; 328 }; 329 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 330 331 /** 332 * \brief Configuration data for the shared memory request ring 333 * used to communicate with the front-end client of this 334 * this driver. 335 */ 336 struct xbb_ring_config { 337 /** KVA address where ring memory is mapped. */ 338 vm_offset_t va; 339 340 /** The pseudo-physical address where ring memory is mapped.*/ 341 uint64_t gnt_addr; 342 343 /** 344 * Grant table handles, one per-ring page, returned by the 345 * hyperpervisor upon mapping of the ring and required to 346 * unmap it when a connection is torn down. 347 */ 348 grant_handle_t handle[XBB_MAX_RING_PAGES]; 349 350 /** 351 * The device bus address returned by the hypervisor when 352 * mapping the ring and required to unmap it when a connection 353 * is torn down. 354 */ 355 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 356 357 /** The number of ring pages mapped for the current connection. */ 358 u_int ring_pages; 359 360 /** 361 * The grant references, one per-ring page, supplied by the 362 * front-end, allowing us to reference the ring pages in the 363 * front-end's domain and to map these pages into our own domain. 364 */ 365 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 366 367 /** The interrupt driven even channel used to signal ring events. */ 368 evtchn_port_t evtchn; 369 }; 370 371 /** 372 * Per-instance connection state flags. 373 */ 374 typedef enum 375 { 376 /** 377 * The front-end requested a read-only mount of the 378 * back-end device/file. 379 */ 380 XBBF_READ_ONLY = 0x01, 381 382 /** Communication with the front-end has been established. */ 383 XBBF_RING_CONNECTED = 0x02, 384 385 /** 386 * Front-end requests exist in the ring and are waiting for 387 * xbb_xen_req objects to free up. 388 */ 389 XBBF_RESOURCE_SHORTAGE = 0x04, 390 391 /** Connection teardown in progress. */ 392 XBBF_SHUTDOWN = 0x08, 393 394 /** A thread is already performing shutdown processing. */ 395 XBBF_IN_SHUTDOWN = 0x10 396 } xbb_flag_t; 397 398 /** Backend device type. */ 399 typedef enum { 400 /** Backend type unknown. */ 401 XBB_TYPE_NONE = 0x00, 402 403 /** 404 * Backend type disk (access via cdev switch 405 * strategy routine). 406 */ 407 XBB_TYPE_DISK = 0x01, 408 409 /** Backend type file (access vnode operations.). */ 410 XBB_TYPE_FILE = 0x02 411 } xbb_type; 412 413 /** 414 * \brief Structure used to memoize information about a per-request 415 * scatter-gather list. 416 * 417 * The chief benefit of using this data structure is it avoids having 418 * to reparse the possibly discontiguous S/G list in the original 419 * request. Due to the way that the mapping of the memory backing an 420 * I/O transaction is handled by Xen, a second pass is unavoidable. 421 * At least this way the second walk is a simple array traversal. 422 * 423 * \note A single Scatter/Gather element in the block interface covers 424 * at most 1 machine page. In this context a sector (blkif 425 * nomenclature, not what I'd choose) is a 512b aligned unit 426 * of mapping within the machine page referenced by an S/G 427 * element. 428 */ 429 struct xbb_sg { 430 /** The number of 512b data chunks mapped in this S/G element. */ 431 int16_t nsect; 432 433 /** 434 * The index (0 based) of the first 512b data chunk mapped 435 * in this S/G element. 436 */ 437 uint8_t first_sect; 438 439 /** 440 * The index (0 based) of the last 512b data chunk mapped 441 * in this S/G element. 442 */ 443 uint8_t last_sect; 444 }; 445 446 /** 447 * Character device backend specific configuration data. 448 */ 449 struct xbb_dev_data { 450 /** Cdev used for device backend access. */ 451 struct cdev *cdev; 452 453 /** Cdev switch used for device backend access. */ 454 struct cdevsw *csw; 455 456 /** Used to hold a reference on opened cdev backend devices. */ 457 int dev_ref; 458 }; 459 460 /** 461 * File backend specific configuration data. 462 */ 463 struct xbb_file_data { 464 /** Credentials to use for vnode backed (file based) I/O. */ 465 struct ucred *cred; 466 467 /** 468 * \brief Array of io vectors used to process file based I/O. 469 * 470 * Only a single file based request is outstanding per-xbb instance, 471 * so we only need one of these. 472 */ 473 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 474 }; 475 476 /** 477 * Collection of backend type specific data. 478 */ 479 union xbb_backend_data { 480 struct xbb_dev_data dev; 481 struct xbb_file_data file; 482 }; 483 484 /** 485 * Function signature of backend specific I/O handlers. 486 */ 487 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 488 struct xbb_xen_reqlist *reqlist, int operation, 489 int flags); 490 491 /** 492 * Per-instance configuration data. 493 */ 494 struct xbb_softc { 495 /** 496 * Task-queue used to process I/O requests. 497 */ 498 struct taskqueue *io_taskqueue; 499 500 /** 501 * Single "run the request queue" task enqueued 502 * on io_taskqueue. 503 */ 504 struct task io_task; 505 506 /** Device type for this instance. */ 507 xbb_type device_type; 508 509 /** NewBus device corresponding to this instance. */ 510 device_t dev; 511 512 /** Backend specific dispatch routine for this instance. */ 513 xbb_dispatch_t dispatch_io; 514 515 /** The number of requests outstanding on the backend device/file. */ 516 int active_request_count; 517 518 /** Free pool of request tracking structures. */ 519 struct xbb_xen_req_list request_free_stailq; 520 521 /** Array, sized at connection time, of request tracking structures. */ 522 struct xbb_xen_req *requests; 523 524 /** Free pool of request list structures. */ 525 struct xbb_xen_reqlist_list reqlist_free_stailq; 526 527 /** List of pending request lists awaiting execution. */ 528 struct xbb_xen_reqlist_list reqlist_pending_stailq; 529 530 /** Array, sized at connection time, of request list structures. */ 531 struct xbb_xen_reqlist *request_lists; 532 533 /** 534 * Global pool of kva used for mapping remote domain ring 535 * and I/O transaction data. 536 */ 537 vm_offset_t kva; 538 539 /** Pseudo-physical address corresponding to kva. */ 540 uint64_t gnt_base_addr; 541 542 /** The size of the global kva pool. */ 543 int kva_size; 544 545 /** The size of the KVA area used for request lists. */ 546 int reqlist_kva_size; 547 548 /** The number of pages of KVA used for request lists */ 549 int reqlist_kva_pages; 550 551 /** Bitmap of free KVA pages */ 552 bitstr_t *kva_free; 553 554 /** 555 * \brief Cached value of the front-end's domain id. 556 * 557 * This value is used at once for each mapped page in 558 * a transaction. We cache it to avoid incuring the 559 * cost of an ivar access every time this is needed. 560 */ 561 domid_t otherend_id; 562 563 /** 564 * \brief The blkif protocol abi in effect. 565 * 566 * There are situations where the back and front ends can 567 * have a different, native abi (e.g. intel x86_64 and 568 * 32bit x86 domains on the same machine). The back-end 569 * always accommodates the front-end's native abi. That 570 * value is pulled from the XenStore and recorded here. 571 */ 572 int abi; 573 574 /** 575 * \brief The maximum number of requests and request lists allowed 576 * to be in flight at a time. 577 * 578 * This value is negotiated via the XenStore. 579 */ 580 u_int max_requests; 581 582 /** 583 * \brief The maximum number of segments (1 page per segment) 584 * that can be mapped by a request. 585 * 586 * This value is negotiated via the XenStore. 587 */ 588 u_int max_request_segments; 589 590 /** 591 * \brief Maximum number of segments per request list. 592 * 593 * This value is derived from and will generally be larger than 594 * max_request_segments. 595 */ 596 u_int max_reqlist_segments; 597 598 /** 599 * The maximum size of any request to this back-end 600 * device. 601 * 602 * This value is negotiated via the XenStore. 603 */ 604 u_int max_request_size; 605 606 /** 607 * The maximum size of any request list. This is derived directly 608 * from max_reqlist_segments. 609 */ 610 u_int max_reqlist_size; 611 612 /** Various configuration and state bit flags. */ 613 xbb_flag_t flags; 614 615 /** Ring mapping and interrupt configuration data. */ 616 struct xbb_ring_config ring_config; 617 618 /** Runtime, cross-abi safe, structures for ring access. */ 619 blkif_back_rings_t rings; 620 621 /** IRQ mapping for the communication ring event channel. */ 622 xen_intr_handle_t xen_intr_handle; 623 624 /** 625 * \brief Backend access mode flags (e.g. write, or read-only). 626 * 627 * This value is passed to us by the front-end via the XenStore. 628 */ 629 char *dev_mode; 630 631 /** 632 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 633 * 634 * This value is passed to us by the front-end via the XenStore. 635 * Currently unused. 636 */ 637 char *dev_type; 638 639 /** 640 * \brief Backend device/file identifier. 641 * 642 * This value is passed to us by the front-end via the XenStore. 643 * We expect this to be a POSIX path indicating the file or 644 * device to open. 645 */ 646 char *dev_name; 647 648 /** 649 * Vnode corresponding to the backend device node or file 650 * we are acessing. 651 */ 652 struct vnode *vn; 653 654 union xbb_backend_data backend; 655 656 /** The native sector size of the backend. */ 657 u_int sector_size; 658 659 /** log2 of sector_size. */ 660 u_int sector_size_shift; 661 662 /** Size in bytes of the backend device or file. */ 663 off_t media_size; 664 665 /** 666 * \brief media_size expressed in terms of the backend native 667 * sector size. 668 * 669 * (e.g. xbb->media_size >> xbb->sector_size_shift). 670 */ 671 uint64_t media_num_sectors; 672 673 /** 674 * \brief Array of memoized scatter gather data computed during the 675 * conversion of blkif ring requests to internal xbb_xen_req 676 * structures. 677 * 678 * Ring processing is serialized so we only need one of these. 679 */ 680 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 681 682 /** 683 * Temporary grant table map used in xbb_dispatch_io(). When 684 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 685 * stack could cause a stack overflow. 686 */ 687 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 688 689 /** Mutex protecting per-instance data. */ 690 struct mtx lock; 691 692 /** 693 * Resource representing allocated physical address space 694 * associated with our per-instance kva region. 695 */ 696 struct resource *pseudo_phys_res; 697 698 /** Resource id for allocated physical address space. */ 699 int pseudo_phys_res_id; 700 701 /** 702 * I/O statistics from BlockBack dispatch down. These are 703 * coalesced requests, and we start them right before execution. 704 */ 705 struct devstat *xbb_stats; 706 707 /** 708 * I/O statistics coming into BlockBack. These are the requests as 709 * we get them from BlockFront. They are started as soon as we 710 * receive a request, and completed when the I/O is complete. 711 */ 712 struct devstat *xbb_stats_in; 713 714 /** Disable sending flush to the backend */ 715 int disable_flush; 716 717 /** Send a real flush for every N flush requests */ 718 int flush_interval; 719 720 /** Count of flush requests in the interval */ 721 int flush_count; 722 723 /** Don't coalesce requests if this is set */ 724 int no_coalesce_reqs; 725 726 /** Number of requests we have received */ 727 uint64_t reqs_received; 728 729 /** Number of requests we have completed*/ 730 uint64_t reqs_completed; 731 732 /** Number of requests we queued but not pushed*/ 733 uint64_t reqs_queued_for_completion; 734 735 /** Number of requests we completed with an error status*/ 736 uint64_t reqs_completed_with_error; 737 738 /** How many forced dispatches (i.e. without coalescing) have happened */ 739 uint64_t forced_dispatch; 740 741 /** How many normal dispatches have happened */ 742 uint64_t normal_dispatch; 743 744 /** How many total dispatches have happened */ 745 uint64_t total_dispatch; 746 747 /** How many times we have run out of KVA */ 748 uint64_t kva_shortages; 749 750 /** How many times we have run out of request structures */ 751 uint64_t request_shortages; 752 753 /** Watch to wait for hotplug script execution */ 754 struct xs_watch hotplug_watch; 755 756 /** Got the needed data from hotplug scripts? */ 757 bool hotplug_done; 758 }; 759 760 /*---------------------------- Request Processing ----------------------------*/ 761 /** 762 * Allocate an internal transaction tracking structure from the free pool. 763 * 764 * \param xbb Per-instance xbb configuration structure. 765 * 766 * \return On success, a pointer to the allocated xbb_xen_req structure. 767 * Otherwise NULL. 768 */ 769 static inline struct xbb_xen_req * 770 xbb_get_req(struct xbb_softc *xbb) 771 { 772 struct xbb_xen_req *req; 773 774 req = NULL; 775 776 mtx_assert(&xbb->lock, MA_OWNED); 777 778 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 779 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 780 xbb->active_request_count++; 781 } 782 783 return (req); 784 } 785 786 /** 787 * Return an allocated transaction tracking structure to the free pool. 788 * 789 * \param xbb Per-instance xbb configuration structure. 790 * \param req The request structure to free. 791 */ 792 static inline void 793 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 794 { 795 mtx_assert(&xbb->lock, MA_OWNED); 796 797 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 798 xbb->active_request_count--; 799 800 KASSERT(xbb->active_request_count >= 0, 801 ("xbb_release_req: negative active count")); 802 } 803 804 /** 805 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 806 * 807 * \param xbb Per-instance xbb configuration structure. 808 * \param req_list The list of requests to free. 809 * \param nreqs The number of items in the list. 810 */ 811 static inline void 812 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 813 int nreqs) 814 { 815 mtx_assert(&xbb->lock, MA_OWNED); 816 817 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 818 xbb->active_request_count -= nreqs; 819 820 KASSERT(xbb->active_request_count >= 0, 821 ("xbb_release_reqs: negative active count")); 822 } 823 824 /** 825 * Given a page index and 512b sector offset within that page, 826 * calculate an offset into a request's kva region. 827 * 828 * \param reqlist The request structure whose kva region will be accessed. 829 * \param pagenr The page index used to compute the kva offset. 830 * \param sector The 512b sector index used to compute the page relative 831 * kva offset. 832 * 833 * \return The computed global KVA offset. 834 */ 835 static inline uint8_t * 836 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 837 { 838 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 839 } 840 841 /** 842 * Given a page number and 512b sector offset within that page, 843 * calculate an offset into the request's memory region that the 844 * underlying backend device/file should use for I/O. 845 * 846 * \param reqlist The request structure whose I/O region will be accessed. 847 * \param pagenr The page index used to compute the I/O offset. 848 * \param sector The 512b sector index used to compute the page relative 849 * I/O offset. 850 * 851 * \return The computed global I/O address. 852 * 853 * Depending on configuration, this will either be a local bounce buffer 854 * or a pointer to the memory mapped in from the front-end domain for 855 * this request. 856 */ 857 static inline uint8_t * 858 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 859 { 860 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 861 } 862 863 /** 864 * Given a page index and 512b sector offset within that page, calculate 865 * an offset into the local pseudo-physical address space used to map a 866 * front-end's request data into a request. 867 * 868 * \param reqlist The request list structure whose pseudo-physical region 869 * will be accessed. 870 * \param pagenr The page index used to compute the pseudo-physical offset. 871 * \param sector The 512b sector index used to compute the page relative 872 * pseudo-physical offset. 873 * 874 * \return The computed global pseudo-phsyical address. 875 * 876 * Depending on configuration, this will either be a local bounce buffer 877 * or a pointer to the memory mapped in from the front-end domain for 878 * this request. 879 */ 880 static inline uintptr_t 881 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 882 { 883 struct xbb_softc *xbb; 884 885 xbb = reqlist->xbb; 886 887 return ((uintptr_t)(xbb->gnt_base_addr + 888 (uintptr_t)(reqlist->kva - xbb->kva) + 889 (PAGE_SIZE * pagenr) + (sector << 9))); 890 } 891 892 /** 893 * Get Kernel Virtual Address space for mapping requests. 894 * 895 * \param xbb Per-instance xbb configuration structure. 896 * \param nr_pages Number of pages needed. 897 * \param check_only If set, check for free KVA but don't allocate it. 898 * \param have_lock If set, xbb lock is already held. 899 * 900 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 901 * 902 * Note: This should be unnecessary once we have either chaining or 903 * scatter/gather support for struct bio. At that point we'll be able to 904 * put multiple addresses and lengths in one bio/bio chain and won't need 905 * to map everything into one virtual segment. 906 */ 907 static uint8_t * 908 xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 909 { 910 int first_clear; 911 int num_clear; 912 uint8_t *free_kva; 913 int i; 914 915 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 916 917 first_clear = 0; 918 free_kva = NULL; 919 920 mtx_lock(&xbb->lock); 921 922 /* 923 * Look for the first available page. If there are none, we're done. 924 */ 925 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 926 927 if (first_clear == -1) 928 goto bailout; 929 930 /* 931 * Starting at the first available page, look for consecutive free 932 * pages that will satisfy the user's request. 933 */ 934 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 935 /* 936 * If this is true, the page is used, so we have to reset 937 * the number of clear pages and the first clear page 938 * (since it pointed to a region with an insufficient number 939 * of clear pages). 940 */ 941 if (bit_test(xbb->kva_free, i)) { 942 num_clear = 0; 943 first_clear = -1; 944 continue; 945 } 946 947 if (first_clear == -1) 948 first_clear = i; 949 950 /* 951 * If this is true, we've found a large enough free region 952 * to satisfy the request. 953 */ 954 if (++num_clear == nr_pages) { 955 bit_nset(xbb->kva_free, first_clear, 956 first_clear + nr_pages - 1); 957 958 free_kva = xbb->kva + 959 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); 960 961 KASSERT(free_kva >= (uint8_t *)xbb->kva && 962 free_kva + (nr_pages * PAGE_SIZE) <= 963 (uint8_t *)xbb->ring_config.va, 964 ("Free KVA %p len %d out of range, " 965 "kva = %#jx, ring VA = %#jx\n", free_kva, 966 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 967 (uintmax_t)xbb->ring_config.va)); 968 break; 969 } 970 } 971 972 bailout: 973 974 if (free_kva == NULL) { 975 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 976 xbb->kva_shortages++; 977 } 978 979 mtx_unlock(&xbb->lock); 980 981 return (free_kva); 982 } 983 984 /** 985 * Free allocated KVA. 986 * 987 * \param xbb Per-instance xbb configuration structure. 988 * \param kva_ptr Pointer to allocated KVA region. 989 * \param nr_pages Number of pages in the KVA region. 990 */ 991 static void 992 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 993 { 994 intptr_t start_page; 995 996 mtx_assert(&xbb->lock, MA_OWNED); 997 998 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 999 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1000 1001 } 1002 1003 /** 1004 * Unmap the front-end pages associated with this I/O request. 1005 * 1006 * \param req The request structure to unmap. 1007 */ 1008 static void 1009 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1010 { 1011 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1012 u_int i; 1013 u_int invcount; 1014 int error __diagused; 1015 1016 invcount = 0; 1017 for (i = 0; i < reqlist->nr_segments; i++) { 1018 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1019 continue; 1020 1021 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1022 unmap[invcount].dev_bus_addr = 0; 1023 unmap[invcount].handle = reqlist->gnt_handles[i]; 1024 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1025 invcount++; 1026 } 1027 1028 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1029 unmap, invcount); 1030 KASSERT(error == 0, ("Grant table operation failed")); 1031 } 1032 1033 /** 1034 * Allocate an internal transaction tracking structure from the free pool. 1035 * 1036 * \param xbb Per-instance xbb configuration structure. 1037 * 1038 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1039 * Otherwise NULL. 1040 */ 1041 static inline struct xbb_xen_reqlist * 1042 xbb_get_reqlist(struct xbb_softc *xbb) 1043 { 1044 struct xbb_xen_reqlist *reqlist; 1045 1046 reqlist = NULL; 1047 1048 mtx_assert(&xbb->lock, MA_OWNED); 1049 1050 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1051 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1052 reqlist->flags = XBB_REQLIST_NONE; 1053 reqlist->kva = NULL; 1054 reqlist->status = BLKIF_RSP_OKAY; 1055 reqlist->residual_512b_sectors = 0; 1056 reqlist->num_children = 0; 1057 reqlist->nr_segments = 0; 1058 STAILQ_INIT(&reqlist->contig_req_list); 1059 } 1060 1061 return (reqlist); 1062 } 1063 1064 /** 1065 * Return an allocated transaction tracking structure to the free pool. 1066 * 1067 * \param xbb Per-instance xbb configuration structure. 1068 * \param req The request list structure to free. 1069 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1070 * during a resource shortage condition. 1071 */ 1072 static inline void 1073 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1074 int wakeup) 1075 { 1076 1077 mtx_assert(&xbb->lock, MA_OWNED); 1078 1079 if (wakeup) { 1080 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1081 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1082 } 1083 1084 if (reqlist->kva != NULL) 1085 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1086 1087 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1088 1089 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1090 1091 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1092 /* 1093 * Shutdown is in progress. See if we can 1094 * progress further now that one more request 1095 * has completed and been returned to the 1096 * free pool. 1097 */ 1098 xbb_shutdown(xbb); 1099 } 1100 1101 if (wakeup != 0) 1102 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1103 } 1104 1105 /** 1106 * Request resources and do basic request setup. 1107 * 1108 * \param xbb Per-instance xbb configuration structure. 1109 * \param reqlist Pointer to reqlist pointer. 1110 * \param ring_req Pointer to a block ring request. 1111 * \param ring_index The ring index of this request. 1112 * 1113 * \return 0 for success, non-zero for failure. 1114 */ 1115 static int 1116 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1117 blkif_request_t *ring_req, RING_IDX ring_idx) 1118 { 1119 struct xbb_xen_reqlist *nreqlist; 1120 struct xbb_xen_req *nreq; 1121 1122 nreqlist = NULL; 1123 nreq = NULL; 1124 1125 mtx_lock(&xbb->lock); 1126 1127 /* 1128 * We don't allow new resources to be allocated if we're in the 1129 * process of shutting down. 1130 */ 1131 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1132 mtx_unlock(&xbb->lock); 1133 return (1); 1134 } 1135 1136 /* 1137 * Allocate a reqlist if the caller doesn't have one already. 1138 */ 1139 if (*reqlist == NULL) { 1140 nreqlist = xbb_get_reqlist(xbb); 1141 if (nreqlist == NULL) 1142 goto bailout_error; 1143 } 1144 1145 /* We always allocate a request. */ 1146 nreq = xbb_get_req(xbb); 1147 if (nreq == NULL) 1148 goto bailout_error; 1149 1150 mtx_unlock(&xbb->lock); 1151 1152 if (*reqlist == NULL) { 1153 *reqlist = nreqlist; 1154 nreqlist->operation = ring_req->operation; 1155 nreqlist->starting_sector_number = 1156 (ring_req->sector_number << XBD_SECTOR_SHFT) >> 1157 xbb->sector_size_shift; 1158 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1159 links); 1160 } 1161 1162 nreq->reqlist = *reqlist; 1163 nreq->req_ring_idx = ring_idx; 1164 nreq->id = ring_req->id; 1165 nreq->operation = ring_req->operation; 1166 1167 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1168 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1169 nreq->ring_req = &nreq->ring_req_storage; 1170 } else { 1171 nreq->ring_req = ring_req; 1172 } 1173 1174 binuptime(&nreq->ds_t0); 1175 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1176 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1177 (*reqlist)->num_children++; 1178 (*reqlist)->nr_segments += ring_req->nr_segments; 1179 1180 return (0); 1181 1182 bailout_error: 1183 1184 /* 1185 * We're out of resources, so set the shortage flag. The next time 1186 * a request is released, we'll try waking up the work thread to 1187 * see if we can allocate more resources. 1188 */ 1189 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1190 xbb->request_shortages++; 1191 1192 if (nreq != NULL) 1193 xbb_release_req(xbb, nreq); 1194 1195 if (nreqlist != NULL) 1196 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1197 1198 mtx_unlock(&xbb->lock); 1199 1200 return (1); 1201 } 1202 1203 /** 1204 * Create and queue a response to a blkif request. 1205 * 1206 * \param xbb Per-instance xbb configuration structure. 1207 * \param req The request structure to which to respond. 1208 * \param status The status code to report. See BLKIF_RSP_* 1209 * in sys/contrib/xen/io/blkif.h. 1210 */ 1211 static void 1212 xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1213 { 1214 blkif_response_t *resp; 1215 1216 /* 1217 * The mutex is required here, and should be held across this call 1218 * until after the subsequent call to xbb_push_responses(). This 1219 * is to guarantee that another context won't queue responses and 1220 * push them while we're active. 1221 * 1222 * That could lead to the other end being notified of responses 1223 * before the resources have been freed on this end. The other end 1224 * would then be able to queue additional I/O, and we may run out 1225 * of resources because we haven't freed them all yet. 1226 */ 1227 mtx_assert(&xbb->lock, MA_OWNED); 1228 1229 /* 1230 * Place on the response ring for the relevant domain. 1231 * For now, only the spacing between entries is different 1232 * in the different ABIs, not the response entry layout. 1233 */ 1234 switch (xbb->abi) { 1235 case BLKIF_PROTOCOL_NATIVE: 1236 resp = RING_GET_RESPONSE(&xbb->rings.native, 1237 xbb->rings.native.rsp_prod_pvt); 1238 break; 1239 case BLKIF_PROTOCOL_X86_32: 1240 resp = (blkif_response_t *) 1241 RING_GET_RESPONSE(&xbb->rings.x86_32, 1242 xbb->rings.x86_32.rsp_prod_pvt); 1243 break; 1244 case BLKIF_PROTOCOL_X86_64: 1245 resp = (blkif_response_t *) 1246 RING_GET_RESPONSE(&xbb->rings.x86_64, 1247 xbb->rings.x86_64.rsp_prod_pvt); 1248 break; 1249 default: 1250 panic("Unexpected blkif protocol ABI."); 1251 } 1252 1253 resp->id = req->id; 1254 resp->operation = req->operation; 1255 resp->status = status; 1256 1257 if (status != BLKIF_RSP_OKAY) 1258 xbb->reqs_completed_with_error++; 1259 1260 xbb->rings.common.rsp_prod_pvt++; 1261 1262 xbb->reqs_queued_for_completion++; 1263 1264 } 1265 1266 /** 1267 * Send queued responses to blkif requests. 1268 * 1269 * \param xbb Per-instance xbb configuration structure. 1270 * \param run_taskqueue Flag that is set to 1 if the taskqueue 1271 * should be run, 0 if it does not need to be run. 1272 * \param notify Flag that is set to 1 if the other end should be 1273 * notified via irq, 0 if the other end should not be 1274 * notified. 1275 */ 1276 static void 1277 xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) 1278 { 1279 int more_to_do; 1280 1281 /* 1282 * The mutex is required here. 1283 */ 1284 mtx_assert(&xbb->lock, MA_OWNED); 1285 1286 more_to_do = 0; 1287 1288 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); 1289 1290 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1291 /* 1292 * Tail check for pending requests. Allows frontend to avoid 1293 * notifications if requests are already in flight (lower 1294 * overheads and promotes batching). 1295 */ 1296 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1297 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1298 more_to_do = 1; 1299 } 1300 1301 xbb->reqs_completed += xbb->reqs_queued_for_completion; 1302 xbb->reqs_queued_for_completion = 0; 1303 1304 *run_taskqueue = more_to_do; 1305 } 1306 1307 /** 1308 * Complete a request list. 1309 * 1310 * \param xbb Per-instance xbb configuration structure. 1311 * \param reqlist Allocated internal request list structure. 1312 */ 1313 static void 1314 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1315 { 1316 struct xbb_xen_req *nreq; 1317 off_t sectors_sent; 1318 int notify, run_taskqueue; 1319 1320 sectors_sent = 0; 1321 1322 if (reqlist->flags & XBB_REQLIST_MAPPED) 1323 xbb_unmap_reqlist(reqlist); 1324 1325 mtx_lock(&xbb->lock); 1326 1327 /* 1328 * All I/O is done, send the response. A lock is not necessary 1329 * to protect the request list, because all requests have 1330 * completed. Therefore this is the only context accessing this 1331 * reqlist right now. However, in order to make sure that no one 1332 * else queues responses onto the queue or pushes them to the other 1333 * side while we're active, we need to hold the lock across the 1334 * calls to xbb_queue_response() and xbb_push_responses(). 1335 */ 1336 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1337 off_t cur_sectors_sent; 1338 1339 /* Put this response on the ring, but don't push yet */ 1340 xbb_queue_response(xbb, nreq, reqlist->status); 1341 1342 /* We don't report bytes sent if there is an error. */ 1343 if (reqlist->status == BLKIF_RSP_OKAY) 1344 cur_sectors_sent = nreq->nr_512b_sectors; 1345 else 1346 cur_sectors_sent = 0; 1347 1348 sectors_sent += cur_sectors_sent; 1349 1350 devstat_end_transaction(xbb->xbb_stats_in, 1351 /*bytes*/cur_sectors_sent << 9, 1352 reqlist->ds_tag_type, 1353 reqlist->ds_trans_type, 1354 /*now*/NULL, 1355 /*then*/&nreq->ds_t0); 1356 } 1357 1358 /* 1359 * Take out any sectors not sent. If we wind up negative (which 1360 * might happen if an error is reported as well as a residual), just 1361 * report 0 sectors sent. 1362 */ 1363 sectors_sent -= reqlist->residual_512b_sectors; 1364 if (sectors_sent < 0) 1365 sectors_sent = 0; 1366 1367 devstat_end_transaction(xbb->xbb_stats, 1368 /*bytes*/ sectors_sent << 9, 1369 reqlist->ds_tag_type, 1370 reqlist->ds_trans_type, 1371 /*now*/NULL, 1372 /*then*/&reqlist->ds_t0); 1373 1374 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1375 1376 xbb_push_responses(xbb, &run_taskqueue, ¬ify); 1377 1378 mtx_unlock(&xbb->lock); 1379 1380 if (run_taskqueue) 1381 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1382 1383 if (notify) 1384 xen_intr_signal(xbb->xen_intr_handle); 1385 } 1386 1387 /** 1388 * Completion handler for buffer I/O requests issued by the device 1389 * backend driver. 1390 * 1391 * \param bio The buffer I/O request on which to perform completion 1392 * processing. 1393 */ 1394 static void 1395 xbb_bio_done(struct bio *bio) 1396 { 1397 struct xbb_softc *xbb; 1398 struct xbb_xen_reqlist *reqlist; 1399 1400 reqlist = bio->bio_caller1; 1401 xbb = reqlist->xbb; 1402 1403 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1404 1405 /* 1406 * This is a bit imprecise. With aggregated I/O a single 1407 * request list can contain multiple front-end requests and 1408 * a multiple bios may point to a single request. By carefully 1409 * walking the request list, we could map residuals and errors 1410 * back to the original front-end request, but the interface 1411 * isn't sufficiently rich for us to properly report the error. 1412 * So, we just treat the entire request list as having failed if an 1413 * error occurs on any part. And, if an error occurs, we treat 1414 * the amount of data transferred as 0. 1415 * 1416 * For residuals, we report it on the overall aggregated device, 1417 * but not on the individual requests, since we don't currently 1418 * do the work to determine which front-end request to which the 1419 * residual applies. 1420 */ 1421 if (bio->bio_error) { 1422 DPRINTF("BIO returned error %d for operation on device %s\n", 1423 bio->bio_error, xbb->dev_name); 1424 reqlist->status = BLKIF_RSP_ERROR; 1425 1426 if (bio->bio_error == ENXIO 1427 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1428 /* 1429 * Backend device has disappeared. Signal the 1430 * front-end that we (the device proxy) want to 1431 * go away. 1432 */ 1433 xenbus_set_state(xbb->dev, XenbusStateClosing); 1434 } 1435 } 1436 1437 /* 1438 * Decrement the pending count for the request list. When we're 1439 * done with the requests, send status back for all of them. 1440 */ 1441 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1442 xbb_complete_reqlist(xbb, reqlist); 1443 1444 g_destroy_bio(bio); 1445 } 1446 1447 /** 1448 * Parse a blkif request into an internal request structure and send 1449 * it to the backend for processing. 1450 * 1451 * \param xbb Per-instance xbb configuration structure. 1452 * \param reqlist Allocated internal request list structure. 1453 * 1454 * \return On success, 0. For resource shortages, non-zero. 1455 * 1456 * This routine performs the backend common aspects of request parsing 1457 * including compiling an internal request structure, parsing the S/G 1458 * list and any secondary ring requests in which they may reside, and 1459 * the mapping of front-end I/O pages into our domain. 1460 */ 1461 static int 1462 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1463 { 1464 struct xbb_sg *xbb_sg; 1465 struct gnttab_map_grant_ref *map; 1466 struct blkif_request_segment *sg; 1467 struct blkif_request_segment *last_block_sg; 1468 struct xbb_xen_req *nreq; 1469 u_int nseg; 1470 u_int seg_idx; 1471 u_int block_segs; 1472 int nr_sects; 1473 int total_sects; 1474 int operation; 1475 uint8_t bio_flags; 1476 int error; 1477 1478 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1479 bio_flags = 0; 1480 total_sects = 0; 1481 nr_sects = 0; 1482 1483 /* 1484 * First determine whether we have enough free KVA to satisfy this 1485 * request list. If not, tell xbb_run_queue() so it can go to 1486 * sleep until we have more KVA. 1487 */ 1488 reqlist->kva = NULL; 1489 if (reqlist->nr_segments != 0) { 1490 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1491 if (reqlist->kva == NULL) { 1492 /* 1493 * If we're out of KVA, return ENOMEM. 1494 */ 1495 return (ENOMEM); 1496 } 1497 } 1498 1499 binuptime(&reqlist->ds_t0); 1500 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1501 1502 switch (reqlist->operation) { 1503 case BLKIF_OP_WRITE_BARRIER: 1504 bio_flags |= BIO_ORDERED; 1505 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1506 /* FALLTHROUGH */ 1507 case BLKIF_OP_WRITE: 1508 operation = BIO_WRITE; 1509 reqlist->ds_trans_type = DEVSTAT_WRITE; 1510 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1511 DPRINTF("Attempt to write to read only device %s\n", 1512 xbb->dev_name); 1513 reqlist->status = BLKIF_RSP_ERROR; 1514 goto send_response; 1515 } 1516 break; 1517 case BLKIF_OP_READ: 1518 operation = BIO_READ; 1519 reqlist->ds_trans_type = DEVSTAT_READ; 1520 break; 1521 case BLKIF_OP_FLUSH_DISKCACHE: 1522 /* 1523 * If this is true, the user has requested that we disable 1524 * flush support. So we just complete the requests 1525 * successfully. 1526 */ 1527 if (xbb->disable_flush != 0) { 1528 goto send_response; 1529 } 1530 1531 /* 1532 * The user has requested that we only send a real flush 1533 * for every N flush requests. So keep count, and either 1534 * complete the request immediately or queue it for the 1535 * backend. 1536 */ 1537 if (xbb->flush_interval != 0) { 1538 if (++(xbb->flush_count) < xbb->flush_interval) { 1539 goto send_response; 1540 } else 1541 xbb->flush_count = 0; 1542 } 1543 1544 operation = BIO_FLUSH; 1545 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1546 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1547 goto do_dispatch; 1548 /*NOTREACHED*/ 1549 default: 1550 DPRINTF("error: unknown block io operation [%d]\n", 1551 reqlist->operation); 1552 reqlist->status = BLKIF_RSP_ERROR; 1553 goto send_response; 1554 } 1555 1556 reqlist->xbb = xbb; 1557 xbb_sg = xbb->xbb_sgs; 1558 map = xbb->maps; 1559 seg_idx = 0; 1560 1561 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1562 blkif_request_t *ring_req; 1563 1564 ring_req = nreq->ring_req; 1565 nr_sects = 0; 1566 nseg = ring_req->nr_segments; 1567 nreq->nr_pages = nseg; 1568 nreq->nr_512b_sectors = 0; 1569 sg = NULL; 1570 1571 /* Check that number of segments is sane. */ 1572 if (__predict_false(nseg == 0) 1573 || __predict_false(nseg > xbb->max_request_segments)) { 1574 DPRINTF("Bad number of segments in request (%d)\n", 1575 nseg); 1576 reqlist->status = BLKIF_RSP_ERROR; 1577 goto send_response; 1578 } 1579 1580 block_segs = nseg; 1581 sg = ring_req->seg; 1582 last_block_sg = sg + block_segs; 1583 1584 while (sg < last_block_sg) { 1585 KASSERT(seg_idx < 1586 XBB_MAX_SEGMENTS_PER_REQLIST, 1587 ("seg_idx %d is too large, max " 1588 "segs %d\n", seg_idx, 1589 XBB_MAX_SEGMENTS_PER_REQLIST)); 1590 1591 xbb_sg->first_sect = sg->first_sect; 1592 xbb_sg->last_sect = sg->last_sect; 1593 xbb_sg->nsect = 1594 (int8_t)(sg->last_sect - 1595 sg->first_sect + 1); 1596 1597 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1598 || (xbb_sg->nsect <= 0)) { 1599 reqlist->status = BLKIF_RSP_ERROR; 1600 goto send_response; 1601 } 1602 1603 nr_sects += xbb_sg->nsect; 1604 map->host_addr = xbb_get_gntaddr(reqlist, 1605 seg_idx, /*sector*/0); 1606 KASSERT(map->host_addr + PAGE_SIZE <= 1607 xbb->ring_config.gnt_addr, 1608 ("Host address %#jx len %d overlaps " 1609 "ring address %#jx\n", 1610 (uintmax_t)map->host_addr, PAGE_SIZE, 1611 (uintmax_t)xbb->ring_config.gnt_addr)); 1612 1613 map->flags = GNTMAP_host_map; 1614 map->ref = sg->gref; 1615 map->dom = xbb->otherend_id; 1616 if (operation == BIO_WRITE) 1617 map->flags |= GNTMAP_readonly; 1618 sg++; 1619 map++; 1620 xbb_sg++; 1621 seg_idx++; 1622 } 1623 1624 /* Convert to the disk's sector size */ 1625 nreq->nr_512b_sectors = nr_sects; 1626 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1627 total_sects += nr_sects; 1628 1629 if ((nreq->nr_512b_sectors & 1630 ((xbb->sector_size >> 9) - 1)) != 0) { 1631 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1632 "a multiple of the backing store sector " 1633 "size (%d)\n", __func__, 1634 nreq->nr_512b_sectors << 9, 1635 xbb->sector_size); 1636 reqlist->status = BLKIF_RSP_ERROR; 1637 goto send_response; 1638 } 1639 } 1640 1641 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1642 xbb->maps, reqlist->nr_segments); 1643 if (error != 0) 1644 panic("Grant table operation failed (%d)", error); 1645 1646 reqlist->flags |= XBB_REQLIST_MAPPED; 1647 1648 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1649 seg_idx++, map++){ 1650 if (__predict_false(map->status != 0)) { 1651 DPRINTF("invalid buffer -- could not remap " 1652 "it (%d)\n", map->status); 1653 DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " 1654 "0x%x ref 0x%x, dom %d\n", seg_idx, 1655 map->host_addr, map->flags, map->ref, 1656 map->dom); 1657 reqlist->status = BLKIF_RSP_ERROR; 1658 goto send_response; 1659 } 1660 1661 reqlist->gnt_handles[seg_idx] = map->handle; 1662 } 1663 if (reqlist->starting_sector_number + total_sects > 1664 xbb->media_num_sectors) { 1665 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1666 "extends past end of device %s\n", 1667 operation == BIO_READ ? "read" : "write", 1668 reqlist->starting_sector_number, 1669 reqlist->starting_sector_number + total_sects, 1670 xbb->dev_name); 1671 reqlist->status = BLKIF_RSP_ERROR; 1672 goto send_response; 1673 } 1674 1675 do_dispatch: 1676 1677 error = xbb->dispatch_io(xbb, 1678 reqlist, 1679 operation, 1680 bio_flags); 1681 1682 if (error != 0) { 1683 reqlist->status = BLKIF_RSP_ERROR; 1684 goto send_response; 1685 } 1686 1687 return (0); 1688 1689 send_response: 1690 1691 xbb_complete_reqlist(xbb, reqlist); 1692 1693 return (0); 1694 } 1695 1696 static __inline int 1697 xbb_count_sects(blkif_request_t *ring_req) 1698 { 1699 int i; 1700 int cur_size = 0; 1701 1702 for (i = 0; i < ring_req->nr_segments; i++) { 1703 int nsect; 1704 1705 nsect = (int8_t)(ring_req->seg[i].last_sect - 1706 ring_req->seg[i].first_sect + 1); 1707 if (nsect <= 0) 1708 break; 1709 1710 cur_size += nsect; 1711 } 1712 1713 return (cur_size); 1714 } 1715 1716 /** 1717 * Process incoming requests from the shared communication ring in response 1718 * to a signal on the ring's event channel. 1719 * 1720 * \param context Callback argument registerd during task initialization - 1721 * the xbb_softc for this instance. 1722 * \param pending The number of taskqueue_enqueue events that have 1723 * occurred since this handler was last run. 1724 */ 1725 static void 1726 xbb_run_queue(void *context, int pending) 1727 { 1728 struct xbb_softc *xbb; 1729 blkif_back_rings_t *rings; 1730 RING_IDX rp; 1731 uint64_t cur_sector; 1732 int cur_operation; 1733 struct xbb_xen_reqlist *reqlist; 1734 1735 xbb = (struct xbb_softc *)context; 1736 rings = &xbb->rings; 1737 1738 /* 1739 * Work gather and dispatch loop. Note that we have a bias here 1740 * towards gathering I/O sent by blockfront. We first gather up 1741 * everything in the ring, as long as we have resources. Then we 1742 * dispatch one request, and then attempt to gather up any 1743 * additional requests that have come in while we were dispatching 1744 * the request. 1745 * 1746 * This allows us to get a clearer picture (via devstat) of how 1747 * many requests blockfront is queueing to us at any given time. 1748 */ 1749 for (;;) { 1750 int retval; 1751 1752 /* 1753 * Initialize reqlist to the last element in the pending 1754 * queue, if there is one. This allows us to add more 1755 * requests to that request list, if we have room. 1756 */ 1757 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1758 xbb_xen_reqlist, links); 1759 if (reqlist != NULL) { 1760 cur_sector = reqlist->next_contig_sector; 1761 cur_operation = reqlist->operation; 1762 } else { 1763 cur_operation = 0; 1764 cur_sector = 0; 1765 } 1766 1767 /* 1768 * Cache req_prod to avoid accessing a cache line shared 1769 * with the frontend. 1770 */ 1771 rp = rings->common.sring->req_prod; 1772 1773 /* Ensure we see queued requests up to 'rp'. */ 1774 rmb(); 1775 1776 /** 1777 * Run so long as there is work to consume and the generation 1778 * of a response will not overflow the ring. 1779 * 1780 * @note There's a 1 to 1 relationship between requests and 1781 * responses, so an overflow should never occur. This 1782 * test is to protect our domain from digesting bogus 1783 * data. Shouldn't we log this? 1784 */ 1785 while (rings->common.req_cons != rp 1786 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1787 rings->common.req_cons) == 0){ 1788 blkif_request_t ring_req_storage; 1789 blkif_request_t *ring_req; 1790 int cur_size; 1791 1792 switch (xbb->abi) { 1793 case BLKIF_PROTOCOL_NATIVE: 1794 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1795 rings->common.req_cons); 1796 break; 1797 case BLKIF_PROTOCOL_X86_32: 1798 { 1799 struct blkif_x86_32_request *ring_req32; 1800 1801 ring_req32 = RING_GET_REQUEST( 1802 &xbb->rings.x86_32, rings->common.req_cons); 1803 blkif_get_x86_32_req(&ring_req_storage, 1804 ring_req32); 1805 ring_req = &ring_req_storage; 1806 break; 1807 } 1808 case BLKIF_PROTOCOL_X86_64: 1809 { 1810 struct blkif_x86_64_request *ring_req64; 1811 1812 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1813 rings->common.req_cons); 1814 blkif_get_x86_64_req(&ring_req_storage, 1815 ring_req64); 1816 ring_req = &ring_req_storage; 1817 break; 1818 } 1819 default: 1820 panic("Unexpected blkif protocol ABI."); 1821 /* NOTREACHED */ 1822 } 1823 1824 /* 1825 * Check for situations that would require closing 1826 * off this I/O for further coalescing: 1827 * - Coalescing is turned off. 1828 * - Current I/O is out of sequence with the previous 1829 * I/O. 1830 * - Coalesced I/O would be too large. 1831 */ 1832 if ((reqlist != NULL) 1833 && ((xbb->no_coalesce_reqs != 0) 1834 || ((xbb->no_coalesce_reqs == 0) 1835 && ((ring_req->sector_number != cur_sector) 1836 || (ring_req->operation != cur_operation) 1837 || ((ring_req->nr_segments + reqlist->nr_segments) > 1838 xbb->max_reqlist_segments))))) { 1839 reqlist = NULL; 1840 } 1841 1842 /* 1843 * Grab and check for all resources in one shot. 1844 * If we can't get all of the resources we need, 1845 * the shortage is noted and the thread will get 1846 * woken up when more resources are available. 1847 */ 1848 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1849 xbb->rings.common.req_cons); 1850 1851 if (retval != 0) { 1852 /* 1853 * Resource shortage has been recorded. 1854 * We'll be scheduled to run once a request 1855 * object frees up due to a completion. 1856 */ 1857 break; 1858 } 1859 1860 /* 1861 * Signify that we can overwrite this request with 1862 * a response by incrementing our consumer index. 1863 * The response won't be generated until after 1864 * we've already consumed all necessary data out 1865 * of the version of the request in the ring buffer 1866 * (for native mode). We must update the consumer 1867 * index before issuing back-end I/O so there is 1868 * no possibility that it will complete and a 1869 * response be generated before we make room in 1870 * the queue for that response. 1871 */ 1872 xbb->rings.common.req_cons++; 1873 xbb->reqs_received++; 1874 1875 cur_size = xbb_count_sects(ring_req); 1876 cur_sector = ring_req->sector_number + cur_size; 1877 reqlist->next_contig_sector = cur_sector; 1878 cur_operation = ring_req->operation; 1879 } 1880 1881 /* Check for I/O to dispatch */ 1882 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1883 if (reqlist == NULL) { 1884 /* 1885 * We're out of work to do, put the task queue to 1886 * sleep. 1887 */ 1888 break; 1889 } 1890 1891 /* 1892 * Grab the first request off the queue and attempt 1893 * to dispatch it. 1894 */ 1895 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1896 1897 retval = xbb_dispatch_io(xbb, reqlist); 1898 if (retval != 0) { 1899 /* 1900 * xbb_dispatch_io() returns non-zero only when 1901 * there is a resource shortage. If that's the 1902 * case, re-queue this request on the head of the 1903 * queue, and go to sleep until we have more 1904 * resources. 1905 */ 1906 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1907 reqlist, links); 1908 break; 1909 } else { 1910 /* 1911 * If we still have anything on the queue after 1912 * removing the head entry, that is because we 1913 * met one of the criteria to create a new 1914 * request list (outlined above), and we'll call 1915 * that a forced dispatch for statistical purposes. 1916 * 1917 * Otherwise, if there is only one element on the 1918 * queue, we coalesced everything available on 1919 * the ring and we'll call that a normal dispatch. 1920 */ 1921 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1922 1923 if (reqlist != NULL) 1924 xbb->forced_dispatch++; 1925 else 1926 xbb->normal_dispatch++; 1927 1928 xbb->total_dispatch++; 1929 } 1930 } 1931 } 1932 1933 /** 1934 * Interrupt handler bound to the shared ring's event channel. 1935 * 1936 * \param arg Callback argument registerd during event channel 1937 * binding - the xbb_softc for this instance. 1938 */ 1939 static int 1940 xbb_filter(void *arg) 1941 { 1942 struct xbb_softc *xbb; 1943 1944 /* Defer to taskqueue thread. */ 1945 xbb = (struct xbb_softc *)arg; 1946 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1947 1948 return (FILTER_HANDLED); 1949 } 1950 1951 SDT_PROVIDER_DEFINE(xbb); 1952 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 1953 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 1954 "uint64_t"); 1955 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 1956 "uint64_t", "uint64_t"); 1957 1958 /*----------------------------- Backend Handlers -----------------------------*/ 1959 /** 1960 * Backend handler for character device access. 1961 * 1962 * \param xbb Per-instance xbb configuration structure. 1963 * \param reqlist Allocated internal request list structure. 1964 * \param operation BIO_* I/O operation code. 1965 * \param bio_flags Additional bio_flag data to pass to any generated 1966 * bios (e.g. BIO_ORDERED).. 1967 * 1968 * \return 0 for success, errno codes for failure. 1969 */ 1970 static int 1971 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1972 int operation, int bio_flags) 1973 { 1974 struct xbb_dev_data *dev_data; 1975 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 1976 off_t bio_offset; 1977 struct bio *bio; 1978 struct xbb_sg *xbb_sg; 1979 u_int nbio; 1980 u_int bio_idx; 1981 u_int nseg; 1982 u_int seg_idx; 1983 int error; 1984 1985 dev_data = &xbb->backend.dev; 1986 bio_offset = (off_t)reqlist->starting_sector_number 1987 << xbb->sector_size_shift; 1988 error = 0; 1989 nbio = 0; 1990 bio_idx = 0; 1991 1992 if (operation == BIO_FLUSH) { 1993 bio = g_new_bio(); 1994 if (__predict_false(bio == NULL)) { 1995 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 1996 error = ENOMEM; 1997 return (error); 1998 } 1999 2000 bio->bio_cmd = BIO_FLUSH; 2001 bio->bio_flags |= BIO_ORDERED; 2002 bio->bio_dev = dev_data->cdev; 2003 bio->bio_offset = 0; 2004 bio->bio_data = 0; 2005 bio->bio_done = xbb_bio_done; 2006 bio->bio_caller1 = reqlist; 2007 bio->bio_pblkno = 0; 2008 2009 reqlist->pendcnt = 1; 2010 2011 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2012 device_get_unit(xbb->dev)); 2013 2014 (*dev_data->csw->d_strategy)(bio); 2015 2016 return (0); 2017 } 2018 2019 xbb_sg = xbb->xbb_sgs; 2020 bio = NULL; 2021 nseg = reqlist->nr_segments; 2022 2023 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2024 /* 2025 * KVA will not be contiguous, so any additional 2026 * I/O will need to be represented in a new bio. 2027 */ 2028 if ((bio != NULL) 2029 && (xbb_sg->first_sect != 0)) { 2030 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2031 printf("%s: Discontiguous I/O request " 2032 "from domain %d ends on " 2033 "non-sector boundary\n", 2034 __func__, xbb->otherend_id); 2035 error = EINVAL; 2036 goto fail_free_bios; 2037 } 2038 bio = NULL; 2039 } 2040 2041 if (bio == NULL) { 2042 /* 2043 * Make sure that the start of this bio is 2044 * aligned to a device sector. 2045 */ 2046 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2047 printf("%s: Misaligned I/O request " 2048 "from domain %d\n", __func__, 2049 xbb->otherend_id); 2050 error = EINVAL; 2051 goto fail_free_bios; 2052 } 2053 2054 bio = bios[nbio++] = g_new_bio(); 2055 if (__predict_false(bio == NULL)) { 2056 error = ENOMEM; 2057 goto fail_free_bios; 2058 } 2059 bio->bio_cmd = operation; 2060 bio->bio_flags |= bio_flags; 2061 bio->bio_dev = dev_data->cdev; 2062 bio->bio_offset = bio_offset; 2063 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2064 xbb_sg->first_sect); 2065 bio->bio_done = xbb_bio_done; 2066 bio->bio_caller1 = reqlist; 2067 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2068 } 2069 2070 bio->bio_length += xbb_sg->nsect << 9; 2071 bio->bio_bcount = bio->bio_length; 2072 bio_offset += xbb_sg->nsect << 9; 2073 2074 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2075 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2076 printf("%s: Discontiguous I/O request " 2077 "from domain %d ends on " 2078 "non-sector boundary\n", 2079 __func__, xbb->otherend_id); 2080 error = EINVAL; 2081 goto fail_free_bios; 2082 } 2083 /* 2084 * KVA will not be contiguous, so any additional 2085 * I/O will need to be represented in a new bio. 2086 */ 2087 bio = NULL; 2088 } 2089 } 2090 2091 reqlist->pendcnt = nbio; 2092 2093 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2094 { 2095 if (operation == BIO_READ) { 2096 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2097 device_get_unit(xbb->dev), 2098 bios[bio_idx]->bio_offset, 2099 bios[bio_idx]->bio_length); 2100 } else if (operation == BIO_WRITE) { 2101 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2102 device_get_unit(xbb->dev), 2103 bios[bio_idx]->bio_offset, 2104 bios[bio_idx]->bio_length); 2105 } 2106 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2107 } 2108 2109 return (error); 2110 2111 fail_free_bios: 2112 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2113 g_destroy_bio(bios[bio_idx]); 2114 2115 return (error); 2116 } 2117 2118 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2119 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2120 "uint64_t"); 2121 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2122 "uint64_t", "uint64_t"); 2123 2124 /** 2125 * Backend handler for file access. 2126 * 2127 * \param xbb Per-instance xbb configuration structure. 2128 * \param reqlist Allocated internal request list. 2129 * \param operation BIO_* I/O operation code. 2130 * \param flags Additional bio_flag data to pass to any generated bios 2131 * (e.g. BIO_ORDERED).. 2132 * 2133 * \return 0 for success, errno codes for failure. 2134 */ 2135 static int 2136 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2137 int operation, int flags) 2138 { 2139 struct xbb_file_data *file_data; 2140 u_int seg_idx; 2141 u_int nseg; 2142 struct uio xuio; 2143 struct xbb_sg *xbb_sg; 2144 struct iovec *xiovec; 2145 int error; 2146 2147 file_data = &xbb->backend.file; 2148 error = 0; 2149 bzero(&xuio, sizeof(xuio)); 2150 2151 switch (operation) { 2152 case BIO_READ: 2153 xuio.uio_rw = UIO_READ; 2154 break; 2155 case BIO_WRITE: 2156 xuio.uio_rw = UIO_WRITE; 2157 break; 2158 case BIO_FLUSH: { 2159 struct mount *mountpoint; 2160 2161 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2162 device_get_unit(xbb->dev)); 2163 2164 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2165 2166 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2167 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2168 VOP_UNLOCK(xbb->vn); 2169 2170 vn_finished_write(mountpoint); 2171 2172 goto bailout_send_response; 2173 /* NOTREACHED */ 2174 } 2175 default: 2176 panic("invalid operation %d", operation); 2177 /* NOTREACHED */ 2178 } 2179 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2180 << xbb->sector_size_shift; 2181 xuio.uio_segflg = UIO_SYSSPACE; 2182 xuio.uio_iov = file_data->xiovecs; 2183 xuio.uio_iovcnt = 0; 2184 xbb_sg = xbb->xbb_sgs; 2185 nseg = reqlist->nr_segments; 2186 2187 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2188 /* 2189 * If the first sector is not 0, the KVA will 2190 * not be contiguous and we'll need to go on 2191 * to another segment. 2192 */ 2193 if (xbb_sg->first_sect != 0) 2194 xiovec = NULL; 2195 2196 if (xiovec == NULL) { 2197 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2198 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2199 seg_idx, xbb_sg->first_sect); 2200 xiovec->iov_len = 0; 2201 xuio.uio_iovcnt++; 2202 } 2203 2204 xiovec->iov_len += xbb_sg->nsect << 9; 2205 2206 xuio.uio_resid += xbb_sg->nsect << 9; 2207 2208 /* 2209 * If the last sector is not the full page 2210 * size count, the next segment will not be 2211 * contiguous in KVA and we need a new iovec. 2212 */ 2213 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2214 xiovec = NULL; 2215 } 2216 2217 xuio.uio_td = curthread; 2218 2219 switch (operation) { 2220 case BIO_READ: 2221 2222 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2223 device_get_unit(xbb->dev), xuio.uio_offset, 2224 xuio.uio_resid); 2225 2226 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2227 2228 /* 2229 * UFS pays attention to IO_DIRECT for reads. If the 2230 * DIRECTIO option is configured into the kernel, it calls 2231 * ffs_rawread(). But that only works for single-segment 2232 * uios with user space addresses. In our case, with a 2233 * kernel uio, it still reads into the buffer cache, but it 2234 * will just try to release the buffer from the cache later 2235 * on in ffs_read(). 2236 * 2237 * ZFS does not pay attention to IO_DIRECT for reads. 2238 * 2239 * UFS does not pay attention to IO_SYNC for reads. 2240 * 2241 * ZFS pays attention to IO_SYNC (which translates into the 2242 * Solaris define FRSYNC for zfs_read()) for reads. It 2243 * attempts to sync the file before reading. 2244 * 2245 * So, to attempt to provide some barrier semantics in the 2246 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2247 */ 2248 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2249 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2250 2251 VOP_UNLOCK(xbb->vn); 2252 break; 2253 case BIO_WRITE: { 2254 struct mount *mountpoint; 2255 2256 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2257 device_get_unit(xbb->dev), xuio.uio_offset, 2258 xuio.uio_resid); 2259 2260 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2261 2262 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2263 2264 /* 2265 * UFS pays attention to IO_DIRECT for writes. The write 2266 * is done asynchronously. (Normally the write would just 2267 * get put into cache. 2268 * 2269 * UFS pays attention to IO_SYNC for writes. It will 2270 * attempt to write the buffer out synchronously if that 2271 * flag is set. 2272 * 2273 * ZFS does not pay attention to IO_DIRECT for writes. 2274 * 2275 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2276 * for writes. It will flush the transaction from the 2277 * cache before returning. 2278 * 2279 * So if we've got the BIO_ORDERED flag set, we want 2280 * IO_SYNC in either the UFS or ZFS case. 2281 */ 2282 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2283 IO_SYNC : 0, file_data->cred); 2284 VOP_UNLOCK(xbb->vn); 2285 2286 vn_finished_write(mountpoint); 2287 2288 break; 2289 } 2290 default: 2291 panic("invalid operation %d", operation); 2292 /* NOTREACHED */ 2293 } 2294 2295 bailout_send_response: 2296 2297 if (error != 0) 2298 reqlist->status = BLKIF_RSP_ERROR; 2299 2300 xbb_complete_reqlist(xbb, reqlist); 2301 2302 return (0); 2303 } 2304 2305 /*--------------------------- Backend Configuration --------------------------*/ 2306 /** 2307 * Close and cleanup any backend device/file specific state for this 2308 * block back instance. 2309 * 2310 * \param xbb Per-instance xbb configuration structure. 2311 */ 2312 static void 2313 xbb_close_backend(struct xbb_softc *xbb) 2314 { 2315 DROP_GIANT(); 2316 DPRINTF("closing dev=%s\n", xbb->dev_name); 2317 if (xbb->vn) { 2318 int flags = FREAD; 2319 2320 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2321 flags |= FWRITE; 2322 2323 switch (xbb->device_type) { 2324 case XBB_TYPE_DISK: 2325 if (xbb->backend.dev.csw) { 2326 dev_relthread(xbb->backend.dev.cdev, 2327 xbb->backend.dev.dev_ref); 2328 xbb->backend.dev.csw = NULL; 2329 xbb->backend.dev.cdev = NULL; 2330 } 2331 break; 2332 case XBB_TYPE_FILE: 2333 break; 2334 case XBB_TYPE_NONE: 2335 default: 2336 panic("Unexpected backend type."); 2337 break; 2338 } 2339 2340 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2341 xbb->vn = NULL; 2342 2343 switch (xbb->device_type) { 2344 case XBB_TYPE_DISK: 2345 break; 2346 case XBB_TYPE_FILE: 2347 if (xbb->backend.file.cred != NULL) { 2348 crfree(xbb->backend.file.cred); 2349 xbb->backend.file.cred = NULL; 2350 } 2351 break; 2352 case XBB_TYPE_NONE: 2353 default: 2354 panic("Unexpected backend type."); 2355 break; 2356 } 2357 } 2358 PICKUP_GIANT(); 2359 } 2360 2361 /** 2362 * Open a character device to be used for backend I/O. 2363 * 2364 * \param xbb Per-instance xbb configuration structure. 2365 * 2366 * \return 0 for success, errno codes for failure. 2367 */ 2368 static int 2369 xbb_open_dev(struct xbb_softc *xbb) 2370 { 2371 struct vattr vattr; 2372 struct cdev *dev; 2373 struct cdevsw *devsw; 2374 int error; 2375 2376 xbb->device_type = XBB_TYPE_DISK; 2377 xbb->dispatch_io = xbb_dispatch_dev; 2378 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2379 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2380 &xbb->backend.dev.dev_ref); 2381 if (xbb->backend.dev.csw == NULL) 2382 panic("Unable to retrieve device switch"); 2383 2384 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2385 if (error) { 2386 xenbus_dev_fatal(xbb->dev, error, "error getting " 2387 "vnode attributes for device %s", 2388 xbb->dev_name); 2389 return (error); 2390 } 2391 2392 dev = xbb->vn->v_rdev; 2393 devsw = dev->si_devsw; 2394 if (!devsw->d_ioctl) { 2395 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2396 "device %s!", xbb->dev_name); 2397 return (ENODEV); 2398 } 2399 2400 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2401 (caddr_t)&xbb->sector_size, FREAD, 2402 curthread); 2403 if (error) { 2404 xenbus_dev_fatal(xbb->dev, error, 2405 "error calling ioctl DIOCGSECTORSIZE " 2406 "for device %s", xbb->dev_name); 2407 return (error); 2408 } 2409 2410 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2411 (caddr_t)&xbb->media_size, FREAD, 2412 curthread); 2413 if (error) { 2414 xenbus_dev_fatal(xbb->dev, error, 2415 "error calling ioctl DIOCGMEDIASIZE " 2416 "for device %s", xbb->dev_name); 2417 return (error); 2418 } 2419 2420 return (0); 2421 } 2422 2423 /** 2424 * Open a file to be used for backend I/O. 2425 * 2426 * \param xbb Per-instance xbb configuration structure. 2427 * 2428 * \return 0 for success, errno codes for failure. 2429 */ 2430 static int 2431 xbb_open_file(struct xbb_softc *xbb) 2432 { 2433 struct xbb_file_data *file_data; 2434 struct vattr vattr; 2435 int error; 2436 2437 file_data = &xbb->backend.file; 2438 xbb->device_type = XBB_TYPE_FILE; 2439 xbb->dispatch_io = xbb_dispatch_file; 2440 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2441 if (error != 0) { 2442 xenbus_dev_fatal(xbb->dev, error, 2443 "error calling VOP_GETATTR()" 2444 "for file %s", xbb->dev_name); 2445 return (error); 2446 } 2447 2448 /* 2449 * Verify that we have the ability to upgrade to exclusive 2450 * access on this file so we can trap errors at open instead 2451 * of reporting them during first access. 2452 */ 2453 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2454 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2455 if (VN_IS_DOOMED(xbb->vn)) { 2456 error = EBADF; 2457 xenbus_dev_fatal(xbb->dev, error, 2458 "error locking file %s", 2459 xbb->dev_name); 2460 2461 return (error); 2462 } 2463 } 2464 2465 file_data->cred = crhold(curthread->td_ucred); 2466 xbb->media_size = vattr.va_size; 2467 2468 /* 2469 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2470 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2471 * with disklabel and UFS on FreeBSD at least. Large block sizes 2472 * may not work with other OSes as well. So just export a sector 2473 * size of 512 bytes, which should work with any OS or 2474 * application. Since our backing is a file, any block size will 2475 * work fine for the backing store. 2476 */ 2477 #if 0 2478 xbb->sector_size = vattr.va_blocksize; 2479 #endif 2480 xbb->sector_size = 512; 2481 2482 /* 2483 * Sanity check. The media size must be a multiple of the sector 2484 * size. 2485 */ 2486 if ((xbb->media_size % xbb->sector_size) != 0) { 2487 error = EINVAL; 2488 xenbus_dev_fatal(xbb->dev, error, 2489 "file %s size %ju not multiple of block size %u", 2490 xbb->dev_name, 2491 (uintmax_t)xbb->media_size, 2492 xbb->sector_size); 2493 } 2494 return (error); 2495 } 2496 2497 /** 2498 * Open the backend provider for this connection. 2499 * 2500 * \param xbb Per-instance xbb configuration structure. 2501 * 2502 * \return 0 for success, errno codes for failure. 2503 */ 2504 static int 2505 xbb_open_backend(struct xbb_softc *xbb) 2506 { 2507 struct nameidata nd; 2508 int flags; 2509 int error; 2510 2511 flags = FREAD; 2512 error = 0; 2513 2514 DPRINTF("opening dev=%s\n", xbb->dev_name); 2515 2516 if (rootvnode == NULL) { 2517 xenbus_dev_fatal(xbb->dev, ENOENT, 2518 "Root file system not mounted"); 2519 return (ENOENT); 2520 } 2521 2522 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2523 flags |= FWRITE; 2524 2525 pwd_ensure_dirs(); 2526 2527 again: 2528 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name); 2529 error = vn_open(&nd, &flags, 0, NULL); 2530 if (error) { 2531 /* 2532 * This is the only reasonable guess we can make as far as 2533 * path if the user doesn't give us a fully qualified path. 2534 * If they want to specify a file, they need to specify the 2535 * full path. 2536 */ 2537 if (xbb->dev_name[0] != '/') { 2538 char *dev_path = "/dev/"; 2539 char *dev_name; 2540 2541 /* Try adding device path at beginning of name */ 2542 dev_name = malloc(strlen(xbb->dev_name) 2543 + strlen(dev_path) + 1, 2544 M_XENBLOCKBACK, M_NOWAIT); 2545 if (dev_name) { 2546 sprintf(dev_name, "%s%s", dev_path, 2547 xbb->dev_name); 2548 free(xbb->dev_name, M_XENBLOCKBACK); 2549 xbb->dev_name = dev_name; 2550 goto again; 2551 } 2552 } 2553 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2554 xbb->dev_name); 2555 return (error); 2556 } 2557 2558 NDFREE_PNBUF(&nd); 2559 2560 xbb->vn = nd.ni_vp; 2561 2562 /* We only support disks and files. */ 2563 if (vn_isdisk_error(xbb->vn, &error)) { 2564 error = xbb_open_dev(xbb); 2565 } else if (xbb->vn->v_type == VREG) { 2566 error = xbb_open_file(xbb); 2567 } else { 2568 error = EINVAL; 2569 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2570 "or file", xbb->dev_name); 2571 } 2572 VOP_UNLOCK(xbb->vn); 2573 2574 if (error != 0) { 2575 xbb_close_backend(xbb); 2576 return (error); 2577 } 2578 2579 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2580 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2581 2582 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2583 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2584 xbb->dev_name, xbb->sector_size, xbb->media_size); 2585 2586 return (0); 2587 } 2588 2589 /*------------------------ Inter-Domain Communication ------------------------*/ 2590 /** 2591 * Free dynamically allocated KVA or pseudo-physical address allocations. 2592 * 2593 * \param xbb Per-instance xbb configuration structure. 2594 */ 2595 static void 2596 xbb_free_communication_mem(struct xbb_softc *xbb) 2597 { 2598 if (xbb->kva != 0) { 2599 if (xbb->pseudo_phys_res != NULL) { 2600 xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, 2601 xbb->pseudo_phys_res); 2602 xbb->pseudo_phys_res = NULL; 2603 } 2604 } 2605 xbb->kva = 0; 2606 xbb->gnt_base_addr = 0; 2607 if (xbb->kva_free != NULL) { 2608 free(xbb->kva_free, M_XENBLOCKBACK); 2609 xbb->kva_free = NULL; 2610 } 2611 } 2612 2613 /** 2614 * Cleanup all inter-domain communication mechanisms. 2615 * 2616 * \param xbb Per-instance xbb configuration structure. 2617 */ 2618 static int 2619 xbb_disconnect(struct xbb_softc *xbb) 2620 { 2621 DPRINTF("\n"); 2622 2623 mtx_unlock(&xbb->lock); 2624 xen_intr_unbind(&xbb->xen_intr_handle); 2625 if (xbb->io_taskqueue != NULL) 2626 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2627 mtx_lock(&xbb->lock); 2628 2629 /* 2630 * No new interrupts can generate work, but we must wait 2631 * for all currently active requests to drain. 2632 */ 2633 if (xbb->active_request_count != 0) 2634 return (EAGAIN); 2635 2636 if (xbb->flags & XBBF_RING_CONNECTED) { 2637 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2638 struct gnttab_unmap_grant_ref *op; 2639 unsigned int ring_idx; 2640 int error; 2641 2642 for (ring_idx = 0, op = ops; 2643 ring_idx < xbb->ring_config.ring_pages; 2644 ring_idx++, op++) { 2645 op->host_addr = xbb->ring_config.gnt_addr 2646 + (ring_idx * PAGE_SIZE); 2647 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2648 op->handle = xbb->ring_config.handle[ring_idx]; 2649 } 2650 2651 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2652 xbb->ring_config.ring_pages); 2653 if (error != 0) 2654 panic("Grant table op failed (%d)", error); 2655 2656 xbb->flags &= ~XBBF_RING_CONNECTED; 2657 } 2658 2659 xbb_free_communication_mem(xbb); 2660 2661 if (xbb->requests != NULL) { 2662 free(xbb->requests, M_XENBLOCKBACK); 2663 xbb->requests = NULL; 2664 } 2665 2666 if (xbb->request_lists != NULL) { 2667 struct xbb_xen_reqlist *reqlist; 2668 int i; 2669 2670 /* There is one request list for ever allocated request. */ 2671 for (i = 0, reqlist = xbb->request_lists; 2672 i < xbb->max_requests; i++, reqlist++){ 2673 if (reqlist->gnt_handles != NULL) { 2674 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2675 reqlist->gnt_handles = NULL; 2676 } 2677 } 2678 free(xbb->request_lists, M_XENBLOCKBACK); 2679 xbb->request_lists = NULL; 2680 } 2681 2682 return (0); 2683 } 2684 2685 /** 2686 * Map shared memory ring into domain local address space, initialize 2687 * ring control structures, and bind an interrupt to the event channel 2688 * used to notify us of ring changes. 2689 * 2690 * \param xbb Per-instance xbb configuration structure. 2691 */ 2692 static int 2693 xbb_connect_ring(struct xbb_softc *xbb) 2694 { 2695 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2696 struct gnttab_map_grant_ref *gnt; 2697 u_int ring_idx; 2698 int error; 2699 2700 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2701 return (0); 2702 2703 /* 2704 * Kva for our ring is at the tail of the region of kva allocated 2705 * by xbb_alloc_communication_mem(). 2706 */ 2707 xbb->ring_config.va = xbb->kva 2708 + (xbb->kva_size 2709 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2710 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2711 + (xbb->kva_size 2712 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2713 2714 for (ring_idx = 0, gnt = gnts; 2715 ring_idx < xbb->ring_config.ring_pages; 2716 ring_idx++, gnt++) { 2717 gnt->host_addr = xbb->ring_config.gnt_addr 2718 + (ring_idx * PAGE_SIZE); 2719 gnt->flags = GNTMAP_host_map; 2720 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2721 gnt->dom = xbb->otherend_id; 2722 } 2723 2724 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2725 xbb->ring_config.ring_pages); 2726 if (error) 2727 panic("blkback: Ring page grant table op failed (%d)", error); 2728 2729 for (ring_idx = 0, gnt = gnts; 2730 ring_idx < xbb->ring_config.ring_pages; 2731 ring_idx++, gnt++) { 2732 if (gnt->status != 0) { 2733 struct gnttab_unmap_grant_ref unmap[XBB_MAX_RING_PAGES]; 2734 unsigned int i, j; 2735 2736 xbb->ring_config.va = 0; 2737 xenbus_dev_fatal(xbb->dev, EACCES, 2738 "Ring shared page mapping failed. " 2739 "Status %d.", gnt->status); 2740 2741 /* Unmap everything to avoid leaking grant table maps */ 2742 for (i = 0, j = 0; i < xbb->ring_config.ring_pages; 2743 i++) { 2744 if (gnts[i].status != GNTST_okay) 2745 continue; 2746 2747 unmap[j].host_addr = gnts[i].host_addr; 2748 unmap[j].dev_bus_addr = gnts[i].dev_bus_addr; 2749 unmap[j++].handle = gnts[i].handle; 2750 } 2751 if (j != 0) { 2752 error = HYPERVISOR_grant_table_op( 2753 GNTTABOP_unmap_grant_ref, unmap, j); 2754 if (error != 0) 2755 panic("Unable to unmap grants (%d)", 2756 error); 2757 } 2758 return (EACCES); 2759 } 2760 xbb->ring_config.handle[ring_idx] = gnt->handle; 2761 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2762 } 2763 2764 /* Initialize the ring based on ABI. */ 2765 switch (xbb->abi) { 2766 case BLKIF_PROTOCOL_NATIVE: 2767 { 2768 blkif_sring_t *sring; 2769 sring = (blkif_sring_t *)xbb->ring_config.va; 2770 BACK_RING_INIT(&xbb->rings.native, sring, 2771 xbb->ring_config.ring_pages * PAGE_SIZE); 2772 break; 2773 } 2774 case BLKIF_PROTOCOL_X86_32: 2775 { 2776 blkif_x86_32_sring_t *sring_x86_32; 2777 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2778 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2779 xbb->ring_config.ring_pages * PAGE_SIZE); 2780 break; 2781 } 2782 case BLKIF_PROTOCOL_X86_64: 2783 { 2784 blkif_x86_64_sring_t *sring_x86_64; 2785 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2786 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2787 xbb->ring_config.ring_pages * PAGE_SIZE); 2788 break; 2789 } 2790 default: 2791 panic("Unexpected blkif protocol ABI."); 2792 } 2793 2794 xbb->flags |= XBBF_RING_CONNECTED; 2795 2796 error = xen_intr_bind_remote_port(xbb->dev, 2797 xbb->otherend_id, 2798 xbb->ring_config.evtchn, 2799 xbb_filter, 2800 /*ithread_handler*/NULL, 2801 /*arg*/xbb, 2802 INTR_TYPE_BIO | INTR_MPSAFE, 2803 &xbb->xen_intr_handle); 2804 if (error) { 2805 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2806 return (error); 2807 } 2808 2809 DPRINTF("rings connected!\n"); 2810 2811 return 0; 2812 } 2813 2814 /** 2815 * Size KVA and pseudo-physical address allocations based on negotiated 2816 * values for the size and number of I/O requests, and the size of our 2817 * communication ring. 2818 * 2819 * \param xbb Per-instance xbb configuration structure. 2820 * 2821 * These address spaces are used to dynamically map pages in the 2822 * front-end's domain into our own. 2823 */ 2824 static int 2825 xbb_alloc_communication_mem(struct xbb_softc *xbb) 2826 { 2827 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2828 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2829 xbb->kva_size = xbb->reqlist_kva_size + 2830 (xbb->ring_config.ring_pages * PAGE_SIZE); 2831 2832 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); 2833 if (xbb->kva_free == NULL) 2834 return (ENOMEM); 2835 2836 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 2837 device_get_nameunit(xbb->dev), xbb->kva_size, 2838 xbb->reqlist_kva_size); 2839 /* 2840 * Reserve a range of pseudo physical memory that we can map 2841 * into kva. These pages will only be backed by machine 2842 * pages ("real memory") during the lifetime of front-end requests 2843 * via grant table operations. 2844 */ 2845 xbb->pseudo_phys_res_id = 0; 2846 xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, 2847 xbb->kva_size); 2848 if (xbb->pseudo_phys_res == NULL) { 2849 xbb->kva = 0; 2850 return (ENOMEM); 2851 } 2852 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 2853 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 2854 2855 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 2856 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 2857 (uintmax_t)xbb->gnt_base_addr); 2858 return (0); 2859 } 2860 2861 /** 2862 * Collect front-end information from the XenStore. 2863 * 2864 * \param xbb Per-instance xbb configuration structure. 2865 */ 2866 static int 2867 xbb_collect_frontend_info(struct xbb_softc *xbb) 2868 { 2869 char protocol_abi[64]; 2870 const char *otherend_path; 2871 int error; 2872 u_int ring_idx; 2873 u_int ring_page_order; 2874 size_t ring_size; 2875 2876 otherend_path = xenbus_get_otherend_path(xbb->dev); 2877 2878 /* 2879 * Protocol defaults valid even if all negotiation fails. 2880 */ 2881 xbb->ring_config.ring_pages = 1; 2882 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; 2883 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 2884 2885 /* 2886 * Mandatory data (used in all versions of the protocol) first. 2887 */ 2888 error = xs_scanf(XST_NIL, otherend_path, 2889 "event-channel", NULL, "%" PRIu32, 2890 &xbb->ring_config.evtchn); 2891 if (error != 0) { 2892 xenbus_dev_fatal(xbb->dev, error, 2893 "Unable to retrieve event-channel information " 2894 "from frontend %s. Unable to connect.", 2895 xenbus_get_otherend_path(xbb->dev)); 2896 return (error); 2897 } 2898 2899 /* 2900 * These fields are initialized to legacy protocol defaults 2901 * so we only need to fail if reading the updated value succeeds 2902 * and the new value is outside of its allowed range. 2903 * 2904 * \note xs_gather() returns on the first encountered error, so 2905 * we must use independent calls in order to guarantee 2906 * we don't miss information in a sparsly populated front-end 2907 * tree. 2908 * 2909 * \note xs_scanf() does not update variables for unmatched 2910 * fields. 2911 */ 2912 ring_page_order = 0; 2913 xbb->max_requests = 32; 2914 2915 (void)xs_scanf(XST_NIL, otherend_path, 2916 "ring-page-order", NULL, "%u", 2917 &ring_page_order); 2918 xbb->ring_config.ring_pages = 1 << ring_page_order; 2919 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 2920 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 2921 2922 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 2923 xenbus_dev_fatal(xbb->dev, EINVAL, 2924 "Front-end specified ring-pages of %u " 2925 "exceeds backend limit of %u. " 2926 "Unable to connect.", 2927 xbb->ring_config.ring_pages, 2928 XBB_MAX_RING_PAGES); 2929 return (EINVAL); 2930 } 2931 2932 if (xbb->ring_config.ring_pages == 1) { 2933 error = xs_gather(XST_NIL, otherend_path, 2934 "ring-ref", "%" PRIu32, 2935 &xbb->ring_config.ring_ref[0], 2936 NULL); 2937 if (error != 0) { 2938 xenbus_dev_fatal(xbb->dev, error, 2939 "Unable to retrieve ring information " 2940 "from frontend %s. Unable to " 2941 "connect.", 2942 xenbus_get_otherend_path(xbb->dev)); 2943 return (error); 2944 } 2945 } else { 2946 /* Multi-page ring format. */ 2947 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 2948 ring_idx++) { 2949 char ring_ref_name[]= "ring_refXX"; 2950 2951 snprintf(ring_ref_name, sizeof(ring_ref_name), 2952 "ring-ref%u", ring_idx); 2953 error = xs_scanf(XST_NIL, otherend_path, 2954 ring_ref_name, NULL, "%" PRIu32, 2955 &xbb->ring_config.ring_ref[ring_idx]); 2956 if (error != 0) { 2957 xenbus_dev_fatal(xbb->dev, error, 2958 "Failed to retriev grant " 2959 "reference for page %u of " 2960 "shared ring. Unable " 2961 "to connect.", ring_idx); 2962 return (error); 2963 } 2964 } 2965 } 2966 2967 error = xs_gather(XST_NIL, otherend_path, 2968 "protocol", "%63s", protocol_abi, 2969 NULL); 2970 if (error != 0 2971 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 2972 /* 2973 * Assume native if the frontend has not 2974 * published ABI data or it has published and 2975 * matches our own ABI. 2976 */ 2977 xbb->abi = BLKIF_PROTOCOL_NATIVE; 2978 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 2979 xbb->abi = BLKIF_PROTOCOL_X86_32; 2980 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 2981 xbb->abi = BLKIF_PROTOCOL_X86_64; 2982 } else { 2983 xenbus_dev_fatal(xbb->dev, EINVAL, 2984 "Unknown protocol ABI (%s) published by " 2985 "frontend. Unable to connect.", protocol_abi); 2986 return (EINVAL); 2987 } 2988 return (0); 2989 } 2990 2991 /** 2992 * Allocate per-request data structures given request size and number 2993 * information negotiated with the front-end. 2994 * 2995 * \param xbb Per-instance xbb configuration structure. 2996 */ 2997 static int 2998 xbb_alloc_requests(struct xbb_softc *xbb) 2999 { 3000 struct xbb_xen_req *req; 3001 struct xbb_xen_req *last_req; 3002 3003 /* 3004 * Allocate request book keeping datastructures. 3005 */ 3006 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3007 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3008 if (xbb->requests == NULL) { 3009 xenbus_dev_fatal(xbb->dev, ENOMEM, 3010 "Unable to allocate request structures"); 3011 return (ENOMEM); 3012 } 3013 3014 req = xbb->requests; 3015 last_req = &xbb->requests[xbb->max_requests - 1]; 3016 STAILQ_INIT(&xbb->request_free_stailq); 3017 while (req <= last_req) { 3018 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3019 req++; 3020 } 3021 return (0); 3022 } 3023 3024 static int 3025 xbb_alloc_request_lists(struct xbb_softc *xbb) 3026 { 3027 struct xbb_xen_reqlist *reqlist; 3028 int i; 3029 3030 /* 3031 * If no requests can be merged, we need 1 request list per 3032 * in flight request. 3033 */ 3034 xbb->request_lists = malloc(xbb->max_requests * 3035 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3036 if (xbb->request_lists == NULL) { 3037 xenbus_dev_fatal(xbb->dev, ENOMEM, 3038 "Unable to allocate request list structures"); 3039 return (ENOMEM); 3040 } 3041 3042 STAILQ_INIT(&xbb->reqlist_free_stailq); 3043 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3044 for (i = 0; i < xbb->max_requests; i++) { 3045 int seg; 3046 3047 reqlist = &xbb->request_lists[i]; 3048 3049 reqlist->xbb = xbb; 3050 3051 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3052 sizeof(*reqlist->gnt_handles), 3053 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3054 if (reqlist->gnt_handles == NULL) { 3055 xenbus_dev_fatal(xbb->dev, ENOMEM, 3056 "Unable to allocate request " 3057 "grant references"); 3058 return (ENOMEM); 3059 } 3060 3061 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3062 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3063 3064 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3065 } 3066 return (0); 3067 } 3068 3069 /** 3070 * Supply information about the physical device to the frontend 3071 * via XenBus. 3072 * 3073 * \param xbb Per-instance xbb configuration structure. 3074 */ 3075 static int 3076 xbb_publish_backend_info(struct xbb_softc *xbb) 3077 { 3078 struct xs_transaction xst; 3079 const char *our_path; 3080 const char *leaf; 3081 int error; 3082 3083 our_path = xenbus_get_node(xbb->dev); 3084 while (1) { 3085 error = xs_transaction_start(&xst); 3086 if (error != 0) { 3087 xenbus_dev_fatal(xbb->dev, error, 3088 "Error publishing backend info " 3089 "(start transaction)"); 3090 return (error); 3091 } 3092 3093 /* 3094 * The 'sectors' node is special and always contains the size 3095 * in units of 512b, regardless of the value in 'sector-size'. 3096 */ 3097 leaf = "sectors"; 3098 error = xs_printf(xst, our_path, leaf, "%ju", 3099 (uintmax_t)(xbb->media_size >> XBD_SECTOR_SHFT)); 3100 if (error != 0) 3101 break; 3102 3103 /* XXX Support all VBD attributes here. */ 3104 leaf = "info"; 3105 error = xs_printf(xst, our_path, leaf, "%u", 3106 xbb->flags & XBBF_READ_ONLY 3107 ? VDISK_READONLY : 0); 3108 if (error != 0) 3109 break; 3110 3111 leaf = "sector-size"; 3112 error = xs_printf(xst, our_path, leaf, "%u", 3113 xbb->sector_size); 3114 if (error != 0) 3115 break; 3116 3117 error = xs_transaction_end(xst, 0); 3118 if (error == 0) { 3119 return (0); 3120 } else if (error != EAGAIN) { 3121 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3122 return (error); 3123 } 3124 } 3125 3126 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3127 our_path, leaf); 3128 xs_transaction_end(xst, 1); 3129 return (error); 3130 } 3131 3132 /** 3133 * Connect to our blkfront peer now that it has completed publishing 3134 * its configuration into the XenStore. 3135 * 3136 * \param xbb Per-instance xbb configuration structure. 3137 */ 3138 static void 3139 xbb_connect(struct xbb_softc *xbb) 3140 { 3141 int error; 3142 3143 if (!xbb->hotplug_done || 3144 (xenbus_get_state(xbb->dev) != XenbusStateInitWait) || 3145 (xbb_collect_frontend_info(xbb) != 0)) 3146 return; 3147 3148 xbb->flags &= ~XBBF_SHUTDOWN; 3149 3150 /* 3151 * We limit the maximum number of reqlist segments to the maximum 3152 * number of segments in the ring, or our absolute maximum, 3153 * whichever is smaller. 3154 */ 3155 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3156 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3157 3158 /* 3159 * The maximum size is simply a function of the number of segments 3160 * we can handle. 3161 */ 3162 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3163 3164 /* Allocate resources whose size depends on front-end configuration. */ 3165 error = xbb_alloc_communication_mem(xbb); 3166 if (error != 0) { 3167 xenbus_dev_fatal(xbb->dev, error, 3168 "Unable to allocate communication memory"); 3169 return; 3170 } 3171 3172 error = xbb_publish_backend_info(xbb); 3173 if (error != 0) { 3174 xenbus_dev_fatal(xbb->dev, error, 3175 "Unable to publish device information"); 3176 return; 3177 } 3178 3179 error = xbb_alloc_requests(xbb); 3180 if (error != 0) { 3181 /* Specific errors are reported by xbb_alloc_requests(). */ 3182 return; 3183 } 3184 3185 error = xbb_alloc_request_lists(xbb); 3186 if (error != 0) { 3187 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3188 return; 3189 } 3190 3191 /* 3192 * Connect communication channel. 3193 */ 3194 error = xbb_connect_ring(xbb); 3195 if (error != 0) { 3196 /* Specific errors are reported by xbb_connect_ring(). */ 3197 return; 3198 } 3199 3200 /* Ready for I/O. */ 3201 xenbus_set_state(xbb->dev, XenbusStateConnected); 3202 } 3203 3204 /*-------------------------- Device Teardown Support -------------------------*/ 3205 /** 3206 * Perform device shutdown functions. 3207 * 3208 * \param xbb Per-instance xbb configuration structure. 3209 * 3210 * Mark this instance as shutting down, wait for any active I/O on the 3211 * backend device/file to drain, disconnect from the front-end, and notify 3212 * any waiters (e.g. a thread invoking our detach method) that detach can 3213 * now proceed. 3214 */ 3215 static int 3216 xbb_shutdown(struct xbb_softc *xbb) 3217 { 3218 XenbusState frontState; 3219 int error; 3220 3221 DPRINTF("\n"); 3222 3223 /* 3224 * Due to the need to drop our mutex during some 3225 * xenbus operations, it is possible for two threads 3226 * to attempt to close out shutdown processing at 3227 * the same time. Tell the caller that hits this 3228 * race to try back later. 3229 */ 3230 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3231 return (EAGAIN); 3232 3233 xbb->flags |= XBBF_IN_SHUTDOWN; 3234 mtx_unlock(&xbb->lock); 3235 3236 if (xbb->hotplug_watch.node != NULL) { 3237 xs_unregister_watch(&xbb->hotplug_watch); 3238 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3239 xbb->hotplug_watch.node = NULL; 3240 } 3241 3242 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3243 xenbus_set_state(xbb->dev, XenbusStateClosing); 3244 3245 frontState = xenbus_get_otherend_state(xbb->dev); 3246 mtx_lock(&xbb->lock); 3247 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3248 3249 /* Wait for the frontend to disconnect (if it's connected). */ 3250 if (frontState == XenbusStateConnected) 3251 return (EAGAIN); 3252 3253 DPRINTF("\n"); 3254 3255 /* Indicate shutdown is in progress. */ 3256 xbb->flags |= XBBF_SHUTDOWN; 3257 3258 /* Disconnect from the front-end. */ 3259 error = xbb_disconnect(xbb); 3260 if (error != 0) { 3261 /* 3262 * Requests still outstanding. We'll be called again 3263 * once they complete. 3264 */ 3265 KASSERT(error == EAGAIN, 3266 ("%s: Unexpected xbb_disconnect() failure %d", 3267 __func__, error)); 3268 3269 return (error); 3270 } 3271 3272 DPRINTF("\n"); 3273 3274 /* Indicate to xbb_detach() that is it safe to proceed. */ 3275 wakeup(xbb); 3276 3277 return (0); 3278 } 3279 3280 /** 3281 * Report an attach time error to the console and Xen, and cleanup 3282 * this instance by forcing immediate detach processing. 3283 * 3284 * \param xbb Per-instance xbb configuration structure. 3285 * \param err Errno describing the error. 3286 * \param fmt Printf style format and arguments 3287 */ 3288 static void 3289 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3290 { 3291 va_list ap; 3292 va_list ap_hotplug; 3293 3294 va_start(ap, fmt); 3295 va_copy(ap_hotplug, ap); 3296 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3297 "hotplug-error", fmt, ap_hotplug); 3298 va_end(ap_hotplug); 3299 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3300 "hotplug-status", "error"); 3301 3302 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3303 va_end(ap); 3304 3305 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3306 "online", "0"); 3307 mtx_lock(&xbb->lock); 3308 xbb_shutdown(xbb); 3309 mtx_unlock(&xbb->lock); 3310 } 3311 3312 /*---------------------------- NewBus Entrypoints ----------------------------*/ 3313 /** 3314 * Inspect a XenBus device and claim it if is of the appropriate type. 3315 * 3316 * \param dev NewBus device object representing a candidate XenBus device. 3317 * 3318 * \return 0 for success, errno codes for failure. 3319 */ 3320 static int 3321 xbb_probe(device_t dev) 3322 { 3323 3324 if (strcmp(xenbus_get_type(dev), "vbd")) 3325 return (ENXIO); 3326 3327 /* Only attach if Xen creates IOMMU entries for grant mapped pages. */ 3328 if (!xen_has_iommu_maps()) { 3329 static bool warned; 3330 3331 if (!warned) { 3332 warned = true; 3333 printf( 3334 "xen-blkback disabled due to grant maps lacking IOMMU entries\n"); 3335 } 3336 return (ENXIO); 3337 } 3338 3339 device_set_desc(dev, "Backend Virtual Block Device"); 3340 device_quiet(dev); 3341 return (0); 3342 } 3343 3344 /** 3345 * Setup sysctl variables to control various Block Back parameters. 3346 * 3347 * \param xbb Xen Block Back softc. 3348 * 3349 */ 3350 static void 3351 xbb_setup_sysctl(struct xbb_softc *xbb) 3352 { 3353 struct sysctl_ctx_list *sysctl_ctx = NULL; 3354 struct sysctl_oid *sysctl_tree = NULL; 3355 3356 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3357 if (sysctl_ctx == NULL) 3358 return; 3359 3360 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3361 if (sysctl_tree == NULL) 3362 return; 3363 3364 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3365 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3366 "fake the flush command"); 3367 3368 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3369 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3370 "send a real flush for N flush requests"); 3371 3372 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3373 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3374 "Don't coalesce contiguous requests"); 3375 3376 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3377 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3378 "how many I/O requests we have received"); 3379 3380 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3381 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3382 "how many I/O requests have been completed"); 3383 3384 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3385 "reqs_queued_for_completion", CTLFLAG_RW, 3386 &xbb->reqs_queued_for_completion, 3387 "how many I/O requests queued but not yet pushed"); 3388 3389 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3390 "reqs_completed_with_error", CTLFLAG_RW, 3391 &xbb->reqs_completed_with_error, 3392 "how many I/O requests completed with error status"); 3393 3394 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3395 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3396 "how many I/O dispatches were forced"); 3397 3398 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3399 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3400 "how many I/O dispatches were normal"); 3401 3402 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3403 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3404 "total number of I/O dispatches"); 3405 3406 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3407 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3408 "how many times we have run out of KVA"); 3409 3410 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3411 "request_shortages", CTLFLAG_RW, 3412 &xbb->request_shortages, 3413 "how many times we have run out of requests"); 3414 3415 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3416 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3417 "maximum outstanding requests (negotiated)"); 3418 3419 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3420 "max_request_segments", CTLFLAG_RD, 3421 &xbb->max_request_segments, 0, 3422 "maximum number of pages per requests (negotiated)"); 3423 3424 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3425 "max_request_size", CTLFLAG_RD, 3426 &xbb->max_request_size, 0, 3427 "maximum size in bytes of a request (negotiated)"); 3428 3429 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3430 "ring_pages", CTLFLAG_RD, 3431 &xbb->ring_config.ring_pages, 0, 3432 "communication channel pages (negotiated)"); 3433 } 3434 3435 static void 3436 xbb_attach_disk(device_t dev) 3437 { 3438 struct xbb_softc *xbb; 3439 int error; 3440 3441 xbb = device_get_softc(dev); 3442 3443 KASSERT(xbb->hotplug_done, ("Missing hotplug execution")); 3444 3445 /* Parse fopen style mode flags. */ 3446 if (strchr(xbb->dev_mode, 'w') == NULL) 3447 xbb->flags |= XBBF_READ_ONLY; 3448 3449 /* 3450 * Verify the physical device is present and can support 3451 * the desired I/O mode. 3452 */ 3453 error = xbb_open_backend(xbb); 3454 if (error != 0) { 3455 xbb_attach_failed(xbb, error, "Unable to open %s", 3456 xbb->dev_name); 3457 return; 3458 } 3459 3460 /* Use devstat(9) for recording statistics. */ 3461 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3462 xbb->sector_size, 3463 DEVSTAT_ALL_SUPPORTED, 3464 DEVSTAT_TYPE_DIRECT 3465 | DEVSTAT_TYPE_IF_OTHER, 3466 DEVSTAT_PRIORITY_OTHER); 3467 3468 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3469 xbb->sector_size, 3470 DEVSTAT_ALL_SUPPORTED, 3471 DEVSTAT_TYPE_DIRECT 3472 | DEVSTAT_TYPE_IF_OTHER, 3473 DEVSTAT_PRIORITY_OTHER); 3474 /* 3475 * Setup sysctl variables. 3476 */ 3477 xbb_setup_sysctl(xbb); 3478 3479 /* 3480 * Create a taskqueue for doing work that must occur from a 3481 * thread context. 3482 */ 3483 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3484 M_NOWAIT, 3485 taskqueue_thread_enqueue, 3486 /*contxt*/&xbb->io_taskqueue); 3487 if (xbb->io_taskqueue == NULL) { 3488 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3489 return; 3490 } 3491 3492 taskqueue_start_threads(&xbb->io_taskqueue, 3493 /*num threads*/1, 3494 /*priority*/PWAIT, 3495 /*thread name*/ 3496 "%s taskq", device_get_nameunit(dev)); 3497 3498 /* Update hot-plug status to satisfy xend. */ 3499 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3500 "hotplug-status", "connected"); 3501 if (error) { 3502 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3503 xenbus_get_node(xbb->dev)); 3504 return; 3505 } 3506 3507 /* The front end might be waiting for the backend, attach if so. */ 3508 if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised) 3509 xbb_connect(xbb); 3510 } 3511 3512 static void 3513 xbb_attach_cb(struct xs_watch *watch, const char **vec, unsigned int len) 3514 { 3515 device_t dev; 3516 struct xbb_softc *xbb; 3517 int error; 3518 3519 dev = (device_t)watch->callback_data; 3520 xbb = device_get_softc(dev); 3521 3522 error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", 3523 NULL, &xbb->dev_name, NULL); 3524 if (error != 0) 3525 return; 3526 3527 xs_unregister_watch(watch); 3528 free(watch->node, M_XENBLOCKBACK); 3529 watch->node = NULL; 3530 xbb->hotplug_done = true; 3531 3532 /* Collect physical device information. */ 3533 error = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "device-type", 3534 NULL, &xbb->dev_type, NULL); 3535 if (error != 0) 3536 xbb->dev_type = NULL; 3537 3538 error = xs_gather(XST_NIL, xenbus_get_node(dev), "mode", NULL, 3539 &xbb->dev_mode, NULL); 3540 if (error != 0) { 3541 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3542 xenbus_get_node(dev)); 3543 return; 3544 } 3545 3546 xbb_attach_disk(dev); 3547 } 3548 3549 /** 3550 * Attach to a XenBus device that has been claimed by our probe routine. 3551 * 3552 * \param dev NewBus device object representing this Xen Block Back instance. 3553 * 3554 * \return 0 for success, errno codes for failure. 3555 */ 3556 static int 3557 xbb_attach(device_t dev) 3558 { 3559 struct xbb_softc *xbb; 3560 int error; 3561 u_int max_ring_page_order; 3562 struct sbuf *watch_path; 3563 3564 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3565 3566 /* 3567 * Basic initialization. 3568 * After this block it is safe to call xbb_detach() 3569 * to clean up any allocated data for this instance. 3570 */ 3571 xbb = device_get_softc(dev); 3572 xbb->dev = dev; 3573 xbb->otherend_id = xenbus_get_otherend_id(dev); 3574 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3575 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3576 3577 /* 3578 * Publish protocol capabilities for consumption by the 3579 * front-end. 3580 */ 3581 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3582 "feature-barrier", "1"); 3583 if (error) { 3584 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3585 xenbus_get_node(xbb->dev)); 3586 return (error); 3587 } 3588 3589 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3590 "feature-flush-cache", "1"); 3591 if (error) { 3592 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3593 xenbus_get_node(xbb->dev)); 3594 return (error); 3595 } 3596 3597 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3598 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3599 "max-ring-page-order", "%u", max_ring_page_order); 3600 if (error) { 3601 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3602 xenbus_get_node(xbb->dev)); 3603 return (error); 3604 } 3605 3606 /* Tell the toolstack blkback has attached. */ 3607 xenbus_set_state(dev, XenbusStateInitWait); 3608 3609 if (xbb->hotplug_done) { 3610 xbb_attach_disk(dev); 3611 return (0); 3612 } 3613 3614 /* 3615 * We need to wait for hotplug script execution before 3616 * moving forward. 3617 */ 3618 watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); 3619 xbb->hotplug_watch.callback_data = (uintptr_t)dev; 3620 xbb->hotplug_watch.callback = xbb_attach_cb; 3621 KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); 3622 xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); 3623 /* 3624 * We don't care about the path updated, just about the value changes 3625 * on that single node, hence there's no need to queue more that one 3626 * event. 3627 */ 3628 xbb->hotplug_watch.max_pending = 1; 3629 sbuf_delete(watch_path); 3630 error = xs_register_watch(&xbb->hotplug_watch); 3631 if (error != 0) { 3632 xbb_attach_failed(xbb, error, "failed to create watch on %s", 3633 xbb->hotplug_watch.node); 3634 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3635 return (error); 3636 } 3637 3638 return (0); 3639 } 3640 3641 /** 3642 * Detach from a block back device instance. 3643 * 3644 * \param dev NewBus device object representing this Xen Block Back instance. 3645 * 3646 * \return 0 for success, errno codes for failure. 3647 * 3648 * \note A block back device may be detached at any time in its life-cycle, 3649 * including part way through the attach process. For this reason, 3650 * initialization order and the initialization state checks in this 3651 * routine must be carefully coupled so that attach time failures 3652 * are gracefully handled. 3653 */ 3654 static int 3655 xbb_detach(device_t dev) 3656 { 3657 struct xbb_softc *xbb; 3658 3659 DPRINTF("\n"); 3660 3661 xbb = device_get_softc(dev); 3662 mtx_lock(&xbb->lock); 3663 while (xbb_shutdown(xbb) == EAGAIN) { 3664 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3665 "xbb_shutdown", 0); 3666 } 3667 mtx_unlock(&xbb->lock); 3668 3669 DPRINTF("\n"); 3670 3671 if (xbb->io_taskqueue != NULL) 3672 taskqueue_free(xbb->io_taskqueue); 3673 3674 if (xbb->xbb_stats != NULL) 3675 devstat_remove_entry(xbb->xbb_stats); 3676 3677 if (xbb->xbb_stats_in != NULL) 3678 devstat_remove_entry(xbb->xbb_stats_in); 3679 3680 xbb_close_backend(xbb); 3681 3682 if (xbb->dev_mode != NULL) { 3683 free(xbb->dev_mode, M_XENSTORE); 3684 xbb->dev_mode = NULL; 3685 } 3686 3687 if (xbb->dev_type != NULL) { 3688 free(xbb->dev_type, M_XENSTORE); 3689 xbb->dev_type = NULL; 3690 } 3691 3692 if (xbb->dev_name != NULL) { 3693 free(xbb->dev_name, M_XENSTORE); 3694 xbb->dev_name = NULL; 3695 } 3696 3697 mtx_destroy(&xbb->lock); 3698 return (0); 3699 } 3700 3701 /** 3702 * Prepare this block back device for suspension of this VM. 3703 * 3704 * \param dev NewBus device object representing this Xen Block Back instance. 3705 * 3706 * \return 0 for success, errno codes for failure. 3707 */ 3708 static int 3709 xbb_suspend(device_t dev) 3710 { 3711 #ifdef NOT_YET 3712 struct xbb_softc *sc = device_get_softc(dev); 3713 3714 /* Prevent new requests being issued until we fix things up. */ 3715 mtx_lock(&sc->xb_io_lock); 3716 sc->connected = BLKIF_STATE_SUSPENDED; 3717 mtx_unlock(&sc->xb_io_lock); 3718 #endif 3719 3720 return (0); 3721 } 3722 3723 /** 3724 * Perform any processing required to recover from a suspended state. 3725 * 3726 * \param dev NewBus device object representing this Xen Block Back instance. 3727 * 3728 * \return 0 for success, errno codes for failure. 3729 */ 3730 static int 3731 xbb_resume(device_t dev) 3732 { 3733 return (0); 3734 } 3735 3736 /** 3737 * Handle state changes expressed via the XenStore by our front-end peer. 3738 * 3739 * \param dev NewBus device object representing this Xen 3740 * Block Back instance. 3741 * \param frontend_state The new state of the front-end. 3742 * 3743 * \return 0 for success, errno codes for failure. 3744 */ 3745 static void 3746 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3747 { 3748 struct xbb_softc *xbb = device_get_softc(dev); 3749 3750 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3751 xenbus_strstate(frontend_state), 3752 xenbus_strstate(xenbus_get_state(xbb->dev))); 3753 3754 switch (frontend_state) { 3755 case XenbusStateInitialising: 3756 break; 3757 case XenbusStateInitialised: 3758 case XenbusStateConnected: 3759 xbb_connect(xbb); 3760 break; 3761 case XenbusStateClosing: 3762 case XenbusStateClosed: 3763 mtx_lock(&xbb->lock); 3764 xbb_shutdown(xbb); 3765 mtx_unlock(&xbb->lock); 3766 if (frontend_state == XenbusStateClosed) 3767 xenbus_set_state(xbb->dev, XenbusStateClosed); 3768 break; 3769 default: 3770 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3771 frontend_state); 3772 break; 3773 } 3774 } 3775 3776 /*---------------------------- NewBus Registration ---------------------------*/ 3777 static device_method_t xbb_methods[] = { 3778 /* Device interface */ 3779 DEVMETHOD(device_probe, xbb_probe), 3780 DEVMETHOD(device_attach, xbb_attach), 3781 DEVMETHOD(device_detach, xbb_detach), 3782 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3783 DEVMETHOD(device_suspend, xbb_suspend), 3784 DEVMETHOD(device_resume, xbb_resume), 3785 3786 /* Xenbus interface */ 3787 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3788 3789 DEVMETHOD_END 3790 }; 3791 3792 static driver_t xbb_driver = { 3793 "xbbd", 3794 xbb_methods, 3795 sizeof(struct xbb_softc), 3796 }; 3797 3798 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, 0, 0); 3799