1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009-2012 Spectra Logic Corporation 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions, and the following disclaimer, 12 * without modification. 13 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 14 * substantially similar to the "NO WARRANTY" disclaimer below 15 * ("Disclaimer") and any redistribution must be conditioned upon 16 * including a substantially similar Disclaimer requirement for further 17 * binary redistribution. 18 * 19 * NO WARRANTY 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 29 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGES. 31 * 32 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 33 * Ken Merry (Spectra Logic Corporation) 34 */ 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 /** 39 * \file blkback.c 40 * 41 * \brief Device driver supporting the vending of block storage from 42 * a FreeBSD domain to other domains. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kernel.h> 48 #include <sys/malloc.h> 49 50 #include <sys/bio.h> 51 #include <sys/bus.h> 52 #include <sys/conf.h> 53 #include <sys/devicestat.h> 54 #include <sys/disk.h> 55 #include <sys/fcntl.h> 56 #include <sys/filedesc.h> 57 #include <sys/kdb.h> 58 #include <sys/module.h> 59 #include <sys/namei.h> 60 #include <sys/proc.h> 61 #include <sys/rman.h> 62 #include <sys/taskqueue.h> 63 #include <sys/types.h> 64 #include <sys/vnode.h> 65 #include <sys/mount.h> 66 #include <sys/sysctl.h> 67 #include <sys/bitstring.h> 68 #include <sys/sdt.h> 69 70 #include <geom/geom.h> 71 72 #include <machine/_inttypes.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 78 #include <xen/xen-os.h> 79 #include <xen/blkif.h> 80 #include <xen/gnttab.h> 81 #include <xen/xen_intr.h> 82 83 #include <contrib/xen/arch-x86/cpuid.h> 84 #include <contrib/xen/event_channel.h> 85 #include <contrib/xen/grant_table.h> 86 87 #include <xen/xenbus/xenbusvar.h> 88 89 /*--------------------------- Compile-time Tunables --------------------------*/ 90 /** 91 * The maximum number of shared memory ring pages we will allow in a 92 * negotiated block-front/back communication channel. Allow enough 93 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 94 */ 95 #define XBB_MAX_RING_PAGES 32 96 97 /** 98 * The maximum number of outstanding request blocks (request headers plus 99 * additional segment blocks) we will allow in a negotiated block-front/back 100 * communication channel. 101 */ 102 #define XBB_MAX_REQUESTS \ 103 __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) 104 105 /** 106 * \brief Define to enable rudimentary request logging to the console. 107 */ 108 #undef XBB_DEBUG 109 110 /*---------------------------------- Macros ----------------------------------*/ 111 /** 112 * Custom malloc type for all driver allocations. 113 */ 114 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 115 116 #ifdef XBB_DEBUG 117 #define DPRINTF(fmt, args...) \ 118 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 119 #else 120 #define DPRINTF(fmt, args...) do {} while(0) 121 #endif 122 123 /** 124 * The maximum mapped region size per request we will allow in a negotiated 125 * block-front/back communication channel. 126 * Use old default of MAXPHYS == 128K. 127 */ 128 #define XBB_MAX_REQUEST_SIZE \ 129 MIN(128 * 1024, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 130 131 /** 132 * The maximum number of segments (within a request header and accompanying 133 * segment blocks) per request we will allow in a negotiated block-front/back 134 * communication channel. 135 */ 136 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 137 (MIN(UIO_MAXIOV, \ 138 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 139 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 140 141 /** 142 * The maximum number of ring pages that we can allow per request list. 143 * We limit this to the maximum number of segments per request, because 144 * that is already a reasonable number of segments to aggregate. This 145 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 146 * because that would leave situations where we can't dispatch even one 147 * large request. 148 */ 149 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 150 151 /*--------------------------- Forward Declarations ---------------------------*/ 152 struct xbb_softc; 153 struct xbb_xen_req; 154 155 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 156 ...) __attribute__((format(printf, 3, 4))); 157 static int xbb_shutdown(struct xbb_softc *xbb); 158 159 /*------------------------------ Data Structures -----------------------------*/ 160 161 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 162 163 typedef enum { 164 XBB_REQLIST_NONE = 0x00, 165 XBB_REQLIST_MAPPED = 0x01 166 } xbb_reqlist_flags; 167 168 struct xbb_xen_reqlist { 169 /** 170 * Back reference to the parent block back instance for this 171 * request. Used during bio_done handling. 172 */ 173 struct xbb_softc *xbb; 174 175 /** 176 * BLKIF_OP code for this request. 177 */ 178 int operation; 179 180 /** 181 * Set to BLKIF_RSP_* to indicate request status. 182 * 183 * This field allows an error status to be recorded even if the 184 * delivery of this status must be deferred. Deferred reporting 185 * is necessary, for example, when an error is detected during 186 * completion processing of one bio when other bios for this 187 * request are still outstanding. 188 */ 189 int status; 190 191 /** 192 * Number of 512 byte sectors not transferred. 193 */ 194 int residual_512b_sectors; 195 196 /** 197 * Starting sector number of the first request in the list. 198 */ 199 off_t starting_sector_number; 200 201 /** 202 * If we're going to coalesce, the next contiguous sector would be 203 * this one. 204 */ 205 off_t next_contig_sector; 206 207 /** 208 * Number of child requests in the list. 209 */ 210 int num_children; 211 212 /** 213 * Number of I/O requests still pending on the backend. 214 */ 215 int pendcnt; 216 217 /** 218 * Total number of segments for requests in the list. 219 */ 220 int nr_segments; 221 222 /** 223 * Flags for this particular request list. 224 */ 225 xbb_reqlist_flags flags; 226 227 /** 228 * Kernel virtual address space reserved for this request 229 * list structure and used to map the remote domain's pages for 230 * this I/O, into our domain's address space. 231 */ 232 uint8_t *kva; 233 234 /** 235 * Base, pseudo-physical address, corresponding to the start 236 * of this request's kva region. 237 */ 238 uint64_t gnt_base; 239 240 /** 241 * Array of grant handles (one per page) used to map this request. 242 */ 243 grant_handle_t *gnt_handles; 244 245 /** 246 * Device statistics request ordering type (ordered or simple). 247 */ 248 devstat_tag_type ds_tag_type; 249 250 /** 251 * Device statistics request type (read, write, no_data). 252 */ 253 devstat_trans_flags ds_trans_type; 254 255 /** 256 * The start time for this request. 257 */ 258 struct bintime ds_t0; 259 260 /** 261 * Linked list of contiguous requests with the same operation type. 262 */ 263 struct xbb_xen_req_list contig_req_list; 264 265 /** 266 * Linked list links used to aggregate idle requests in the 267 * request list free pool (xbb->reqlist_free_stailq) and pending 268 * requests waiting for execution (xbb->reqlist_pending_stailq). 269 */ 270 STAILQ_ENTRY(xbb_xen_reqlist) links; 271 }; 272 273 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 274 275 /** 276 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 277 */ 278 struct xbb_xen_req { 279 /** 280 * Linked list links used to aggregate requests into a reqlist 281 * and to store them in the request free pool. 282 */ 283 STAILQ_ENTRY(xbb_xen_req) links; 284 285 /** 286 * The remote domain's identifier for this I/O request. 287 */ 288 uint64_t id; 289 290 /** 291 * The number of pages currently mapped for this request. 292 */ 293 int nr_pages; 294 295 /** 296 * The number of 512 byte sectors comprising this requests. 297 */ 298 int nr_512b_sectors; 299 300 /** 301 * BLKIF_OP code for this request. 302 */ 303 int operation; 304 305 /** 306 * Storage used for non-native ring requests. 307 */ 308 blkif_request_t ring_req_storage; 309 310 /** 311 * Pointer to the Xen request in the ring. 312 */ 313 blkif_request_t *ring_req; 314 315 /** 316 * Consumer index for this request. 317 */ 318 RING_IDX req_ring_idx; 319 320 /** 321 * The start time for this request. 322 */ 323 struct bintime ds_t0; 324 325 /** 326 * Pointer back to our parent request list. 327 */ 328 struct xbb_xen_reqlist *reqlist; 329 }; 330 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 331 332 /** 333 * \brief Configuration data for the shared memory request ring 334 * used to communicate with the front-end client of this 335 * this driver. 336 */ 337 struct xbb_ring_config { 338 /** KVA address where ring memory is mapped. */ 339 vm_offset_t va; 340 341 /** The pseudo-physical address where ring memory is mapped.*/ 342 uint64_t gnt_addr; 343 344 /** 345 * Grant table handles, one per-ring page, returned by the 346 * hyperpervisor upon mapping of the ring and required to 347 * unmap it when a connection is torn down. 348 */ 349 grant_handle_t handle[XBB_MAX_RING_PAGES]; 350 351 /** 352 * The device bus address returned by the hypervisor when 353 * mapping the ring and required to unmap it when a connection 354 * is torn down. 355 */ 356 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 357 358 /** The number of ring pages mapped for the current connection. */ 359 u_int ring_pages; 360 361 /** 362 * The grant references, one per-ring page, supplied by the 363 * front-end, allowing us to reference the ring pages in the 364 * front-end's domain and to map these pages into our own domain. 365 */ 366 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 367 368 /** The interrupt driven even channel used to signal ring events. */ 369 evtchn_port_t evtchn; 370 }; 371 372 /** 373 * Per-instance connection state flags. 374 */ 375 typedef enum 376 { 377 /** 378 * The front-end requested a read-only mount of the 379 * back-end device/file. 380 */ 381 XBBF_READ_ONLY = 0x01, 382 383 /** Communication with the front-end has been established. */ 384 XBBF_RING_CONNECTED = 0x02, 385 386 /** 387 * Front-end requests exist in the ring and are waiting for 388 * xbb_xen_req objects to free up. 389 */ 390 XBBF_RESOURCE_SHORTAGE = 0x04, 391 392 /** Connection teardown in progress. */ 393 XBBF_SHUTDOWN = 0x08, 394 395 /** A thread is already performing shutdown processing. */ 396 XBBF_IN_SHUTDOWN = 0x10 397 } xbb_flag_t; 398 399 /** Backend device type. */ 400 typedef enum { 401 /** Backend type unknown. */ 402 XBB_TYPE_NONE = 0x00, 403 404 /** 405 * Backend type disk (access via cdev switch 406 * strategy routine). 407 */ 408 XBB_TYPE_DISK = 0x01, 409 410 /** Backend type file (access vnode operations.). */ 411 XBB_TYPE_FILE = 0x02 412 } xbb_type; 413 414 /** 415 * \brief Structure used to memoize information about a per-request 416 * scatter-gather list. 417 * 418 * The chief benefit of using this data structure is it avoids having 419 * to reparse the possibly discontiguous S/G list in the original 420 * request. Due to the way that the mapping of the memory backing an 421 * I/O transaction is handled by Xen, a second pass is unavoidable. 422 * At least this way the second walk is a simple array traversal. 423 * 424 * \note A single Scatter/Gather element in the block interface covers 425 * at most 1 machine page. In this context a sector (blkif 426 * nomenclature, not what I'd choose) is a 512b aligned unit 427 * of mapping within the machine page referenced by an S/G 428 * element. 429 */ 430 struct xbb_sg { 431 /** The number of 512b data chunks mapped in this S/G element. */ 432 int16_t nsect; 433 434 /** 435 * The index (0 based) of the first 512b data chunk mapped 436 * in this S/G element. 437 */ 438 uint8_t first_sect; 439 440 /** 441 * The index (0 based) of the last 512b data chunk mapped 442 * in this S/G element. 443 */ 444 uint8_t last_sect; 445 }; 446 447 /** 448 * Character device backend specific configuration data. 449 */ 450 struct xbb_dev_data { 451 /** Cdev used for device backend access. */ 452 struct cdev *cdev; 453 454 /** Cdev switch used for device backend access. */ 455 struct cdevsw *csw; 456 457 /** Used to hold a reference on opened cdev backend devices. */ 458 int dev_ref; 459 }; 460 461 /** 462 * File backend specific configuration data. 463 */ 464 struct xbb_file_data { 465 /** Credentials to use for vnode backed (file based) I/O. */ 466 struct ucred *cred; 467 468 /** 469 * \brief Array of io vectors used to process file based I/O. 470 * 471 * Only a single file based request is outstanding per-xbb instance, 472 * so we only need one of these. 473 */ 474 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 475 }; 476 477 /** 478 * Collection of backend type specific data. 479 */ 480 union xbb_backend_data { 481 struct xbb_dev_data dev; 482 struct xbb_file_data file; 483 }; 484 485 /** 486 * Function signature of backend specific I/O handlers. 487 */ 488 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 489 struct xbb_xen_reqlist *reqlist, int operation, 490 int flags); 491 492 /** 493 * Per-instance configuration data. 494 */ 495 struct xbb_softc { 496 /** 497 * Task-queue used to process I/O requests. 498 */ 499 struct taskqueue *io_taskqueue; 500 501 /** 502 * Single "run the request queue" task enqueued 503 * on io_taskqueue. 504 */ 505 struct task io_task; 506 507 /** Device type for this instance. */ 508 xbb_type device_type; 509 510 /** NewBus device corresponding to this instance. */ 511 device_t dev; 512 513 /** Backend specific dispatch routine for this instance. */ 514 xbb_dispatch_t dispatch_io; 515 516 /** The number of requests outstanding on the backend device/file. */ 517 int active_request_count; 518 519 /** Free pool of request tracking structures. */ 520 struct xbb_xen_req_list request_free_stailq; 521 522 /** Array, sized at connection time, of request tracking structures. */ 523 struct xbb_xen_req *requests; 524 525 /** Free pool of request list structures. */ 526 struct xbb_xen_reqlist_list reqlist_free_stailq; 527 528 /** List of pending request lists awaiting execution. */ 529 struct xbb_xen_reqlist_list reqlist_pending_stailq; 530 531 /** Array, sized at connection time, of request list structures. */ 532 struct xbb_xen_reqlist *request_lists; 533 534 /** 535 * Global pool of kva used for mapping remote domain ring 536 * and I/O transaction data. 537 */ 538 vm_offset_t kva; 539 540 /** Pseudo-physical address corresponding to kva. */ 541 uint64_t gnt_base_addr; 542 543 /** The size of the global kva pool. */ 544 int kva_size; 545 546 /** The size of the KVA area used for request lists. */ 547 int reqlist_kva_size; 548 549 /** The number of pages of KVA used for request lists */ 550 int reqlist_kva_pages; 551 552 /** Bitmap of free KVA pages */ 553 bitstr_t *kva_free; 554 555 /** 556 * \brief Cached value of the front-end's domain id. 557 * 558 * This value is used at once for each mapped page in 559 * a transaction. We cache it to avoid incuring the 560 * cost of an ivar access every time this is needed. 561 */ 562 domid_t otherend_id; 563 564 /** 565 * \brief The blkif protocol abi in effect. 566 * 567 * There are situations where the back and front ends can 568 * have a different, native abi (e.g. intel x86_64 and 569 * 32bit x86 domains on the same machine). The back-end 570 * always accommodates the front-end's native abi. That 571 * value is pulled from the XenStore and recorded here. 572 */ 573 int abi; 574 575 /** 576 * \brief The maximum number of requests and request lists allowed 577 * to be in flight at a time. 578 * 579 * This value is negotiated via the XenStore. 580 */ 581 u_int max_requests; 582 583 /** 584 * \brief The maximum number of segments (1 page per segment) 585 * that can be mapped by a request. 586 * 587 * This value is negotiated via the XenStore. 588 */ 589 u_int max_request_segments; 590 591 /** 592 * \brief Maximum number of segments per request list. 593 * 594 * This value is derived from and will generally be larger than 595 * max_request_segments. 596 */ 597 u_int max_reqlist_segments; 598 599 /** 600 * The maximum size of any request to this back-end 601 * device. 602 * 603 * This value is negotiated via the XenStore. 604 */ 605 u_int max_request_size; 606 607 /** 608 * The maximum size of any request list. This is derived directly 609 * from max_reqlist_segments. 610 */ 611 u_int max_reqlist_size; 612 613 /** Various configuration and state bit flags. */ 614 xbb_flag_t flags; 615 616 /** Ring mapping and interrupt configuration data. */ 617 struct xbb_ring_config ring_config; 618 619 /** Runtime, cross-abi safe, structures for ring access. */ 620 blkif_back_rings_t rings; 621 622 /** IRQ mapping for the communication ring event channel. */ 623 xen_intr_handle_t xen_intr_handle; 624 625 /** 626 * \brief Backend access mode flags (e.g. write, or read-only). 627 * 628 * This value is passed to us by the front-end via the XenStore. 629 */ 630 char *dev_mode; 631 632 /** 633 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 634 * 635 * This value is passed to us by the front-end via the XenStore. 636 * Currently unused. 637 */ 638 char *dev_type; 639 640 /** 641 * \brief Backend device/file identifier. 642 * 643 * This value is passed to us by the front-end via the XenStore. 644 * We expect this to be a POSIX path indicating the file or 645 * device to open. 646 */ 647 char *dev_name; 648 649 /** 650 * Vnode corresponding to the backend device node or file 651 * we are acessing. 652 */ 653 struct vnode *vn; 654 655 union xbb_backend_data backend; 656 657 /** The native sector size of the backend. */ 658 u_int sector_size; 659 660 /** log2 of sector_size. */ 661 u_int sector_size_shift; 662 663 /** Size in bytes of the backend device or file. */ 664 off_t media_size; 665 666 /** 667 * \brief media_size expressed in terms of the backend native 668 * sector size. 669 * 670 * (e.g. xbb->media_size >> xbb->sector_size_shift). 671 */ 672 uint64_t media_num_sectors; 673 674 /** 675 * \brief Array of memoized scatter gather data computed during the 676 * conversion of blkif ring requests to internal xbb_xen_req 677 * structures. 678 * 679 * Ring processing is serialized so we only need one of these. 680 */ 681 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 682 683 /** 684 * Temporary grant table map used in xbb_dispatch_io(). When 685 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 686 * stack could cause a stack overflow. 687 */ 688 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 689 690 /** Mutex protecting per-instance data. */ 691 struct mtx lock; 692 693 /** 694 * Resource representing allocated physical address space 695 * associated with our per-instance kva region. 696 */ 697 struct resource *pseudo_phys_res; 698 699 /** Resource id for allocated physical address space. */ 700 int pseudo_phys_res_id; 701 702 /** 703 * I/O statistics from BlockBack dispatch down. These are 704 * coalesced requests, and we start them right before execution. 705 */ 706 struct devstat *xbb_stats; 707 708 /** 709 * I/O statistics coming into BlockBack. These are the requests as 710 * we get them from BlockFront. They are started as soon as we 711 * receive a request, and completed when the I/O is complete. 712 */ 713 struct devstat *xbb_stats_in; 714 715 /** Disable sending flush to the backend */ 716 int disable_flush; 717 718 /** Send a real flush for every N flush requests */ 719 int flush_interval; 720 721 /** Count of flush requests in the interval */ 722 int flush_count; 723 724 /** Don't coalesce requests if this is set */ 725 int no_coalesce_reqs; 726 727 /** Number of requests we have received */ 728 uint64_t reqs_received; 729 730 /** Number of requests we have completed*/ 731 uint64_t reqs_completed; 732 733 /** Number of requests we queued but not pushed*/ 734 uint64_t reqs_queued_for_completion; 735 736 /** Number of requests we completed with an error status*/ 737 uint64_t reqs_completed_with_error; 738 739 /** How many forced dispatches (i.e. without coalescing) have happened */ 740 uint64_t forced_dispatch; 741 742 /** How many normal dispatches have happened */ 743 uint64_t normal_dispatch; 744 745 /** How many total dispatches have happened */ 746 uint64_t total_dispatch; 747 748 /** How many times we have run out of KVA */ 749 uint64_t kva_shortages; 750 751 /** How many times we have run out of request structures */ 752 uint64_t request_shortages; 753 754 /** Watch to wait for hotplug script execution */ 755 struct xs_watch hotplug_watch; 756 757 /** Got the needed data from hotplug scripts? */ 758 bool hotplug_done; 759 }; 760 761 /*---------------------------- Request Processing ----------------------------*/ 762 /** 763 * Allocate an internal transaction tracking structure from the free pool. 764 * 765 * \param xbb Per-instance xbb configuration structure. 766 * 767 * \return On success, a pointer to the allocated xbb_xen_req structure. 768 * Otherwise NULL. 769 */ 770 static inline struct xbb_xen_req * 771 xbb_get_req(struct xbb_softc *xbb) 772 { 773 struct xbb_xen_req *req; 774 775 req = NULL; 776 777 mtx_assert(&xbb->lock, MA_OWNED); 778 779 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 780 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 781 xbb->active_request_count++; 782 } 783 784 return (req); 785 } 786 787 /** 788 * Return an allocated transaction tracking structure to the free pool. 789 * 790 * \param xbb Per-instance xbb configuration structure. 791 * \param req The request structure to free. 792 */ 793 static inline void 794 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 795 { 796 mtx_assert(&xbb->lock, MA_OWNED); 797 798 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 799 xbb->active_request_count--; 800 801 KASSERT(xbb->active_request_count >= 0, 802 ("xbb_release_req: negative active count")); 803 } 804 805 /** 806 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 807 * 808 * \param xbb Per-instance xbb configuration structure. 809 * \param req_list The list of requests to free. 810 * \param nreqs The number of items in the list. 811 */ 812 static inline void 813 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 814 int nreqs) 815 { 816 mtx_assert(&xbb->lock, MA_OWNED); 817 818 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 819 xbb->active_request_count -= nreqs; 820 821 KASSERT(xbb->active_request_count >= 0, 822 ("xbb_release_reqs: negative active count")); 823 } 824 825 /** 826 * Given a page index and 512b sector offset within that page, 827 * calculate an offset into a request's kva region. 828 * 829 * \param reqlist The request structure whose kva region will be accessed. 830 * \param pagenr The page index used to compute the kva offset. 831 * \param sector The 512b sector index used to compute the page relative 832 * kva offset. 833 * 834 * \return The computed global KVA offset. 835 */ 836 static inline uint8_t * 837 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 838 { 839 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 840 } 841 842 /** 843 * Given a page number and 512b sector offset within that page, 844 * calculate an offset into the request's memory region that the 845 * underlying backend device/file should use for I/O. 846 * 847 * \param reqlist The request structure whose I/O region will be accessed. 848 * \param pagenr The page index used to compute the I/O offset. 849 * \param sector The 512b sector index used to compute the page relative 850 * I/O offset. 851 * 852 * \return The computed global I/O address. 853 * 854 * Depending on configuration, this will either be a local bounce buffer 855 * or a pointer to the memory mapped in from the front-end domain for 856 * this request. 857 */ 858 static inline uint8_t * 859 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 860 { 861 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 862 } 863 864 /** 865 * Given a page index and 512b sector offset within that page, calculate 866 * an offset into the local pseudo-physical address space used to map a 867 * front-end's request data into a request. 868 * 869 * \param reqlist The request list structure whose pseudo-physical region 870 * will be accessed. 871 * \param pagenr The page index used to compute the pseudo-physical offset. 872 * \param sector The 512b sector index used to compute the page relative 873 * pseudo-physical offset. 874 * 875 * \return The computed global pseudo-phsyical address. 876 * 877 * Depending on configuration, this will either be a local bounce buffer 878 * or a pointer to the memory mapped in from the front-end domain for 879 * this request. 880 */ 881 static inline uintptr_t 882 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 883 { 884 struct xbb_softc *xbb; 885 886 xbb = reqlist->xbb; 887 888 return ((uintptr_t)(xbb->gnt_base_addr + 889 (uintptr_t)(reqlist->kva - xbb->kva) + 890 (PAGE_SIZE * pagenr) + (sector << 9))); 891 } 892 893 /** 894 * Get Kernel Virtual Address space for mapping requests. 895 * 896 * \param xbb Per-instance xbb configuration structure. 897 * \param nr_pages Number of pages needed. 898 * \param check_only If set, check for free KVA but don't allocate it. 899 * \param have_lock If set, xbb lock is already held. 900 * 901 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 902 * 903 * Note: This should be unnecessary once we have either chaining or 904 * scatter/gather support for struct bio. At that point we'll be able to 905 * put multiple addresses and lengths in one bio/bio chain and won't need 906 * to map everything into one virtual segment. 907 */ 908 static uint8_t * 909 xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 910 { 911 int first_clear; 912 int num_clear; 913 uint8_t *free_kva; 914 int i; 915 916 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 917 918 first_clear = 0; 919 free_kva = NULL; 920 921 mtx_lock(&xbb->lock); 922 923 /* 924 * Look for the first available page. If there are none, we're done. 925 */ 926 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 927 928 if (first_clear == -1) 929 goto bailout; 930 931 /* 932 * Starting at the first available page, look for consecutive free 933 * pages that will satisfy the user's request. 934 */ 935 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 936 /* 937 * If this is true, the page is used, so we have to reset 938 * the number of clear pages and the first clear page 939 * (since it pointed to a region with an insufficient number 940 * of clear pages). 941 */ 942 if (bit_test(xbb->kva_free, i)) { 943 num_clear = 0; 944 first_clear = -1; 945 continue; 946 } 947 948 if (first_clear == -1) 949 first_clear = i; 950 951 /* 952 * If this is true, we've found a large enough free region 953 * to satisfy the request. 954 */ 955 if (++num_clear == nr_pages) { 956 bit_nset(xbb->kva_free, first_clear, 957 first_clear + nr_pages - 1); 958 959 free_kva = xbb->kva + 960 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); 961 962 KASSERT(free_kva >= (uint8_t *)xbb->kva && 963 free_kva + (nr_pages * PAGE_SIZE) <= 964 (uint8_t *)xbb->ring_config.va, 965 ("Free KVA %p len %d out of range, " 966 "kva = %#jx, ring VA = %#jx\n", free_kva, 967 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 968 (uintmax_t)xbb->ring_config.va)); 969 break; 970 } 971 } 972 973 bailout: 974 975 if (free_kva == NULL) { 976 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 977 xbb->kva_shortages++; 978 } 979 980 mtx_unlock(&xbb->lock); 981 982 return (free_kva); 983 } 984 985 /** 986 * Free allocated KVA. 987 * 988 * \param xbb Per-instance xbb configuration structure. 989 * \param kva_ptr Pointer to allocated KVA region. 990 * \param nr_pages Number of pages in the KVA region. 991 */ 992 static void 993 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 994 { 995 intptr_t start_page; 996 997 mtx_assert(&xbb->lock, MA_OWNED); 998 999 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1000 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1001 1002 } 1003 1004 /** 1005 * Unmap the front-end pages associated with this I/O request. 1006 * 1007 * \param req The request structure to unmap. 1008 */ 1009 static void 1010 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1011 { 1012 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1013 u_int i; 1014 u_int invcount; 1015 int error __diagused; 1016 1017 invcount = 0; 1018 for (i = 0; i < reqlist->nr_segments; i++) { 1019 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1020 continue; 1021 1022 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1023 unmap[invcount].dev_bus_addr = 0; 1024 unmap[invcount].handle = reqlist->gnt_handles[i]; 1025 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1026 invcount++; 1027 } 1028 1029 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1030 unmap, invcount); 1031 KASSERT(error == 0, ("Grant table operation failed")); 1032 } 1033 1034 /** 1035 * Allocate an internal transaction tracking structure from the free pool. 1036 * 1037 * \param xbb Per-instance xbb configuration structure. 1038 * 1039 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1040 * Otherwise NULL. 1041 */ 1042 static inline struct xbb_xen_reqlist * 1043 xbb_get_reqlist(struct xbb_softc *xbb) 1044 { 1045 struct xbb_xen_reqlist *reqlist; 1046 1047 reqlist = NULL; 1048 1049 mtx_assert(&xbb->lock, MA_OWNED); 1050 1051 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1052 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1053 reqlist->flags = XBB_REQLIST_NONE; 1054 reqlist->kva = NULL; 1055 reqlist->status = BLKIF_RSP_OKAY; 1056 reqlist->residual_512b_sectors = 0; 1057 reqlist->num_children = 0; 1058 reqlist->nr_segments = 0; 1059 STAILQ_INIT(&reqlist->contig_req_list); 1060 } 1061 1062 return (reqlist); 1063 } 1064 1065 /** 1066 * Return an allocated transaction tracking structure to the free pool. 1067 * 1068 * \param xbb Per-instance xbb configuration structure. 1069 * \param req The request list structure to free. 1070 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1071 * during a resource shortage condition. 1072 */ 1073 static inline void 1074 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1075 int wakeup) 1076 { 1077 1078 mtx_assert(&xbb->lock, MA_OWNED); 1079 1080 if (wakeup) { 1081 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1082 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1083 } 1084 1085 if (reqlist->kva != NULL) 1086 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1087 1088 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1089 1090 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1091 1092 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1093 /* 1094 * Shutdown is in progress. See if we can 1095 * progress further now that one more request 1096 * has completed and been returned to the 1097 * free pool. 1098 */ 1099 xbb_shutdown(xbb); 1100 } 1101 1102 if (wakeup != 0) 1103 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1104 } 1105 1106 /** 1107 * Request resources and do basic request setup. 1108 * 1109 * \param xbb Per-instance xbb configuration structure. 1110 * \param reqlist Pointer to reqlist pointer. 1111 * \param ring_req Pointer to a block ring request. 1112 * \param ring_index The ring index of this request. 1113 * 1114 * \return 0 for success, non-zero for failure. 1115 */ 1116 static int 1117 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1118 blkif_request_t *ring_req, RING_IDX ring_idx) 1119 { 1120 struct xbb_xen_reqlist *nreqlist; 1121 struct xbb_xen_req *nreq; 1122 1123 nreqlist = NULL; 1124 nreq = NULL; 1125 1126 mtx_lock(&xbb->lock); 1127 1128 /* 1129 * We don't allow new resources to be allocated if we're in the 1130 * process of shutting down. 1131 */ 1132 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1133 mtx_unlock(&xbb->lock); 1134 return (1); 1135 } 1136 1137 /* 1138 * Allocate a reqlist if the caller doesn't have one already. 1139 */ 1140 if (*reqlist == NULL) { 1141 nreqlist = xbb_get_reqlist(xbb); 1142 if (nreqlist == NULL) 1143 goto bailout_error; 1144 } 1145 1146 /* We always allocate a request. */ 1147 nreq = xbb_get_req(xbb); 1148 if (nreq == NULL) 1149 goto bailout_error; 1150 1151 mtx_unlock(&xbb->lock); 1152 1153 if (*reqlist == NULL) { 1154 *reqlist = nreqlist; 1155 nreqlist->operation = ring_req->operation; 1156 nreqlist->starting_sector_number = ring_req->sector_number; 1157 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1158 links); 1159 } 1160 1161 nreq->reqlist = *reqlist; 1162 nreq->req_ring_idx = ring_idx; 1163 nreq->id = ring_req->id; 1164 nreq->operation = ring_req->operation; 1165 1166 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1167 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1168 nreq->ring_req = &nreq->ring_req_storage; 1169 } else { 1170 nreq->ring_req = ring_req; 1171 } 1172 1173 binuptime(&nreq->ds_t0); 1174 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1175 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1176 (*reqlist)->num_children++; 1177 (*reqlist)->nr_segments += ring_req->nr_segments; 1178 1179 return (0); 1180 1181 bailout_error: 1182 1183 /* 1184 * We're out of resources, so set the shortage flag. The next time 1185 * a request is released, we'll try waking up the work thread to 1186 * see if we can allocate more resources. 1187 */ 1188 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1189 xbb->request_shortages++; 1190 1191 if (nreq != NULL) 1192 xbb_release_req(xbb, nreq); 1193 1194 if (nreqlist != NULL) 1195 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1196 1197 mtx_unlock(&xbb->lock); 1198 1199 return (1); 1200 } 1201 1202 /** 1203 * Create and queue a response to a blkif request. 1204 * 1205 * \param xbb Per-instance xbb configuration structure. 1206 * \param req The request structure to which to respond. 1207 * \param status The status code to report. See BLKIF_RSP_* 1208 * in sys/contrib/xen/io/blkif.h. 1209 */ 1210 static void 1211 xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1212 { 1213 blkif_response_t *resp; 1214 1215 /* 1216 * The mutex is required here, and should be held across this call 1217 * until after the subsequent call to xbb_push_responses(). This 1218 * is to guarantee that another context won't queue responses and 1219 * push them while we're active. 1220 * 1221 * That could lead to the other end being notified of responses 1222 * before the resources have been freed on this end. The other end 1223 * would then be able to queue additional I/O, and we may run out 1224 * of resources because we haven't freed them all yet. 1225 */ 1226 mtx_assert(&xbb->lock, MA_OWNED); 1227 1228 /* 1229 * Place on the response ring for the relevant domain. 1230 * For now, only the spacing between entries is different 1231 * in the different ABIs, not the response entry layout. 1232 */ 1233 switch (xbb->abi) { 1234 case BLKIF_PROTOCOL_NATIVE: 1235 resp = RING_GET_RESPONSE(&xbb->rings.native, 1236 xbb->rings.native.rsp_prod_pvt); 1237 break; 1238 case BLKIF_PROTOCOL_X86_32: 1239 resp = (blkif_response_t *) 1240 RING_GET_RESPONSE(&xbb->rings.x86_32, 1241 xbb->rings.x86_32.rsp_prod_pvt); 1242 break; 1243 case BLKIF_PROTOCOL_X86_64: 1244 resp = (blkif_response_t *) 1245 RING_GET_RESPONSE(&xbb->rings.x86_64, 1246 xbb->rings.x86_64.rsp_prod_pvt); 1247 break; 1248 default: 1249 panic("Unexpected blkif protocol ABI."); 1250 } 1251 1252 resp->id = req->id; 1253 resp->operation = req->operation; 1254 resp->status = status; 1255 1256 if (status != BLKIF_RSP_OKAY) 1257 xbb->reqs_completed_with_error++; 1258 1259 xbb->rings.common.rsp_prod_pvt++; 1260 1261 xbb->reqs_queued_for_completion++; 1262 1263 } 1264 1265 /** 1266 * Send queued responses to blkif requests. 1267 * 1268 * \param xbb Per-instance xbb configuration structure. 1269 * \param run_taskqueue Flag that is set to 1 if the taskqueue 1270 * should be run, 0 if it does not need to be run. 1271 * \param notify Flag that is set to 1 if the other end should be 1272 * notified via irq, 0 if the other end should not be 1273 * notified. 1274 */ 1275 static void 1276 xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) 1277 { 1278 int more_to_do; 1279 1280 /* 1281 * The mutex is required here. 1282 */ 1283 mtx_assert(&xbb->lock, MA_OWNED); 1284 1285 more_to_do = 0; 1286 1287 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); 1288 1289 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1290 /* 1291 * Tail check for pending requests. Allows frontend to avoid 1292 * notifications if requests are already in flight (lower 1293 * overheads and promotes batching). 1294 */ 1295 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1296 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1297 more_to_do = 1; 1298 } 1299 1300 xbb->reqs_completed += xbb->reqs_queued_for_completion; 1301 xbb->reqs_queued_for_completion = 0; 1302 1303 *run_taskqueue = more_to_do; 1304 } 1305 1306 /** 1307 * Complete a request list. 1308 * 1309 * \param xbb Per-instance xbb configuration structure. 1310 * \param reqlist Allocated internal request list structure. 1311 */ 1312 static void 1313 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1314 { 1315 struct xbb_xen_req *nreq; 1316 off_t sectors_sent; 1317 int notify, run_taskqueue; 1318 1319 sectors_sent = 0; 1320 1321 if (reqlist->flags & XBB_REQLIST_MAPPED) 1322 xbb_unmap_reqlist(reqlist); 1323 1324 mtx_lock(&xbb->lock); 1325 1326 /* 1327 * All I/O is done, send the response. A lock is not necessary 1328 * to protect the request list, because all requests have 1329 * completed. Therefore this is the only context accessing this 1330 * reqlist right now. However, in order to make sure that no one 1331 * else queues responses onto the queue or pushes them to the other 1332 * side while we're active, we need to hold the lock across the 1333 * calls to xbb_queue_response() and xbb_push_responses(). 1334 */ 1335 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1336 off_t cur_sectors_sent; 1337 1338 /* Put this response on the ring, but don't push yet */ 1339 xbb_queue_response(xbb, nreq, reqlist->status); 1340 1341 /* We don't report bytes sent if there is an error. */ 1342 if (reqlist->status == BLKIF_RSP_OKAY) 1343 cur_sectors_sent = nreq->nr_512b_sectors; 1344 else 1345 cur_sectors_sent = 0; 1346 1347 sectors_sent += cur_sectors_sent; 1348 1349 devstat_end_transaction(xbb->xbb_stats_in, 1350 /*bytes*/cur_sectors_sent << 9, 1351 reqlist->ds_tag_type, 1352 reqlist->ds_trans_type, 1353 /*now*/NULL, 1354 /*then*/&nreq->ds_t0); 1355 } 1356 1357 /* 1358 * Take out any sectors not sent. If we wind up negative (which 1359 * might happen if an error is reported as well as a residual), just 1360 * report 0 sectors sent. 1361 */ 1362 sectors_sent -= reqlist->residual_512b_sectors; 1363 if (sectors_sent < 0) 1364 sectors_sent = 0; 1365 1366 devstat_end_transaction(xbb->xbb_stats, 1367 /*bytes*/ sectors_sent << 9, 1368 reqlist->ds_tag_type, 1369 reqlist->ds_trans_type, 1370 /*now*/NULL, 1371 /*then*/&reqlist->ds_t0); 1372 1373 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1374 1375 xbb_push_responses(xbb, &run_taskqueue, ¬ify); 1376 1377 mtx_unlock(&xbb->lock); 1378 1379 if (run_taskqueue) 1380 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1381 1382 if (notify) 1383 xen_intr_signal(xbb->xen_intr_handle); 1384 } 1385 1386 /** 1387 * Completion handler for buffer I/O requests issued by the device 1388 * backend driver. 1389 * 1390 * \param bio The buffer I/O request on which to perform completion 1391 * processing. 1392 */ 1393 static void 1394 xbb_bio_done(struct bio *bio) 1395 { 1396 struct xbb_softc *xbb; 1397 struct xbb_xen_reqlist *reqlist; 1398 1399 reqlist = bio->bio_caller1; 1400 xbb = reqlist->xbb; 1401 1402 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1403 1404 /* 1405 * This is a bit imprecise. With aggregated I/O a single 1406 * request list can contain multiple front-end requests and 1407 * a multiple bios may point to a single request. By carefully 1408 * walking the request list, we could map residuals and errors 1409 * back to the original front-end request, but the interface 1410 * isn't sufficiently rich for us to properly report the error. 1411 * So, we just treat the entire request list as having failed if an 1412 * error occurs on any part. And, if an error occurs, we treat 1413 * the amount of data transferred as 0. 1414 * 1415 * For residuals, we report it on the overall aggregated device, 1416 * but not on the individual requests, since we don't currently 1417 * do the work to determine which front-end request to which the 1418 * residual applies. 1419 */ 1420 if (bio->bio_error) { 1421 DPRINTF("BIO returned error %d for operation on device %s\n", 1422 bio->bio_error, xbb->dev_name); 1423 reqlist->status = BLKIF_RSP_ERROR; 1424 1425 if (bio->bio_error == ENXIO 1426 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1427 /* 1428 * Backend device has disappeared. Signal the 1429 * front-end that we (the device proxy) want to 1430 * go away. 1431 */ 1432 xenbus_set_state(xbb->dev, XenbusStateClosing); 1433 } 1434 } 1435 1436 /* 1437 * Decrement the pending count for the request list. When we're 1438 * done with the requests, send status back for all of them. 1439 */ 1440 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1441 xbb_complete_reqlist(xbb, reqlist); 1442 1443 g_destroy_bio(bio); 1444 } 1445 1446 /** 1447 * Parse a blkif request into an internal request structure and send 1448 * it to the backend for processing. 1449 * 1450 * \param xbb Per-instance xbb configuration structure. 1451 * \param reqlist Allocated internal request list structure. 1452 * 1453 * \return On success, 0. For resource shortages, non-zero. 1454 * 1455 * This routine performs the backend common aspects of request parsing 1456 * including compiling an internal request structure, parsing the S/G 1457 * list and any secondary ring requests in which they may reside, and 1458 * the mapping of front-end I/O pages into our domain. 1459 */ 1460 static int 1461 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1462 { 1463 struct xbb_sg *xbb_sg; 1464 struct gnttab_map_grant_ref *map; 1465 struct blkif_request_segment *sg; 1466 struct blkif_request_segment *last_block_sg; 1467 struct xbb_xen_req *nreq; 1468 u_int nseg; 1469 u_int seg_idx; 1470 u_int block_segs; 1471 int nr_sects; 1472 int total_sects; 1473 int operation; 1474 uint8_t bio_flags; 1475 int error; 1476 1477 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1478 bio_flags = 0; 1479 total_sects = 0; 1480 nr_sects = 0; 1481 1482 /* 1483 * First determine whether we have enough free KVA to satisfy this 1484 * request list. If not, tell xbb_run_queue() so it can go to 1485 * sleep until we have more KVA. 1486 */ 1487 reqlist->kva = NULL; 1488 if (reqlist->nr_segments != 0) { 1489 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1490 if (reqlist->kva == NULL) { 1491 /* 1492 * If we're out of KVA, return ENOMEM. 1493 */ 1494 return (ENOMEM); 1495 } 1496 } 1497 1498 binuptime(&reqlist->ds_t0); 1499 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1500 1501 switch (reqlist->operation) { 1502 case BLKIF_OP_WRITE_BARRIER: 1503 bio_flags |= BIO_ORDERED; 1504 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1505 /* FALLTHROUGH */ 1506 case BLKIF_OP_WRITE: 1507 operation = BIO_WRITE; 1508 reqlist->ds_trans_type = DEVSTAT_WRITE; 1509 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1510 DPRINTF("Attempt to write to read only device %s\n", 1511 xbb->dev_name); 1512 reqlist->status = BLKIF_RSP_ERROR; 1513 goto send_response; 1514 } 1515 break; 1516 case BLKIF_OP_READ: 1517 operation = BIO_READ; 1518 reqlist->ds_trans_type = DEVSTAT_READ; 1519 break; 1520 case BLKIF_OP_FLUSH_DISKCACHE: 1521 /* 1522 * If this is true, the user has requested that we disable 1523 * flush support. So we just complete the requests 1524 * successfully. 1525 */ 1526 if (xbb->disable_flush != 0) { 1527 goto send_response; 1528 } 1529 1530 /* 1531 * The user has requested that we only send a real flush 1532 * for every N flush requests. So keep count, and either 1533 * complete the request immediately or queue it for the 1534 * backend. 1535 */ 1536 if (xbb->flush_interval != 0) { 1537 if (++(xbb->flush_count) < xbb->flush_interval) { 1538 goto send_response; 1539 } else 1540 xbb->flush_count = 0; 1541 } 1542 1543 operation = BIO_FLUSH; 1544 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1545 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1546 goto do_dispatch; 1547 /*NOTREACHED*/ 1548 default: 1549 DPRINTF("error: unknown block io operation [%d]\n", 1550 reqlist->operation); 1551 reqlist->status = BLKIF_RSP_ERROR; 1552 goto send_response; 1553 } 1554 1555 reqlist->xbb = xbb; 1556 xbb_sg = xbb->xbb_sgs; 1557 map = xbb->maps; 1558 seg_idx = 0; 1559 1560 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1561 blkif_request_t *ring_req; 1562 u_int req_seg_idx; 1563 1564 ring_req = nreq->ring_req; 1565 nr_sects = 0; 1566 nseg = ring_req->nr_segments; 1567 nreq->nr_pages = nseg; 1568 nreq->nr_512b_sectors = 0; 1569 req_seg_idx = 0; 1570 sg = NULL; 1571 1572 /* Check that number of segments is sane. */ 1573 if (__predict_false(nseg == 0) 1574 || __predict_false(nseg > xbb->max_request_segments)) { 1575 DPRINTF("Bad number of segments in request (%d)\n", 1576 nseg); 1577 reqlist->status = BLKIF_RSP_ERROR; 1578 goto send_response; 1579 } 1580 1581 block_segs = nseg; 1582 sg = ring_req->seg; 1583 last_block_sg = sg + block_segs; 1584 1585 while (sg < last_block_sg) { 1586 KASSERT(seg_idx < 1587 XBB_MAX_SEGMENTS_PER_REQLIST, 1588 ("seg_idx %d is too large, max " 1589 "segs %d\n", seg_idx, 1590 XBB_MAX_SEGMENTS_PER_REQLIST)); 1591 1592 xbb_sg->first_sect = sg->first_sect; 1593 xbb_sg->last_sect = sg->last_sect; 1594 xbb_sg->nsect = 1595 (int8_t)(sg->last_sect - 1596 sg->first_sect + 1); 1597 1598 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1599 || (xbb_sg->nsect <= 0)) { 1600 reqlist->status = BLKIF_RSP_ERROR; 1601 goto send_response; 1602 } 1603 1604 nr_sects += xbb_sg->nsect; 1605 map->host_addr = xbb_get_gntaddr(reqlist, 1606 seg_idx, /*sector*/0); 1607 KASSERT(map->host_addr + PAGE_SIZE <= 1608 xbb->ring_config.gnt_addr, 1609 ("Host address %#jx len %d overlaps " 1610 "ring address %#jx\n", 1611 (uintmax_t)map->host_addr, PAGE_SIZE, 1612 (uintmax_t)xbb->ring_config.gnt_addr)); 1613 1614 map->flags = GNTMAP_host_map; 1615 map->ref = sg->gref; 1616 map->dom = xbb->otherend_id; 1617 if (operation == BIO_WRITE) 1618 map->flags |= GNTMAP_readonly; 1619 sg++; 1620 map++; 1621 xbb_sg++; 1622 seg_idx++; 1623 req_seg_idx++; 1624 } 1625 1626 /* Convert to the disk's sector size */ 1627 nreq->nr_512b_sectors = nr_sects; 1628 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1629 total_sects += nr_sects; 1630 1631 if ((nreq->nr_512b_sectors & 1632 ((xbb->sector_size >> 9) - 1)) != 0) { 1633 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1634 "a multiple of the backing store sector " 1635 "size (%d)\n", __func__, 1636 nreq->nr_512b_sectors << 9, 1637 xbb->sector_size); 1638 reqlist->status = BLKIF_RSP_ERROR; 1639 goto send_response; 1640 } 1641 } 1642 1643 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1644 xbb->maps, reqlist->nr_segments); 1645 if (error != 0) 1646 panic("Grant table operation failed (%d)", error); 1647 1648 reqlist->flags |= XBB_REQLIST_MAPPED; 1649 1650 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1651 seg_idx++, map++){ 1652 if (__predict_false(map->status != 0)) { 1653 DPRINTF("invalid buffer -- could not remap " 1654 "it (%d)\n", map->status); 1655 DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " 1656 "0x%x ref 0x%x, dom %d\n", seg_idx, 1657 map->host_addr, map->flags, map->ref, 1658 map->dom); 1659 reqlist->status = BLKIF_RSP_ERROR; 1660 goto send_response; 1661 } 1662 1663 reqlist->gnt_handles[seg_idx] = map->handle; 1664 } 1665 if (reqlist->starting_sector_number + total_sects > 1666 xbb->media_num_sectors) { 1667 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1668 "extends past end of device %s\n", 1669 operation == BIO_READ ? "read" : "write", 1670 reqlist->starting_sector_number, 1671 reqlist->starting_sector_number + total_sects, 1672 xbb->dev_name); 1673 reqlist->status = BLKIF_RSP_ERROR; 1674 goto send_response; 1675 } 1676 1677 do_dispatch: 1678 1679 error = xbb->dispatch_io(xbb, 1680 reqlist, 1681 operation, 1682 bio_flags); 1683 1684 if (error != 0) { 1685 reqlist->status = BLKIF_RSP_ERROR; 1686 goto send_response; 1687 } 1688 1689 return (0); 1690 1691 send_response: 1692 1693 xbb_complete_reqlist(xbb, reqlist); 1694 1695 return (0); 1696 } 1697 1698 static __inline int 1699 xbb_count_sects(blkif_request_t *ring_req) 1700 { 1701 int i; 1702 int cur_size = 0; 1703 1704 for (i = 0; i < ring_req->nr_segments; i++) { 1705 int nsect; 1706 1707 nsect = (int8_t)(ring_req->seg[i].last_sect - 1708 ring_req->seg[i].first_sect + 1); 1709 if (nsect <= 0) 1710 break; 1711 1712 cur_size += nsect; 1713 } 1714 1715 return (cur_size); 1716 } 1717 1718 /** 1719 * Process incoming requests from the shared communication ring in response 1720 * to a signal on the ring's event channel. 1721 * 1722 * \param context Callback argument registerd during task initialization - 1723 * the xbb_softc for this instance. 1724 * \param pending The number of taskqueue_enqueue events that have 1725 * occurred since this handler was last run. 1726 */ 1727 static void 1728 xbb_run_queue(void *context, int pending) 1729 { 1730 struct xbb_softc *xbb; 1731 blkif_back_rings_t *rings; 1732 RING_IDX rp; 1733 uint64_t cur_sector; 1734 int cur_operation; 1735 struct xbb_xen_reqlist *reqlist; 1736 1737 xbb = (struct xbb_softc *)context; 1738 rings = &xbb->rings; 1739 1740 /* 1741 * Work gather and dispatch loop. Note that we have a bias here 1742 * towards gathering I/O sent by blockfront. We first gather up 1743 * everything in the ring, as long as we have resources. Then we 1744 * dispatch one request, and then attempt to gather up any 1745 * additional requests that have come in while we were dispatching 1746 * the request. 1747 * 1748 * This allows us to get a clearer picture (via devstat) of how 1749 * many requests blockfront is queueing to us at any given time. 1750 */ 1751 for (;;) { 1752 int retval; 1753 1754 /* 1755 * Initialize reqlist to the last element in the pending 1756 * queue, if there is one. This allows us to add more 1757 * requests to that request list, if we have room. 1758 */ 1759 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1760 xbb_xen_reqlist, links); 1761 if (reqlist != NULL) { 1762 cur_sector = reqlist->next_contig_sector; 1763 cur_operation = reqlist->operation; 1764 } else { 1765 cur_operation = 0; 1766 cur_sector = 0; 1767 } 1768 1769 /* 1770 * Cache req_prod to avoid accessing a cache line shared 1771 * with the frontend. 1772 */ 1773 rp = rings->common.sring->req_prod; 1774 1775 /* Ensure we see queued requests up to 'rp'. */ 1776 rmb(); 1777 1778 /** 1779 * Run so long as there is work to consume and the generation 1780 * of a response will not overflow the ring. 1781 * 1782 * @note There's a 1 to 1 relationship between requests and 1783 * responses, so an overflow should never occur. This 1784 * test is to protect our domain from digesting bogus 1785 * data. Shouldn't we log this? 1786 */ 1787 while (rings->common.req_cons != rp 1788 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1789 rings->common.req_cons) == 0){ 1790 blkif_request_t ring_req_storage; 1791 blkif_request_t *ring_req; 1792 int cur_size; 1793 1794 switch (xbb->abi) { 1795 case BLKIF_PROTOCOL_NATIVE: 1796 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1797 rings->common.req_cons); 1798 break; 1799 case BLKIF_PROTOCOL_X86_32: 1800 { 1801 struct blkif_x86_32_request *ring_req32; 1802 1803 ring_req32 = RING_GET_REQUEST( 1804 &xbb->rings.x86_32, rings->common.req_cons); 1805 blkif_get_x86_32_req(&ring_req_storage, 1806 ring_req32); 1807 ring_req = &ring_req_storage; 1808 break; 1809 } 1810 case BLKIF_PROTOCOL_X86_64: 1811 { 1812 struct blkif_x86_64_request *ring_req64; 1813 1814 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1815 rings->common.req_cons); 1816 blkif_get_x86_64_req(&ring_req_storage, 1817 ring_req64); 1818 ring_req = &ring_req_storage; 1819 break; 1820 } 1821 default: 1822 panic("Unexpected blkif protocol ABI."); 1823 /* NOTREACHED */ 1824 } 1825 1826 /* 1827 * Check for situations that would require closing 1828 * off this I/O for further coalescing: 1829 * - Coalescing is turned off. 1830 * - Current I/O is out of sequence with the previous 1831 * I/O. 1832 * - Coalesced I/O would be too large. 1833 */ 1834 if ((reqlist != NULL) 1835 && ((xbb->no_coalesce_reqs != 0) 1836 || ((xbb->no_coalesce_reqs == 0) 1837 && ((ring_req->sector_number != cur_sector) 1838 || (ring_req->operation != cur_operation) 1839 || ((ring_req->nr_segments + reqlist->nr_segments) > 1840 xbb->max_reqlist_segments))))) { 1841 reqlist = NULL; 1842 } 1843 1844 /* 1845 * Grab and check for all resources in one shot. 1846 * If we can't get all of the resources we need, 1847 * the shortage is noted and the thread will get 1848 * woken up when more resources are available. 1849 */ 1850 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1851 xbb->rings.common.req_cons); 1852 1853 if (retval != 0) { 1854 /* 1855 * Resource shortage has been recorded. 1856 * We'll be scheduled to run once a request 1857 * object frees up due to a completion. 1858 */ 1859 break; 1860 } 1861 1862 /* 1863 * Signify that we can overwrite this request with 1864 * a response by incrementing our consumer index. 1865 * The response won't be generated until after 1866 * we've already consumed all necessary data out 1867 * of the version of the request in the ring buffer 1868 * (for native mode). We must update the consumer 1869 * index before issuing back-end I/O so there is 1870 * no possibility that it will complete and a 1871 * response be generated before we make room in 1872 * the queue for that response. 1873 */ 1874 xbb->rings.common.req_cons++; 1875 xbb->reqs_received++; 1876 1877 cur_size = xbb_count_sects(ring_req); 1878 cur_sector = ring_req->sector_number + cur_size; 1879 reqlist->next_contig_sector = cur_sector; 1880 cur_operation = ring_req->operation; 1881 } 1882 1883 /* Check for I/O to dispatch */ 1884 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1885 if (reqlist == NULL) { 1886 /* 1887 * We're out of work to do, put the task queue to 1888 * sleep. 1889 */ 1890 break; 1891 } 1892 1893 /* 1894 * Grab the first request off the queue and attempt 1895 * to dispatch it. 1896 */ 1897 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1898 1899 retval = xbb_dispatch_io(xbb, reqlist); 1900 if (retval != 0) { 1901 /* 1902 * xbb_dispatch_io() returns non-zero only when 1903 * there is a resource shortage. If that's the 1904 * case, re-queue this request on the head of the 1905 * queue, and go to sleep until we have more 1906 * resources. 1907 */ 1908 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1909 reqlist, links); 1910 break; 1911 } else { 1912 /* 1913 * If we still have anything on the queue after 1914 * removing the head entry, that is because we 1915 * met one of the criteria to create a new 1916 * request list (outlined above), and we'll call 1917 * that a forced dispatch for statistical purposes. 1918 * 1919 * Otherwise, if there is only one element on the 1920 * queue, we coalesced everything available on 1921 * the ring and we'll call that a normal dispatch. 1922 */ 1923 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1924 1925 if (reqlist != NULL) 1926 xbb->forced_dispatch++; 1927 else 1928 xbb->normal_dispatch++; 1929 1930 xbb->total_dispatch++; 1931 } 1932 } 1933 } 1934 1935 /** 1936 * Interrupt handler bound to the shared ring's event channel. 1937 * 1938 * \param arg Callback argument registerd during event channel 1939 * binding - the xbb_softc for this instance. 1940 */ 1941 static int 1942 xbb_filter(void *arg) 1943 { 1944 struct xbb_softc *xbb; 1945 1946 /* Defer to taskqueue thread. */ 1947 xbb = (struct xbb_softc *)arg; 1948 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1949 1950 return (FILTER_HANDLED); 1951 } 1952 1953 SDT_PROVIDER_DEFINE(xbb); 1954 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 1955 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 1956 "uint64_t"); 1957 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 1958 "uint64_t", "uint64_t"); 1959 1960 /*----------------------------- Backend Handlers -----------------------------*/ 1961 /** 1962 * Backend handler for character device access. 1963 * 1964 * \param xbb Per-instance xbb configuration structure. 1965 * \param reqlist Allocated internal request list structure. 1966 * \param operation BIO_* I/O operation code. 1967 * \param bio_flags Additional bio_flag data to pass to any generated 1968 * bios (e.g. BIO_ORDERED).. 1969 * 1970 * \return 0 for success, errno codes for failure. 1971 */ 1972 static int 1973 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1974 int operation, int bio_flags) 1975 { 1976 struct xbb_dev_data *dev_data; 1977 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 1978 off_t bio_offset; 1979 struct bio *bio; 1980 struct xbb_sg *xbb_sg; 1981 u_int nbio; 1982 u_int bio_idx; 1983 u_int nseg; 1984 u_int seg_idx; 1985 int error; 1986 1987 dev_data = &xbb->backend.dev; 1988 bio_offset = (off_t)reqlist->starting_sector_number 1989 << xbb->sector_size_shift; 1990 error = 0; 1991 nbio = 0; 1992 bio_idx = 0; 1993 1994 if (operation == BIO_FLUSH) { 1995 bio = g_new_bio(); 1996 if (__predict_false(bio == NULL)) { 1997 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 1998 error = ENOMEM; 1999 return (error); 2000 } 2001 2002 bio->bio_cmd = BIO_FLUSH; 2003 bio->bio_flags |= BIO_ORDERED; 2004 bio->bio_dev = dev_data->cdev; 2005 bio->bio_offset = 0; 2006 bio->bio_data = 0; 2007 bio->bio_done = xbb_bio_done; 2008 bio->bio_caller1 = reqlist; 2009 bio->bio_pblkno = 0; 2010 2011 reqlist->pendcnt = 1; 2012 2013 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2014 device_get_unit(xbb->dev)); 2015 2016 (*dev_data->csw->d_strategy)(bio); 2017 2018 return (0); 2019 } 2020 2021 xbb_sg = xbb->xbb_sgs; 2022 bio = NULL; 2023 nseg = reqlist->nr_segments; 2024 2025 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2026 /* 2027 * KVA will not be contiguous, so any additional 2028 * I/O will need to be represented in a new bio. 2029 */ 2030 if ((bio != NULL) 2031 && (xbb_sg->first_sect != 0)) { 2032 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2033 printf("%s: Discontiguous I/O request " 2034 "from domain %d ends on " 2035 "non-sector boundary\n", 2036 __func__, xbb->otherend_id); 2037 error = EINVAL; 2038 goto fail_free_bios; 2039 } 2040 bio = NULL; 2041 } 2042 2043 if (bio == NULL) { 2044 /* 2045 * Make sure that the start of this bio is 2046 * aligned to a device sector. 2047 */ 2048 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2049 printf("%s: Misaligned I/O request " 2050 "from domain %d\n", __func__, 2051 xbb->otherend_id); 2052 error = EINVAL; 2053 goto fail_free_bios; 2054 } 2055 2056 bio = bios[nbio++] = g_new_bio(); 2057 if (__predict_false(bio == NULL)) { 2058 error = ENOMEM; 2059 goto fail_free_bios; 2060 } 2061 bio->bio_cmd = operation; 2062 bio->bio_flags |= bio_flags; 2063 bio->bio_dev = dev_data->cdev; 2064 bio->bio_offset = bio_offset; 2065 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2066 xbb_sg->first_sect); 2067 bio->bio_done = xbb_bio_done; 2068 bio->bio_caller1 = reqlist; 2069 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2070 } 2071 2072 bio->bio_length += xbb_sg->nsect << 9; 2073 bio->bio_bcount = bio->bio_length; 2074 bio_offset += xbb_sg->nsect << 9; 2075 2076 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2077 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2078 printf("%s: Discontiguous I/O request " 2079 "from domain %d ends on " 2080 "non-sector boundary\n", 2081 __func__, xbb->otherend_id); 2082 error = EINVAL; 2083 goto fail_free_bios; 2084 } 2085 /* 2086 * KVA will not be contiguous, so any additional 2087 * I/O will need to be represented in a new bio. 2088 */ 2089 bio = NULL; 2090 } 2091 } 2092 2093 reqlist->pendcnt = nbio; 2094 2095 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2096 { 2097 if (operation == BIO_READ) { 2098 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2099 device_get_unit(xbb->dev), 2100 bios[bio_idx]->bio_offset, 2101 bios[bio_idx]->bio_length); 2102 } else if (operation == BIO_WRITE) { 2103 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2104 device_get_unit(xbb->dev), 2105 bios[bio_idx]->bio_offset, 2106 bios[bio_idx]->bio_length); 2107 } 2108 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2109 } 2110 2111 return (error); 2112 2113 fail_free_bios: 2114 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2115 g_destroy_bio(bios[bio_idx]); 2116 2117 return (error); 2118 } 2119 2120 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2121 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2122 "uint64_t"); 2123 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2124 "uint64_t", "uint64_t"); 2125 2126 /** 2127 * Backend handler for file access. 2128 * 2129 * \param xbb Per-instance xbb configuration structure. 2130 * \param reqlist Allocated internal request list. 2131 * \param operation BIO_* I/O operation code. 2132 * \param flags Additional bio_flag data to pass to any generated bios 2133 * (e.g. BIO_ORDERED).. 2134 * 2135 * \return 0 for success, errno codes for failure. 2136 */ 2137 static int 2138 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2139 int operation, int flags) 2140 { 2141 struct xbb_file_data *file_data; 2142 u_int seg_idx; 2143 u_int nseg; 2144 struct uio xuio; 2145 struct xbb_sg *xbb_sg; 2146 struct iovec *xiovec; 2147 int error; 2148 2149 file_data = &xbb->backend.file; 2150 error = 0; 2151 bzero(&xuio, sizeof(xuio)); 2152 2153 switch (operation) { 2154 case BIO_READ: 2155 xuio.uio_rw = UIO_READ; 2156 break; 2157 case BIO_WRITE: 2158 xuio.uio_rw = UIO_WRITE; 2159 break; 2160 case BIO_FLUSH: { 2161 struct mount *mountpoint; 2162 2163 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2164 device_get_unit(xbb->dev)); 2165 2166 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2167 2168 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2169 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2170 VOP_UNLOCK(xbb->vn); 2171 2172 vn_finished_write(mountpoint); 2173 2174 goto bailout_send_response; 2175 /* NOTREACHED */ 2176 } 2177 default: 2178 panic("invalid operation %d", operation); 2179 /* NOTREACHED */ 2180 } 2181 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2182 << xbb->sector_size_shift; 2183 xuio.uio_segflg = UIO_SYSSPACE; 2184 xuio.uio_iov = file_data->xiovecs; 2185 xuio.uio_iovcnt = 0; 2186 xbb_sg = xbb->xbb_sgs; 2187 nseg = reqlist->nr_segments; 2188 2189 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2190 /* 2191 * If the first sector is not 0, the KVA will 2192 * not be contiguous and we'll need to go on 2193 * to another segment. 2194 */ 2195 if (xbb_sg->first_sect != 0) 2196 xiovec = NULL; 2197 2198 if (xiovec == NULL) { 2199 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2200 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2201 seg_idx, xbb_sg->first_sect); 2202 xiovec->iov_len = 0; 2203 xuio.uio_iovcnt++; 2204 } 2205 2206 xiovec->iov_len += xbb_sg->nsect << 9; 2207 2208 xuio.uio_resid += xbb_sg->nsect << 9; 2209 2210 /* 2211 * If the last sector is not the full page 2212 * size count, the next segment will not be 2213 * contiguous in KVA and we need a new iovec. 2214 */ 2215 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2216 xiovec = NULL; 2217 } 2218 2219 xuio.uio_td = curthread; 2220 2221 switch (operation) { 2222 case BIO_READ: 2223 2224 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2225 device_get_unit(xbb->dev), xuio.uio_offset, 2226 xuio.uio_resid); 2227 2228 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2229 2230 /* 2231 * UFS pays attention to IO_DIRECT for reads. If the 2232 * DIRECTIO option is configured into the kernel, it calls 2233 * ffs_rawread(). But that only works for single-segment 2234 * uios with user space addresses. In our case, with a 2235 * kernel uio, it still reads into the buffer cache, but it 2236 * will just try to release the buffer from the cache later 2237 * on in ffs_read(). 2238 * 2239 * ZFS does not pay attention to IO_DIRECT for reads. 2240 * 2241 * UFS does not pay attention to IO_SYNC for reads. 2242 * 2243 * ZFS pays attention to IO_SYNC (which translates into the 2244 * Solaris define FRSYNC for zfs_read()) for reads. It 2245 * attempts to sync the file before reading. 2246 * 2247 * So, to attempt to provide some barrier semantics in the 2248 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2249 */ 2250 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2251 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2252 2253 VOP_UNLOCK(xbb->vn); 2254 break; 2255 case BIO_WRITE: { 2256 struct mount *mountpoint; 2257 2258 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2259 device_get_unit(xbb->dev), xuio.uio_offset, 2260 xuio.uio_resid); 2261 2262 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2263 2264 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2265 2266 /* 2267 * UFS pays attention to IO_DIRECT for writes. The write 2268 * is done asynchronously. (Normally the write would just 2269 * get put into cache. 2270 * 2271 * UFS pays attention to IO_SYNC for writes. It will 2272 * attempt to write the buffer out synchronously if that 2273 * flag is set. 2274 * 2275 * ZFS does not pay attention to IO_DIRECT for writes. 2276 * 2277 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2278 * for writes. It will flush the transaction from the 2279 * cache before returning. 2280 * 2281 * So if we've got the BIO_ORDERED flag set, we want 2282 * IO_SYNC in either the UFS or ZFS case. 2283 */ 2284 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2285 IO_SYNC : 0, file_data->cred); 2286 VOP_UNLOCK(xbb->vn); 2287 2288 vn_finished_write(mountpoint); 2289 2290 break; 2291 } 2292 default: 2293 panic("invalid operation %d", operation); 2294 /* NOTREACHED */ 2295 } 2296 2297 bailout_send_response: 2298 2299 if (error != 0) 2300 reqlist->status = BLKIF_RSP_ERROR; 2301 2302 xbb_complete_reqlist(xbb, reqlist); 2303 2304 return (0); 2305 } 2306 2307 /*--------------------------- Backend Configuration --------------------------*/ 2308 /** 2309 * Close and cleanup any backend device/file specific state for this 2310 * block back instance. 2311 * 2312 * \param xbb Per-instance xbb configuration structure. 2313 */ 2314 static void 2315 xbb_close_backend(struct xbb_softc *xbb) 2316 { 2317 DROP_GIANT(); 2318 DPRINTF("closing dev=%s\n", xbb->dev_name); 2319 if (xbb->vn) { 2320 int flags = FREAD; 2321 2322 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2323 flags |= FWRITE; 2324 2325 switch (xbb->device_type) { 2326 case XBB_TYPE_DISK: 2327 if (xbb->backend.dev.csw) { 2328 dev_relthread(xbb->backend.dev.cdev, 2329 xbb->backend.dev.dev_ref); 2330 xbb->backend.dev.csw = NULL; 2331 xbb->backend.dev.cdev = NULL; 2332 } 2333 break; 2334 case XBB_TYPE_FILE: 2335 break; 2336 case XBB_TYPE_NONE: 2337 default: 2338 panic("Unexpected backend type."); 2339 break; 2340 } 2341 2342 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2343 xbb->vn = NULL; 2344 2345 switch (xbb->device_type) { 2346 case XBB_TYPE_DISK: 2347 break; 2348 case XBB_TYPE_FILE: 2349 if (xbb->backend.file.cred != NULL) { 2350 crfree(xbb->backend.file.cred); 2351 xbb->backend.file.cred = NULL; 2352 } 2353 break; 2354 case XBB_TYPE_NONE: 2355 default: 2356 panic("Unexpected backend type."); 2357 break; 2358 } 2359 } 2360 PICKUP_GIANT(); 2361 } 2362 2363 /** 2364 * Open a character device to be used for backend I/O. 2365 * 2366 * \param xbb Per-instance xbb configuration structure. 2367 * 2368 * \return 0 for success, errno codes for failure. 2369 */ 2370 static int 2371 xbb_open_dev(struct xbb_softc *xbb) 2372 { 2373 struct vattr vattr; 2374 struct cdev *dev; 2375 struct cdevsw *devsw; 2376 int error; 2377 2378 xbb->device_type = XBB_TYPE_DISK; 2379 xbb->dispatch_io = xbb_dispatch_dev; 2380 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2381 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2382 &xbb->backend.dev.dev_ref); 2383 if (xbb->backend.dev.csw == NULL) 2384 panic("Unable to retrieve device switch"); 2385 2386 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2387 if (error) { 2388 xenbus_dev_fatal(xbb->dev, error, "error getting " 2389 "vnode attributes for device %s", 2390 xbb->dev_name); 2391 return (error); 2392 } 2393 2394 dev = xbb->vn->v_rdev; 2395 devsw = dev->si_devsw; 2396 if (!devsw->d_ioctl) { 2397 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2398 "device %s!", xbb->dev_name); 2399 return (ENODEV); 2400 } 2401 2402 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2403 (caddr_t)&xbb->sector_size, FREAD, 2404 curthread); 2405 if (error) { 2406 xenbus_dev_fatal(xbb->dev, error, 2407 "error calling ioctl DIOCGSECTORSIZE " 2408 "for device %s", xbb->dev_name); 2409 return (error); 2410 } 2411 2412 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2413 (caddr_t)&xbb->media_size, FREAD, 2414 curthread); 2415 if (error) { 2416 xenbus_dev_fatal(xbb->dev, error, 2417 "error calling ioctl DIOCGMEDIASIZE " 2418 "for device %s", xbb->dev_name); 2419 return (error); 2420 } 2421 2422 return (0); 2423 } 2424 2425 /** 2426 * Open a file to be used for backend I/O. 2427 * 2428 * \param xbb Per-instance xbb configuration structure. 2429 * 2430 * \return 0 for success, errno codes for failure. 2431 */ 2432 static int 2433 xbb_open_file(struct xbb_softc *xbb) 2434 { 2435 struct xbb_file_data *file_data; 2436 struct vattr vattr; 2437 int error; 2438 2439 file_data = &xbb->backend.file; 2440 xbb->device_type = XBB_TYPE_FILE; 2441 xbb->dispatch_io = xbb_dispatch_file; 2442 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2443 if (error != 0) { 2444 xenbus_dev_fatal(xbb->dev, error, 2445 "error calling VOP_GETATTR()" 2446 "for file %s", xbb->dev_name); 2447 return (error); 2448 } 2449 2450 /* 2451 * Verify that we have the ability to upgrade to exclusive 2452 * access on this file so we can trap errors at open instead 2453 * of reporting them during first access. 2454 */ 2455 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2456 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2457 if (VN_IS_DOOMED(xbb->vn)) { 2458 error = EBADF; 2459 xenbus_dev_fatal(xbb->dev, error, 2460 "error locking file %s", 2461 xbb->dev_name); 2462 2463 return (error); 2464 } 2465 } 2466 2467 file_data->cred = crhold(curthread->td_ucred); 2468 xbb->media_size = vattr.va_size; 2469 2470 /* 2471 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2472 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2473 * with disklabel and UFS on FreeBSD at least. Large block sizes 2474 * may not work with other OSes as well. So just export a sector 2475 * size of 512 bytes, which should work with any OS or 2476 * application. Since our backing is a file, any block size will 2477 * work fine for the backing store. 2478 */ 2479 #if 0 2480 xbb->sector_size = vattr.va_blocksize; 2481 #endif 2482 xbb->sector_size = 512; 2483 2484 /* 2485 * Sanity check. The media size has to be at least one 2486 * sector long. 2487 */ 2488 if (xbb->media_size < xbb->sector_size) { 2489 error = EINVAL; 2490 xenbus_dev_fatal(xbb->dev, error, 2491 "file %s size %ju < block size %u", 2492 xbb->dev_name, 2493 (uintmax_t)xbb->media_size, 2494 xbb->sector_size); 2495 } 2496 return (error); 2497 } 2498 2499 /** 2500 * Open the backend provider for this connection. 2501 * 2502 * \param xbb Per-instance xbb configuration structure. 2503 * 2504 * \return 0 for success, errno codes for failure. 2505 */ 2506 static int 2507 xbb_open_backend(struct xbb_softc *xbb) 2508 { 2509 struct nameidata nd; 2510 int flags; 2511 int error; 2512 2513 flags = FREAD; 2514 error = 0; 2515 2516 DPRINTF("opening dev=%s\n", xbb->dev_name); 2517 2518 if (rootvnode == NULL) { 2519 xenbus_dev_fatal(xbb->dev, ENOENT, 2520 "Root file system not mounted"); 2521 return (ENOENT); 2522 } 2523 2524 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2525 flags |= FWRITE; 2526 2527 pwd_ensure_dirs(); 2528 2529 again: 2530 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name); 2531 error = vn_open(&nd, &flags, 0, NULL); 2532 if (error) { 2533 /* 2534 * This is the only reasonable guess we can make as far as 2535 * path if the user doesn't give us a fully qualified path. 2536 * If they want to specify a file, they need to specify the 2537 * full path. 2538 */ 2539 if (xbb->dev_name[0] != '/') { 2540 char *dev_path = "/dev/"; 2541 char *dev_name; 2542 2543 /* Try adding device path at beginning of name */ 2544 dev_name = malloc(strlen(xbb->dev_name) 2545 + strlen(dev_path) + 1, 2546 M_XENBLOCKBACK, M_NOWAIT); 2547 if (dev_name) { 2548 sprintf(dev_name, "%s%s", dev_path, 2549 xbb->dev_name); 2550 free(xbb->dev_name, M_XENBLOCKBACK); 2551 xbb->dev_name = dev_name; 2552 goto again; 2553 } 2554 } 2555 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2556 xbb->dev_name); 2557 return (error); 2558 } 2559 2560 NDFREE_PNBUF(&nd); 2561 2562 xbb->vn = nd.ni_vp; 2563 2564 /* We only support disks and files. */ 2565 if (vn_isdisk_error(xbb->vn, &error)) { 2566 error = xbb_open_dev(xbb); 2567 } else if (xbb->vn->v_type == VREG) { 2568 error = xbb_open_file(xbb); 2569 } else { 2570 error = EINVAL; 2571 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2572 "or file", xbb->dev_name); 2573 } 2574 VOP_UNLOCK(xbb->vn); 2575 2576 if (error != 0) { 2577 xbb_close_backend(xbb); 2578 return (error); 2579 } 2580 2581 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2582 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2583 2584 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2585 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2586 xbb->dev_name, xbb->sector_size, xbb->media_size); 2587 2588 return (0); 2589 } 2590 2591 /*------------------------ Inter-Domain Communication ------------------------*/ 2592 /** 2593 * Free dynamically allocated KVA or pseudo-physical address allocations. 2594 * 2595 * \param xbb Per-instance xbb configuration structure. 2596 */ 2597 static void 2598 xbb_free_communication_mem(struct xbb_softc *xbb) 2599 { 2600 if (xbb->kva != 0) { 2601 if (xbb->pseudo_phys_res != NULL) { 2602 xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, 2603 xbb->pseudo_phys_res); 2604 xbb->pseudo_phys_res = NULL; 2605 } 2606 } 2607 xbb->kva = 0; 2608 xbb->gnt_base_addr = 0; 2609 if (xbb->kva_free != NULL) { 2610 free(xbb->kva_free, M_XENBLOCKBACK); 2611 xbb->kva_free = NULL; 2612 } 2613 } 2614 2615 /** 2616 * Cleanup all inter-domain communication mechanisms. 2617 * 2618 * \param xbb Per-instance xbb configuration structure. 2619 */ 2620 static int 2621 xbb_disconnect(struct xbb_softc *xbb) 2622 { 2623 DPRINTF("\n"); 2624 2625 mtx_unlock(&xbb->lock); 2626 xen_intr_unbind(&xbb->xen_intr_handle); 2627 if (xbb->io_taskqueue != NULL) 2628 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2629 mtx_lock(&xbb->lock); 2630 2631 /* 2632 * No new interrupts can generate work, but we must wait 2633 * for all currently active requests to drain. 2634 */ 2635 if (xbb->active_request_count != 0) 2636 return (EAGAIN); 2637 2638 if (xbb->flags & XBBF_RING_CONNECTED) { 2639 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2640 struct gnttab_unmap_grant_ref *op; 2641 unsigned int ring_idx; 2642 int error; 2643 2644 for (ring_idx = 0, op = ops; 2645 ring_idx < xbb->ring_config.ring_pages; 2646 ring_idx++, op++) { 2647 op->host_addr = xbb->ring_config.gnt_addr 2648 + (ring_idx * PAGE_SIZE); 2649 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2650 op->handle = xbb->ring_config.handle[ring_idx]; 2651 } 2652 2653 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2654 xbb->ring_config.ring_pages); 2655 if (error != 0) 2656 panic("Grant table op failed (%d)", error); 2657 2658 xbb->flags &= ~XBBF_RING_CONNECTED; 2659 } 2660 2661 xbb_free_communication_mem(xbb); 2662 2663 if (xbb->requests != NULL) { 2664 free(xbb->requests, M_XENBLOCKBACK); 2665 xbb->requests = NULL; 2666 } 2667 2668 if (xbb->request_lists != NULL) { 2669 struct xbb_xen_reqlist *reqlist; 2670 int i; 2671 2672 /* There is one request list for ever allocated request. */ 2673 for (i = 0, reqlist = xbb->request_lists; 2674 i < xbb->max_requests; i++, reqlist++){ 2675 if (reqlist->gnt_handles != NULL) { 2676 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2677 reqlist->gnt_handles = NULL; 2678 } 2679 } 2680 free(xbb->request_lists, M_XENBLOCKBACK); 2681 xbb->request_lists = NULL; 2682 } 2683 2684 return (0); 2685 } 2686 2687 /** 2688 * Map shared memory ring into domain local address space, initialize 2689 * ring control structures, and bind an interrupt to the event channel 2690 * used to notify us of ring changes. 2691 * 2692 * \param xbb Per-instance xbb configuration structure. 2693 */ 2694 static int 2695 xbb_connect_ring(struct xbb_softc *xbb) 2696 { 2697 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2698 struct gnttab_map_grant_ref *gnt; 2699 u_int ring_idx; 2700 int error; 2701 2702 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2703 return (0); 2704 2705 /* 2706 * Kva for our ring is at the tail of the region of kva allocated 2707 * by xbb_alloc_communication_mem(). 2708 */ 2709 xbb->ring_config.va = xbb->kva 2710 + (xbb->kva_size 2711 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2712 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2713 + (xbb->kva_size 2714 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2715 2716 for (ring_idx = 0, gnt = gnts; 2717 ring_idx < xbb->ring_config.ring_pages; 2718 ring_idx++, gnt++) { 2719 gnt->host_addr = xbb->ring_config.gnt_addr 2720 + (ring_idx * PAGE_SIZE); 2721 gnt->flags = GNTMAP_host_map; 2722 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2723 gnt->dom = xbb->otherend_id; 2724 } 2725 2726 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2727 xbb->ring_config.ring_pages); 2728 if (error) 2729 panic("blkback: Ring page grant table op failed (%d)", error); 2730 2731 for (ring_idx = 0, gnt = gnts; 2732 ring_idx < xbb->ring_config.ring_pages; 2733 ring_idx++, gnt++) { 2734 if (gnt->status != 0) { 2735 struct gnttab_unmap_grant_ref unmap[XBB_MAX_RING_PAGES]; 2736 unsigned int i, j; 2737 2738 xbb->ring_config.va = 0; 2739 xenbus_dev_fatal(xbb->dev, EACCES, 2740 "Ring shared page mapping failed. " 2741 "Status %d.", gnt->status); 2742 2743 /* Unmap everything to avoid leaking grant table maps */ 2744 for (i = 0, j = 0; i < xbb->ring_config.ring_pages; 2745 i++) { 2746 if (gnts[i].status != GNTST_okay) 2747 continue; 2748 2749 unmap[j].host_addr = gnts[i].host_addr; 2750 unmap[j].dev_bus_addr = gnts[i].dev_bus_addr; 2751 unmap[j++].handle = gnts[i].handle; 2752 } 2753 if (j != 0) { 2754 error = HYPERVISOR_grant_table_op( 2755 GNTTABOP_unmap_grant_ref, unmap, j); 2756 if (error != 0) 2757 panic("Unable to unmap grants (%d)", 2758 error); 2759 } 2760 return (EACCES); 2761 } 2762 xbb->ring_config.handle[ring_idx] = gnt->handle; 2763 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2764 } 2765 2766 /* Initialize the ring based on ABI. */ 2767 switch (xbb->abi) { 2768 case BLKIF_PROTOCOL_NATIVE: 2769 { 2770 blkif_sring_t *sring; 2771 sring = (blkif_sring_t *)xbb->ring_config.va; 2772 BACK_RING_INIT(&xbb->rings.native, sring, 2773 xbb->ring_config.ring_pages * PAGE_SIZE); 2774 break; 2775 } 2776 case BLKIF_PROTOCOL_X86_32: 2777 { 2778 blkif_x86_32_sring_t *sring_x86_32; 2779 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2780 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2781 xbb->ring_config.ring_pages * PAGE_SIZE); 2782 break; 2783 } 2784 case BLKIF_PROTOCOL_X86_64: 2785 { 2786 blkif_x86_64_sring_t *sring_x86_64; 2787 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2788 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2789 xbb->ring_config.ring_pages * PAGE_SIZE); 2790 break; 2791 } 2792 default: 2793 panic("Unexpected blkif protocol ABI."); 2794 } 2795 2796 xbb->flags |= XBBF_RING_CONNECTED; 2797 2798 error = xen_intr_bind_remote_port(xbb->dev, 2799 xbb->otherend_id, 2800 xbb->ring_config.evtchn, 2801 xbb_filter, 2802 /*ithread_handler*/NULL, 2803 /*arg*/xbb, 2804 INTR_TYPE_BIO | INTR_MPSAFE, 2805 &xbb->xen_intr_handle); 2806 if (error) { 2807 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2808 return (error); 2809 } 2810 2811 DPRINTF("rings connected!\n"); 2812 2813 return 0; 2814 } 2815 2816 /** 2817 * Size KVA and pseudo-physical address allocations based on negotiated 2818 * values for the size and number of I/O requests, and the size of our 2819 * communication ring. 2820 * 2821 * \param xbb Per-instance xbb configuration structure. 2822 * 2823 * These address spaces are used to dynamically map pages in the 2824 * front-end's domain into our own. 2825 */ 2826 static int 2827 xbb_alloc_communication_mem(struct xbb_softc *xbb) 2828 { 2829 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2830 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2831 xbb->kva_size = xbb->reqlist_kva_size + 2832 (xbb->ring_config.ring_pages * PAGE_SIZE); 2833 2834 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); 2835 if (xbb->kva_free == NULL) 2836 return (ENOMEM); 2837 2838 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 2839 device_get_nameunit(xbb->dev), xbb->kva_size, 2840 xbb->reqlist_kva_size); 2841 /* 2842 * Reserve a range of pseudo physical memory that we can map 2843 * into kva. These pages will only be backed by machine 2844 * pages ("real memory") during the lifetime of front-end requests 2845 * via grant table operations. 2846 */ 2847 xbb->pseudo_phys_res_id = 0; 2848 xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, 2849 xbb->kva_size); 2850 if (xbb->pseudo_phys_res == NULL) { 2851 xbb->kva = 0; 2852 return (ENOMEM); 2853 } 2854 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 2855 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 2856 2857 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 2858 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 2859 (uintmax_t)xbb->gnt_base_addr); 2860 return (0); 2861 } 2862 2863 /** 2864 * Collect front-end information from the XenStore. 2865 * 2866 * \param xbb Per-instance xbb configuration structure. 2867 */ 2868 static int 2869 xbb_collect_frontend_info(struct xbb_softc *xbb) 2870 { 2871 char protocol_abi[64]; 2872 const char *otherend_path; 2873 int error; 2874 u_int ring_idx; 2875 u_int ring_page_order; 2876 size_t ring_size; 2877 2878 otherend_path = xenbus_get_otherend_path(xbb->dev); 2879 2880 /* 2881 * Protocol defaults valid even if all negotiation fails. 2882 */ 2883 xbb->ring_config.ring_pages = 1; 2884 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; 2885 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 2886 2887 /* 2888 * Mandatory data (used in all versions of the protocol) first. 2889 */ 2890 error = xs_scanf(XST_NIL, otherend_path, 2891 "event-channel", NULL, "%" PRIu32, 2892 &xbb->ring_config.evtchn); 2893 if (error != 0) { 2894 xenbus_dev_fatal(xbb->dev, error, 2895 "Unable to retrieve event-channel information " 2896 "from frontend %s. Unable to connect.", 2897 xenbus_get_otherend_path(xbb->dev)); 2898 return (error); 2899 } 2900 2901 /* 2902 * These fields are initialized to legacy protocol defaults 2903 * so we only need to fail if reading the updated value succeeds 2904 * and the new value is outside of its allowed range. 2905 * 2906 * \note xs_gather() returns on the first encountered error, so 2907 * we must use independent calls in order to guarantee 2908 * we don't miss information in a sparsly populated front-end 2909 * tree. 2910 * 2911 * \note xs_scanf() does not update variables for unmatched 2912 * fields. 2913 */ 2914 ring_page_order = 0; 2915 xbb->max_requests = 32; 2916 2917 (void)xs_scanf(XST_NIL, otherend_path, 2918 "ring-page-order", NULL, "%u", 2919 &ring_page_order); 2920 xbb->ring_config.ring_pages = 1 << ring_page_order; 2921 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 2922 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 2923 2924 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 2925 xenbus_dev_fatal(xbb->dev, EINVAL, 2926 "Front-end specified ring-pages of %u " 2927 "exceeds backend limit of %u. " 2928 "Unable to connect.", 2929 xbb->ring_config.ring_pages, 2930 XBB_MAX_RING_PAGES); 2931 return (EINVAL); 2932 } 2933 2934 if (xbb->ring_config.ring_pages == 1) { 2935 error = xs_gather(XST_NIL, otherend_path, 2936 "ring-ref", "%" PRIu32, 2937 &xbb->ring_config.ring_ref[0], 2938 NULL); 2939 if (error != 0) { 2940 xenbus_dev_fatal(xbb->dev, error, 2941 "Unable to retrieve ring information " 2942 "from frontend %s. Unable to " 2943 "connect.", 2944 xenbus_get_otherend_path(xbb->dev)); 2945 return (error); 2946 } 2947 } else { 2948 /* Multi-page ring format. */ 2949 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 2950 ring_idx++) { 2951 char ring_ref_name[]= "ring_refXX"; 2952 2953 snprintf(ring_ref_name, sizeof(ring_ref_name), 2954 "ring-ref%u", ring_idx); 2955 error = xs_scanf(XST_NIL, otherend_path, 2956 ring_ref_name, NULL, "%" PRIu32, 2957 &xbb->ring_config.ring_ref[ring_idx]); 2958 if (error != 0) { 2959 xenbus_dev_fatal(xbb->dev, error, 2960 "Failed to retriev grant " 2961 "reference for page %u of " 2962 "shared ring. Unable " 2963 "to connect.", ring_idx); 2964 return (error); 2965 } 2966 } 2967 } 2968 2969 error = xs_gather(XST_NIL, otherend_path, 2970 "protocol", "%63s", protocol_abi, 2971 NULL); 2972 if (error != 0 2973 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 2974 /* 2975 * Assume native if the frontend has not 2976 * published ABI data or it has published and 2977 * matches our own ABI. 2978 */ 2979 xbb->abi = BLKIF_PROTOCOL_NATIVE; 2980 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 2981 xbb->abi = BLKIF_PROTOCOL_X86_32; 2982 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 2983 xbb->abi = BLKIF_PROTOCOL_X86_64; 2984 } else { 2985 xenbus_dev_fatal(xbb->dev, EINVAL, 2986 "Unknown protocol ABI (%s) published by " 2987 "frontend. Unable to connect.", protocol_abi); 2988 return (EINVAL); 2989 } 2990 return (0); 2991 } 2992 2993 /** 2994 * Allocate per-request data structures given request size and number 2995 * information negotiated with the front-end. 2996 * 2997 * \param xbb Per-instance xbb configuration structure. 2998 */ 2999 static int 3000 xbb_alloc_requests(struct xbb_softc *xbb) 3001 { 3002 struct xbb_xen_req *req; 3003 struct xbb_xen_req *last_req; 3004 3005 /* 3006 * Allocate request book keeping datastructures. 3007 */ 3008 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3009 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3010 if (xbb->requests == NULL) { 3011 xenbus_dev_fatal(xbb->dev, ENOMEM, 3012 "Unable to allocate request structures"); 3013 return (ENOMEM); 3014 } 3015 3016 req = xbb->requests; 3017 last_req = &xbb->requests[xbb->max_requests - 1]; 3018 STAILQ_INIT(&xbb->request_free_stailq); 3019 while (req <= last_req) { 3020 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3021 req++; 3022 } 3023 return (0); 3024 } 3025 3026 static int 3027 xbb_alloc_request_lists(struct xbb_softc *xbb) 3028 { 3029 struct xbb_xen_reqlist *reqlist; 3030 int i; 3031 3032 /* 3033 * If no requests can be merged, we need 1 request list per 3034 * in flight request. 3035 */ 3036 xbb->request_lists = malloc(xbb->max_requests * 3037 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3038 if (xbb->request_lists == NULL) { 3039 xenbus_dev_fatal(xbb->dev, ENOMEM, 3040 "Unable to allocate request list structures"); 3041 return (ENOMEM); 3042 } 3043 3044 STAILQ_INIT(&xbb->reqlist_free_stailq); 3045 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3046 for (i = 0; i < xbb->max_requests; i++) { 3047 int seg; 3048 3049 reqlist = &xbb->request_lists[i]; 3050 3051 reqlist->xbb = xbb; 3052 3053 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3054 sizeof(*reqlist->gnt_handles), 3055 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3056 if (reqlist->gnt_handles == NULL) { 3057 xenbus_dev_fatal(xbb->dev, ENOMEM, 3058 "Unable to allocate request " 3059 "grant references"); 3060 return (ENOMEM); 3061 } 3062 3063 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3064 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3065 3066 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3067 } 3068 return (0); 3069 } 3070 3071 /** 3072 * Supply information about the physical device to the frontend 3073 * via XenBus. 3074 * 3075 * \param xbb Per-instance xbb configuration structure. 3076 */ 3077 static int 3078 xbb_publish_backend_info(struct xbb_softc *xbb) 3079 { 3080 struct xs_transaction xst; 3081 const char *our_path; 3082 const char *leaf; 3083 int error; 3084 3085 our_path = xenbus_get_node(xbb->dev); 3086 while (1) { 3087 error = xs_transaction_start(&xst); 3088 if (error != 0) { 3089 xenbus_dev_fatal(xbb->dev, error, 3090 "Error publishing backend info " 3091 "(start transaction)"); 3092 return (error); 3093 } 3094 3095 leaf = "sectors"; 3096 error = xs_printf(xst, our_path, leaf, 3097 "%"PRIu64, xbb->media_num_sectors); 3098 if (error != 0) 3099 break; 3100 3101 /* XXX Support all VBD attributes here. */ 3102 leaf = "info"; 3103 error = xs_printf(xst, our_path, leaf, "%u", 3104 xbb->flags & XBBF_READ_ONLY 3105 ? VDISK_READONLY : 0); 3106 if (error != 0) 3107 break; 3108 3109 leaf = "sector-size"; 3110 error = xs_printf(xst, our_path, leaf, "%u", 3111 xbb->sector_size); 3112 if (error != 0) 3113 break; 3114 3115 error = xs_transaction_end(xst, 0); 3116 if (error == 0) { 3117 return (0); 3118 } else if (error != EAGAIN) { 3119 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3120 return (error); 3121 } 3122 } 3123 3124 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3125 our_path, leaf); 3126 xs_transaction_end(xst, 1); 3127 return (error); 3128 } 3129 3130 /** 3131 * Connect to our blkfront peer now that it has completed publishing 3132 * its configuration into the XenStore. 3133 * 3134 * \param xbb Per-instance xbb configuration structure. 3135 */ 3136 static void 3137 xbb_connect(struct xbb_softc *xbb) 3138 { 3139 int error; 3140 3141 if (!xbb->hotplug_done || 3142 (xenbus_get_state(xbb->dev) != XenbusStateInitWait) || 3143 (xbb_collect_frontend_info(xbb) != 0)) 3144 return; 3145 3146 xbb->flags &= ~XBBF_SHUTDOWN; 3147 3148 /* 3149 * We limit the maximum number of reqlist segments to the maximum 3150 * number of segments in the ring, or our absolute maximum, 3151 * whichever is smaller. 3152 */ 3153 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3154 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3155 3156 /* 3157 * The maximum size is simply a function of the number of segments 3158 * we can handle. 3159 */ 3160 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3161 3162 /* Allocate resources whose size depends on front-end configuration. */ 3163 error = xbb_alloc_communication_mem(xbb); 3164 if (error != 0) { 3165 xenbus_dev_fatal(xbb->dev, error, 3166 "Unable to allocate communication memory"); 3167 return; 3168 } 3169 3170 error = xbb_publish_backend_info(xbb); 3171 if (error != 0) { 3172 xenbus_dev_fatal(xbb->dev, error, 3173 "Unable to publish device information"); 3174 return; 3175 } 3176 3177 error = xbb_alloc_requests(xbb); 3178 if (error != 0) { 3179 /* Specific errors are reported by xbb_alloc_requests(). */ 3180 return; 3181 } 3182 3183 error = xbb_alloc_request_lists(xbb); 3184 if (error != 0) { 3185 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3186 return; 3187 } 3188 3189 /* 3190 * Connect communication channel. 3191 */ 3192 error = xbb_connect_ring(xbb); 3193 if (error != 0) { 3194 /* Specific errors are reported by xbb_connect_ring(). */ 3195 return; 3196 } 3197 3198 /* Ready for I/O. */ 3199 xenbus_set_state(xbb->dev, XenbusStateConnected); 3200 } 3201 3202 /*-------------------------- Device Teardown Support -------------------------*/ 3203 /** 3204 * Perform device shutdown functions. 3205 * 3206 * \param xbb Per-instance xbb configuration structure. 3207 * 3208 * Mark this instance as shutting down, wait for any active I/O on the 3209 * backend device/file to drain, disconnect from the front-end, and notify 3210 * any waiters (e.g. a thread invoking our detach method) that detach can 3211 * now proceed. 3212 */ 3213 static int 3214 xbb_shutdown(struct xbb_softc *xbb) 3215 { 3216 XenbusState frontState; 3217 int error; 3218 3219 DPRINTF("\n"); 3220 3221 /* 3222 * Due to the need to drop our mutex during some 3223 * xenbus operations, it is possible for two threads 3224 * to attempt to close out shutdown processing at 3225 * the same time. Tell the caller that hits this 3226 * race to try back later. 3227 */ 3228 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3229 return (EAGAIN); 3230 3231 xbb->flags |= XBBF_IN_SHUTDOWN; 3232 mtx_unlock(&xbb->lock); 3233 3234 if (xbb->hotplug_watch.node != NULL) { 3235 xs_unregister_watch(&xbb->hotplug_watch); 3236 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3237 xbb->hotplug_watch.node = NULL; 3238 } 3239 3240 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3241 xenbus_set_state(xbb->dev, XenbusStateClosing); 3242 3243 frontState = xenbus_get_otherend_state(xbb->dev); 3244 mtx_lock(&xbb->lock); 3245 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3246 3247 /* Wait for the frontend to disconnect (if it's connected). */ 3248 if (frontState == XenbusStateConnected) 3249 return (EAGAIN); 3250 3251 DPRINTF("\n"); 3252 3253 /* Indicate shutdown is in progress. */ 3254 xbb->flags |= XBBF_SHUTDOWN; 3255 3256 /* Disconnect from the front-end. */ 3257 error = xbb_disconnect(xbb); 3258 if (error != 0) { 3259 /* 3260 * Requests still outstanding. We'll be called again 3261 * once they complete. 3262 */ 3263 KASSERT(error == EAGAIN, 3264 ("%s: Unexpected xbb_disconnect() failure %d", 3265 __func__, error)); 3266 3267 return (error); 3268 } 3269 3270 DPRINTF("\n"); 3271 3272 /* Indicate to xbb_detach() that is it safe to proceed. */ 3273 wakeup(xbb); 3274 3275 return (0); 3276 } 3277 3278 /** 3279 * Report an attach time error to the console and Xen, and cleanup 3280 * this instance by forcing immediate detach processing. 3281 * 3282 * \param xbb Per-instance xbb configuration structure. 3283 * \param err Errno describing the error. 3284 * \param fmt Printf style format and arguments 3285 */ 3286 static void 3287 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3288 { 3289 va_list ap; 3290 va_list ap_hotplug; 3291 3292 va_start(ap, fmt); 3293 va_copy(ap_hotplug, ap); 3294 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3295 "hotplug-error", fmt, ap_hotplug); 3296 va_end(ap_hotplug); 3297 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3298 "hotplug-status", "error"); 3299 3300 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3301 va_end(ap); 3302 3303 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3304 "online", "0"); 3305 mtx_lock(&xbb->lock); 3306 xbb_shutdown(xbb); 3307 mtx_unlock(&xbb->lock); 3308 } 3309 3310 /*---------------------------- NewBus Entrypoints ----------------------------*/ 3311 /** 3312 * Inspect a XenBus device and claim it if is of the appropriate type. 3313 * 3314 * \param dev NewBus device object representing a candidate XenBus device. 3315 * 3316 * \return 0 for success, errno codes for failure. 3317 */ 3318 static int 3319 xbb_probe(device_t dev) 3320 { 3321 uint32_t regs[4]; 3322 3323 if (strcmp(xenbus_get_type(dev), "vbd")) 3324 return (ENXIO); 3325 3326 KASSERT(xen_cpuid_base != 0, ("Invalid base Xen CPUID leaf")); 3327 cpuid_count(xen_cpuid_base + 4, 0, regs); 3328 3329 /* Only attach if Xen creates IOMMU entries for grant mapped pages. */ 3330 if (!(regs[0] & XEN_HVM_CPUID_IOMMU_MAPPINGS)) { 3331 static bool warned; 3332 3333 if (!warned) { 3334 warned = true; 3335 printf( 3336 "xen-blkback disabled due to grant maps lacking IOMMU entries\n"); 3337 } 3338 return (ENXIO); 3339 } 3340 3341 device_set_desc(dev, "Backend Virtual Block Device"); 3342 device_quiet(dev); 3343 return (0); 3344 } 3345 3346 /** 3347 * Setup sysctl variables to control various Block Back parameters. 3348 * 3349 * \param xbb Xen Block Back softc. 3350 * 3351 */ 3352 static void 3353 xbb_setup_sysctl(struct xbb_softc *xbb) 3354 { 3355 struct sysctl_ctx_list *sysctl_ctx = NULL; 3356 struct sysctl_oid *sysctl_tree = NULL; 3357 3358 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3359 if (sysctl_ctx == NULL) 3360 return; 3361 3362 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3363 if (sysctl_tree == NULL) 3364 return; 3365 3366 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3367 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3368 "fake the flush command"); 3369 3370 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3371 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3372 "send a real flush for N flush requests"); 3373 3374 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3375 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3376 "Don't coalesce contiguous requests"); 3377 3378 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3379 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3380 "how many I/O requests we have received"); 3381 3382 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3383 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3384 "how many I/O requests have been completed"); 3385 3386 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3387 "reqs_queued_for_completion", CTLFLAG_RW, 3388 &xbb->reqs_queued_for_completion, 3389 "how many I/O requests queued but not yet pushed"); 3390 3391 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3392 "reqs_completed_with_error", CTLFLAG_RW, 3393 &xbb->reqs_completed_with_error, 3394 "how many I/O requests completed with error status"); 3395 3396 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3397 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3398 "how many I/O dispatches were forced"); 3399 3400 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3401 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3402 "how many I/O dispatches were normal"); 3403 3404 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3405 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3406 "total number of I/O dispatches"); 3407 3408 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3409 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3410 "how many times we have run out of KVA"); 3411 3412 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3413 "request_shortages", CTLFLAG_RW, 3414 &xbb->request_shortages, 3415 "how many times we have run out of requests"); 3416 3417 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3418 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3419 "maximum outstanding requests (negotiated)"); 3420 3421 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3422 "max_request_segments", CTLFLAG_RD, 3423 &xbb->max_request_segments, 0, 3424 "maximum number of pages per requests (negotiated)"); 3425 3426 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3427 "max_request_size", CTLFLAG_RD, 3428 &xbb->max_request_size, 0, 3429 "maximum size in bytes of a request (negotiated)"); 3430 3431 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3432 "ring_pages", CTLFLAG_RD, 3433 &xbb->ring_config.ring_pages, 0, 3434 "communication channel pages (negotiated)"); 3435 } 3436 3437 static void 3438 xbb_attach_disk(device_t dev) 3439 { 3440 struct xbb_softc *xbb; 3441 int error; 3442 3443 xbb = device_get_softc(dev); 3444 3445 KASSERT(xbb->hotplug_done, ("Missing hotplug execution")); 3446 3447 /* Parse fopen style mode flags. */ 3448 if (strchr(xbb->dev_mode, 'w') == NULL) 3449 xbb->flags |= XBBF_READ_ONLY; 3450 3451 /* 3452 * Verify the physical device is present and can support 3453 * the desired I/O mode. 3454 */ 3455 error = xbb_open_backend(xbb); 3456 if (error != 0) { 3457 xbb_attach_failed(xbb, error, "Unable to open %s", 3458 xbb->dev_name); 3459 return; 3460 } 3461 3462 /* Use devstat(9) for recording statistics. */ 3463 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3464 xbb->sector_size, 3465 DEVSTAT_ALL_SUPPORTED, 3466 DEVSTAT_TYPE_DIRECT 3467 | DEVSTAT_TYPE_IF_OTHER, 3468 DEVSTAT_PRIORITY_OTHER); 3469 3470 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3471 xbb->sector_size, 3472 DEVSTAT_ALL_SUPPORTED, 3473 DEVSTAT_TYPE_DIRECT 3474 | DEVSTAT_TYPE_IF_OTHER, 3475 DEVSTAT_PRIORITY_OTHER); 3476 /* 3477 * Setup sysctl variables. 3478 */ 3479 xbb_setup_sysctl(xbb); 3480 3481 /* 3482 * Create a taskqueue for doing work that must occur from a 3483 * thread context. 3484 */ 3485 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3486 M_NOWAIT, 3487 taskqueue_thread_enqueue, 3488 /*contxt*/&xbb->io_taskqueue); 3489 if (xbb->io_taskqueue == NULL) { 3490 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3491 return; 3492 } 3493 3494 taskqueue_start_threads(&xbb->io_taskqueue, 3495 /*num threads*/1, 3496 /*priority*/PWAIT, 3497 /*thread name*/ 3498 "%s taskq", device_get_nameunit(dev)); 3499 3500 /* Update hot-plug status to satisfy xend. */ 3501 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3502 "hotplug-status", "connected"); 3503 if (error) { 3504 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3505 xenbus_get_node(xbb->dev)); 3506 return; 3507 } 3508 3509 /* The front end might be waiting for the backend, attach if so. */ 3510 if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised) 3511 xbb_connect(xbb); 3512 } 3513 3514 static void 3515 xbb_attach_cb(struct xs_watch *watch, const char **vec, unsigned int len) 3516 { 3517 device_t dev; 3518 struct xbb_softc *xbb; 3519 int error; 3520 3521 dev = (device_t)watch->callback_data; 3522 xbb = device_get_softc(dev); 3523 3524 error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", 3525 NULL, &xbb->dev_name, NULL); 3526 if (error != 0) 3527 return; 3528 3529 xs_unregister_watch(watch); 3530 free(watch->node, M_XENBLOCKBACK); 3531 watch->node = NULL; 3532 xbb->hotplug_done = true; 3533 3534 /* Collect physical device information. */ 3535 error = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "device-type", 3536 NULL, &xbb->dev_type, NULL); 3537 if (error != 0) 3538 xbb->dev_type = NULL; 3539 3540 error = xs_gather(XST_NIL, xenbus_get_node(dev), "mode", NULL, 3541 &xbb->dev_mode, NULL); 3542 if (error != 0) { 3543 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3544 xenbus_get_node(dev)); 3545 return; 3546 } 3547 3548 xbb_attach_disk(dev); 3549 } 3550 3551 /** 3552 * Attach to a XenBus device that has been claimed by our probe routine. 3553 * 3554 * \param dev NewBus device object representing this Xen Block Back instance. 3555 * 3556 * \return 0 for success, errno codes for failure. 3557 */ 3558 static int 3559 xbb_attach(device_t dev) 3560 { 3561 struct xbb_softc *xbb; 3562 int error; 3563 u_int max_ring_page_order; 3564 struct sbuf *watch_path; 3565 3566 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3567 3568 /* 3569 * Basic initialization. 3570 * After this block it is safe to call xbb_detach() 3571 * to clean up any allocated data for this instance. 3572 */ 3573 xbb = device_get_softc(dev); 3574 xbb->dev = dev; 3575 xbb->otherend_id = xenbus_get_otherend_id(dev); 3576 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3577 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3578 3579 /* 3580 * Publish protocol capabilities for consumption by the 3581 * front-end. 3582 */ 3583 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3584 "feature-barrier", "1"); 3585 if (error) { 3586 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3587 xenbus_get_node(xbb->dev)); 3588 return (error); 3589 } 3590 3591 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3592 "feature-flush-cache", "1"); 3593 if (error) { 3594 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3595 xenbus_get_node(xbb->dev)); 3596 return (error); 3597 } 3598 3599 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3600 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3601 "max-ring-page-order", "%u", max_ring_page_order); 3602 if (error) { 3603 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3604 xenbus_get_node(xbb->dev)); 3605 return (error); 3606 } 3607 3608 /* Tell the toolstack blkback has attached. */ 3609 xenbus_set_state(dev, XenbusStateInitWait); 3610 3611 if (xbb->hotplug_done) { 3612 xbb_attach_disk(dev); 3613 return (0); 3614 } 3615 3616 /* 3617 * We need to wait for hotplug script execution before 3618 * moving forward. 3619 */ 3620 watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); 3621 xbb->hotplug_watch.callback_data = (uintptr_t)dev; 3622 xbb->hotplug_watch.callback = xbb_attach_cb; 3623 KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); 3624 xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); 3625 /* 3626 * We don't care about the path updated, just about the value changes 3627 * on that single node, hence there's no need to queue more that one 3628 * event. 3629 */ 3630 xbb->hotplug_watch.max_pending = 1; 3631 sbuf_delete(watch_path); 3632 error = xs_register_watch(&xbb->hotplug_watch); 3633 if (error != 0) { 3634 xbb_attach_failed(xbb, error, "failed to create watch on %s", 3635 xbb->hotplug_watch.node); 3636 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3637 return (error); 3638 } 3639 3640 return (0); 3641 } 3642 3643 /** 3644 * Detach from a block back device instance. 3645 * 3646 * \param dev NewBus device object representing this Xen Block Back instance. 3647 * 3648 * \return 0 for success, errno codes for failure. 3649 * 3650 * \note A block back device may be detached at any time in its life-cycle, 3651 * including part way through the attach process. For this reason, 3652 * initialization order and the initialization state checks in this 3653 * routine must be carefully coupled so that attach time failures 3654 * are gracefully handled. 3655 */ 3656 static int 3657 xbb_detach(device_t dev) 3658 { 3659 struct xbb_softc *xbb; 3660 3661 DPRINTF("\n"); 3662 3663 xbb = device_get_softc(dev); 3664 mtx_lock(&xbb->lock); 3665 while (xbb_shutdown(xbb) == EAGAIN) { 3666 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3667 "xbb_shutdown", 0); 3668 } 3669 mtx_unlock(&xbb->lock); 3670 3671 DPRINTF("\n"); 3672 3673 if (xbb->io_taskqueue != NULL) 3674 taskqueue_free(xbb->io_taskqueue); 3675 3676 if (xbb->xbb_stats != NULL) 3677 devstat_remove_entry(xbb->xbb_stats); 3678 3679 if (xbb->xbb_stats_in != NULL) 3680 devstat_remove_entry(xbb->xbb_stats_in); 3681 3682 xbb_close_backend(xbb); 3683 3684 if (xbb->dev_mode != NULL) { 3685 free(xbb->dev_mode, M_XENSTORE); 3686 xbb->dev_mode = NULL; 3687 } 3688 3689 if (xbb->dev_type != NULL) { 3690 free(xbb->dev_type, M_XENSTORE); 3691 xbb->dev_type = NULL; 3692 } 3693 3694 if (xbb->dev_name != NULL) { 3695 free(xbb->dev_name, M_XENSTORE); 3696 xbb->dev_name = NULL; 3697 } 3698 3699 mtx_destroy(&xbb->lock); 3700 return (0); 3701 } 3702 3703 /** 3704 * Prepare this block back device for suspension of this VM. 3705 * 3706 * \param dev NewBus device object representing this Xen Block Back instance. 3707 * 3708 * \return 0 for success, errno codes for failure. 3709 */ 3710 static int 3711 xbb_suspend(device_t dev) 3712 { 3713 #ifdef NOT_YET 3714 struct xbb_softc *sc = device_get_softc(dev); 3715 3716 /* Prevent new requests being issued until we fix things up. */ 3717 mtx_lock(&sc->xb_io_lock); 3718 sc->connected = BLKIF_STATE_SUSPENDED; 3719 mtx_unlock(&sc->xb_io_lock); 3720 #endif 3721 3722 return (0); 3723 } 3724 3725 /** 3726 * Perform any processing required to recover from a suspended state. 3727 * 3728 * \param dev NewBus device object representing this Xen Block Back instance. 3729 * 3730 * \return 0 for success, errno codes for failure. 3731 */ 3732 static int 3733 xbb_resume(device_t dev) 3734 { 3735 return (0); 3736 } 3737 3738 /** 3739 * Handle state changes expressed via the XenStore by our front-end peer. 3740 * 3741 * \param dev NewBus device object representing this Xen 3742 * Block Back instance. 3743 * \param frontend_state The new state of the front-end. 3744 * 3745 * \return 0 for success, errno codes for failure. 3746 */ 3747 static void 3748 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3749 { 3750 struct xbb_softc *xbb = device_get_softc(dev); 3751 3752 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3753 xenbus_strstate(frontend_state), 3754 xenbus_strstate(xenbus_get_state(xbb->dev))); 3755 3756 switch (frontend_state) { 3757 case XenbusStateInitialising: 3758 break; 3759 case XenbusStateInitialised: 3760 case XenbusStateConnected: 3761 xbb_connect(xbb); 3762 break; 3763 case XenbusStateClosing: 3764 case XenbusStateClosed: 3765 mtx_lock(&xbb->lock); 3766 xbb_shutdown(xbb); 3767 mtx_unlock(&xbb->lock); 3768 if (frontend_state == XenbusStateClosed) 3769 xenbus_set_state(xbb->dev, XenbusStateClosed); 3770 break; 3771 default: 3772 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3773 frontend_state); 3774 break; 3775 } 3776 } 3777 3778 /*---------------------------- NewBus Registration ---------------------------*/ 3779 static device_method_t xbb_methods[] = { 3780 /* Device interface */ 3781 DEVMETHOD(device_probe, xbb_probe), 3782 DEVMETHOD(device_attach, xbb_attach), 3783 DEVMETHOD(device_detach, xbb_detach), 3784 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3785 DEVMETHOD(device_suspend, xbb_suspend), 3786 DEVMETHOD(device_resume, xbb_resume), 3787 3788 /* Xenbus interface */ 3789 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3790 { 0, 0 } 3791 }; 3792 3793 static driver_t xbb_driver = { 3794 "xbbd", 3795 xbb_methods, 3796 sizeof(struct xbb_softc), 3797 }; 3798 3799 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, 0, 0); 3800