1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009-2012 Spectra Logic Corporation 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions, and the following disclaimer, 12 * without modification. 13 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 14 * substantially similar to the "NO WARRANTY" disclaimer below 15 * ("Disclaimer") and any redistribution must be conditioned upon 16 * including a substantially similar Disclaimer requirement for further 17 * binary redistribution. 18 * 19 * NO WARRANTY 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 29 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGES. 31 * 32 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 33 * Ken Merry (Spectra Logic Corporation) 34 */ 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 /** 39 * \file blkback.c 40 * 41 * \brief Device driver supporting the vending of block storage from 42 * a FreeBSD domain to other domains. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kernel.h> 48 #include <sys/malloc.h> 49 50 #include <sys/bio.h> 51 #include <sys/bus.h> 52 #include <sys/conf.h> 53 #include <sys/devicestat.h> 54 #include <sys/disk.h> 55 #include <sys/fcntl.h> 56 #include <sys/filedesc.h> 57 #include <sys/kdb.h> 58 #include <sys/module.h> 59 #include <sys/namei.h> 60 #include <sys/proc.h> 61 #include <sys/rman.h> 62 #include <sys/taskqueue.h> 63 #include <sys/types.h> 64 #include <sys/vnode.h> 65 #include <sys/mount.h> 66 #include <sys/sysctl.h> 67 #include <sys/bitstring.h> 68 #include <sys/sdt.h> 69 70 #include <geom/geom.h> 71 72 #include <machine/_inttypes.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 78 #include <xen/xen-os.h> 79 #include <xen/blkif.h> 80 #include <xen/gnttab.h> 81 #include <xen/xen_intr.h> 82 83 #include <contrib/xen/event_channel.h> 84 #include <contrib/xen/grant_table.h> 85 86 #include <xen/xenbus/xenbusvar.h> 87 88 /*--------------------------- Compile-time Tunables --------------------------*/ 89 /** 90 * The maximum number of shared memory ring pages we will allow in a 91 * negotiated block-front/back communication channel. Allow enough 92 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 93 */ 94 #define XBB_MAX_RING_PAGES 32 95 96 /** 97 * The maximum number of outstanding request blocks (request headers plus 98 * additional segment blocks) we will allow in a negotiated block-front/back 99 * communication channel. 100 */ 101 #define XBB_MAX_REQUESTS \ 102 __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) 103 104 /** 105 * \brief Define to enable rudimentary request logging to the console. 106 */ 107 #undef XBB_DEBUG 108 109 /*---------------------------------- Macros ----------------------------------*/ 110 /** 111 * Custom malloc type for all driver allocations. 112 */ 113 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 114 115 #ifdef XBB_DEBUG 116 #define DPRINTF(fmt, args...) \ 117 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 118 #else 119 #define DPRINTF(fmt, args...) do {} while(0) 120 #endif 121 122 /** 123 * The maximum mapped region size per request we will allow in a negotiated 124 * block-front/back communication channel. 125 * Use old default of MAXPHYS == 128K. 126 */ 127 #define XBB_MAX_REQUEST_SIZE \ 128 MIN(128 * 1024, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 129 130 /** 131 * The maximum number of segments (within a request header and accompanying 132 * segment blocks) per request we will allow in a negotiated block-front/back 133 * communication channel. 134 */ 135 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 136 (MIN(UIO_MAXIOV, \ 137 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 138 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 139 140 /** 141 * The maximum number of ring pages that we can allow per request list. 142 * We limit this to the maximum number of segments per request, because 143 * that is already a reasonable number of segments to aggregate. This 144 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 145 * because that would leave situations where we can't dispatch even one 146 * large request. 147 */ 148 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 149 150 /*--------------------------- Forward Declarations ---------------------------*/ 151 struct xbb_softc; 152 struct xbb_xen_req; 153 154 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 155 ...) __attribute__((format(printf, 3, 4))); 156 static int xbb_shutdown(struct xbb_softc *xbb); 157 158 /*------------------------------ Data Structures -----------------------------*/ 159 160 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 161 162 typedef enum { 163 XBB_REQLIST_NONE = 0x00, 164 XBB_REQLIST_MAPPED = 0x01 165 } xbb_reqlist_flags; 166 167 struct xbb_xen_reqlist { 168 /** 169 * Back reference to the parent block back instance for this 170 * request. Used during bio_done handling. 171 */ 172 struct xbb_softc *xbb; 173 174 /** 175 * BLKIF_OP code for this request. 176 */ 177 int operation; 178 179 /** 180 * Set to BLKIF_RSP_* to indicate request status. 181 * 182 * This field allows an error status to be recorded even if the 183 * delivery of this status must be deferred. Deferred reporting 184 * is necessary, for example, when an error is detected during 185 * completion processing of one bio when other bios for this 186 * request are still outstanding. 187 */ 188 int status; 189 190 /** 191 * Number of 512 byte sectors not transferred. 192 */ 193 int residual_512b_sectors; 194 195 /** 196 * Starting sector number of the first request in the list. 197 */ 198 off_t starting_sector_number; 199 200 /** 201 * If we're going to coalesce, the next contiguous sector would be 202 * this one. 203 */ 204 off_t next_contig_sector; 205 206 /** 207 * Number of child requests in the list. 208 */ 209 int num_children; 210 211 /** 212 * Number of I/O requests still pending on the backend. 213 */ 214 int pendcnt; 215 216 /** 217 * Total number of segments for requests in the list. 218 */ 219 int nr_segments; 220 221 /** 222 * Flags for this particular request list. 223 */ 224 xbb_reqlist_flags flags; 225 226 /** 227 * Kernel virtual address space reserved for this request 228 * list structure and used to map the remote domain's pages for 229 * this I/O, into our domain's address space. 230 */ 231 uint8_t *kva; 232 233 /** 234 * Base, pseudo-physical address, corresponding to the start 235 * of this request's kva region. 236 */ 237 uint64_t gnt_base; 238 239 /** 240 * Array of grant handles (one per page) used to map this request. 241 */ 242 grant_handle_t *gnt_handles; 243 244 /** 245 * Device statistics request ordering type (ordered or simple). 246 */ 247 devstat_tag_type ds_tag_type; 248 249 /** 250 * Device statistics request type (read, write, no_data). 251 */ 252 devstat_trans_flags ds_trans_type; 253 254 /** 255 * The start time for this request. 256 */ 257 struct bintime ds_t0; 258 259 /** 260 * Linked list of contiguous requests with the same operation type. 261 */ 262 struct xbb_xen_req_list contig_req_list; 263 264 /** 265 * Linked list links used to aggregate idle requests in the 266 * request list free pool (xbb->reqlist_free_stailq) and pending 267 * requests waiting for execution (xbb->reqlist_pending_stailq). 268 */ 269 STAILQ_ENTRY(xbb_xen_reqlist) links; 270 }; 271 272 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 273 274 /** 275 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 276 */ 277 struct xbb_xen_req { 278 /** 279 * Linked list links used to aggregate requests into a reqlist 280 * and to store them in the request free pool. 281 */ 282 STAILQ_ENTRY(xbb_xen_req) links; 283 284 /** 285 * The remote domain's identifier for this I/O request. 286 */ 287 uint64_t id; 288 289 /** 290 * The number of pages currently mapped for this request. 291 */ 292 int nr_pages; 293 294 /** 295 * The number of 512 byte sectors comprising this requests. 296 */ 297 int nr_512b_sectors; 298 299 /** 300 * BLKIF_OP code for this request. 301 */ 302 int operation; 303 304 /** 305 * Storage used for non-native ring requests. 306 */ 307 blkif_request_t ring_req_storage; 308 309 /** 310 * Pointer to the Xen request in the ring. 311 */ 312 blkif_request_t *ring_req; 313 314 /** 315 * Consumer index for this request. 316 */ 317 RING_IDX req_ring_idx; 318 319 /** 320 * The start time for this request. 321 */ 322 struct bintime ds_t0; 323 324 /** 325 * Pointer back to our parent request list. 326 */ 327 struct xbb_xen_reqlist *reqlist; 328 }; 329 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 330 331 /** 332 * \brief Configuration data for the shared memory request ring 333 * used to communicate with the front-end client of this 334 * this driver. 335 */ 336 struct xbb_ring_config { 337 /** KVA address where ring memory is mapped. */ 338 vm_offset_t va; 339 340 /** The pseudo-physical address where ring memory is mapped.*/ 341 uint64_t gnt_addr; 342 343 /** 344 * Grant table handles, one per-ring page, returned by the 345 * hyperpervisor upon mapping of the ring and required to 346 * unmap it when a connection is torn down. 347 */ 348 grant_handle_t handle[XBB_MAX_RING_PAGES]; 349 350 /** 351 * The device bus address returned by the hypervisor when 352 * mapping the ring and required to unmap it when a connection 353 * is torn down. 354 */ 355 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 356 357 /** The number of ring pages mapped for the current connection. */ 358 u_int ring_pages; 359 360 /** 361 * The grant references, one per-ring page, supplied by the 362 * front-end, allowing us to reference the ring pages in the 363 * front-end's domain and to map these pages into our own domain. 364 */ 365 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 366 367 /** The interrupt driven even channel used to signal ring events. */ 368 evtchn_port_t evtchn; 369 }; 370 371 /** 372 * Per-instance connection state flags. 373 */ 374 typedef enum 375 { 376 /** 377 * The front-end requested a read-only mount of the 378 * back-end device/file. 379 */ 380 XBBF_READ_ONLY = 0x01, 381 382 /** Communication with the front-end has been established. */ 383 XBBF_RING_CONNECTED = 0x02, 384 385 /** 386 * Front-end requests exist in the ring and are waiting for 387 * xbb_xen_req objects to free up. 388 */ 389 XBBF_RESOURCE_SHORTAGE = 0x04, 390 391 /** Connection teardown in progress. */ 392 XBBF_SHUTDOWN = 0x08, 393 394 /** A thread is already performing shutdown processing. */ 395 XBBF_IN_SHUTDOWN = 0x10 396 } xbb_flag_t; 397 398 /** Backend device type. */ 399 typedef enum { 400 /** Backend type unknown. */ 401 XBB_TYPE_NONE = 0x00, 402 403 /** 404 * Backend type disk (access via cdev switch 405 * strategy routine). 406 */ 407 XBB_TYPE_DISK = 0x01, 408 409 /** Backend type file (access vnode operations.). */ 410 XBB_TYPE_FILE = 0x02 411 } xbb_type; 412 413 /** 414 * \brief Structure used to memoize information about a per-request 415 * scatter-gather list. 416 * 417 * The chief benefit of using this data structure is it avoids having 418 * to reparse the possibly discontiguous S/G list in the original 419 * request. Due to the way that the mapping of the memory backing an 420 * I/O transaction is handled by Xen, a second pass is unavoidable. 421 * At least this way the second walk is a simple array traversal. 422 * 423 * \note A single Scatter/Gather element in the block interface covers 424 * at most 1 machine page. In this context a sector (blkif 425 * nomenclature, not what I'd choose) is a 512b aligned unit 426 * of mapping within the machine page referenced by an S/G 427 * element. 428 */ 429 struct xbb_sg { 430 /** The number of 512b data chunks mapped in this S/G element. */ 431 int16_t nsect; 432 433 /** 434 * The index (0 based) of the first 512b data chunk mapped 435 * in this S/G element. 436 */ 437 uint8_t first_sect; 438 439 /** 440 * The index (0 based) of the last 512b data chunk mapped 441 * in this S/G element. 442 */ 443 uint8_t last_sect; 444 }; 445 446 /** 447 * Character device backend specific configuration data. 448 */ 449 struct xbb_dev_data { 450 /** Cdev used for device backend access. */ 451 struct cdev *cdev; 452 453 /** Cdev switch used for device backend access. */ 454 struct cdevsw *csw; 455 456 /** Used to hold a reference on opened cdev backend devices. */ 457 int dev_ref; 458 }; 459 460 /** 461 * File backend specific configuration data. 462 */ 463 struct xbb_file_data { 464 /** Credentials to use for vnode backed (file based) I/O. */ 465 struct ucred *cred; 466 467 /** 468 * \brief Array of io vectors used to process file based I/O. 469 * 470 * Only a single file based request is outstanding per-xbb instance, 471 * so we only need one of these. 472 */ 473 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 474 }; 475 476 /** 477 * Collection of backend type specific data. 478 */ 479 union xbb_backend_data { 480 struct xbb_dev_data dev; 481 struct xbb_file_data file; 482 }; 483 484 /** 485 * Function signature of backend specific I/O handlers. 486 */ 487 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 488 struct xbb_xen_reqlist *reqlist, int operation, 489 int flags); 490 491 /** 492 * Per-instance configuration data. 493 */ 494 struct xbb_softc { 495 /** 496 * Task-queue used to process I/O requests. 497 */ 498 struct taskqueue *io_taskqueue; 499 500 /** 501 * Single "run the request queue" task enqueued 502 * on io_taskqueue. 503 */ 504 struct task io_task; 505 506 /** Device type for this instance. */ 507 xbb_type device_type; 508 509 /** NewBus device corresponding to this instance. */ 510 device_t dev; 511 512 /** Backend specific dispatch routine for this instance. */ 513 xbb_dispatch_t dispatch_io; 514 515 /** The number of requests outstanding on the backend device/file. */ 516 int active_request_count; 517 518 /** Free pool of request tracking structures. */ 519 struct xbb_xen_req_list request_free_stailq; 520 521 /** Array, sized at connection time, of request tracking structures. */ 522 struct xbb_xen_req *requests; 523 524 /** Free pool of request list structures. */ 525 struct xbb_xen_reqlist_list reqlist_free_stailq; 526 527 /** List of pending request lists awaiting execution. */ 528 struct xbb_xen_reqlist_list reqlist_pending_stailq; 529 530 /** Array, sized at connection time, of request list structures. */ 531 struct xbb_xen_reqlist *request_lists; 532 533 /** 534 * Global pool of kva used for mapping remote domain ring 535 * and I/O transaction data. 536 */ 537 vm_offset_t kva; 538 539 /** Pseudo-physical address corresponding to kva. */ 540 uint64_t gnt_base_addr; 541 542 /** The size of the global kva pool. */ 543 int kva_size; 544 545 /** The size of the KVA area used for request lists. */ 546 int reqlist_kva_size; 547 548 /** The number of pages of KVA used for request lists */ 549 int reqlist_kva_pages; 550 551 /** Bitmap of free KVA pages */ 552 bitstr_t *kva_free; 553 554 /** 555 * \brief Cached value of the front-end's domain id. 556 * 557 * This value is used at once for each mapped page in 558 * a transaction. We cache it to avoid incuring the 559 * cost of an ivar access every time this is needed. 560 */ 561 domid_t otherend_id; 562 563 /** 564 * \brief The blkif protocol abi in effect. 565 * 566 * There are situations where the back and front ends can 567 * have a different, native abi (e.g. intel x86_64 and 568 * 32bit x86 domains on the same machine). The back-end 569 * always accommodates the front-end's native abi. That 570 * value is pulled from the XenStore and recorded here. 571 */ 572 int abi; 573 574 /** 575 * \brief The maximum number of requests and request lists allowed 576 * to be in flight at a time. 577 * 578 * This value is negotiated via the XenStore. 579 */ 580 u_int max_requests; 581 582 /** 583 * \brief The maximum number of segments (1 page per segment) 584 * that can be mapped by a request. 585 * 586 * This value is negotiated via the XenStore. 587 */ 588 u_int max_request_segments; 589 590 /** 591 * \brief Maximum number of segments per request list. 592 * 593 * This value is derived from and will generally be larger than 594 * max_request_segments. 595 */ 596 u_int max_reqlist_segments; 597 598 /** 599 * The maximum size of any request to this back-end 600 * device. 601 * 602 * This value is negotiated via the XenStore. 603 */ 604 u_int max_request_size; 605 606 /** 607 * The maximum size of any request list. This is derived directly 608 * from max_reqlist_segments. 609 */ 610 u_int max_reqlist_size; 611 612 /** Various configuration and state bit flags. */ 613 xbb_flag_t flags; 614 615 /** Ring mapping and interrupt configuration data. */ 616 struct xbb_ring_config ring_config; 617 618 /** Runtime, cross-abi safe, structures for ring access. */ 619 blkif_back_rings_t rings; 620 621 /** IRQ mapping for the communication ring event channel. */ 622 xen_intr_handle_t xen_intr_handle; 623 624 /** 625 * \brief Backend access mode flags (e.g. write, or read-only). 626 * 627 * This value is passed to us by the front-end via the XenStore. 628 */ 629 char *dev_mode; 630 631 /** 632 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 633 * 634 * This value is passed to us by the front-end via the XenStore. 635 * Currently unused. 636 */ 637 char *dev_type; 638 639 /** 640 * \brief Backend device/file identifier. 641 * 642 * This value is passed to us by the front-end via the XenStore. 643 * We expect this to be a POSIX path indicating the file or 644 * device to open. 645 */ 646 char *dev_name; 647 648 /** 649 * Vnode corresponding to the backend device node or file 650 * we are acessing. 651 */ 652 struct vnode *vn; 653 654 union xbb_backend_data backend; 655 656 /** The native sector size of the backend. */ 657 u_int sector_size; 658 659 /** log2 of sector_size. */ 660 u_int sector_size_shift; 661 662 /** Size in bytes of the backend device or file. */ 663 off_t media_size; 664 665 /** 666 * \brief media_size expressed in terms of the backend native 667 * sector size. 668 * 669 * (e.g. xbb->media_size >> xbb->sector_size_shift). 670 */ 671 uint64_t media_num_sectors; 672 673 /** 674 * \brief Array of memoized scatter gather data computed during the 675 * conversion of blkif ring requests to internal xbb_xen_req 676 * structures. 677 * 678 * Ring processing is serialized so we only need one of these. 679 */ 680 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 681 682 /** 683 * Temporary grant table map used in xbb_dispatch_io(). When 684 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 685 * stack could cause a stack overflow. 686 */ 687 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 688 689 /** Mutex protecting per-instance data. */ 690 struct mtx lock; 691 692 /** 693 * Resource representing allocated physical address space 694 * associated with our per-instance kva region. 695 */ 696 struct resource *pseudo_phys_res; 697 698 /** Resource id for allocated physical address space. */ 699 int pseudo_phys_res_id; 700 701 /** 702 * I/O statistics from BlockBack dispatch down. These are 703 * coalesced requests, and we start them right before execution. 704 */ 705 struct devstat *xbb_stats; 706 707 /** 708 * I/O statistics coming into BlockBack. These are the requests as 709 * we get them from BlockFront. They are started as soon as we 710 * receive a request, and completed when the I/O is complete. 711 */ 712 struct devstat *xbb_stats_in; 713 714 /** Disable sending flush to the backend */ 715 int disable_flush; 716 717 /** Send a real flush for every N flush requests */ 718 int flush_interval; 719 720 /** Count of flush requests in the interval */ 721 int flush_count; 722 723 /** Don't coalesce requests if this is set */ 724 int no_coalesce_reqs; 725 726 /** Number of requests we have received */ 727 uint64_t reqs_received; 728 729 /** Number of requests we have completed*/ 730 uint64_t reqs_completed; 731 732 /** Number of requests we queued but not pushed*/ 733 uint64_t reqs_queued_for_completion; 734 735 /** Number of requests we completed with an error status*/ 736 uint64_t reqs_completed_with_error; 737 738 /** How many forced dispatches (i.e. without coalescing) have happened */ 739 uint64_t forced_dispatch; 740 741 /** How many normal dispatches have happened */ 742 uint64_t normal_dispatch; 743 744 /** How many total dispatches have happened */ 745 uint64_t total_dispatch; 746 747 /** How many times we have run out of KVA */ 748 uint64_t kva_shortages; 749 750 /** How many times we have run out of request structures */ 751 uint64_t request_shortages; 752 753 /** Watch to wait for hotplug script execution */ 754 struct xs_watch hotplug_watch; 755 756 /** Got the needed data from hotplug scripts? */ 757 bool hotplug_done; 758 }; 759 760 /*---------------------------- Request Processing ----------------------------*/ 761 /** 762 * Allocate an internal transaction tracking structure from the free pool. 763 * 764 * \param xbb Per-instance xbb configuration structure. 765 * 766 * \return On success, a pointer to the allocated xbb_xen_req structure. 767 * Otherwise NULL. 768 */ 769 static inline struct xbb_xen_req * 770 xbb_get_req(struct xbb_softc *xbb) 771 { 772 struct xbb_xen_req *req; 773 774 req = NULL; 775 776 mtx_assert(&xbb->lock, MA_OWNED); 777 778 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 779 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 780 xbb->active_request_count++; 781 } 782 783 return (req); 784 } 785 786 /** 787 * Return an allocated transaction tracking structure to the free pool. 788 * 789 * \param xbb Per-instance xbb configuration structure. 790 * \param req The request structure to free. 791 */ 792 static inline void 793 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 794 { 795 mtx_assert(&xbb->lock, MA_OWNED); 796 797 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 798 xbb->active_request_count--; 799 800 KASSERT(xbb->active_request_count >= 0, 801 ("xbb_release_req: negative active count")); 802 } 803 804 /** 805 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 806 * 807 * \param xbb Per-instance xbb configuration structure. 808 * \param req_list The list of requests to free. 809 * \param nreqs The number of items in the list. 810 */ 811 static inline void 812 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 813 int nreqs) 814 { 815 mtx_assert(&xbb->lock, MA_OWNED); 816 817 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 818 xbb->active_request_count -= nreqs; 819 820 KASSERT(xbb->active_request_count >= 0, 821 ("xbb_release_reqs: negative active count")); 822 } 823 824 /** 825 * Given a page index and 512b sector offset within that page, 826 * calculate an offset into a request's kva region. 827 * 828 * \param reqlist The request structure whose kva region will be accessed. 829 * \param pagenr The page index used to compute the kva offset. 830 * \param sector The 512b sector index used to compute the page relative 831 * kva offset. 832 * 833 * \return The computed global KVA offset. 834 */ 835 static inline uint8_t * 836 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 837 { 838 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 839 } 840 841 /** 842 * Given a page number and 512b sector offset within that page, 843 * calculate an offset into the request's memory region that the 844 * underlying backend device/file should use for I/O. 845 * 846 * \param reqlist The request structure whose I/O region will be accessed. 847 * \param pagenr The page index used to compute the I/O offset. 848 * \param sector The 512b sector index used to compute the page relative 849 * I/O offset. 850 * 851 * \return The computed global I/O address. 852 * 853 * Depending on configuration, this will either be a local bounce buffer 854 * or a pointer to the memory mapped in from the front-end domain for 855 * this request. 856 */ 857 static inline uint8_t * 858 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 859 { 860 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 861 } 862 863 /** 864 * Given a page index and 512b sector offset within that page, calculate 865 * an offset into the local pseudo-physical address space used to map a 866 * front-end's request data into a request. 867 * 868 * \param reqlist The request list structure whose pseudo-physical region 869 * will be accessed. 870 * \param pagenr The page index used to compute the pseudo-physical offset. 871 * \param sector The 512b sector index used to compute the page relative 872 * pseudo-physical offset. 873 * 874 * \return The computed global pseudo-phsyical address. 875 * 876 * Depending on configuration, this will either be a local bounce buffer 877 * or a pointer to the memory mapped in from the front-end domain for 878 * this request. 879 */ 880 static inline uintptr_t 881 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 882 { 883 struct xbb_softc *xbb; 884 885 xbb = reqlist->xbb; 886 887 return ((uintptr_t)(xbb->gnt_base_addr + 888 (uintptr_t)(reqlist->kva - xbb->kva) + 889 (PAGE_SIZE * pagenr) + (sector << 9))); 890 } 891 892 /** 893 * Get Kernel Virtual Address space for mapping requests. 894 * 895 * \param xbb Per-instance xbb configuration structure. 896 * \param nr_pages Number of pages needed. 897 * \param check_only If set, check for free KVA but don't allocate it. 898 * \param have_lock If set, xbb lock is already held. 899 * 900 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 901 * 902 * Note: This should be unnecessary once we have either chaining or 903 * scatter/gather support for struct bio. At that point we'll be able to 904 * put multiple addresses and lengths in one bio/bio chain and won't need 905 * to map everything into one virtual segment. 906 */ 907 static uint8_t * 908 xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 909 { 910 int first_clear; 911 int num_clear; 912 uint8_t *free_kva; 913 int i; 914 915 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 916 917 first_clear = 0; 918 free_kva = NULL; 919 920 mtx_lock(&xbb->lock); 921 922 /* 923 * Look for the first available page. If there are none, we're done. 924 */ 925 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 926 927 if (first_clear == -1) 928 goto bailout; 929 930 /* 931 * Starting at the first available page, look for consecutive free 932 * pages that will satisfy the user's request. 933 */ 934 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 935 /* 936 * If this is true, the page is used, so we have to reset 937 * the number of clear pages and the first clear page 938 * (since it pointed to a region with an insufficient number 939 * of clear pages). 940 */ 941 if (bit_test(xbb->kva_free, i)) { 942 num_clear = 0; 943 first_clear = -1; 944 continue; 945 } 946 947 if (first_clear == -1) 948 first_clear = i; 949 950 /* 951 * If this is true, we've found a large enough free region 952 * to satisfy the request. 953 */ 954 if (++num_clear == nr_pages) { 955 bit_nset(xbb->kva_free, first_clear, 956 first_clear + nr_pages - 1); 957 958 free_kva = xbb->kva + 959 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); 960 961 KASSERT(free_kva >= (uint8_t *)xbb->kva && 962 free_kva + (nr_pages * PAGE_SIZE) <= 963 (uint8_t *)xbb->ring_config.va, 964 ("Free KVA %p len %d out of range, " 965 "kva = %#jx, ring VA = %#jx\n", free_kva, 966 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 967 (uintmax_t)xbb->ring_config.va)); 968 break; 969 } 970 } 971 972 bailout: 973 974 if (free_kva == NULL) { 975 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 976 xbb->kva_shortages++; 977 } 978 979 mtx_unlock(&xbb->lock); 980 981 return (free_kva); 982 } 983 984 /** 985 * Free allocated KVA. 986 * 987 * \param xbb Per-instance xbb configuration structure. 988 * \param kva_ptr Pointer to allocated KVA region. 989 * \param nr_pages Number of pages in the KVA region. 990 */ 991 static void 992 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 993 { 994 intptr_t start_page; 995 996 mtx_assert(&xbb->lock, MA_OWNED); 997 998 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 999 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1000 1001 } 1002 1003 /** 1004 * Unmap the front-end pages associated with this I/O request. 1005 * 1006 * \param req The request structure to unmap. 1007 */ 1008 static void 1009 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1010 { 1011 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1012 u_int i; 1013 u_int invcount; 1014 int error __diagused; 1015 1016 invcount = 0; 1017 for (i = 0; i < reqlist->nr_segments; i++) { 1018 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1019 continue; 1020 1021 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1022 unmap[invcount].dev_bus_addr = 0; 1023 unmap[invcount].handle = reqlist->gnt_handles[i]; 1024 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1025 invcount++; 1026 } 1027 1028 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1029 unmap, invcount); 1030 KASSERT(error == 0, ("Grant table operation failed")); 1031 } 1032 1033 /** 1034 * Allocate an internal transaction tracking structure from the free pool. 1035 * 1036 * \param xbb Per-instance xbb configuration structure. 1037 * 1038 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1039 * Otherwise NULL. 1040 */ 1041 static inline struct xbb_xen_reqlist * 1042 xbb_get_reqlist(struct xbb_softc *xbb) 1043 { 1044 struct xbb_xen_reqlist *reqlist; 1045 1046 reqlist = NULL; 1047 1048 mtx_assert(&xbb->lock, MA_OWNED); 1049 1050 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1051 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1052 reqlist->flags = XBB_REQLIST_NONE; 1053 reqlist->kva = NULL; 1054 reqlist->status = BLKIF_RSP_OKAY; 1055 reqlist->residual_512b_sectors = 0; 1056 reqlist->num_children = 0; 1057 reqlist->nr_segments = 0; 1058 STAILQ_INIT(&reqlist->contig_req_list); 1059 } 1060 1061 return (reqlist); 1062 } 1063 1064 /** 1065 * Return an allocated transaction tracking structure to the free pool. 1066 * 1067 * \param xbb Per-instance xbb configuration structure. 1068 * \param req The request list structure to free. 1069 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1070 * during a resource shortage condition. 1071 */ 1072 static inline void 1073 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1074 int wakeup) 1075 { 1076 1077 mtx_assert(&xbb->lock, MA_OWNED); 1078 1079 if (wakeup) { 1080 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1081 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1082 } 1083 1084 if (reqlist->kva != NULL) 1085 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1086 1087 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1088 1089 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1090 1091 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1092 /* 1093 * Shutdown is in progress. See if we can 1094 * progress further now that one more request 1095 * has completed and been returned to the 1096 * free pool. 1097 */ 1098 xbb_shutdown(xbb); 1099 } 1100 1101 if (wakeup != 0) 1102 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1103 } 1104 1105 /** 1106 * Request resources and do basic request setup. 1107 * 1108 * \param xbb Per-instance xbb configuration structure. 1109 * \param reqlist Pointer to reqlist pointer. 1110 * \param ring_req Pointer to a block ring request. 1111 * \param ring_index The ring index of this request. 1112 * 1113 * \return 0 for success, non-zero for failure. 1114 */ 1115 static int 1116 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1117 blkif_request_t *ring_req, RING_IDX ring_idx) 1118 { 1119 struct xbb_xen_reqlist *nreqlist; 1120 struct xbb_xen_req *nreq; 1121 1122 nreqlist = NULL; 1123 nreq = NULL; 1124 1125 mtx_lock(&xbb->lock); 1126 1127 /* 1128 * We don't allow new resources to be allocated if we're in the 1129 * process of shutting down. 1130 */ 1131 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1132 mtx_unlock(&xbb->lock); 1133 return (1); 1134 } 1135 1136 /* 1137 * Allocate a reqlist if the caller doesn't have one already. 1138 */ 1139 if (*reqlist == NULL) { 1140 nreqlist = xbb_get_reqlist(xbb); 1141 if (nreqlist == NULL) 1142 goto bailout_error; 1143 } 1144 1145 /* We always allocate a request. */ 1146 nreq = xbb_get_req(xbb); 1147 if (nreq == NULL) 1148 goto bailout_error; 1149 1150 mtx_unlock(&xbb->lock); 1151 1152 if (*reqlist == NULL) { 1153 *reqlist = nreqlist; 1154 nreqlist->operation = ring_req->operation; 1155 nreqlist->starting_sector_number = ring_req->sector_number; 1156 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1157 links); 1158 } 1159 1160 nreq->reqlist = *reqlist; 1161 nreq->req_ring_idx = ring_idx; 1162 nreq->id = ring_req->id; 1163 nreq->operation = ring_req->operation; 1164 1165 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1166 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1167 nreq->ring_req = &nreq->ring_req_storage; 1168 } else { 1169 nreq->ring_req = ring_req; 1170 } 1171 1172 binuptime(&nreq->ds_t0); 1173 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1174 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1175 (*reqlist)->num_children++; 1176 (*reqlist)->nr_segments += ring_req->nr_segments; 1177 1178 return (0); 1179 1180 bailout_error: 1181 1182 /* 1183 * We're out of resources, so set the shortage flag. The next time 1184 * a request is released, we'll try waking up the work thread to 1185 * see if we can allocate more resources. 1186 */ 1187 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1188 xbb->request_shortages++; 1189 1190 if (nreq != NULL) 1191 xbb_release_req(xbb, nreq); 1192 1193 if (nreqlist != NULL) 1194 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1195 1196 mtx_unlock(&xbb->lock); 1197 1198 return (1); 1199 } 1200 1201 /** 1202 * Create and queue a response to a blkif request. 1203 * 1204 * \param xbb Per-instance xbb configuration structure. 1205 * \param req The request structure to which to respond. 1206 * \param status The status code to report. See BLKIF_RSP_* 1207 * in sys/contrib/xen/io/blkif.h. 1208 */ 1209 static void 1210 xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1211 { 1212 blkif_response_t *resp; 1213 1214 /* 1215 * The mutex is required here, and should be held across this call 1216 * until after the subsequent call to xbb_push_responses(). This 1217 * is to guarantee that another context won't queue responses and 1218 * push them while we're active. 1219 * 1220 * That could lead to the other end being notified of responses 1221 * before the resources have been freed on this end. The other end 1222 * would then be able to queue additional I/O, and we may run out 1223 * of resources because we haven't freed them all yet. 1224 */ 1225 mtx_assert(&xbb->lock, MA_OWNED); 1226 1227 /* 1228 * Place on the response ring for the relevant domain. 1229 * For now, only the spacing between entries is different 1230 * in the different ABIs, not the response entry layout. 1231 */ 1232 switch (xbb->abi) { 1233 case BLKIF_PROTOCOL_NATIVE: 1234 resp = RING_GET_RESPONSE(&xbb->rings.native, 1235 xbb->rings.native.rsp_prod_pvt); 1236 break; 1237 case BLKIF_PROTOCOL_X86_32: 1238 resp = (blkif_response_t *) 1239 RING_GET_RESPONSE(&xbb->rings.x86_32, 1240 xbb->rings.x86_32.rsp_prod_pvt); 1241 break; 1242 case BLKIF_PROTOCOL_X86_64: 1243 resp = (blkif_response_t *) 1244 RING_GET_RESPONSE(&xbb->rings.x86_64, 1245 xbb->rings.x86_64.rsp_prod_pvt); 1246 break; 1247 default: 1248 panic("Unexpected blkif protocol ABI."); 1249 } 1250 1251 resp->id = req->id; 1252 resp->operation = req->operation; 1253 resp->status = status; 1254 1255 if (status != BLKIF_RSP_OKAY) 1256 xbb->reqs_completed_with_error++; 1257 1258 xbb->rings.common.rsp_prod_pvt++; 1259 1260 xbb->reqs_queued_for_completion++; 1261 1262 } 1263 1264 /** 1265 * Send queued responses to blkif requests. 1266 * 1267 * \param xbb Per-instance xbb configuration structure. 1268 * \param run_taskqueue Flag that is set to 1 if the taskqueue 1269 * should be run, 0 if it does not need to be run. 1270 * \param notify Flag that is set to 1 if the other end should be 1271 * notified via irq, 0 if the other end should not be 1272 * notified. 1273 */ 1274 static void 1275 xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) 1276 { 1277 int more_to_do; 1278 1279 /* 1280 * The mutex is required here. 1281 */ 1282 mtx_assert(&xbb->lock, MA_OWNED); 1283 1284 more_to_do = 0; 1285 1286 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); 1287 1288 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1289 /* 1290 * Tail check for pending requests. Allows frontend to avoid 1291 * notifications if requests are already in flight (lower 1292 * overheads and promotes batching). 1293 */ 1294 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1295 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1296 more_to_do = 1; 1297 } 1298 1299 xbb->reqs_completed += xbb->reqs_queued_for_completion; 1300 xbb->reqs_queued_for_completion = 0; 1301 1302 *run_taskqueue = more_to_do; 1303 } 1304 1305 /** 1306 * Complete a request list. 1307 * 1308 * \param xbb Per-instance xbb configuration structure. 1309 * \param reqlist Allocated internal request list structure. 1310 */ 1311 static void 1312 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1313 { 1314 struct xbb_xen_req *nreq; 1315 off_t sectors_sent; 1316 int notify, run_taskqueue; 1317 1318 sectors_sent = 0; 1319 1320 if (reqlist->flags & XBB_REQLIST_MAPPED) 1321 xbb_unmap_reqlist(reqlist); 1322 1323 mtx_lock(&xbb->lock); 1324 1325 /* 1326 * All I/O is done, send the response. A lock is not necessary 1327 * to protect the request list, because all requests have 1328 * completed. Therefore this is the only context accessing this 1329 * reqlist right now. However, in order to make sure that no one 1330 * else queues responses onto the queue or pushes them to the other 1331 * side while we're active, we need to hold the lock across the 1332 * calls to xbb_queue_response() and xbb_push_responses(). 1333 */ 1334 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1335 off_t cur_sectors_sent; 1336 1337 /* Put this response on the ring, but don't push yet */ 1338 xbb_queue_response(xbb, nreq, reqlist->status); 1339 1340 /* We don't report bytes sent if there is an error. */ 1341 if (reqlist->status == BLKIF_RSP_OKAY) 1342 cur_sectors_sent = nreq->nr_512b_sectors; 1343 else 1344 cur_sectors_sent = 0; 1345 1346 sectors_sent += cur_sectors_sent; 1347 1348 devstat_end_transaction(xbb->xbb_stats_in, 1349 /*bytes*/cur_sectors_sent << 9, 1350 reqlist->ds_tag_type, 1351 reqlist->ds_trans_type, 1352 /*now*/NULL, 1353 /*then*/&nreq->ds_t0); 1354 } 1355 1356 /* 1357 * Take out any sectors not sent. If we wind up negative (which 1358 * might happen if an error is reported as well as a residual), just 1359 * report 0 sectors sent. 1360 */ 1361 sectors_sent -= reqlist->residual_512b_sectors; 1362 if (sectors_sent < 0) 1363 sectors_sent = 0; 1364 1365 devstat_end_transaction(xbb->xbb_stats, 1366 /*bytes*/ sectors_sent << 9, 1367 reqlist->ds_tag_type, 1368 reqlist->ds_trans_type, 1369 /*now*/NULL, 1370 /*then*/&reqlist->ds_t0); 1371 1372 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1373 1374 xbb_push_responses(xbb, &run_taskqueue, ¬ify); 1375 1376 mtx_unlock(&xbb->lock); 1377 1378 if (run_taskqueue) 1379 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1380 1381 if (notify) 1382 xen_intr_signal(xbb->xen_intr_handle); 1383 } 1384 1385 /** 1386 * Completion handler for buffer I/O requests issued by the device 1387 * backend driver. 1388 * 1389 * \param bio The buffer I/O request on which to perform completion 1390 * processing. 1391 */ 1392 static void 1393 xbb_bio_done(struct bio *bio) 1394 { 1395 struct xbb_softc *xbb; 1396 struct xbb_xen_reqlist *reqlist; 1397 1398 reqlist = bio->bio_caller1; 1399 xbb = reqlist->xbb; 1400 1401 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1402 1403 /* 1404 * This is a bit imprecise. With aggregated I/O a single 1405 * request list can contain multiple front-end requests and 1406 * a multiple bios may point to a single request. By carefully 1407 * walking the request list, we could map residuals and errors 1408 * back to the original front-end request, but the interface 1409 * isn't sufficiently rich for us to properly report the error. 1410 * So, we just treat the entire request list as having failed if an 1411 * error occurs on any part. And, if an error occurs, we treat 1412 * the amount of data transferred as 0. 1413 * 1414 * For residuals, we report it on the overall aggregated device, 1415 * but not on the individual requests, since we don't currently 1416 * do the work to determine which front-end request to which the 1417 * residual applies. 1418 */ 1419 if (bio->bio_error) { 1420 DPRINTF("BIO returned error %d for operation on device %s\n", 1421 bio->bio_error, xbb->dev_name); 1422 reqlist->status = BLKIF_RSP_ERROR; 1423 1424 if (bio->bio_error == ENXIO 1425 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1426 /* 1427 * Backend device has disappeared. Signal the 1428 * front-end that we (the device proxy) want to 1429 * go away. 1430 */ 1431 xenbus_set_state(xbb->dev, XenbusStateClosing); 1432 } 1433 } 1434 1435 /* 1436 * Decrement the pending count for the request list. When we're 1437 * done with the requests, send status back for all of them. 1438 */ 1439 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1440 xbb_complete_reqlist(xbb, reqlist); 1441 1442 g_destroy_bio(bio); 1443 } 1444 1445 /** 1446 * Parse a blkif request into an internal request structure and send 1447 * it to the backend for processing. 1448 * 1449 * \param xbb Per-instance xbb configuration structure. 1450 * \param reqlist Allocated internal request list structure. 1451 * 1452 * \return On success, 0. For resource shortages, non-zero. 1453 * 1454 * This routine performs the backend common aspects of request parsing 1455 * including compiling an internal request structure, parsing the S/G 1456 * list and any secondary ring requests in which they may reside, and 1457 * the mapping of front-end I/O pages into our domain. 1458 */ 1459 static int 1460 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1461 { 1462 struct xbb_sg *xbb_sg; 1463 struct gnttab_map_grant_ref *map; 1464 struct blkif_request_segment *sg; 1465 struct blkif_request_segment *last_block_sg; 1466 struct xbb_xen_req *nreq; 1467 u_int nseg; 1468 u_int seg_idx; 1469 u_int block_segs; 1470 int nr_sects; 1471 int total_sects; 1472 int operation; 1473 uint8_t bio_flags; 1474 int error; 1475 1476 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1477 bio_flags = 0; 1478 total_sects = 0; 1479 nr_sects = 0; 1480 1481 /* 1482 * First determine whether we have enough free KVA to satisfy this 1483 * request list. If not, tell xbb_run_queue() so it can go to 1484 * sleep until we have more KVA. 1485 */ 1486 reqlist->kva = NULL; 1487 if (reqlist->nr_segments != 0) { 1488 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1489 if (reqlist->kva == NULL) { 1490 /* 1491 * If we're out of KVA, return ENOMEM. 1492 */ 1493 return (ENOMEM); 1494 } 1495 } 1496 1497 binuptime(&reqlist->ds_t0); 1498 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1499 1500 switch (reqlist->operation) { 1501 case BLKIF_OP_WRITE_BARRIER: 1502 bio_flags |= BIO_ORDERED; 1503 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1504 /* FALLTHROUGH */ 1505 case BLKIF_OP_WRITE: 1506 operation = BIO_WRITE; 1507 reqlist->ds_trans_type = DEVSTAT_WRITE; 1508 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1509 DPRINTF("Attempt to write to read only device %s\n", 1510 xbb->dev_name); 1511 reqlist->status = BLKIF_RSP_ERROR; 1512 goto send_response; 1513 } 1514 break; 1515 case BLKIF_OP_READ: 1516 operation = BIO_READ; 1517 reqlist->ds_trans_type = DEVSTAT_READ; 1518 break; 1519 case BLKIF_OP_FLUSH_DISKCACHE: 1520 /* 1521 * If this is true, the user has requested that we disable 1522 * flush support. So we just complete the requests 1523 * successfully. 1524 */ 1525 if (xbb->disable_flush != 0) { 1526 goto send_response; 1527 } 1528 1529 /* 1530 * The user has requested that we only send a real flush 1531 * for every N flush requests. So keep count, and either 1532 * complete the request immediately or queue it for the 1533 * backend. 1534 */ 1535 if (xbb->flush_interval != 0) { 1536 if (++(xbb->flush_count) < xbb->flush_interval) { 1537 goto send_response; 1538 } else 1539 xbb->flush_count = 0; 1540 } 1541 1542 operation = BIO_FLUSH; 1543 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1544 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1545 goto do_dispatch; 1546 /*NOTREACHED*/ 1547 default: 1548 DPRINTF("error: unknown block io operation [%d]\n", 1549 reqlist->operation); 1550 reqlist->status = BLKIF_RSP_ERROR; 1551 goto send_response; 1552 } 1553 1554 reqlist->xbb = xbb; 1555 xbb_sg = xbb->xbb_sgs; 1556 map = xbb->maps; 1557 seg_idx = 0; 1558 1559 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1560 blkif_request_t *ring_req; 1561 u_int req_seg_idx; 1562 1563 ring_req = nreq->ring_req; 1564 nr_sects = 0; 1565 nseg = ring_req->nr_segments; 1566 nreq->nr_pages = nseg; 1567 nreq->nr_512b_sectors = 0; 1568 req_seg_idx = 0; 1569 sg = NULL; 1570 1571 /* Check that number of segments is sane. */ 1572 if (__predict_false(nseg == 0) 1573 || __predict_false(nseg > xbb->max_request_segments)) { 1574 DPRINTF("Bad number of segments in request (%d)\n", 1575 nseg); 1576 reqlist->status = BLKIF_RSP_ERROR; 1577 goto send_response; 1578 } 1579 1580 block_segs = nseg; 1581 sg = ring_req->seg; 1582 last_block_sg = sg + block_segs; 1583 1584 while (sg < last_block_sg) { 1585 KASSERT(seg_idx < 1586 XBB_MAX_SEGMENTS_PER_REQLIST, 1587 ("seg_idx %d is too large, max " 1588 "segs %d\n", seg_idx, 1589 XBB_MAX_SEGMENTS_PER_REQLIST)); 1590 1591 xbb_sg->first_sect = sg->first_sect; 1592 xbb_sg->last_sect = sg->last_sect; 1593 xbb_sg->nsect = 1594 (int8_t)(sg->last_sect - 1595 sg->first_sect + 1); 1596 1597 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1598 || (xbb_sg->nsect <= 0)) { 1599 reqlist->status = BLKIF_RSP_ERROR; 1600 goto send_response; 1601 } 1602 1603 nr_sects += xbb_sg->nsect; 1604 map->host_addr = xbb_get_gntaddr(reqlist, 1605 seg_idx, /*sector*/0); 1606 KASSERT(map->host_addr + PAGE_SIZE <= 1607 xbb->ring_config.gnt_addr, 1608 ("Host address %#jx len %d overlaps " 1609 "ring address %#jx\n", 1610 (uintmax_t)map->host_addr, PAGE_SIZE, 1611 (uintmax_t)xbb->ring_config.gnt_addr)); 1612 1613 map->flags = GNTMAP_host_map; 1614 map->ref = sg->gref; 1615 map->dom = xbb->otherend_id; 1616 if (operation == BIO_WRITE) 1617 map->flags |= GNTMAP_readonly; 1618 sg++; 1619 map++; 1620 xbb_sg++; 1621 seg_idx++; 1622 req_seg_idx++; 1623 } 1624 1625 /* Convert to the disk's sector size */ 1626 nreq->nr_512b_sectors = nr_sects; 1627 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1628 total_sects += nr_sects; 1629 1630 if ((nreq->nr_512b_sectors & 1631 ((xbb->sector_size >> 9) - 1)) != 0) { 1632 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1633 "a multiple of the backing store sector " 1634 "size (%d)\n", __func__, 1635 nreq->nr_512b_sectors << 9, 1636 xbb->sector_size); 1637 reqlist->status = BLKIF_RSP_ERROR; 1638 goto send_response; 1639 } 1640 } 1641 1642 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1643 xbb->maps, reqlist->nr_segments); 1644 if (error != 0) 1645 panic("Grant table operation failed (%d)", error); 1646 1647 reqlist->flags |= XBB_REQLIST_MAPPED; 1648 1649 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1650 seg_idx++, map++){ 1651 if (__predict_false(map->status != 0)) { 1652 DPRINTF("invalid buffer -- could not remap " 1653 "it (%d)\n", map->status); 1654 DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " 1655 "0x%x ref 0x%x, dom %d\n", seg_idx, 1656 map->host_addr, map->flags, map->ref, 1657 map->dom); 1658 reqlist->status = BLKIF_RSP_ERROR; 1659 goto send_response; 1660 } 1661 1662 reqlist->gnt_handles[seg_idx] = map->handle; 1663 } 1664 if (reqlist->starting_sector_number + total_sects > 1665 xbb->media_num_sectors) { 1666 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1667 "extends past end of device %s\n", 1668 operation == BIO_READ ? "read" : "write", 1669 reqlist->starting_sector_number, 1670 reqlist->starting_sector_number + total_sects, 1671 xbb->dev_name); 1672 reqlist->status = BLKIF_RSP_ERROR; 1673 goto send_response; 1674 } 1675 1676 do_dispatch: 1677 1678 error = xbb->dispatch_io(xbb, 1679 reqlist, 1680 operation, 1681 bio_flags); 1682 1683 if (error != 0) { 1684 reqlist->status = BLKIF_RSP_ERROR; 1685 goto send_response; 1686 } 1687 1688 return (0); 1689 1690 send_response: 1691 1692 xbb_complete_reqlist(xbb, reqlist); 1693 1694 return (0); 1695 } 1696 1697 static __inline int 1698 xbb_count_sects(blkif_request_t *ring_req) 1699 { 1700 int i; 1701 int cur_size = 0; 1702 1703 for (i = 0; i < ring_req->nr_segments; i++) { 1704 int nsect; 1705 1706 nsect = (int8_t)(ring_req->seg[i].last_sect - 1707 ring_req->seg[i].first_sect + 1); 1708 if (nsect <= 0) 1709 break; 1710 1711 cur_size += nsect; 1712 } 1713 1714 return (cur_size); 1715 } 1716 1717 /** 1718 * Process incoming requests from the shared communication ring in response 1719 * to a signal on the ring's event channel. 1720 * 1721 * \param context Callback argument registerd during task initialization - 1722 * the xbb_softc for this instance. 1723 * \param pending The number of taskqueue_enqueue events that have 1724 * occurred since this handler was last run. 1725 */ 1726 static void 1727 xbb_run_queue(void *context, int pending) 1728 { 1729 struct xbb_softc *xbb; 1730 blkif_back_rings_t *rings; 1731 RING_IDX rp; 1732 uint64_t cur_sector; 1733 int cur_operation; 1734 struct xbb_xen_reqlist *reqlist; 1735 1736 xbb = (struct xbb_softc *)context; 1737 rings = &xbb->rings; 1738 1739 /* 1740 * Work gather and dispatch loop. Note that we have a bias here 1741 * towards gathering I/O sent by blockfront. We first gather up 1742 * everything in the ring, as long as we have resources. Then we 1743 * dispatch one request, and then attempt to gather up any 1744 * additional requests that have come in while we were dispatching 1745 * the request. 1746 * 1747 * This allows us to get a clearer picture (via devstat) of how 1748 * many requests blockfront is queueing to us at any given time. 1749 */ 1750 for (;;) { 1751 int retval; 1752 1753 /* 1754 * Initialize reqlist to the last element in the pending 1755 * queue, if there is one. This allows us to add more 1756 * requests to that request list, if we have room. 1757 */ 1758 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1759 xbb_xen_reqlist, links); 1760 if (reqlist != NULL) { 1761 cur_sector = reqlist->next_contig_sector; 1762 cur_operation = reqlist->operation; 1763 } else { 1764 cur_operation = 0; 1765 cur_sector = 0; 1766 } 1767 1768 /* 1769 * Cache req_prod to avoid accessing a cache line shared 1770 * with the frontend. 1771 */ 1772 rp = rings->common.sring->req_prod; 1773 1774 /* Ensure we see queued requests up to 'rp'. */ 1775 rmb(); 1776 1777 /** 1778 * Run so long as there is work to consume and the generation 1779 * of a response will not overflow the ring. 1780 * 1781 * @note There's a 1 to 1 relationship between requests and 1782 * responses, so an overflow should never occur. This 1783 * test is to protect our domain from digesting bogus 1784 * data. Shouldn't we log this? 1785 */ 1786 while (rings->common.req_cons != rp 1787 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1788 rings->common.req_cons) == 0){ 1789 blkif_request_t ring_req_storage; 1790 blkif_request_t *ring_req; 1791 int cur_size; 1792 1793 switch (xbb->abi) { 1794 case BLKIF_PROTOCOL_NATIVE: 1795 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1796 rings->common.req_cons); 1797 break; 1798 case BLKIF_PROTOCOL_X86_32: 1799 { 1800 struct blkif_x86_32_request *ring_req32; 1801 1802 ring_req32 = RING_GET_REQUEST( 1803 &xbb->rings.x86_32, rings->common.req_cons); 1804 blkif_get_x86_32_req(&ring_req_storage, 1805 ring_req32); 1806 ring_req = &ring_req_storage; 1807 break; 1808 } 1809 case BLKIF_PROTOCOL_X86_64: 1810 { 1811 struct blkif_x86_64_request *ring_req64; 1812 1813 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1814 rings->common.req_cons); 1815 blkif_get_x86_64_req(&ring_req_storage, 1816 ring_req64); 1817 ring_req = &ring_req_storage; 1818 break; 1819 } 1820 default: 1821 panic("Unexpected blkif protocol ABI."); 1822 /* NOTREACHED */ 1823 } 1824 1825 /* 1826 * Check for situations that would require closing 1827 * off this I/O for further coalescing: 1828 * - Coalescing is turned off. 1829 * - Current I/O is out of sequence with the previous 1830 * I/O. 1831 * - Coalesced I/O would be too large. 1832 */ 1833 if ((reqlist != NULL) 1834 && ((xbb->no_coalesce_reqs != 0) 1835 || ((xbb->no_coalesce_reqs == 0) 1836 && ((ring_req->sector_number != cur_sector) 1837 || (ring_req->operation != cur_operation) 1838 || ((ring_req->nr_segments + reqlist->nr_segments) > 1839 xbb->max_reqlist_segments))))) { 1840 reqlist = NULL; 1841 } 1842 1843 /* 1844 * Grab and check for all resources in one shot. 1845 * If we can't get all of the resources we need, 1846 * the shortage is noted and the thread will get 1847 * woken up when more resources are available. 1848 */ 1849 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1850 xbb->rings.common.req_cons); 1851 1852 if (retval != 0) { 1853 /* 1854 * Resource shortage has been recorded. 1855 * We'll be scheduled to run once a request 1856 * object frees up due to a completion. 1857 */ 1858 break; 1859 } 1860 1861 /* 1862 * Signify that we can overwrite this request with 1863 * a response by incrementing our consumer index. 1864 * The response won't be generated until after 1865 * we've already consumed all necessary data out 1866 * of the version of the request in the ring buffer 1867 * (for native mode). We must update the consumer 1868 * index before issuing back-end I/O so there is 1869 * no possibility that it will complete and a 1870 * response be generated before we make room in 1871 * the queue for that response. 1872 */ 1873 xbb->rings.common.req_cons++; 1874 xbb->reqs_received++; 1875 1876 cur_size = xbb_count_sects(ring_req); 1877 cur_sector = ring_req->sector_number + cur_size; 1878 reqlist->next_contig_sector = cur_sector; 1879 cur_operation = ring_req->operation; 1880 } 1881 1882 /* Check for I/O to dispatch */ 1883 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1884 if (reqlist == NULL) { 1885 /* 1886 * We're out of work to do, put the task queue to 1887 * sleep. 1888 */ 1889 break; 1890 } 1891 1892 /* 1893 * Grab the first request off the queue and attempt 1894 * to dispatch it. 1895 */ 1896 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1897 1898 retval = xbb_dispatch_io(xbb, reqlist); 1899 if (retval != 0) { 1900 /* 1901 * xbb_dispatch_io() returns non-zero only when 1902 * there is a resource shortage. If that's the 1903 * case, re-queue this request on the head of the 1904 * queue, and go to sleep until we have more 1905 * resources. 1906 */ 1907 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1908 reqlist, links); 1909 break; 1910 } else { 1911 /* 1912 * If we still have anything on the queue after 1913 * removing the head entry, that is because we 1914 * met one of the criteria to create a new 1915 * request list (outlined above), and we'll call 1916 * that a forced dispatch for statistical purposes. 1917 * 1918 * Otherwise, if there is only one element on the 1919 * queue, we coalesced everything available on 1920 * the ring and we'll call that a normal dispatch. 1921 */ 1922 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1923 1924 if (reqlist != NULL) 1925 xbb->forced_dispatch++; 1926 else 1927 xbb->normal_dispatch++; 1928 1929 xbb->total_dispatch++; 1930 } 1931 } 1932 } 1933 1934 /** 1935 * Interrupt handler bound to the shared ring's event channel. 1936 * 1937 * \param arg Callback argument registerd during event channel 1938 * binding - the xbb_softc for this instance. 1939 */ 1940 static int 1941 xbb_filter(void *arg) 1942 { 1943 struct xbb_softc *xbb; 1944 1945 /* Defer to taskqueue thread. */ 1946 xbb = (struct xbb_softc *)arg; 1947 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1948 1949 return (FILTER_HANDLED); 1950 } 1951 1952 SDT_PROVIDER_DEFINE(xbb); 1953 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 1954 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 1955 "uint64_t"); 1956 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 1957 "uint64_t", "uint64_t"); 1958 1959 /*----------------------------- Backend Handlers -----------------------------*/ 1960 /** 1961 * Backend handler for character device access. 1962 * 1963 * \param xbb Per-instance xbb configuration structure. 1964 * \param reqlist Allocated internal request list structure. 1965 * \param operation BIO_* I/O operation code. 1966 * \param bio_flags Additional bio_flag data to pass to any generated 1967 * bios (e.g. BIO_ORDERED).. 1968 * 1969 * \return 0 for success, errno codes for failure. 1970 */ 1971 static int 1972 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1973 int operation, int bio_flags) 1974 { 1975 struct xbb_dev_data *dev_data; 1976 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 1977 off_t bio_offset; 1978 struct bio *bio; 1979 struct xbb_sg *xbb_sg; 1980 u_int nbio; 1981 u_int bio_idx; 1982 u_int nseg; 1983 u_int seg_idx; 1984 int error; 1985 1986 dev_data = &xbb->backend.dev; 1987 bio_offset = (off_t)reqlist->starting_sector_number 1988 << xbb->sector_size_shift; 1989 error = 0; 1990 nbio = 0; 1991 bio_idx = 0; 1992 1993 if (operation == BIO_FLUSH) { 1994 bio = g_new_bio(); 1995 if (__predict_false(bio == NULL)) { 1996 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 1997 error = ENOMEM; 1998 return (error); 1999 } 2000 2001 bio->bio_cmd = BIO_FLUSH; 2002 bio->bio_flags |= BIO_ORDERED; 2003 bio->bio_dev = dev_data->cdev; 2004 bio->bio_offset = 0; 2005 bio->bio_data = 0; 2006 bio->bio_done = xbb_bio_done; 2007 bio->bio_caller1 = reqlist; 2008 bio->bio_pblkno = 0; 2009 2010 reqlist->pendcnt = 1; 2011 2012 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2013 device_get_unit(xbb->dev)); 2014 2015 (*dev_data->csw->d_strategy)(bio); 2016 2017 return (0); 2018 } 2019 2020 xbb_sg = xbb->xbb_sgs; 2021 bio = NULL; 2022 nseg = reqlist->nr_segments; 2023 2024 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2025 /* 2026 * KVA will not be contiguous, so any additional 2027 * I/O will need to be represented in a new bio. 2028 */ 2029 if ((bio != NULL) 2030 && (xbb_sg->first_sect != 0)) { 2031 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2032 printf("%s: Discontiguous I/O request " 2033 "from domain %d ends on " 2034 "non-sector boundary\n", 2035 __func__, xbb->otherend_id); 2036 error = EINVAL; 2037 goto fail_free_bios; 2038 } 2039 bio = NULL; 2040 } 2041 2042 if (bio == NULL) { 2043 /* 2044 * Make sure that the start of this bio is 2045 * aligned to a device sector. 2046 */ 2047 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2048 printf("%s: Misaligned I/O request " 2049 "from domain %d\n", __func__, 2050 xbb->otherend_id); 2051 error = EINVAL; 2052 goto fail_free_bios; 2053 } 2054 2055 bio = bios[nbio++] = g_new_bio(); 2056 if (__predict_false(bio == NULL)) { 2057 error = ENOMEM; 2058 goto fail_free_bios; 2059 } 2060 bio->bio_cmd = operation; 2061 bio->bio_flags |= bio_flags; 2062 bio->bio_dev = dev_data->cdev; 2063 bio->bio_offset = bio_offset; 2064 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2065 xbb_sg->first_sect); 2066 bio->bio_done = xbb_bio_done; 2067 bio->bio_caller1 = reqlist; 2068 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2069 } 2070 2071 bio->bio_length += xbb_sg->nsect << 9; 2072 bio->bio_bcount = bio->bio_length; 2073 bio_offset += xbb_sg->nsect << 9; 2074 2075 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2076 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2077 printf("%s: Discontiguous I/O request " 2078 "from domain %d ends on " 2079 "non-sector boundary\n", 2080 __func__, xbb->otherend_id); 2081 error = EINVAL; 2082 goto fail_free_bios; 2083 } 2084 /* 2085 * KVA will not be contiguous, so any additional 2086 * I/O will need to be represented in a new bio. 2087 */ 2088 bio = NULL; 2089 } 2090 } 2091 2092 reqlist->pendcnt = nbio; 2093 2094 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2095 { 2096 if (operation == BIO_READ) { 2097 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2098 device_get_unit(xbb->dev), 2099 bios[bio_idx]->bio_offset, 2100 bios[bio_idx]->bio_length); 2101 } else if (operation == BIO_WRITE) { 2102 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2103 device_get_unit(xbb->dev), 2104 bios[bio_idx]->bio_offset, 2105 bios[bio_idx]->bio_length); 2106 } 2107 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2108 } 2109 2110 return (error); 2111 2112 fail_free_bios: 2113 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2114 g_destroy_bio(bios[bio_idx]); 2115 2116 return (error); 2117 } 2118 2119 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2120 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2121 "uint64_t"); 2122 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2123 "uint64_t", "uint64_t"); 2124 2125 /** 2126 * Backend handler for file access. 2127 * 2128 * \param xbb Per-instance xbb configuration structure. 2129 * \param reqlist Allocated internal request list. 2130 * \param operation BIO_* I/O operation code. 2131 * \param flags Additional bio_flag data to pass to any generated bios 2132 * (e.g. BIO_ORDERED).. 2133 * 2134 * \return 0 for success, errno codes for failure. 2135 */ 2136 static int 2137 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2138 int operation, int flags) 2139 { 2140 struct xbb_file_data *file_data; 2141 u_int seg_idx; 2142 u_int nseg; 2143 struct uio xuio; 2144 struct xbb_sg *xbb_sg; 2145 struct iovec *xiovec; 2146 int error; 2147 2148 file_data = &xbb->backend.file; 2149 error = 0; 2150 bzero(&xuio, sizeof(xuio)); 2151 2152 switch (operation) { 2153 case BIO_READ: 2154 xuio.uio_rw = UIO_READ; 2155 break; 2156 case BIO_WRITE: 2157 xuio.uio_rw = UIO_WRITE; 2158 break; 2159 case BIO_FLUSH: { 2160 struct mount *mountpoint; 2161 2162 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2163 device_get_unit(xbb->dev)); 2164 2165 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2166 2167 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2168 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2169 VOP_UNLOCK(xbb->vn); 2170 2171 vn_finished_write(mountpoint); 2172 2173 goto bailout_send_response; 2174 /* NOTREACHED */ 2175 } 2176 default: 2177 panic("invalid operation %d", operation); 2178 /* NOTREACHED */ 2179 } 2180 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2181 << xbb->sector_size_shift; 2182 xuio.uio_segflg = UIO_SYSSPACE; 2183 xuio.uio_iov = file_data->xiovecs; 2184 xuio.uio_iovcnt = 0; 2185 xbb_sg = xbb->xbb_sgs; 2186 nseg = reqlist->nr_segments; 2187 2188 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2189 /* 2190 * If the first sector is not 0, the KVA will 2191 * not be contiguous and we'll need to go on 2192 * to another segment. 2193 */ 2194 if (xbb_sg->first_sect != 0) 2195 xiovec = NULL; 2196 2197 if (xiovec == NULL) { 2198 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2199 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2200 seg_idx, xbb_sg->first_sect); 2201 xiovec->iov_len = 0; 2202 xuio.uio_iovcnt++; 2203 } 2204 2205 xiovec->iov_len += xbb_sg->nsect << 9; 2206 2207 xuio.uio_resid += xbb_sg->nsect << 9; 2208 2209 /* 2210 * If the last sector is not the full page 2211 * size count, the next segment will not be 2212 * contiguous in KVA and we need a new iovec. 2213 */ 2214 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2215 xiovec = NULL; 2216 } 2217 2218 xuio.uio_td = curthread; 2219 2220 switch (operation) { 2221 case BIO_READ: 2222 2223 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2224 device_get_unit(xbb->dev), xuio.uio_offset, 2225 xuio.uio_resid); 2226 2227 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2228 2229 /* 2230 * UFS pays attention to IO_DIRECT for reads. If the 2231 * DIRECTIO option is configured into the kernel, it calls 2232 * ffs_rawread(). But that only works for single-segment 2233 * uios with user space addresses. In our case, with a 2234 * kernel uio, it still reads into the buffer cache, but it 2235 * will just try to release the buffer from the cache later 2236 * on in ffs_read(). 2237 * 2238 * ZFS does not pay attention to IO_DIRECT for reads. 2239 * 2240 * UFS does not pay attention to IO_SYNC for reads. 2241 * 2242 * ZFS pays attention to IO_SYNC (which translates into the 2243 * Solaris define FRSYNC for zfs_read()) for reads. It 2244 * attempts to sync the file before reading. 2245 * 2246 * So, to attempt to provide some barrier semantics in the 2247 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2248 */ 2249 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2250 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2251 2252 VOP_UNLOCK(xbb->vn); 2253 break; 2254 case BIO_WRITE: { 2255 struct mount *mountpoint; 2256 2257 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2258 device_get_unit(xbb->dev), xuio.uio_offset, 2259 xuio.uio_resid); 2260 2261 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2262 2263 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2264 2265 /* 2266 * UFS pays attention to IO_DIRECT for writes. The write 2267 * is done asynchronously. (Normally the write would just 2268 * get put into cache. 2269 * 2270 * UFS pays attention to IO_SYNC for writes. It will 2271 * attempt to write the buffer out synchronously if that 2272 * flag is set. 2273 * 2274 * ZFS does not pay attention to IO_DIRECT for writes. 2275 * 2276 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2277 * for writes. It will flush the transaction from the 2278 * cache before returning. 2279 * 2280 * So if we've got the BIO_ORDERED flag set, we want 2281 * IO_SYNC in either the UFS or ZFS case. 2282 */ 2283 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2284 IO_SYNC : 0, file_data->cred); 2285 VOP_UNLOCK(xbb->vn); 2286 2287 vn_finished_write(mountpoint); 2288 2289 break; 2290 } 2291 default: 2292 panic("invalid operation %d", operation); 2293 /* NOTREACHED */ 2294 } 2295 2296 bailout_send_response: 2297 2298 if (error != 0) 2299 reqlist->status = BLKIF_RSP_ERROR; 2300 2301 xbb_complete_reqlist(xbb, reqlist); 2302 2303 return (0); 2304 } 2305 2306 /*--------------------------- Backend Configuration --------------------------*/ 2307 /** 2308 * Close and cleanup any backend device/file specific state for this 2309 * block back instance. 2310 * 2311 * \param xbb Per-instance xbb configuration structure. 2312 */ 2313 static void 2314 xbb_close_backend(struct xbb_softc *xbb) 2315 { 2316 DROP_GIANT(); 2317 DPRINTF("closing dev=%s\n", xbb->dev_name); 2318 if (xbb->vn) { 2319 int flags = FREAD; 2320 2321 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2322 flags |= FWRITE; 2323 2324 switch (xbb->device_type) { 2325 case XBB_TYPE_DISK: 2326 if (xbb->backend.dev.csw) { 2327 dev_relthread(xbb->backend.dev.cdev, 2328 xbb->backend.dev.dev_ref); 2329 xbb->backend.dev.csw = NULL; 2330 xbb->backend.dev.cdev = NULL; 2331 } 2332 break; 2333 case XBB_TYPE_FILE: 2334 break; 2335 case XBB_TYPE_NONE: 2336 default: 2337 panic("Unexpected backend type."); 2338 break; 2339 } 2340 2341 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2342 xbb->vn = NULL; 2343 2344 switch (xbb->device_type) { 2345 case XBB_TYPE_DISK: 2346 break; 2347 case XBB_TYPE_FILE: 2348 if (xbb->backend.file.cred != NULL) { 2349 crfree(xbb->backend.file.cred); 2350 xbb->backend.file.cred = NULL; 2351 } 2352 break; 2353 case XBB_TYPE_NONE: 2354 default: 2355 panic("Unexpected backend type."); 2356 break; 2357 } 2358 } 2359 PICKUP_GIANT(); 2360 } 2361 2362 /** 2363 * Open a character device to be used for backend I/O. 2364 * 2365 * \param xbb Per-instance xbb configuration structure. 2366 * 2367 * \return 0 for success, errno codes for failure. 2368 */ 2369 static int 2370 xbb_open_dev(struct xbb_softc *xbb) 2371 { 2372 struct vattr vattr; 2373 struct cdev *dev; 2374 struct cdevsw *devsw; 2375 int error; 2376 2377 xbb->device_type = XBB_TYPE_DISK; 2378 xbb->dispatch_io = xbb_dispatch_dev; 2379 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2380 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2381 &xbb->backend.dev.dev_ref); 2382 if (xbb->backend.dev.csw == NULL) 2383 panic("Unable to retrieve device switch"); 2384 2385 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2386 if (error) { 2387 xenbus_dev_fatal(xbb->dev, error, "error getting " 2388 "vnode attributes for device %s", 2389 xbb->dev_name); 2390 return (error); 2391 } 2392 2393 dev = xbb->vn->v_rdev; 2394 devsw = dev->si_devsw; 2395 if (!devsw->d_ioctl) { 2396 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2397 "device %s!", xbb->dev_name); 2398 return (ENODEV); 2399 } 2400 2401 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2402 (caddr_t)&xbb->sector_size, FREAD, 2403 curthread); 2404 if (error) { 2405 xenbus_dev_fatal(xbb->dev, error, 2406 "error calling ioctl DIOCGSECTORSIZE " 2407 "for device %s", xbb->dev_name); 2408 return (error); 2409 } 2410 2411 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2412 (caddr_t)&xbb->media_size, FREAD, 2413 curthread); 2414 if (error) { 2415 xenbus_dev_fatal(xbb->dev, error, 2416 "error calling ioctl DIOCGMEDIASIZE " 2417 "for device %s", xbb->dev_name); 2418 return (error); 2419 } 2420 2421 return (0); 2422 } 2423 2424 /** 2425 * Open a file to be used for backend I/O. 2426 * 2427 * \param xbb Per-instance xbb configuration structure. 2428 * 2429 * \return 0 for success, errno codes for failure. 2430 */ 2431 static int 2432 xbb_open_file(struct xbb_softc *xbb) 2433 { 2434 struct xbb_file_data *file_data; 2435 struct vattr vattr; 2436 int error; 2437 2438 file_data = &xbb->backend.file; 2439 xbb->device_type = XBB_TYPE_FILE; 2440 xbb->dispatch_io = xbb_dispatch_file; 2441 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2442 if (error != 0) { 2443 xenbus_dev_fatal(xbb->dev, error, 2444 "error calling VOP_GETATTR()" 2445 "for file %s", xbb->dev_name); 2446 return (error); 2447 } 2448 2449 /* 2450 * Verify that we have the ability to upgrade to exclusive 2451 * access on this file so we can trap errors at open instead 2452 * of reporting them during first access. 2453 */ 2454 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2455 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2456 if (VN_IS_DOOMED(xbb->vn)) { 2457 error = EBADF; 2458 xenbus_dev_fatal(xbb->dev, error, 2459 "error locking file %s", 2460 xbb->dev_name); 2461 2462 return (error); 2463 } 2464 } 2465 2466 file_data->cred = crhold(curthread->td_ucred); 2467 xbb->media_size = vattr.va_size; 2468 2469 /* 2470 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2471 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2472 * with disklabel and UFS on FreeBSD at least. Large block sizes 2473 * may not work with other OSes as well. So just export a sector 2474 * size of 512 bytes, which should work with any OS or 2475 * application. Since our backing is a file, any block size will 2476 * work fine for the backing store. 2477 */ 2478 #if 0 2479 xbb->sector_size = vattr.va_blocksize; 2480 #endif 2481 xbb->sector_size = 512; 2482 2483 /* 2484 * Sanity check. The media size has to be at least one 2485 * sector long. 2486 */ 2487 if (xbb->media_size < xbb->sector_size) { 2488 error = EINVAL; 2489 xenbus_dev_fatal(xbb->dev, error, 2490 "file %s size %ju < block size %u", 2491 xbb->dev_name, 2492 (uintmax_t)xbb->media_size, 2493 xbb->sector_size); 2494 } 2495 return (error); 2496 } 2497 2498 /** 2499 * Open the backend provider for this connection. 2500 * 2501 * \param xbb Per-instance xbb configuration structure. 2502 * 2503 * \return 0 for success, errno codes for failure. 2504 */ 2505 static int 2506 xbb_open_backend(struct xbb_softc *xbb) 2507 { 2508 struct nameidata nd; 2509 int flags; 2510 int error; 2511 2512 flags = FREAD; 2513 error = 0; 2514 2515 DPRINTF("opening dev=%s\n", xbb->dev_name); 2516 2517 if (rootvnode == NULL) { 2518 xenbus_dev_fatal(xbb->dev, ENOENT, 2519 "Root file system not mounted"); 2520 return (ENOENT); 2521 } 2522 2523 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2524 flags |= FWRITE; 2525 2526 pwd_ensure_dirs(); 2527 2528 again: 2529 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name); 2530 error = vn_open(&nd, &flags, 0, NULL); 2531 if (error) { 2532 /* 2533 * This is the only reasonable guess we can make as far as 2534 * path if the user doesn't give us a fully qualified path. 2535 * If they want to specify a file, they need to specify the 2536 * full path. 2537 */ 2538 if (xbb->dev_name[0] != '/') { 2539 char *dev_path = "/dev/"; 2540 char *dev_name; 2541 2542 /* Try adding device path at beginning of name */ 2543 dev_name = malloc(strlen(xbb->dev_name) 2544 + strlen(dev_path) + 1, 2545 M_XENBLOCKBACK, M_NOWAIT); 2546 if (dev_name) { 2547 sprintf(dev_name, "%s%s", dev_path, 2548 xbb->dev_name); 2549 free(xbb->dev_name, M_XENBLOCKBACK); 2550 xbb->dev_name = dev_name; 2551 goto again; 2552 } 2553 } 2554 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2555 xbb->dev_name); 2556 return (error); 2557 } 2558 2559 NDFREE_PNBUF(&nd); 2560 2561 xbb->vn = nd.ni_vp; 2562 2563 /* We only support disks and files. */ 2564 if (vn_isdisk_error(xbb->vn, &error)) { 2565 error = xbb_open_dev(xbb); 2566 } else if (xbb->vn->v_type == VREG) { 2567 error = xbb_open_file(xbb); 2568 } else { 2569 error = EINVAL; 2570 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2571 "or file", xbb->dev_name); 2572 } 2573 VOP_UNLOCK(xbb->vn); 2574 2575 if (error != 0) { 2576 xbb_close_backend(xbb); 2577 return (error); 2578 } 2579 2580 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2581 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2582 2583 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2584 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2585 xbb->dev_name, xbb->sector_size, xbb->media_size); 2586 2587 return (0); 2588 } 2589 2590 /*------------------------ Inter-Domain Communication ------------------------*/ 2591 /** 2592 * Free dynamically allocated KVA or pseudo-physical address allocations. 2593 * 2594 * \param xbb Per-instance xbb configuration structure. 2595 */ 2596 static void 2597 xbb_free_communication_mem(struct xbb_softc *xbb) 2598 { 2599 if (xbb->kva != 0) { 2600 if (xbb->pseudo_phys_res != NULL) { 2601 xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, 2602 xbb->pseudo_phys_res); 2603 xbb->pseudo_phys_res = NULL; 2604 } 2605 } 2606 xbb->kva = 0; 2607 xbb->gnt_base_addr = 0; 2608 if (xbb->kva_free != NULL) { 2609 free(xbb->kva_free, M_XENBLOCKBACK); 2610 xbb->kva_free = NULL; 2611 } 2612 } 2613 2614 /** 2615 * Cleanup all inter-domain communication mechanisms. 2616 * 2617 * \param xbb Per-instance xbb configuration structure. 2618 */ 2619 static int 2620 xbb_disconnect(struct xbb_softc *xbb) 2621 { 2622 DPRINTF("\n"); 2623 2624 mtx_unlock(&xbb->lock); 2625 xen_intr_unbind(&xbb->xen_intr_handle); 2626 if (xbb->io_taskqueue != NULL) 2627 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2628 mtx_lock(&xbb->lock); 2629 2630 /* 2631 * No new interrupts can generate work, but we must wait 2632 * for all currently active requests to drain. 2633 */ 2634 if (xbb->active_request_count != 0) 2635 return (EAGAIN); 2636 2637 if (xbb->flags & XBBF_RING_CONNECTED) { 2638 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2639 struct gnttab_unmap_grant_ref *op; 2640 unsigned int ring_idx; 2641 int error; 2642 2643 for (ring_idx = 0, op = ops; 2644 ring_idx < xbb->ring_config.ring_pages; 2645 ring_idx++, op++) { 2646 op->host_addr = xbb->ring_config.gnt_addr 2647 + (ring_idx * PAGE_SIZE); 2648 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2649 op->handle = xbb->ring_config.handle[ring_idx]; 2650 } 2651 2652 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2653 xbb->ring_config.ring_pages); 2654 if (error != 0) 2655 panic("Grant table op failed (%d)", error); 2656 2657 xbb->flags &= ~XBBF_RING_CONNECTED; 2658 } 2659 2660 xbb_free_communication_mem(xbb); 2661 2662 if (xbb->requests != NULL) { 2663 free(xbb->requests, M_XENBLOCKBACK); 2664 xbb->requests = NULL; 2665 } 2666 2667 if (xbb->request_lists != NULL) { 2668 struct xbb_xen_reqlist *reqlist; 2669 int i; 2670 2671 /* There is one request list for ever allocated request. */ 2672 for (i = 0, reqlist = xbb->request_lists; 2673 i < xbb->max_requests; i++, reqlist++){ 2674 if (reqlist->gnt_handles != NULL) { 2675 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2676 reqlist->gnt_handles = NULL; 2677 } 2678 } 2679 free(xbb->request_lists, M_XENBLOCKBACK); 2680 xbb->request_lists = NULL; 2681 } 2682 2683 return (0); 2684 } 2685 2686 /** 2687 * Map shared memory ring into domain local address space, initialize 2688 * ring control structures, and bind an interrupt to the event channel 2689 * used to notify us of ring changes. 2690 * 2691 * \param xbb Per-instance xbb configuration structure. 2692 */ 2693 static int 2694 xbb_connect_ring(struct xbb_softc *xbb) 2695 { 2696 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2697 struct gnttab_map_grant_ref *gnt; 2698 u_int ring_idx; 2699 int error; 2700 2701 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2702 return (0); 2703 2704 /* 2705 * Kva for our ring is at the tail of the region of kva allocated 2706 * by xbb_alloc_communication_mem(). 2707 */ 2708 xbb->ring_config.va = xbb->kva 2709 + (xbb->kva_size 2710 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2711 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2712 + (xbb->kva_size 2713 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2714 2715 for (ring_idx = 0, gnt = gnts; 2716 ring_idx < xbb->ring_config.ring_pages; 2717 ring_idx++, gnt++) { 2718 gnt->host_addr = xbb->ring_config.gnt_addr 2719 + (ring_idx * PAGE_SIZE); 2720 gnt->flags = GNTMAP_host_map; 2721 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2722 gnt->dom = xbb->otherend_id; 2723 } 2724 2725 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2726 xbb->ring_config.ring_pages); 2727 if (error) 2728 panic("blkback: Ring page grant table op failed (%d)", error); 2729 2730 for (ring_idx = 0, gnt = gnts; 2731 ring_idx < xbb->ring_config.ring_pages; 2732 ring_idx++, gnt++) { 2733 if (gnt->status != 0) { 2734 struct gnttab_unmap_grant_ref unmap[XBB_MAX_RING_PAGES]; 2735 unsigned int i, j; 2736 2737 xbb->ring_config.va = 0; 2738 xenbus_dev_fatal(xbb->dev, EACCES, 2739 "Ring shared page mapping failed. " 2740 "Status %d.", gnt->status); 2741 2742 /* Unmap everything to avoid leaking grant table maps */ 2743 for (i = 0, j = 0; i < xbb->ring_config.ring_pages; 2744 i++) { 2745 if (gnts[i].status != GNTST_okay) 2746 continue; 2747 2748 unmap[j].host_addr = gnts[i].host_addr; 2749 unmap[j].dev_bus_addr = gnts[i].dev_bus_addr; 2750 unmap[j++].handle = gnts[i].handle; 2751 } 2752 if (j != 0) { 2753 error = HYPERVISOR_grant_table_op( 2754 GNTTABOP_unmap_grant_ref, unmap, j); 2755 if (error != 0) 2756 panic("Unable to unmap grants (%d)", 2757 error); 2758 } 2759 return (EACCES); 2760 } 2761 xbb->ring_config.handle[ring_idx] = gnt->handle; 2762 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2763 } 2764 2765 /* Initialize the ring based on ABI. */ 2766 switch (xbb->abi) { 2767 case BLKIF_PROTOCOL_NATIVE: 2768 { 2769 blkif_sring_t *sring; 2770 sring = (blkif_sring_t *)xbb->ring_config.va; 2771 BACK_RING_INIT(&xbb->rings.native, sring, 2772 xbb->ring_config.ring_pages * PAGE_SIZE); 2773 break; 2774 } 2775 case BLKIF_PROTOCOL_X86_32: 2776 { 2777 blkif_x86_32_sring_t *sring_x86_32; 2778 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2779 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2780 xbb->ring_config.ring_pages * PAGE_SIZE); 2781 break; 2782 } 2783 case BLKIF_PROTOCOL_X86_64: 2784 { 2785 blkif_x86_64_sring_t *sring_x86_64; 2786 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2787 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2788 xbb->ring_config.ring_pages * PAGE_SIZE); 2789 break; 2790 } 2791 default: 2792 panic("Unexpected blkif protocol ABI."); 2793 } 2794 2795 xbb->flags |= XBBF_RING_CONNECTED; 2796 2797 error = xen_intr_bind_remote_port(xbb->dev, 2798 xbb->otherend_id, 2799 xbb->ring_config.evtchn, 2800 xbb_filter, 2801 /*ithread_handler*/NULL, 2802 /*arg*/xbb, 2803 INTR_TYPE_BIO | INTR_MPSAFE, 2804 &xbb->xen_intr_handle); 2805 if (error) { 2806 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2807 return (error); 2808 } 2809 2810 DPRINTF("rings connected!\n"); 2811 2812 return 0; 2813 } 2814 2815 /** 2816 * Size KVA and pseudo-physical address allocations based on negotiated 2817 * values for the size and number of I/O requests, and the size of our 2818 * communication ring. 2819 * 2820 * \param xbb Per-instance xbb configuration structure. 2821 * 2822 * These address spaces are used to dynamically map pages in the 2823 * front-end's domain into our own. 2824 */ 2825 static int 2826 xbb_alloc_communication_mem(struct xbb_softc *xbb) 2827 { 2828 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2829 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2830 xbb->kva_size = xbb->reqlist_kva_size + 2831 (xbb->ring_config.ring_pages * PAGE_SIZE); 2832 2833 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); 2834 if (xbb->kva_free == NULL) 2835 return (ENOMEM); 2836 2837 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 2838 device_get_nameunit(xbb->dev), xbb->kva_size, 2839 xbb->reqlist_kva_size); 2840 /* 2841 * Reserve a range of pseudo physical memory that we can map 2842 * into kva. These pages will only be backed by machine 2843 * pages ("real memory") during the lifetime of front-end requests 2844 * via grant table operations. 2845 */ 2846 xbb->pseudo_phys_res_id = 0; 2847 xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, 2848 xbb->kva_size); 2849 if (xbb->pseudo_phys_res == NULL) { 2850 xbb->kva = 0; 2851 return (ENOMEM); 2852 } 2853 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 2854 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 2855 2856 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 2857 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 2858 (uintmax_t)xbb->gnt_base_addr); 2859 return (0); 2860 } 2861 2862 /** 2863 * Collect front-end information from the XenStore. 2864 * 2865 * \param xbb Per-instance xbb configuration structure. 2866 */ 2867 static int 2868 xbb_collect_frontend_info(struct xbb_softc *xbb) 2869 { 2870 char protocol_abi[64]; 2871 const char *otherend_path; 2872 int error; 2873 u_int ring_idx; 2874 u_int ring_page_order; 2875 size_t ring_size; 2876 2877 otherend_path = xenbus_get_otherend_path(xbb->dev); 2878 2879 /* 2880 * Protocol defaults valid even if all negotiation fails. 2881 */ 2882 xbb->ring_config.ring_pages = 1; 2883 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; 2884 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 2885 2886 /* 2887 * Mandatory data (used in all versions of the protocol) first. 2888 */ 2889 error = xs_scanf(XST_NIL, otherend_path, 2890 "event-channel", NULL, "%" PRIu32, 2891 &xbb->ring_config.evtchn); 2892 if (error != 0) { 2893 xenbus_dev_fatal(xbb->dev, error, 2894 "Unable to retrieve event-channel information " 2895 "from frontend %s. Unable to connect.", 2896 xenbus_get_otherend_path(xbb->dev)); 2897 return (error); 2898 } 2899 2900 /* 2901 * These fields are initialized to legacy protocol defaults 2902 * so we only need to fail if reading the updated value succeeds 2903 * and the new value is outside of its allowed range. 2904 * 2905 * \note xs_gather() returns on the first encountered error, so 2906 * we must use independent calls in order to guarantee 2907 * we don't miss information in a sparsly populated front-end 2908 * tree. 2909 * 2910 * \note xs_scanf() does not update variables for unmatched 2911 * fields. 2912 */ 2913 ring_page_order = 0; 2914 xbb->max_requests = 32; 2915 2916 (void)xs_scanf(XST_NIL, otherend_path, 2917 "ring-page-order", NULL, "%u", 2918 &ring_page_order); 2919 xbb->ring_config.ring_pages = 1 << ring_page_order; 2920 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 2921 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 2922 2923 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 2924 xenbus_dev_fatal(xbb->dev, EINVAL, 2925 "Front-end specified ring-pages of %u " 2926 "exceeds backend limit of %u. " 2927 "Unable to connect.", 2928 xbb->ring_config.ring_pages, 2929 XBB_MAX_RING_PAGES); 2930 return (EINVAL); 2931 } 2932 2933 if (xbb->ring_config.ring_pages == 1) { 2934 error = xs_gather(XST_NIL, otherend_path, 2935 "ring-ref", "%" PRIu32, 2936 &xbb->ring_config.ring_ref[0], 2937 NULL); 2938 if (error != 0) { 2939 xenbus_dev_fatal(xbb->dev, error, 2940 "Unable to retrieve ring information " 2941 "from frontend %s. Unable to " 2942 "connect.", 2943 xenbus_get_otherend_path(xbb->dev)); 2944 return (error); 2945 } 2946 } else { 2947 /* Multi-page ring format. */ 2948 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 2949 ring_idx++) { 2950 char ring_ref_name[]= "ring_refXX"; 2951 2952 snprintf(ring_ref_name, sizeof(ring_ref_name), 2953 "ring-ref%u", ring_idx); 2954 error = xs_scanf(XST_NIL, otherend_path, 2955 ring_ref_name, NULL, "%" PRIu32, 2956 &xbb->ring_config.ring_ref[ring_idx]); 2957 if (error != 0) { 2958 xenbus_dev_fatal(xbb->dev, error, 2959 "Failed to retriev grant " 2960 "reference for page %u of " 2961 "shared ring. Unable " 2962 "to connect.", ring_idx); 2963 return (error); 2964 } 2965 } 2966 } 2967 2968 error = xs_gather(XST_NIL, otherend_path, 2969 "protocol", "%63s", protocol_abi, 2970 NULL); 2971 if (error != 0 2972 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 2973 /* 2974 * Assume native if the frontend has not 2975 * published ABI data or it has published and 2976 * matches our own ABI. 2977 */ 2978 xbb->abi = BLKIF_PROTOCOL_NATIVE; 2979 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 2980 xbb->abi = BLKIF_PROTOCOL_X86_32; 2981 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 2982 xbb->abi = BLKIF_PROTOCOL_X86_64; 2983 } else { 2984 xenbus_dev_fatal(xbb->dev, EINVAL, 2985 "Unknown protocol ABI (%s) published by " 2986 "frontend. Unable to connect.", protocol_abi); 2987 return (EINVAL); 2988 } 2989 return (0); 2990 } 2991 2992 /** 2993 * Allocate per-request data structures given request size and number 2994 * information negotiated with the front-end. 2995 * 2996 * \param xbb Per-instance xbb configuration structure. 2997 */ 2998 static int 2999 xbb_alloc_requests(struct xbb_softc *xbb) 3000 { 3001 struct xbb_xen_req *req; 3002 struct xbb_xen_req *last_req; 3003 3004 /* 3005 * Allocate request book keeping datastructures. 3006 */ 3007 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3008 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3009 if (xbb->requests == NULL) { 3010 xenbus_dev_fatal(xbb->dev, ENOMEM, 3011 "Unable to allocate request structures"); 3012 return (ENOMEM); 3013 } 3014 3015 req = xbb->requests; 3016 last_req = &xbb->requests[xbb->max_requests - 1]; 3017 STAILQ_INIT(&xbb->request_free_stailq); 3018 while (req <= last_req) { 3019 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3020 req++; 3021 } 3022 return (0); 3023 } 3024 3025 static int 3026 xbb_alloc_request_lists(struct xbb_softc *xbb) 3027 { 3028 struct xbb_xen_reqlist *reqlist; 3029 int i; 3030 3031 /* 3032 * If no requests can be merged, we need 1 request list per 3033 * in flight request. 3034 */ 3035 xbb->request_lists = malloc(xbb->max_requests * 3036 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3037 if (xbb->request_lists == NULL) { 3038 xenbus_dev_fatal(xbb->dev, ENOMEM, 3039 "Unable to allocate request list structures"); 3040 return (ENOMEM); 3041 } 3042 3043 STAILQ_INIT(&xbb->reqlist_free_stailq); 3044 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3045 for (i = 0; i < xbb->max_requests; i++) { 3046 int seg; 3047 3048 reqlist = &xbb->request_lists[i]; 3049 3050 reqlist->xbb = xbb; 3051 3052 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3053 sizeof(*reqlist->gnt_handles), 3054 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3055 if (reqlist->gnt_handles == NULL) { 3056 xenbus_dev_fatal(xbb->dev, ENOMEM, 3057 "Unable to allocate request " 3058 "grant references"); 3059 return (ENOMEM); 3060 } 3061 3062 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3063 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3064 3065 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3066 } 3067 return (0); 3068 } 3069 3070 /** 3071 * Supply information about the physical device to the frontend 3072 * via XenBus. 3073 * 3074 * \param xbb Per-instance xbb configuration structure. 3075 */ 3076 static int 3077 xbb_publish_backend_info(struct xbb_softc *xbb) 3078 { 3079 struct xs_transaction xst; 3080 const char *our_path; 3081 const char *leaf; 3082 int error; 3083 3084 our_path = xenbus_get_node(xbb->dev); 3085 while (1) { 3086 error = xs_transaction_start(&xst); 3087 if (error != 0) { 3088 xenbus_dev_fatal(xbb->dev, error, 3089 "Error publishing backend info " 3090 "(start transaction)"); 3091 return (error); 3092 } 3093 3094 leaf = "sectors"; 3095 error = xs_printf(xst, our_path, leaf, 3096 "%"PRIu64, xbb->media_num_sectors); 3097 if (error != 0) 3098 break; 3099 3100 /* XXX Support all VBD attributes here. */ 3101 leaf = "info"; 3102 error = xs_printf(xst, our_path, leaf, "%u", 3103 xbb->flags & XBBF_READ_ONLY 3104 ? VDISK_READONLY : 0); 3105 if (error != 0) 3106 break; 3107 3108 leaf = "sector-size"; 3109 error = xs_printf(xst, our_path, leaf, "%u", 3110 xbb->sector_size); 3111 if (error != 0) 3112 break; 3113 3114 error = xs_transaction_end(xst, 0); 3115 if (error == 0) { 3116 return (0); 3117 } else if (error != EAGAIN) { 3118 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3119 return (error); 3120 } 3121 } 3122 3123 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3124 our_path, leaf); 3125 xs_transaction_end(xst, 1); 3126 return (error); 3127 } 3128 3129 /** 3130 * Connect to our blkfront peer now that it has completed publishing 3131 * its configuration into the XenStore. 3132 * 3133 * \param xbb Per-instance xbb configuration structure. 3134 */ 3135 static void 3136 xbb_connect(struct xbb_softc *xbb) 3137 { 3138 int error; 3139 3140 if (!xbb->hotplug_done || 3141 (xenbus_get_state(xbb->dev) != XenbusStateInitWait) || 3142 (xbb_collect_frontend_info(xbb) != 0)) 3143 return; 3144 3145 xbb->flags &= ~XBBF_SHUTDOWN; 3146 3147 /* 3148 * We limit the maximum number of reqlist segments to the maximum 3149 * number of segments in the ring, or our absolute maximum, 3150 * whichever is smaller. 3151 */ 3152 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3153 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3154 3155 /* 3156 * The maximum size is simply a function of the number of segments 3157 * we can handle. 3158 */ 3159 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3160 3161 /* Allocate resources whose size depends on front-end configuration. */ 3162 error = xbb_alloc_communication_mem(xbb); 3163 if (error != 0) { 3164 xenbus_dev_fatal(xbb->dev, error, 3165 "Unable to allocate communication memory"); 3166 return; 3167 } 3168 3169 error = xbb_publish_backend_info(xbb); 3170 if (error != 0) { 3171 xenbus_dev_fatal(xbb->dev, error, 3172 "Unable to publish device information"); 3173 return; 3174 } 3175 3176 error = xbb_alloc_requests(xbb); 3177 if (error != 0) { 3178 /* Specific errors are reported by xbb_alloc_requests(). */ 3179 return; 3180 } 3181 3182 error = xbb_alloc_request_lists(xbb); 3183 if (error != 0) { 3184 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3185 return; 3186 } 3187 3188 /* 3189 * Connect communication channel. 3190 */ 3191 error = xbb_connect_ring(xbb); 3192 if (error != 0) { 3193 /* Specific errors are reported by xbb_connect_ring(). */ 3194 return; 3195 } 3196 3197 /* Ready for I/O. */ 3198 xenbus_set_state(xbb->dev, XenbusStateConnected); 3199 } 3200 3201 /*-------------------------- Device Teardown Support -------------------------*/ 3202 /** 3203 * Perform device shutdown functions. 3204 * 3205 * \param xbb Per-instance xbb configuration structure. 3206 * 3207 * Mark this instance as shutting down, wait for any active I/O on the 3208 * backend device/file to drain, disconnect from the front-end, and notify 3209 * any waiters (e.g. a thread invoking our detach method) that detach can 3210 * now proceed. 3211 */ 3212 static int 3213 xbb_shutdown(struct xbb_softc *xbb) 3214 { 3215 XenbusState frontState; 3216 int error; 3217 3218 DPRINTF("\n"); 3219 3220 /* 3221 * Due to the need to drop our mutex during some 3222 * xenbus operations, it is possible for two threads 3223 * to attempt to close out shutdown processing at 3224 * the same time. Tell the caller that hits this 3225 * race to try back later. 3226 */ 3227 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3228 return (EAGAIN); 3229 3230 xbb->flags |= XBBF_IN_SHUTDOWN; 3231 mtx_unlock(&xbb->lock); 3232 3233 if (xbb->hotplug_watch.node != NULL) { 3234 xs_unregister_watch(&xbb->hotplug_watch); 3235 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3236 xbb->hotplug_watch.node = NULL; 3237 } 3238 3239 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3240 xenbus_set_state(xbb->dev, XenbusStateClosing); 3241 3242 frontState = xenbus_get_otherend_state(xbb->dev); 3243 mtx_lock(&xbb->lock); 3244 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3245 3246 /* Wait for the frontend to disconnect (if it's connected). */ 3247 if (frontState == XenbusStateConnected) 3248 return (EAGAIN); 3249 3250 DPRINTF("\n"); 3251 3252 /* Indicate shutdown is in progress. */ 3253 xbb->flags |= XBBF_SHUTDOWN; 3254 3255 /* Disconnect from the front-end. */ 3256 error = xbb_disconnect(xbb); 3257 if (error != 0) { 3258 /* 3259 * Requests still outstanding. We'll be called again 3260 * once they complete. 3261 */ 3262 KASSERT(error == EAGAIN, 3263 ("%s: Unexpected xbb_disconnect() failure %d", 3264 __func__, error)); 3265 3266 return (error); 3267 } 3268 3269 DPRINTF("\n"); 3270 3271 /* Indicate to xbb_detach() that is it safe to proceed. */ 3272 wakeup(xbb); 3273 3274 return (0); 3275 } 3276 3277 /** 3278 * Report an attach time error to the console and Xen, and cleanup 3279 * this instance by forcing immediate detach processing. 3280 * 3281 * \param xbb Per-instance xbb configuration structure. 3282 * \param err Errno describing the error. 3283 * \param fmt Printf style format and arguments 3284 */ 3285 static void 3286 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3287 { 3288 va_list ap; 3289 va_list ap_hotplug; 3290 3291 va_start(ap, fmt); 3292 va_copy(ap_hotplug, ap); 3293 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3294 "hotplug-error", fmt, ap_hotplug); 3295 va_end(ap_hotplug); 3296 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3297 "hotplug-status", "error"); 3298 3299 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3300 va_end(ap); 3301 3302 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3303 "online", "0"); 3304 mtx_lock(&xbb->lock); 3305 xbb_shutdown(xbb); 3306 mtx_unlock(&xbb->lock); 3307 } 3308 3309 /*---------------------------- NewBus Entrypoints ----------------------------*/ 3310 /** 3311 * Inspect a XenBus device and claim it if is of the appropriate type. 3312 * 3313 * \param dev NewBus device object representing a candidate XenBus device. 3314 * 3315 * \return 0 for success, errno codes for failure. 3316 */ 3317 static int 3318 xbb_probe(device_t dev) 3319 { 3320 3321 if (strcmp(xenbus_get_type(dev), "vbd")) 3322 return (ENXIO); 3323 3324 /* Only attach if Xen creates IOMMU entries for grant mapped pages. */ 3325 if (!xen_has_iommu_maps()) { 3326 static bool warned; 3327 3328 if (!warned) { 3329 warned = true; 3330 printf( 3331 "xen-blkback disabled due to grant maps lacking IOMMU entries\n"); 3332 } 3333 return (ENXIO); 3334 } 3335 3336 device_set_desc(dev, "Backend Virtual Block Device"); 3337 device_quiet(dev); 3338 return (0); 3339 } 3340 3341 /** 3342 * Setup sysctl variables to control various Block Back parameters. 3343 * 3344 * \param xbb Xen Block Back softc. 3345 * 3346 */ 3347 static void 3348 xbb_setup_sysctl(struct xbb_softc *xbb) 3349 { 3350 struct sysctl_ctx_list *sysctl_ctx = NULL; 3351 struct sysctl_oid *sysctl_tree = NULL; 3352 3353 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3354 if (sysctl_ctx == NULL) 3355 return; 3356 3357 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3358 if (sysctl_tree == NULL) 3359 return; 3360 3361 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3362 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3363 "fake the flush command"); 3364 3365 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3366 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3367 "send a real flush for N flush requests"); 3368 3369 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3370 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3371 "Don't coalesce contiguous requests"); 3372 3373 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3374 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3375 "how many I/O requests we have received"); 3376 3377 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3378 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3379 "how many I/O requests have been completed"); 3380 3381 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3382 "reqs_queued_for_completion", CTLFLAG_RW, 3383 &xbb->reqs_queued_for_completion, 3384 "how many I/O requests queued but not yet pushed"); 3385 3386 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3387 "reqs_completed_with_error", CTLFLAG_RW, 3388 &xbb->reqs_completed_with_error, 3389 "how many I/O requests completed with error status"); 3390 3391 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3392 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3393 "how many I/O dispatches were forced"); 3394 3395 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3396 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3397 "how many I/O dispatches were normal"); 3398 3399 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3400 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3401 "total number of I/O dispatches"); 3402 3403 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3404 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3405 "how many times we have run out of KVA"); 3406 3407 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3408 "request_shortages", CTLFLAG_RW, 3409 &xbb->request_shortages, 3410 "how many times we have run out of requests"); 3411 3412 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3413 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3414 "maximum outstanding requests (negotiated)"); 3415 3416 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3417 "max_request_segments", CTLFLAG_RD, 3418 &xbb->max_request_segments, 0, 3419 "maximum number of pages per requests (negotiated)"); 3420 3421 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3422 "max_request_size", CTLFLAG_RD, 3423 &xbb->max_request_size, 0, 3424 "maximum size in bytes of a request (negotiated)"); 3425 3426 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3427 "ring_pages", CTLFLAG_RD, 3428 &xbb->ring_config.ring_pages, 0, 3429 "communication channel pages (negotiated)"); 3430 } 3431 3432 static void 3433 xbb_attach_disk(device_t dev) 3434 { 3435 struct xbb_softc *xbb; 3436 int error; 3437 3438 xbb = device_get_softc(dev); 3439 3440 KASSERT(xbb->hotplug_done, ("Missing hotplug execution")); 3441 3442 /* Parse fopen style mode flags. */ 3443 if (strchr(xbb->dev_mode, 'w') == NULL) 3444 xbb->flags |= XBBF_READ_ONLY; 3445 3446 /* 3447 * Verify the physical device is present and can support 3448 * the desired I/O mode. 3449 */ 3450 error = xbb_open_backend(xbb); 3451 if (error != 0) { 3452 xbb_attach_failed(xbb, error, "Unable to open %s", 3453 xbb->dev_name); 3454 return; 3455 } 3456 3457 /* Use devstat(9) for recording statistics. */ 3458 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3459 xbb->sector_size, 3460 DEVSTAT_ALL_SUPPORTED, 3461 DEVSTAT_TYPE_DIRECT 3462 | DEVSTAT_TYPE_IF_OTHER, 3463 DEVSTAT_PRIORITY_OTHER); 3464 3465 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3466 xbb->sector_size, 3467 DEVSTAT_ALL_SUPPORTED, 3468 DEVSTAT_TYPE_DIRECT 3469 | DEVSTAT_TYPE_IF_OTHER, 3470 DEVSTAT_PRIORITY_OTHER); 3471 /* 3472 * Setup sysctl variables. 3473 */ 3474 xbb_setup_sysctl(xbb); 3475 3476 /* 3477 * Create a taskqueue for doing work that must occur from a 3478 * thread context. 3479 */ 3480 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3481 M_NOWAIT, 3482 taskqueue_thread_enqueue, 3483 /*contxt*/&xbb->io_taskqueue); 3484 if (xbb->io_taskqueue == NULL) { 3485 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3486 return; 3487 } 3488 3489 taskqueue_start_threads(&xbb->io_taskqueue, 3490 /*num threads*/1, 3491 /*priority*/PWAIT, 3492 /*thread name*/ 3493 "%s taskq", device_get_nameunit(dev)); 3494 3495 /* Update hot-plug status to satisfy xend. */ 3496 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3497 "hotplug-status", "connected"); 3498 if (error) { 3499 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3500 xenbus_get_node(xbb->dev)); 3501 return; 3502 } 3503 3504 /* The front end might be waiting for the backend, attach if so. */ 3505 if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised) 3506 xbb_connect(xbb); 3507 } 3508 3509 static void 3510 xbb_attach_cb(struct xs_watch *watch, const char **vec, unsigned int len) 3511 { 3512 device_t dev; 3513 struct xbb_softc *xbb; 3514 int error; 3515 3516 dev = (device_t)watch->callback_data; 3517 xbb = device_get_softc(dev); 3518 3519 error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", 3520 NULL, &xbb->dev_name, NULL); 3521 if (error != 0) 3522 return; 3523 3524 xs_unregister_watch(watch); 3525 free(watch->node, M_XENBLOCKBACK); 3526 watch->node = NULL; 3527 xbb->hotplug_done = true; 3528 3529 /* Collect physical device information. */ 3530 error = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "device-type", 3531 NULL, &xbb->dev_type, NULL); 3532 if (error != 0) 3533 xbb->dev_type = NULL; 3534 3535 error = xs_gather(XST_NIL, xenbus_get_node(dev), "mode", NULL, 3536 &xbb->dev_mode, NULL); 3537 if (error != 0) { 3538 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3539 xenbus_get_node(dev)); 3540 return; 3541 } 3542 3543 xbb_attach_disk(dev); 3544 } 3545 3546 /** 3547 * Attach to a XenBus device that has been claimed by our probe routine. 3548 * 3549 * \param dev NewBus device object representing this Xen Block Back instance. 3550 * 3551 * \return 0 for success, errno codes for failure. 3552 */ 3553 static int 3554 xbb_attach(device_t dev) 3555 { 3556 struct xbb_softc *xbb; 3557 int error; 3558 u_int max_ring_page_order; 3559 struct sbuf *watch_path; 3560 3561 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3562 3563 /* 3564 * Basic initialization. 3565 * After this block it is safe to call xbb_detach() 3566 * to clean up any allocated data for this instance. 3567 */ 3568 xbb = device_get_softc(dev); 3569 xbb->dev = dev; 3570 xbb->otherend_id = xenbus_get_otherend_id(dev); 3571 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3572 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3573 3574 /* 3575 * Publish protocol capabilities for consumption by the 3576 * front-end. 3577 */ 3578 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3579 "feature-barrier", "1"); 3580 if (error) { 3581 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3582 xenbus_get_node(xbb->dev)); 3583 return (error); 3584 } 3585 3586 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3587 "feature-flush-cache", "1"); 3588 if (error) { 3589 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3590 xenbus_get_node(xbb->dev)); 3591 return (error); 3592 } 3593 3594 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3595 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3596 "max-ring-page-order", "%u", max_ring_page_order); 3597 if (error) { 3598 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3599 xenbus_get_node(xbb->dev)); 3600 return (error); 3601 } 3602 3603 /* Tell the toolstack blkback has attached. */ 3604 xenbus_set_state(dev, XenbusStateInitWait); 3605 3606 if (xbb->hotplug_done) { 3607 xbb_attach_disk(dev); 3608 return (0); 3609 } 3610 3611 /* 3612 * We need to wait for hotplug script execution before 3613 * moving forward. 3614 */ 3615 watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); 3616 xbb->hotplug_watch.callback_data = (uintptr_t)dev; 3617 xbb->hotplug_watch.callback = xbb_attach_cb; 3618 KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); 3619 xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); 3620 /* 3621 * We don't care about the path updated, just about the value changes 3622 * on that single node, hence there's no need to queue more that one 3623 * event. 3624 */ 3625 xbb->hotplug_watch.max_pending = 1; 3626 sbuf_delete(watch_path); 3627 error = xs_register_watch(&xbb->hotplug_watch); 3628 if (error != 0) { 3629 xbb_attach_failed(xbb, error, "failed to create watch on %s", 3630 xbb->hotplug_watch.node); 3631 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3632 return (error); 3633 } 3634 3635 return (0); 3636 } 3637 3638 /** 3639 * Detach from a block back device instance. 3640 * 3641 * \param dev NewBus device object representing this Xen Block Back instance. 3642 * 3643 * \return 0 for success, errno codes for failure. 3644 * 3645 * \note A block back device may be detached at any time in its life-cycle, 3646 * including part way through the attach process. For this reason, 3647 * initialization order and the initialization state checks in this 3648 * routine must be carefully coupled so that attach time failures 3649 * are gracefully handled. 3650 */ 3651 static int 3652 xbb_detach(device_t dev) 3653 { 3654 struct xbb_softc *xbb; 3655 3656 DPRINTF("\n"); 3657 3658 xbb = device_get_softc(dev); 3659 mtx_lock(&xbb->lock); 3660 while (xbb_shutdown(xbb) == EAGAIN) { 3661 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3662 "xbb_shutdown", 0); 3663 } 3664 mtx_unlock(&xbb->lock); 3665 3666 DPRINTF("\n"); 3667 3668 if (xbb->io_taskqueue != NULL) 3669 taskqueue_free(xbb->io_taskqueue); 3670 3671 if (xbb->xbb_stats != NULL) 3672 devstat_remove_entry(xbb->xbb_stats); 3673 3674 if (xbb->xbb_stats_in != NULL) 3675 devstat_remove_entry(xbb->xbb_stats_in); 3676 3677 xbb_close_backend(xbb); 3678 3679 if (xbb->dev_mode != NULL) { 3680 free(xbb->dev_mode, M_XENSTORE); 3681 xbb->dev_mode = NULL; 3682 } 3683 3684 if (xbb->dev_type != NULL) { 3685 free(xbb->dev_type, M_XENSTORE); 3686 xbb->dev_type = NULL; 3687 } 3688 3689 if (xbb->dev_name != NULL) { 3690 free(xbb->dev_name, M_XENSTORE); 3691 xbb->dev_name = NULL; 3692 } 3693 3694 mtx_destroy(&xbb->lock); 3695 return (0); 3696 } 3697 3698 /** 3699 * Prepare this block back device for suspension of this VM. 3700 * 3701 * \param dev NewBus device object representing this Xen Block Back instance. 3702 * 3703 * \return 0 for success, errno codes for failure. 3704 */ 3705 static int 3706 xbb_suspend(device_t dev) 3707 { 3708 #ifdef NOT_YET 3709 struct xbb_softc *sc = device_get_softc(dev); 3710 3711 /* Prevent new requests being issued until we fix things up. */ 3712 mtx_lock(&sc->xb_io_lock); 3713 sc->connected = BLKIF_STATE_SUSPENDED; 3714 mtx_unlock(&sc->xb_io_lock); 3715 #endif 3716 3717 return (0); 3718 } 3719 3720 /** 3721 * Perform any processing required to recover from a suspended state. 3722 * 3723 * \param dev NewBus device object representing this Xen Block Back instance. 3724 * 3725 * \return 0 for success, errno codes for failure. 3726 */ 3727 static int 3728 xbb_resume(device_t dev) 3729 { 3730 return (0); 3731 } 3732 3733 /** 3734 * Handle state changes expressed via the XenStore by our front-end peer. 3735 * 3736 * \param dev NewBus device object representing this Xen 3737 * Block Back instance. 3738 * \param frontend_state The new state of the front-end. 3739 * 3740 * \return 0 for success, errno codes for failure. 3741 */ 3742 static void 3743 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3744 { 3745 struct xbb_softc *xbb = device_get_softc(dev); 3746 3747 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3748 xenbus_strstate(frontend_state), 3749 xenbus_strstate(xenbus_get_state(xbb->dev))); 3750 3751 switch (frontend_state) { 3752 case XenbusStateInitialising: 3753 break; 3754 case XenbusStateInitialised: 3755 case XenbusStateConnected: 3756 xbb_connect(xbb); 3757 break; 3758 case XenbusStateClosing: 3759 case XenbusStateClosed: 3760 mtx_lock(&xbb->lock); 3761 xbb_shutdown(xbb); 3762 mtx_unlock(&xbb->lock); 3763 if (frontend_state == XenbusStateClosed) 3764 xenbus_set_state(xbb->dev, XenbusStateClosed); 3765 break; 3766 default: 3767 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3768 frontend_state); 3769 break; 3770 } 3771 } 3772 3773 /*---------------------------- NewBus Registration ---------------------------*/ 3774 static device_method_t xbb_methods[] = { 3775 /* Device interface */ 3776 DEVMETHOD(device_probe, xbb_probe), 3777 DEVMETHOD(device_attach, xbb_attach), 3778 DEVMETHOD(device_detach, xbb_detach), 3779 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3780 DEVMETHOD(device_suspend, xbb_suspend), 3781 DEVMETHOD(device_resume, xbb_resume), 3782 3783 /* Xenbus interface */ 3784 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3785 { 0, 0 } 3786 }; 3787 3788 static driver_t xbb_driver = { 3789 "xbbd", 3790 xbb_methods, 3791 sizeof(struct xbb_softc), 3792 }; 3793 3794 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, 0, 0); 3795