1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2009-2012 Spectra Logic Corporation 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions, and the following disclaimer, 12 * without modification. 13 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 14 * substantially similar to the "NO WARRANTY" disclaimer below 15 * ("Disclaimer") and any redistribution must be conditioned upon 16 * including a substantially similar Disclaimer requirement for further 17 * binary redistribution. 18 * 19 * NO WARRANTY 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 29 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGES. 31 * 32 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 33 * Ken Merry (Spectra Logic Corporation) 34 */ 35 #include <sys/cdefs.h> 36 /** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 */ 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/kernel.h> 46 #include <sys/malloc.h> 47 48 #include <sys/bio.h> 49 #include <sys/bus.h> 50 #include <sys/conf.h> 51 #include <sys/devicestat.h> 52 #include <sys/disk.h> 53 #include <sys/fcntl.h> 54 #include <sys/filedesc.h> 55 #include <sys/kdb.h> 56 #include <sys/module.h> 57 #include <sys/namei.h> 58 #include <sys/proc.h> 59 #include <sys/rman.h> 60 #include <sys/taskqueue.h> 61 #include <sys/types.h> 62 #include <sys/vnode.h> 63 #include <sys/mount.h> 64 #include <sys/sysctl.h> 65 #include <sys/bitstring.h> 66 #include <sys/sdt.h> 67 68 #include <geom/geom.h> 69 70 #include <machine/_inttypes.h> 71 72 #include <vm/vm.h> 73 #include <vm/vm_extern.h> 74 #include <vm/vm_kern.h> 75 76 #include <xen/xen-os.h> 77 #include <xen/blkif.h> 78 #include <xen/gnttab.h> 79 #include <xen/xen_intr.h> 80 81 #include <contrib/xen/event_channel.h> 82 #include <contrib/xen/grant_table.h> 83 84 #include <xen/xenbus/xenbusvar.h> 85 86 /*--------------------------- Compile-time Tunables --------------------------*/ 87 /** 88 * The maximum number of shared memory ring pages we will allow in a 89 * negotiated block-front/back communication channel. Allow enough 90 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 91 */ 92 #define XBB_MAX_RING_PAGES 32 93 94 /** 95 * The maximum number of outstanding request blocks (request headers plus 96 * additional segment blocks) we will allow in a negotiated block-front/back 97 * communication channel. 98 */ 99 #define XBB_MAX_REQUESTS \ 100 __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) 101 102 /** 103 * \brief Define to enable rudimentary request logging to the console. 104 */ 105 #undef XBB_DEBUG 106 107 /*---------------------------------- Macros ----------------------------------*/ 108 /** 109 * Custom malloc type for all driver allocations. 110 */ 111 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 112 113 #ifdef XBB_DEBUG 114 #define DPRINTF(fmt, args...) \ 115 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 116 #else 117 #define DPRINTF(fmt, args...) do {} while(0) 118 #endif 119 120 /** 121 * The maximum mapped region size per request we will allow in a negotiated 122 * block-front/back communication channel. 123 * Use old default of MAXPHYS == 128K. 124 */ 125 #define XBB_MAX_REQUEST_SIZE \ 126 MIN(128 * 1024, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 127 128 /** 129 * The maximum number of segments (within a request header and accompanying 130 * segment blocks) per request we will allow in a negotiated block-front/back 131 * communication channel. 132 */ 133 #define XBB_MAX_SEGMENTS_PER_REQUEST \ 134 (MIN(UIO_MAXIOV, \ 135 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 136 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 137 138 /** 139 * The maximum number of ring pages that we can allow per request list. 140 * We limit this to the maximum number of segments per request, because 141 * that is already a reasonable number of segments to aggregate. This 142 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 143 * because that would leave situations where we can't dispatch even one 144 * large request. 145 */ 146 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 147 148 /*--------------------------- Forward Declarations ---------------------------*/ 149 struct xbb_softc; 150 struct xbb_xen_req; 151 152 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 153 ...) __attribute__((format(printf, 3, 4))); 154 static int xbb_shutdown(struct xbb_softc *xbb); 155 156 /*------------------------------ Data Structures -----------------------------*/ 157 158 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 159 160 typedef enum { 161 XBB_REQLIST_NONE = 0x00, 162 XBB_REQLIST_MAPPED = 0x01 163 } xbb_reqlist_flags; 164 165 struct xbb_xen_reqlist { 166 /** 167 * Back reference to the parent block back instance for this 168 * request. Used during bio_done handling. 169 */ 170 struct xbb_softc *xbb; 171 172 /** 173 * BLKIF_OP code for this request. 174 */ 175 int operation; 176 177 /** 178 * Set to BLKIF_RSP_* to indicate request status. 179 * 180 * This field allows an error status to be recorded even if the 181 * delivery of this status must be deferred. Deferred reporting 182 * is necessary, for example, when an error is detected during 183 * completion processing of one bio when other bios for this 184 * request are still outstanding. 185 */ 186 int status; 187 188 /** 189 * Number of 512 byte sectors not transferred. 190 */ 191 int residual_512b_sectors; 192 193 /** 194 * Starting sector number of the first request in the list. 195 */ 196 off_t starting_sector_number; 197 198 /** 199 * If we're going to coalesce, the next contiguous sector would be 200 * this one. 201 */ 202 off_t next_contig_sector; 203 204 /** 205 * Number of child requests in the list. 206 */ 207 int num_children; 208 209 /** 210 * Number of I/O requests still pending on the backend. 211 */ 212 int pendcnt; 213 214 /** 215 * Total number of segments for requests in the list. 216 */ 217 int nr_segments; 218 219 /** 220 * Flags for this particular request list. 221 */ 222 xbb_reqlist_flags flags; 223 224 /** 225 * Kernel virtual address space reserved for this request 226 * list structure and used to map the remote domain's pages for 227 * this I/O, into our domain's address space. 228 */ 229 uint8_t *kva; 230 231 /** 232 * Base, pseudo-physical address, corresponding to the start 233 * of this request's kva region. 234 */ 235 uint64_t gnt_base; 236 237 /** 238 * Array of grant handles (one per page) used to map this request. 239 */ 240 grant_handle_t *gnt_handles; 241 242 /** 243 * Device statistics request ordering type (ordered or simple). 244 */ 245 devstat_tag_type ds_tag_type; 246 247 /** 248 * Device statistics request type (read, write, no_data). 249 */ 250 devstat_trans_flags ds_trans_type; 251 252 /** 253 * The start time for this request. 254 */ 255 struct bintime ds_t0; 256 257 /** 258 * Linked list of contiguous requests with the same operation type. 259 */ 260 struct xbb_xen_req_list contig_req_list; 261 262 /** 263 * Linked list links used to aggregate idle requests in the 264 * request list free pool (xbb->reqlist_free_stailq) and pending 265 * requests waiting for execution (xbb->reqlist_pending_stailq). 266 */ 267 STAILQ_ENTRY(xbb_xen_reqlist) links; 268 }; 269 270 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 271 272 /** 273 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 274 */ 275 struct xbb_xen_req { 276 /** 277 * Linked list links used to aggregate requests into a reqlist 278 * and to store them in the request free pool. 279 */ 280 STAILQ_ENTRY(xbb_xen_req) links; 281 282 /** 283 * The remote domain's identifier for this I/O request. 284 */ 285 uint64_t id; 286 287 /** 288 * The number of pages currently mapped for this request. 289 */ 290 int nr_pages; 291 292 /** 293 * The number of 512 byte sectors comprising this requests. 294 */ 295 int nr_512b_sectors; 296 297 /** 298 * BLKIF_OP code for this request. 299 */ 300 int operation; 301 302 /** 303 * Storage used for non-native ring requests. 304 */ 305 blkif_request_t ring_req_storage; 306 307 /** 308 * Pointer to the Xen request in the ring. 309 */ 310 blkif_request_t *ring_req; 311 312 /** 313 * Consumer index for this request. 314 */ 315 RING_IDX req_ring_idx; 316 317 /** 318 * The start time for this request. 319 */ 320 struct bintime ds_t0; 321 322 /** 323 * Pointer back to our parent request list. 324 */ 325 struct xbb_xen_reqlist *reqlist; 326 }; 327 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 328 329 /** 330 * \brief Configuration data for the shared memory request ring 331 * used to communicate with the front-end client of this 332 * this driver. 333 */ 334 struct xbb_ring_config { 335 /** KVA address where ring memory is mapped. */ 336 vm_offset_t va; 337 338 /** The pseudo-physical address where ring memory is mapped.*/ 339 uint64_t gnt_addr; 340 341 /** 342 * Grant table handles, one per-ring page, returned by the 343 * hyperpervisor upon mapping of the ring and required to 344 * unmap it when a connection is torn down. 345 */ 346 grant_handle_t handle[XBB_MAX_RING_PAGES]; 347 348 /** 349 * The device bus address returned by the hypervisor when 350 * mapping the ring and required to unmap it when a connection 351 * is torn down. 352 */ 353 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 354 355 /** The number of ring pages mapped for the current connection. */ 356 u_int ring_pages; 357 358 /** 359 * The grant references, one per-ring page, supplied by the 360 * front-end, allowing us to reference the ring pages in the 361 * front-end's domain and to map these pages into our own domain. 362 */ 363 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 364 365 /** The interrupt driven even channel used to signal ring events. */ 366 evtchn_port_t evtchn; 367 }; 368 369 /** 370 * Per-instance connection state flags. 371 */ 372 typedef enum 373 { 374 /** 375 * The front-end requested a read-only mount of the 376 * back-end device/file. 377 */ 378 XBBF_READ_ONLY = 0x01, 379 380 /** Communication with the front-end has been established. */ 381 XBBF_RING_CONNECTED = 0x02, 382 383 /** 384 * Front-end requests exist in the ring and are waiting for 385 * xbb_xen_req objects to free up. 386 */ 387 XBBF_RESOURCE_SHORTAGE = 0x04, 388 389 /** Connection teardown in progress. */ 390 XBBF_SHUTDOWN = 0x08, 391 392 /** A thread is already performing shutdown processing. */ 393 XBBF_IN_SHUTDOWN = 0x10 394 } xbb_flag_t; 395 396 /** Backend device type. */ 397 typedef enum { 398 /** Backend type unknown. */ 399 XBB_TYPE_NONE = 0x00, 400 401 /** 402 * Backend type disk (access via cdev switch 403 * strategy routine). 404 */ 405 XBB_TYPE_DISK = 0x01, 406 407 /** Backend type file (access vnode operations.). */ 408 XBB_TYPE_FILE = 0x02 409 } xbb_type; 410 411 /** 412 * \brief Structure used to memoize information about a per-request 413 * scatter-gather list. 414 * 415 * The chief benefit of using this data structure is it avoids having 416 * to reparse the possibly discontiguous S/G list in the original 417 * request. Due to the way that the mapping of the memory backing an 418 * I/O transaction is handled by Xen, a second pass is unavoidable. 419 * At least this way the second walk is a simple array traversal. 420 * 421 * \note A single Scatter/Gather element in the block interface covers 422 * at most 1 machine page. In this context a sector (blkif 423 * nomenclature, not what I'd choose) is a 512b aligned unit 424 * of mapping within the machine page referenced by an S/G 425 * element. 426 */ 427 struct xbb_sg { 428 /** The number of 512b data chunks mapped in this S/G element. */ 429 int16_t nsect; 430 431 /** 432 * The index (0 based) of the first 512b data chunk mapped 433 * in this S/G element. 434 */ 435 uint8_t first_sect; 436 437 /** 438 * The index (0 based) of the last 512b data chunk mapped 439 * in this S/G element. 440 */ 441 uint8_t last_sect; 442 }; 443 444 /** 445 * Character device backend specific configuration data. 446 */ 447 struct xbb_dev_data { 448 /** Cdev used for device backend access. */ 449 struct cdev *cdev; 450 451 /** Cdev switch used for device backend access. */ 452 struct cdevsw *csw; 453 454 /** Used to hold a reference on opened cdev backend devices. */ 455 int dev_ref; 456 }; 457 458 /** 459 * File backend specific configuration data. 460 */ 461 struct xbb_file_data { 462 /** Credentials to use for vnode backed (file based) I/O. */ 463 struct ucred *cred; 464 465 /** 466 * \brief Array of io vectors used to process file based I/O. 467 * 468 * Only a single file based request is outstanding per-xbb instance, 469 * so we only need one of these. 470 */ 471 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 472 }; 473 474 /** 475 * Collection of backend type specific data. 476 */ 477 union xbb_backend_data { 478 struct xbb_dev_data dev; 479 struct xbb_file_data file; 480 }; 481 482 /** 483 * Function signature of backend specific I/O handlers. 484 */ 485 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 486 struct xbb_xen_reqlist *reqlist, int operation, 487 int flags); 488 489 /** 490 * Per-instance configuration data. 491 */ 492 struct xbb_softc { 493 /** 494 * Task-queue used to process I/O requests. 495 */ 496 struct taskqueue *io_taskqueue; 497 498 /** 499 * Single "run the request queue" task enqueued 500 * on io_taskqueue. 501 */ 502 struct task io_task; 503 504 /** Device type for this instance. */ 505 xbb_type device_type; 506 507 /** NewBus device corresponding to this instance. */ 508 device_t dev; 509 510 /** Backend specific dispatch routine for this instance. */ 511 xbb_dispatch_t dispatch_io; 512 513 /** The number of requests outstanding on the backend device/file. */ 514 int active_request_count; 515 516 /** Free pool of request tracking structures. */ 517 struct xbb_xen_req_list request_free_stailq; 518 519 /** Array, sized at connection time, of request tracking structures. */ 520 struct xbb_xen_req *requests; 521 522 /** Free pool of request list structures. */ 523 struct xbb_xen_reqlist_list reqlist_free_stailq; 524 525 /** List of pending request lists awaiting execution. */ 526 struct xbb_xen_reqlist_list reqlist_pending_stailq; 527 528 /** Array, sized at connection time, of request list structures. */ 529 struct xbb_xen_reqlist *request_lists; 530 531 /** 532 * Global pool of kva used for mapping remote domain ring 533 * and I/O transaction data. 534 */ 535 vm_offset_t kva; 536 537 /** Pseudo-physical address corresponding to kva. */ 538 uint64_t gnt_base_addr; 539 540 /** The size of the global kva pool. */ 541 int kva_size; 542 543 /** The size of the KVA area used for request lists. */ 544 int reqlist_kva_size; 545 546 /** The number of pages of KVA used for request lists */ 547 int reqlist_kva_pages; 548 549 /** Bitmap of free KVA pages */ 550 bitstr_t *kva_free; 551 552 /** 553 * \brief Cached value of the front-end's domain id. 554 * 555 * This value is used at once for each mapped page in 556 * a transaction. We cache it to avoid incuring the 557 * cost of an ivar access every time this is needed. 558 */ 559 domid_t otherend_id; 560 561 /** 562 * \brief The blkif protocol abi in effect. 563 * 564 * There are situations where the back and front ends can 565 * have a different, native abi (e.g. intel x86_64 and 566 * 32bit x86 domains on the same machine). The back-end 567 * always accommodates the front-end's native abi. That 568 * value is pulled from the XenStore and recorded here. 569 */ 570 int abi; 571 572 /** 573 * \brief The maximum number of requests and request lists allowed 574 * to be in flight at a time. 575 * 576 * This value is negotiated via the XenStore. 577 */ 578 u_int max_requests; 579 580 /** 581 * \brief The maximum number of segments (1 page per segment) 582 * that can be mapped by a request. 583 * 584 * This value is negotiated via the XenStore. 585 */ 586 u_int max_request_segments; 587 588 /** 589 * \brief Maximum number of segments per request list. 590 * 591 * This value is derived from and will generally be larger than 592 * max_request_segments. 593 */ 594 u_int max_reqlist_segments; 595 596 /** 597 * The maximum size of any request to this back-end 598 * device. 599 * 600 * This value is negotiated via the XenStore. 601 */ 602 u_int max_request_size; 603 604 /** 605 * The maximum size of any request list. This is derived directly 606 * from max_reqlist_segments. 607 */ 608 u_int max_reqlist_size; 609 610 /** Various configuration and state bit flags. */ 611 xbb_flag_t flags; 612 613 /** Ring mapping and interrupt configuration data. */ 614 struct xbb_ring_config ring_config; 615 616 /** Runtime, cross-abi safe, structures for ring access. */ 617 blkif_back_rings_t rings; 618 619 /** IRQ mapping for the communication ring event channel. */ 620 xen_intr_handle_t xen_intr_handle; 621 622 /** 623 * \brief Backend access mode flags (e.g. write, or read-only). 624 * 625 * This value is passed to us by the front-end via the XenStore. 626 */ 627 char *dev_mode; 628 629 /** 630 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 631 * 632 * This value is passed to us by the front-end via the XenStore. 633 * Currently unused. 634 */ 635 char *dev_type; 636 637 /** 638 * \brief Backend device/file identifier. 639 * 640 * This value is passed to us by the front-end via the XenStore. 641 * We expect this to be a POSIX path indicating the file or 642 * device to open. 643 */ 644 char *dev_name; 645 646 /** 647 * Vnode corresponding to the backend device node or file 648 * we are acessing. 649 */ 650 struct vnode *vn; 651 652 union xbb_backend_data backend; 653 654 /** The native sector size of the backend. */ 655 u_int sector_size; 656 657 /** log2 of sector_size. */ 658 u_int sector_size_shift; 659 660 /** Size in bytes of the backend device or file. */ 661 off_t media_size; 662 663 /** 664 * \brief media_size expressed in terms of the backend native 665 * sector size. 666 * 667 * (e.g. xbb->media_size >> xbb->sector_size_shift). 668 */ 669 uint64_t media_num_sectors; 670 671 /** 672 * \brief Array of memoized scatter gather data computed during the 673 * conversion of blkif ring requests to internal xbb_xen_req 674 * structures. 675 * 676 * Ring processing is serialized so we only need one of these. 677 */ 678 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 679 680 /** 681 * Temporary grant table map used in xbb_dispatch_io(). When 682 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 683 * stack could cause a stack overflow. 684 */ 685 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 686 687 /** Mutex protecting per-instance data. */ 688 struct mtx lock; 689 690 /** 691 * Resource representing allocated physical address space 692 * associated with our per-instance kva region. 693 */ 694 struct resource *pseudo_phys_res; 695 696 /** Resource id for allocated physical address space. */ 697 int pseudo_phys_res_id; 698 699 /** 700 * I/O statistics from BlockBack dispatch down. These are 701 * coalesced requests, and we start them right before execution. 702 */ 703 struct devstat *xbb_stats; 704 705 /** 706 * I/O statistics coming into BlockBack. These are the requests as 707 * we get them from BlockFront. They are started as soon as we 708 * receive a request, and completed when the I/O is complete. 709 */ 710 struct devstat *xbb_stats_in; 711 712 /** Disable sending flush to the backend */ 713 int disable_flush; 714 715 /** Send a real flush for every N flush requests */ 716 int flush_interval; 717 718 /** Count of flush requests in the interval */ 719 int flush_count; 720 721 /** Don't coalesce requests if this is set */ 722 int no_coalesce_reqs; 723 724 /** Number of requests we have received */ 725 uint64_t reqs_received; 726 727 /** Number of requests we have completed*/ 728 uint64_t reqs_completed; 729 730 /** Number of requests we queued but not pushed*/ 731 uint64_t reqs_queued_for_completion; 732 733 /** Number of requests we completed with an error status*/ 734 uint64_t reqs_completed_with_error; 735 736 /** How many forced dispatches (i.e. without coalescing) have happened */ 737 uint64_t forced_dispatch; 738 739 /** How many normal dispatches have happened */ 740 uint64_t normal_dispatch; 741 742 /** How many total dispatches have happened */ 743 uint64_t total_dispatch; 744 745 /** How many times we have run out of KVA */ 746 uint64_t kva_shortages; 747 748 /** How many times we have run out of request structures */ 749 uint64_t request_shortages; 750 751 /** Watch to wait for hotplug script execution */ 752 struct xs_watch hotplug_watch; 753 754 /** Got the needed data from hotplug scripts? */ 755 bool hotplug_done; 756 }; 757 758 /*---------------------------- Request Processing ----------------------------*/ 759 /** 760 * Allocate an internal transaction tracking structure from the free pool. 761 * 762 * \param xbb Per-instance xbb configuration structure. 763 * 764 * \return On success, a pointer to the allocated xbb_xen_req structure. 765 * Otherwise NULL. 766 */ 767 static inline struct xbb_xen_req * 768 xbb_get_req(struct xbb_softc *xbb) 769 { 770 struct xbb_xen_req *req; 771 772 req = NULL; 773 774 mtx_assert(&xbb->lock, MA_OWNED); 775 776 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 777 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 778 xbb->active_request_count++; 779 } 780 781 return (req); 782 } 783 784 /** 785 * Return an allocated transaction tracking structure to the free pool. 786 * 787 * \param xbb Per-instance xbb configuration structure. 788 * \param req The request structure to free. 789 */ 790 static inline void 791 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 792 { 793 mtx_assert(&xbb->lock, MA_OWNED); 794 795 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 796 xbb->active_request_count--; 797 798 KASSERT(xbb->active_request_count >= 0, 799 ("xbb_release_req: negative active count")); 800 } 801 802 /** 803 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 804 * 805 * \param xbb Per-instance xbb configuration structure. 806 * \param req_list The list of requests to free. 807 * \param nreqs The number of items in the list. 808 */ 809 static inline void 810 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 811 int nreqs) 812 { 813 mtx_assert(&xbb->lock, MA_OWNED); 814 815 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 816 xbb->active_request_count -= nreqs; 817 818 KASSERT(xbb->active_request_count >= 0, 819 ("xbb_release_reqs: negative active count")); 820 } 821 822 /** 823 * Given a page index and 512b sector offset within that page, 824 * calculate an offset into a request's kva region. 825 * 826 * \param reqlist The request structure whose kva region will be accessed. 827 * \param pagenr The page index used to compute the kva offset. 828 * \param sector The 512b sector index used to compute the page relative 829 * kva offset. 830 * 831 * \return The computed global KVA offset. 832 */ 833 static inline uint8_t * 834 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 835 { 836 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 837 } 838 839 /** 840 * Given a page number and 512b sector offset within that page, 841 * calculate an offset into the request's memory region that the 842 * underlying backend device/file should use for I/O. 843 * 844 * \param reqlist The request structure whose I/O region will be accessed. 845 * \param pagenr The page index used to compute the I/O offset. 846 * \param sector The 512b sector index used to compute the page relative 847 * I/O offset. 848 * 849 * \return The computed global I/O address. 850 * 851 * Depending on configuration, this will either be a local bounce buffer 852 * or a pointer to the memory mapped in from the front-end domain for 853 * this request. 854 */ 855 static inline uint8_t * 856 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 857 { 858 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 859 } 860 861 /** 862 * Given a page index and 512b sector offset within that page, calculate 863 * an offset into the local pseudo-physical address space used to map a 864 * front-end's request data into a request. 865 * 866 * \param reqlist The request list structure whose pseudo-physical region 867 * will be accessed. 868 * \param pagenr The page index used to compute the pseudo-physical offset. 869 * \param sector The 512b sector index used to compute the page relative 870 * pseudo-physical offset. 871 * 872 * \return The computed global pseudo-phsyical address. 873 * 874 * Depending on configuration, this will either be a local bounce buffer 875 * or a pointer to the memory mapped in from the front-end domain for 876 * this request. 877 */ 878 static inline uintptr_t 879 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 880 { 881 struct xbb_softc *xbb; 882 883 xbb = reqlist->xbb; 884 885 return ((uintptr_t)(xbb->gnt_base_addr + 886 (uintptr_t)(reqlist->kva - xbb->kva) + 887 (PAGE_SIZE * pagenr) + (sector << 9))); 888 } 889 890 /** 891 * Get Kernel Virtual Address space for mapping requests. 892 * 893 * \param xbb Per-instance xbb configuration structure. 894 * \param nr_pages Number of pages needed. 895 * \param check_only If set, check for free KVA but don't allocate it. 896 * \param have_lock If set, xbb lock is already held. 897 * 898 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 899 * 900 * Note: This should be unnecessary once we have either chaining or 901 * scatter/gather support for struct bio. At that point we'll be able to 902 * put multiple addresses and lengths in one bio/bio chain and won't need 903 * to map everything into one virtual segment. 904 */ 905 static uint8_t * 906 xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 907 { 908 int first_clear; 909 int num_clear; 910 uint8_t *free_kva; 911 int i; 912 913 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 914 915 first_clear = 0; 916 free_kva = NULL; 917 918 mtx_lock(&xbb->lock); 919 920 /* 921 * Look for the first available page. If there are none, we're done. 922 */ 923 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 924 925 if (first_clear == -1) 926 goto bailout; 927 928 /* 929 * Starting at the first available page, look for consecutive free 930 * pages that will satisfy the user's request. 931 */ 932 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 933 /* 934 * If this is true, the page is used, so we have to reset 935 * the number of clear pages and the first clear page 936 * (since it pointed to a region with an insufficient number 937 * of clear pages). 938 */ 939 if (bit_test(xbb->kva_free, i)) { 940 num_clear = 0; 941 first_clear = -1; 942 continue; 943 } 944 945 if (first_clear == -1) 946 first_clear = i; 947 948 /* 949 * If this is true, we've found a large enough free region 950 * to satisfy the request. 951 */ 952 if (++num_clear == nr_pages) { 953 bit_nset(xbb->kva_free, first_clear, 954 first_clear + nr_pages - 1); 955 956 free_kva = xbb->kva + 957 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); 958 959 KASSERT(free_kva >= (uint8_t *)xbb->kva && 960 free_kva + (nr_pages * PAGE_SIZE) <= 961 (uint8_t *)xbb->ring_config.va, 962 ("Free KVA %p len %d out of range, " 963 "kva = %#jx, ring VA = %#jx\n", free_kva, 964 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 965 (uintmax_t)xbb->ring_config.va)); 966 break; 967 } 968 } 969 970 bailout: 971 972 if (free_kva == NULL) { 973 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 974 xbb->kva_shortages++; 975 } 976 977 mtx_unlock(&xbb->lock); 978 979 return (free_kva); 980 } 981 982 /** 983 * Free allocated KVA. 984 * 985 * \param xbb Per-instance xbb configuration structure. 986 * \param kva_ptr Pointer to allocated KVA region. 987 * \param nr_pages Number of pages in the KVA region. 988 */ 989 static void 990 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 991 { 992 intptr_t start_page; 993 994 mtx_assert(&xbb->lock, MA_OWNED); 995 996 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 997 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 998 999 } 1000 1001 /** 1002 * Unmap the front-end pages associated with this I/O request. 1003 * 1004 * \param req The request structure to unmap. 1005 */ 1006 static void 1007 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1008 { 1009 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1010 u_int i; 1011 u_int invcount; 1012 int error __diagused; 1013 1014 invcount = 0; 1015 for (i = 0; i < reqlist->nr_segments; i++) { 1016 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1017 continue; 1018 1019 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1020 unmap[invcount].dev_bus_addr = 0; 1021 unmap[invcount].handle = reqlist->gnt_handles[i]; 1022 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1023 invcount++; 1024 } 1025 1026 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1027 unmap, invcount); 1028 KASSERT(error == 0, ("Grant table operation failed")); 1029 } 1030 1031 /** 1032 * Allocate an internal transaction tracking structure from the free pool. 1033 * 1034 * \param xbb Per-instance xbb configuration structure. 1035 * 1036 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1037 * Otherwise NULL. 1038 */ 1039 static inline struct xbb_xen_reqlist * 1040 xbb_get_reqlist(struct xbb_softc *xbb) 1041 { 1042 struct xbb_xen_reqlist *reqlist; 1043 1044 reqlist = NULL; 1045 1046 mtx_assert(&xbb->lock, MA_OWNED); 1047 1048 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1049 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1050 reqlist->flags = XBB_REQLIST_NONE; 1051 reqlist->kva = NULL; 1052 reqlist->status = BLKIF_RSP_OKAY; 1053 reqlist->residual_512b_sectors = 0; 1054 reqlist->num_children = 0; 1055 reqlist->nr_segments = 0; 1056 STAILQ_INIT(&reqlist->contig_req_list); 1057 } 1058 1059 return (reqlist); 1060 } 1061 1062 /** 1063 * Return an allocated transaction tracking structure to the free pool. 1064 * 1065 * \param xbb Per-instance xbb configuration structure. 1066 * \param req The request list structure to free. 1067 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1068 * during a resource shortage condition. 1069 */ 1070 static inline void 1071 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1072 int wakeup) 1073 { 1074 1075 mtx_assert(&xbb->lock, MA_OWNED); 1076 1077 if (wakeup) { 1078 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1079 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1080 } 1081 1082 if (reqlist->kva != NULL) 1083 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1084 1085 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1086 1087 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1088 1089 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1090 /* 1091 * Shutdown is in progress. See if we can 1092 * progress further now that one more request 1093 * has completed and been returned to the 1094 * free pool. 1095 */ 1096 xbb_shutdown(xbb); 1097 } 1098 1099 if (wakeup != 0) 1100 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1101 } 1102 1103 /** 1104 * Request resources and do basic request setup. 1105 * 1106 * \param xbb Per-instance xbb configuration structure. 1107 * \param reqlist Pointer to reqlist pointer. 1108 * \param ring_req Pointer to a block ring request. 1109 * \param ring_index The ring index of this request. 1110 * 1111 * \return 0 for success, non-zero for failure. 1112 */ 1113 static int 1114 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1115 blkif_request_t *ring_req, RING_IDX ring_idx) 1116 { 1117 struct xbb_xen_reqlist *nreqlist; 1118 struct xbb_xen_req *nreq; 1119 1120 nreqlist = NULL; 1121 nreq = NULL; 1122 1123 mtx_lock(&xbb->lock); 1124 1125 /* 1126 * We don't allow new resources to be allocated if we're in the 1127 * process of shutting down. 1128 */ 1129 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1130 mtx_unlock(&xbb->lock); 1131 return (1); 1132 } 1133 1134 /* 1135 * Allocate a reqlist if the caller doesn't have one already. 1136 */ 1137 if (*reqlist == NULL) { 1138 nreqlist = xbb_get_reqlist(xbb); 1139 if (nreqlist == NULL) 1140 goto bailout_error; 1141 } 1142 1143 /* We always allocate a request. */ 1144 nreq = xbb_get_req(xbb); 1145 if (nreq == NULL) 1146 goto bailout_error; 1147 1148 mtx_unlock(&xbb->lock); 1149 1150 if (*reqlist == NULL) { 1151 *reqlist = nreqlist; 1152 nreqlist->operation = ring_req->operation; 1153 nreqlist->starting_sector_number = ring_req->sector_number; 1154 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1155 links); 1156 } 1157 1158 nreq->reqlist = *reqlist; 1159 nreq->req_ring_idx = ring_idx; 1160 nreq->id = ring_req->id; 1161 nreq->operation = ring_req->operation; 1162 1163 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1164 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1165 nreq->ring_req = &nreq->ring_req_storage; 1166 } else { 1167 nreq->ring_req = ring_req; 1168 } 1169 1170 binuptime(&nreq->ds_t0); 1171 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1172 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1173 (*reqlist)->num_children++; 1174 (*reqlist)->nr_segments += ring_req->nr_segments; 1175 1176 return (0); 1177 1178 bailout_error: 1179 1180 /* 1181 * We're out of resources, so set the shortage flag. The next time 1182 * a request is released, we'll try waking up the work thread to 1183 * see if we can allocate more resources. 1184 */ 1185 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1186 xbb->request_shortages++; 1187 1188 if (nreq != NULL) 1189 xbb_release_req(xbb, nreq); 1190 1191 if (nreqlist != NULL) 1192 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1193 1194 mtx_unlock(&xbb->lock); 1195 1196 return (1); 1197 } 1198 1199 /** 1200 * Create and queue a response to a blkif request. 1201 * 1202 * \param xbb Per-instance xbb configuration structure. 1203 * \param req The request structure to which to respond. 1204 * \param status The status code to report. See BLKIF_RSP_* 1205 * in sys/contrib/xen/io/blkif.h. 1206 */ 1207 static void 1208 xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1209 { 1210 blkif_response_t *resp; 1211 1212 /* 1213 * The mutex is required here, and should be held across this call 1214 * until after the subsequent call to xbb_push_responses(). This 1215 * is to guarantee that another context won't queue responses and 1216 * push them while we're active. 1217 * 1218 * That could lead to the other end being notified of responses 1219 * before the resources have been freed on this end. The other end 1220 * would then be able to queue additional I/O, and we may run out 1221 * of resources because we haven't freed them all yet. 1222 */ 1223 mtx_assert(&xbb->lock, MA_OWNED); 1224 1225 /* 1226 * Place on the response ring for the relevant domain. 1227 * For now, only the spacing between entries is different 1228 * in the different ABIs, not the response entry layout. 1229 */ 1230 switch (xbb->abi) { 1231 case BLKIF_PROTOCOL_NATIVE: 1232 resp = RING_GET_RESPONSE(&xbb->rings.native, 1233 xbb->rings.native.rsp_prod_pvt); 1234 break; 1235 case BLKIF_PROTOCOL_X86_32: 1236 resp = (blkif_response_t *) 1237 RING_GET_RESPONSE(&xbb->rings.x86_32, 1238 xbb->rings.x86_32.rsp_prod_pvt); 1239 break; 1240 case BLKIF_PROTOCOL_X86_64: 1241 resp = (blkif_response_t *) 1242 RING_GET_RESPONSE(&xbb->rings.x86_64, 1243 xbb->rings.x86_64.rsp_prod_pvt); 1244 break; 1245 default: 1246 panic("Unexpected blkif protocol ABI."); 1247 } 1248 1249 resp->id = req->id; 1250 resp->operation = req->operation; 1251 resp->status = status; 1252 1253 if (status != BLKIF_RSP_OKAY) 1254 xbb->reqs_completed_with_error++; 1255 1256 xbb->rings.common.rsp_prod_pvt++; 1257 1258 xbb->reqs_queued_for_completion++; 1259 1260 } 1261 1262 /** 1263 * Send queued responses to blkif requests. 1264 * 1265 * \param xbb Per-instance xbb configuration structure. 1266 * \param run_taskqueue Flag that is set to 1 if the taskqueue 1267 * should be run, 0 if it does not need to be run. 1268 * \param notify Flag that is set to 1 if the other end should be 1269 * notified via irq, 0 if the other end should not be 1270 * notified. 1271 */ 1272 static void 1273 xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) 1274 { 1275 int more_to_do; 1276 1277 /* 1278 * The mutex is required here. 1279 */ 1280 mtx_assert(&xbb->lock, MA_OWNED); 1281 1282 more_to_do = 0; 1283 1284 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); 1285 1286 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1287 /* 1288 * Tail check for pending requests. Allows frontend to avoid 1289 * notifications if requests are already in flight (lower 1290 * overheads and promotes batching). 1291 */ 1292 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1293 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1294 more_to_do = 1; 1295 } 1296 1297 xbb->reqs_completed += xbb->reqs_queued_for_completion; 1298 xbb->reqs_queued_for_completion = 0; 1299 1300 *run_taskqueue = more_to_do; 1301 } 1302 1303 /** 1304 * Complete a request list. 1305 * 1306 * \param xbb Per-instance xbb configuration structure. 1307 * \param reqlist Allocated internal request list structure. 1308 */ 1309 static void 1310 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1311 { 1312 struct xbb_xen_req *nreq; 1313 off_t sectors_sent; 1314 int notify, run_taskqueue; 1315 1316 sectors_sent = 0; 1317 1318 if (reqlist->flags & XBB_REQLIST_MAPPED) 1319 xbb_unmap_reqlist(reqlist); 1320 1321 mtx_lock(&xbb->lock); 1322 1323 /* 1324 * All I/O is done, send the response. A lock is not necessary 1325 * to protect the request list, because all requests have 1326 * completed. Therefore this is the only context accessing this 1327 * reqlist right now. However, in order to make sure that no one 1328 * else queues responses onto the queue or pushes them to the other 1329 * side while we're active, we need to hold the lock across the 1330 * calls to xbb_queue_response() and xbb_push_responses(). 1331 */ 1332 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1333 off_t cur_sectors_sent; 1334 1335 /* Put this response on the ring, but don't push yet */ 1336 xbb_queue_response(xbb, nreq, reqlist->status); 1337 1338 /* We don't report bytes sent if there is an error. */ 1339 if (reqlist->status == BLKIF_RSP_OKAY) 1340 cur_sectors_sent = nreq->nr_512b_sectors; 1341 else 1342 cur_sectors_sent = 0; 1343 1344 sectors_sent += cur_sectors_sent; 1345 1346 devstat_end_transaction(xbb->xbb_stats_in, 1347 /*bytes*/cur_sectors_sent << 9, 1348 reqlist->ds_tag_type, 1349 reqlist->ds_trans_type, 1350 /*now*/NULL, 1351 /*then*/&nreq->ds_t0); 1352 } 1353 1354 /* 1355 * Take out any sectors not sent. If we wind up negative (which 1356 * might happen if an error is reported as well as a residual), just 1357 * report 0 sectors sent. 1358 */ 1359 sectors_sent -= reqlist->residual_512b_sectors; 1360 if (sectors_sent < 0) 1361 sectors_sent = 0; 1362 1363 devstat_end_transaction(xbb->xbb_stats, 1364 /*bytes*/ sectors_sent << 9, 1365 reqlist->ds_tag_type, 1366 reqlist->ds_trans_type, 1367 /*now*/NULL, 1368 /*then*/&reqlist->ds_t0); 1369 1370 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1371 1372 xbb_push_responses(xbb, &run_taskqueue, ¬ify); 1373 1374 mtx_unlock(&xbb->lock); 1375 1376 if (run_taskqueue) 1377 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1378 1379 if (notify) 1380 xen_intr_signal(xbb->xen_intr_handle); 1381 } 1382 1383 /** 1384 * Completion handler for buffer I/O requests issued by the device 1385 * backend driver. 1386 * 1387 * \param bio The buffer I/O request on which to perform completion 1388 * processing. 1389 */ 1390 static void 1391 xbb_bio_done(struct bio *bio) 1392 { 1393 struct xbb_softc *xbb; 1394 struct xbb_xen_reqlist *reqlist; 1395 1396 reqlist = bio->bio_caller1; 1397 xbb = reqlist->xbb; 1398 1399 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1400 1401 /* 1402 * This is a bit imprecise. With aggregated I/O a single 1403 * request list can contain multiple front-end requests and 1404 * a multiple bios may point to a single request. By carefully 1405 * walking the request list, we could map residuals and errors 1406 * back to the original front-end request, but the interface 1407 * isn't sufficiently rich for us to properly report the error. 1408 * So, we just treat the entire request list as having failed if an 1409 * error occurs on any part. And, if an error occurs, we treat 1410 * the amount of data transferred as 0. 1411 * 1412 * For residuals, we report it on the overall aggregated device, 1413 * but not on the individual requests, since we don't currently 1414 * do the work to determine which front-end request to which the 1415 * residual applies. 1416 */ 1417 if (bio->bio_error) { 1418 DPRINTF("BIO returned error %d for operation on device %s\n", 1419 bio->bio_error, xbb->dev_name); 1420 reqlist->status = BLKIF_RSP_ERROR; 1421 1422 if (bio->bio_error == ENXIO 1423 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1424 /* 1425 * Backend device has disappeared. Signal the 1426 * front-end that we (the device proxy) want to 1427 * go away. 1428 */ 1429 xenbus_set_state(xbb->dev, XenbusStateClosing); 1430 } 1431 } 1432 1433 /* 1434 * Decrement the pending count for the request list. When we're 1435 * done with the requests, send status back for all of them. 1436 */ 1437 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1438 xbb_complete_reqlist(xbb, reqlist); 1439 1440 g_destroy_bio(bio); 1441 } 1442 1443 /** 1444 * Parse a blkif request into an internal request structure and send 1445 * it to the backend for processing. 1446 * 1447 * \param xbb Per-instance xbb configuration structure. 1448 * \param reqlist Allocated internal request list structure. 1449 * 1450 * \return On success, 0. For resource shortages, non-zero. 1451 * 1452 * This routine performs the backend common aspects of request parsing 1453 * including compiling an internal request structure, parsing the S/G 1454 * list and any secondary ring requests in which they may reside, and 1455 * the mapping of front-end I/O pages into our domain. 1456 */ 1457 static int 1458 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1459 { 1460 struct xbb_sg *xbb_sg; 1461 struct gnttab_map_grant_ref *map; 1462 struct blkif_request_segment *sg; 1463 struct blkif_request_segment *last_block_sg; 1464 struct xbb_xen_req *nreq; 1465 u_int nseg; 1466 u_int seg_idx; 1467 u_int block_segs; 1468 int nr_sects; 1469 int total_sects; 1470 int operation; 1471 uint8_t bio_flags; 1472 int error; 1473 1474 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1475 bio_flags = 0; 1476 total_sects = 0; 1477 nr_sects = 0; 1478 1479 /* 1480 * First determine whether we have enough free KVA to satisfy this 1481 * request list. If not, tell xbb_run_queue() so it can go to 1482 * sleep until we have more KVA. 1483 */ 1484 reqlist->kva = NULL; 1485 if (reqlist->nr_segments != 0) { 1486 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1487 if (reqlist->kva == NULL) { 1488 /* 1489 * If we're out of KVA, return ENOMEM. 1490 */ 1491 return (ENOMEM); 1492 } 1493 } 1494 1495 binuptime(&reqlist->ds_t0); 1496 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1497 1498 switch (reqlist->operation) { 1499 case BLKIF_OP_WRITE_BARRIER: 1500 bio_flags |= BIO_ORDERED; 1501 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1502 /* FALLTHROUGH */ 1503 case BLKIF_OP_WRITE: 1504 operation = BIO_WRITE; 1505 reqlist->ds_trans_type = DEVSTAT_WRITE; 1506 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1507 DPRINTF("Attempt to write to read only device %s\n", 1508 xbb->dev_name); 1509 reqlist->status = BLKIF_RSP_ERROR; 1510 goto send_response; 1511 } 1512 break; 1513 case BLKIF_OP_READ: 1514 operation = BIO_READ; 1515 reqlist->ds_trans_type = DEVSTAT_READ; 1516 break; 1517 case BLKIF_OP_FLUSH_DISKCACHE: 1518 /* 1519 * If this is true, the user has requested that we disable 1520 * flush support. So we just complete the requests 1521 * successfully. 1522 */ 1523 if (xbb->disable_flush != 0) { 1524 goto send_response; 1525 } 1526 1527 /* 1528 * The user has requested that we only send a real flush 1529 * for every N flush requests. So keep count, and either 1530 * complete the request immediately or queue it for the 1531 * backend. 1532 */ 1533 if (xbb->flush_interval != 0) { 1534 if (++(xbb->flush_count) < xbb->flush_interval) { 1535 goto send_response; 1536 } else 1537 xbb->flush_count = 0; 1538 } 1539 1540 operation = BIO_FLUSH; 1541 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1542 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1543 goto do_dispatch; 1544 /*NOTREACHED*/ 1545 default: 1546 DPRINTF("error: unknown block io operation [%d]\n", 1547 reqlist->operation); 1548 reqlist->status = BLKIF_RSP_ERROR; 1549 goto send_response; 1550 } 1551 1552 reqlist->xbb = xbb; 1553 xbb_sg = xbb->xbb_sgs; 1554 map = xbb->maps; 1555 seg_idx = 0; 1556 1557 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1558 blkif_request_t *ring_req; 1559 1560 ring_req = nreq->ring_req; 1561 nr_sects = 0; 1562 nseg = ring_req->nr_segments; 1563 nreq->nr_pages = nseg; 1564 nreq->nr_512b_sectors = 0; 1565 sg = NULL; 1566 1567 /* Check that number of segments is sane. */ 1568 if (__predict_false(nseg == 0) 1569 || __predict_false(nseg > xbb->max_request_segments)) { 1570 DPRINTF("Bad number of segments in request (%d)\n", 1571 nseg); 1572 reqlist->status = BLKIF_RSP_ERROR; 1573 goto send_response; 1574 } 1575 1576 block_segs = nseg; 1577 sg = ring_req->seg; 1578 last_block_sg = sg + block_segs; 1579 1580 while (sg < last_block_sg) { 1581 KASSERT(seg_idx < 1582 XBB_MAX_SEGMENTS_PER_REQLIST, 1583 ("seg_idx %d is too large, max " 1584 "segs %d\n", seg_idx, 1585 XBB_MAX_SEGMENTS_PER_REQLIST)); 1586 1587 xbb_sg->first_sect = sg->first_sect; 1588 xbb_sg->last_sect = sg->last_sect; 1589 xbb_sg->nsect = 1590 (int8_t)(sg->last_sect - 1591 sg->first_sect + 1); 1592 1593 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1594 || (xbb_sg->nsect <= 0)) { 1595 reqlist->status = BLKIF_RSP_ERROR; 1596 goto send_response; 1597 } 1598 1599 nr_sects += xbb_sg->nsect; 1600 map->host_addr = xbb_get_gntaddr(reqlist, 1601 seg_idx, /*sector*/0); 1602 KASSERT(map->host_addr + PAGE_SIZE <= 1603 xbb->ring_config.gnt_addr, 1604 ("Host address %#jx len %d overlaps " 1605 "ring address %#jx\n", 1606 (uintmax_t)map->host_addr, PAGE_SIZE, 1607 (uintmax_t)xbb->ring_config.gnt_addr)); 1608 1609 map->flags = GNTMAP_host_map; 1610 map->ref = sg->gref; 1611 map->dom = xbb->otherend_id; 1612 if (operation == BIO_WRITE) 1613 map->flags |= GNTMAP_readonly; 1614 sg++; 1615 map++; 1616 xbb_sg++; 1617 seg_idx++; 1618 } 1619 1620 /* Convert to the disk's sector size */ 1621 nreq->nr_512b_sectors = nr_sects; 1622 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1623 total_sects += nr_sects; 1624 1625 if ((nreq->nr_512b_sectors & 1626 ((xbb->sector_size >> 9) - 1)) != 0) { 1627 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1628 "a multiple of the backing store sector " 1629 "size (%d)\n", __func__, 1630 nreq->nr_512b_sectors << 9, 1631 xbb->sector_size); 1632 reqlist->status = BLKIF_RSP_ERROR; 1633 goto send_response; 1634 } 1635 } 1636 1637 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1638 xbb->maps, reqlist->nr_segments); 1639 if (error != 0) 1640 panic("Grant table operation failed (%d)", error); 1641 1642 reqlist->flags |= XBB_REQLIST_MAPPED; 1643 1644 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1645 seg_idx++, map++){ 1646 if (__predict_false(map->status != 0)) { 1647 DPRINTF("invalid buffer -- could not remap " 1648 "it (%d)\n", map->status); 1649 DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " 1650 "0x%x ref 0x%x, dom %d\n", seg_idx, 1651 map->host_addr, map->flags, map->ref, 1652 map->dom); 1653 reqlist->status = BLKIF_RSP_ERROR; 1654 goto send_response; 1655 } 1656 1657 reqlist->gnt_handles[seg_idx] = map->handle; 1658 } 1659 if (reqlist->starting_sector_number + total_sects > 1660 xbb->media_num_sectors) { 1661 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1662 "extends past end of device %s\n", 1663 operation == BIO_READ ? "read" : "write", 1664 reqlist->starting_sector_number, 1665 reqlist->starting_sector_number + total_sects, 1666 xbb->dev_name); 1667 reqlist->status = BLKIF_RSP_ERROR; 1668 goto send_response; 1669 } 1670 1671 do_dispatch: 1672 1673 error = xbb->dispatch_io(xbb, 1674 reqlist, 1675 operation, 1676 bio_flags); 1677 1678 if (error != 0) { 1679 reqlist->status = BLKIF_RSP_ERROR; 1680 goto send_response; 1681 } 1682 1683 return (0); 1684 1685 send_response: 1686 1687 xbb_complete_reqlist(xbb, reqlist); 1688 1689 return (0); 1690 } 1691 1692 static __inline int 1693 xbb_count_sects(blkif_request_t *ring_req) 1694 { 1695 int i; 1696 int cur_size = 0; 1697 1698 for (i = 0; i < ring_req->nr_segments; i++) { 1699 int nsect; 1700 1701 nsect = (int8_t)(ring_req->seg[i].last_sect - 1702 ring_req->seg[i].first_sect + 1); 1703 if (nsect <= 0) 1704 break; 1705 1706 cur_size += nsect; 1707 } 1708 1709 return (cur_size); 1710 } 1711 1712 /** 1713 * Process incoming requests from the shared communication ring in response 1714 * to a signal on the ring's event channel. 1715 * 1716 * \param context Callback argument registerd during task initialization - 1717 * the xbb_softc for this instance. 1718 * \param pending The number of taskqueue_enqueue events that have 1719 * occurred since this handler was last run. 1720 */ 1721 static void 1722 xbb_run_queue(void *context, int pending) 1723 { 1724 struct xbb_softc *xbb; 1725 blkif_back_rings_t *rings; 1726 RING_IDX rp; 1727 uint64_t cur_sector; 1728 int cur_operation; 1729 struct xbb_xen_reqlist *reqlist; 1730 1731 xbb = (struct xbb_softc *)context; 1732 rings = &xbb->rings; 1733 1734 /* 1735 * Work gather and dispatch loop. Note that we have a bias here 1736 * towards gathering I/O sent by blockfront. We first gather up 1737 * everything in the ring, as long as we have resources. Then we 1738 * dispatch one request, and then attempt to gather up any 1739 * additional requests that have come in while we were dispatching 1740 * the request. 1741 * 1742 * This allows us to get a clearer picture (via devstat) of how 1743 * many requests blockfront is queueing to us at any given time. 1744 */ 1745 for (;;) { 1746 int retval; 1747 1748 /* 1749 * Initialize reqlist to the last element in the pending 1750 * queue, if there is one. This allows us to add more 1751 * requests to that request list, if we have room. 1752 */ 1753 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1754 xbb_xen_reqlist, links); 1755 if (reqlist != NULL) { 1756 cur_sector = reqlist->next_contig_sector; 1757 cur_operation = reqlist->operation; 1758 } else { 1759 cur_operation = 0; 1760 cur_sector = 0; 1761 } 1762 1763 /* 1764 * Cache req_prod to avoid accessing a cache line shared 1765 * with the frontend. 1766 */ 1767 rp = rings->common.sring->req_prod; 1768 1769 /* Ensure we see queued requests up to 'rp'. */ 1770 rmb(); 1771 1772 /** 1773 * Run so long as there is work to consume and the generation 1774 * of a response will not overflow the ring. 1775 * 1776 * @note There's a 1 to 1 relationship between requests and 1777 * responses, so an overflow should never occur. This 1778 * test is to protect our domain from digesting bogus 1779 * data. Shouldn't we log this? 1780 */ 1781 while (rings->common.req_cons != rp 1782 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1783 rings->common.req_cons) == 0){ 1784 blkif_request_t ring_req_storage; 1785 blkif_request_t *ring_req; 1786 int cur_size; 1787 1788 switch (xbb->abi) { 1789 case BLKIF_PROTOCOL_NATIVE: 1790 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1791 rings->common.req_cons); 1792 break; 1793 case BLKIF_PROTOCOL_X86_32: 1794 { 1795 struct blkif_x86_32_request *ring_req32; 1796 1797 ring_req32 = RING_GET_REQUEST( 1798 &xbb->rings.x86_32, rings->common.req_cons); 1799 blkif_get_x86_32_req(&ring_req_storage, 1800 ring_req32); 1801 ring_req = &ring_req_storage; 1802 break; 1803 } 1804 case BLKIF_PROTOCOL_X86_64: 1805 { 1806 struct blkif_x86_64_request *ring_req64; 1807 1808 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1809 rings->common.req_cons); 1810 blkif_get_x86_64_req(&ring_req_storage, 1811 ring_req64); 1812 ring_req = &ring_req_storage; 1813 break; 1814 } 1815 default: 1816 panic("Unexpected blkif protocol ABI."); 1817 /* NOTREACHED */ 1818 } 1819 1820 /* 1821 * Check for situations that would require closing 1822 * off this I/O for further coalescing: 1823 * - Coalescing is turned off. 1824 * - Current I/O is out of sequence with the previous 1825 * I/O. 1826 * - Coalesced I/O would be too large. 1827 */ 1828 if ((reqlist != NULL) 1829 && ((xbb->no_coalesce_reqs != 0) 1830 || ((xbb->no_coalesce_reqs == 0) 1831 && ((ring_req->sector_number != cur_sector) 1832 || (ring_req->operation != cur_operation) 1833 || ((ring_req->nr_segments + reqlist->nr_segments) > 1834 xbb->max_reqlist_segments))))) { 1835 reqlist = NULL; 1836 } 1837 1838 /* 1839 * Grab and check for all resources in one shot. 1840 * If we can't get all of the resources we need, 1841 * the shortage is noted and the thread will get 1842 * woken up when more resources are available. 1843 */ 1844 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1845 xbb->rings.common.req_cons); 1846 1847 if (retval != 0) { 1848 /* 1849 * Resource shortage has been recorded. 1850 * We'll be scheduled to run once a request 1851 * object frees up due to a completion. 1852 */ 1853 break; 1854 } 1855 1856 /* 1857 * Signify that we can overwrite this request with 1858 * a response by incrementing our consumer index. 1859 * The response won't be generated until after 1860 * we've already consumed all necessary data out 1861 * of the version of the request in the ring buffer 1862 * (for native mode). We must update the consumer 1863 * index before issuing back-end I/O so there is 1864 * no possibility that it will complete and a 1865 * response be generated before we make room in 1866 * the queue for that response. 1867 */ 1868 xbb->rings.common.req_cons++; 1869 xbb->reqs_received++; 1870 1871 cur_size = xbb_count_sects(ring_req); 1872 cur_sector = ring_req->sector_number + cur_size; 1873 reqlist->next_contig_sector = cur_sector; 1874 cur_operation = ring_req->operation; 1875 } 1876 1877 /* Check for I/O to dispatch */ 1878 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1879 if (reqlist == NULL) { 1880 /* 1881 * We're out of work to do, put the task queue to 1882 * sleep. 1883 */ 1884 break; 1885 } 1886 1887 /* 1888 * Grab the first request off the queue and attempt 1889 * to dispatch it. 1890 */ 1891 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1892 1893 retval = xbb_dispatch_io(xbb, reqlist); 1894 if (retval != 0) { 1895 /* 1896 * xbb_dispatch_io() returns non-zero only when 1897 * there is a resource shortage. If that's the 1898 * case, re-queue this request on the head of the 1899 * queue, and go to sleep until we have more 1900 * resources. 1901 */ 1902 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1903 reqlist, links); 1904 break; 1905 } else { 1906 /* 1907 * If we still have anything on the queue after 1908 * removing the head entry, that is because we 1909 * met one of the criteria to create a new 1910 * request list (outlined above), and we'll call 1911 * that a forced dispatch for statistical purposes. 1912 * 1913 * Otherwise, if there is only one element on the 1914 * queue, we coalesced everything available on 1915 * the ring and we'll call that a normal dispatch. 1916 */ 1917 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1918 1919 if (reqlist != NULL) 1920 xbb->forced_dispatch++; 1921 else 1922 xbb->normal_dispatch++; 1923 1924 xbb->total_dispatch++; 1925 } 1926 } 1927 } 1928 1929 /** 1930 * Interrupt handler bound to the shared ring's event channel. 1931 * 1932 * \param arg Callback argument registerd during event channel 1933 * binding - the xbb_softc for this instance. 1934 */ 1935 static int 1936 xbb_filter(void *arg) 1937 { 1938 struct xbb_softc *xbb; 1939 1940 /* Defer to taskqueue thread. */ 1941 xbb = (struct xbb_softc *)arg; 1942 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1943 1944 return (FILTER_HANDLED); 1945 } 1946 1947 SDT_PROVIDER_DEFINE(xbb); 1948 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 1949 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 1950 "uint64_t"); 1951 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 1952 "uint64_t", "uint64_t"); 1953 1954 /*----------------------------- Backend Handlers -----------------------------*/ 1955 /** 1956 * Backend handler for character device access. 1957 * 1958 * \param xbb Per-instance xbb configuration structure. 1959 * \param reqlist Allocated internal request list structure. 1960 * \param operation BIO_* I/O operation code. 1961 * \param bio_flags Additional bio_flag data to pass to any generated 1962 * bios (e.g. BIO_ORDERED).. 1963 * 1964 * \return 0 for success, errno codes for failure. 1965 */ 1966 static int 1967 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1968 int operation, int bio_flags) 1969 { 1970 struct xbb_dev_data *dev_data; 1971 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 1972 off_t bio_offset; 1973 struct bio *bio; 1974 struct xbb_sg *xbb_sg; 1975 u_int nbio; 1976 u_int bio_idx; 1977 u_int nseg; 1978 u_int seg_idx; 1979 int error; 1980 1981 dev_data = &xbb->backend.dev; 1982 bio_offset = (off_t)reqlist->starting_sector_number 1983 << xbb->sector_size_shift; 1984 error = 0; 1985 nbio = 0; 1986 bio_idx = 0; 1987 1988 if (operation == BIO_FLUSH) { 1989 bio = g_new_bio(); 1990 if (__predict_false(bio == NULL)) { 1991 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 1992 error = ENOMEM; 1993 return (error); 1994 } 1995 1996 bio->bio_cmd = BIO_FLUSH; 1997 bio->bio_flags |= BIO_ORDERED; 1998 bio->bio_dev = dev_data->cdev; 1999 bio->bio_offset = 0; 2000 bio->bio_data = 0; 2001 bio->bio_done = xbb_bio_done; 2002 bio->bio_caller1 = reqlist; 2003 bio->bio_pblkno = 0; 2004 2005 reqlist->pendcnt = 1; 2006 2007 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2008 device_get_unit(xbb->dev)); 2009 2010 (*dev_data->csw->d_strategy)(bio); 2011 2012 return (0); 2013 } 2014 2015 xbb_sg = xbb->xbb_sgs; 2016 bio = NULL; 2017 nseg = reqlist->nr_segments; 2018 2019 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2020 /* 2021 * KVA will not be contiguous, so any additional 2022 * I/O will need to be represented in a new bio. 2023 */ 2024 if ((bio != NULL) 2025 && (xbb_sg->first_sect != 0)) { 2026 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2027 printf("%s: Discontiguous I/O request " 2028 "from domain %d ends on " 2029 "non-sector boundary\n", 2030 __func__, xbb->otherend_id); 2031 error = EINVAL; 2032 goto fail_free_bios; 2033 } 2034 bio = NULL; 2035 } 2036 2037 if (bio == NULL) { 2038 /* 2039 * Make sure that the start of this bio is 2040 * aligned to a device sector. 2041 */ 2042 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2043 printf("%s: Misaligned I/O request " 2044 "from domain %d\n", __func__, 2045 xbb->otherend_id); 2046 error = EINVAL; 2047 goto fail_free_bios; 2048 } 2049 2050 bio = bios[nbio++] = g_new_bio(); 2051 if (__predict_false(bio == NULL)) { 2052 error = ENOMEM; 2053 goto fail_free_bios; 2054 } 2055 bio->bio_cmd = operation; 2056 bio->bio_flags |= bio_flags; 2057 bio->bio_dev = dev_data->cdev; 2058 bio->bio_offset = bio_offset; 2059 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2060 xbb_sg->first_sect); 2061 bio->bio_done = xbb_bio_done; 2062 bio->bio_caller1 = reqlist; 2063 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2064 } 2065 2066 bio->bio_length += xbb_sg->nsect << 9; 2067 bio->bio_bcount = bio->bio_length; 2068 bio_offset += xbb_sg->nsect << 9; 2069 2070 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2071 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2072 printf("%s: Discontiguous I/O request " 2073 "from domain %d ends on " 2074 "non-sector boundary\n", 2075 __func__, xbb->otherend_id); 2076 error = EINVAL; 2077 goto fail_free_bios; 2078 } 2079 /* 2080 * KVA will not be contiguous, so any additional 2081 * I/O will need to be represented in a new bio. 2082 */ 2083 bio = NULL; 2084 } 2085 } 2086 2087 reqlist->pendcnt = nbio; 2088 2089 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2090 { 2091 if (operation == BIO_READ) { 2092 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2093 device_get_unit(xbb->dev), 2094 bios[bio_idx]->bio_offset, 2095 bios[bio_idx]->bio_length); 2096 } else if (operation == BIO_WRITE) { 2097 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2098 device_get_unit(xbb->dev), 2099 bios[bio_idx]->bio_offset, 2100 bios[bio_idx]->bio_length); 2101 } 2102 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2103 } 2104 2105 return (error); 2106 2107 fail_free_bios: 2108 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2109 g_destroy_bio(bios[bio_idx]); 2110 2111 return (error); 2112 } 2113 2114 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2115 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2116 "uint64_t"); 2117 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2118 "uint64_t", "uint64_t"); 2119 2120 /** 2121 * Backend handler for file access. 2122 * 2123 * \param xbb Per-instance xbb configuration structure. 2124 * \param reqlist Allocated internal request list. 2125 * \param operation BIO_* I/O operation code. 2126 * \param flags Additional bio_flag data to pass to any generated bios 2127 * (e.g. BIO_ORDERED).. 2128 * 2129 * \return 0 for success, errno codes for failure. 2130 */ 2131 static int 2132 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2133 int operation, int flags) 2134 { 2135 struct xbb_file_data *file_data; 2136 u_int seg_idx; 2137 u_int nseg; 2138 struct uio xuio; 2139 struct xbb_sg *xbb_sg; 2140 struct iovec *xiovec; 2141 int error; 2142 2143 file_data = &xbb->backend.file; 2144 error = 0; 2145 bzero(&xuio, sizeof(xuio)); 2146 2147 switch (operation) { 2148 case BIO_READ: 2149 xuio.uio_rw = UIO_READ; 2150 break; 2151 case BIO_WRITE: 2152 xuio.uio_rw = UIO_WRITE; 2153 break; 2154 case BIO_FLUSH: { 2155 struct mount *mountpoint; 2156 2157 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2158 device_get_unit(xbb->dev)); 2159 2160 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2161 2162 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2163 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2164 VOP_UNLOCK(xbb->vn); 2165 2166 vn_finished_write(mountpoint); 2167 2168 goto bailout_send_response; 2169 /* NOTREACHED */ 2170 } 2171 default: 2172 panic("invalid operation %d", operation); 2173 /* NOTREACHED */ 2174 } 2175 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2176 << xbb->sector_size_shift; 2177 xuio.uio_segflg = UIO_SYSSPACE; 2178 xuio.uio_iov = file_data->xiovecs; 2179 xuio.uio_iovcnt = 0; 2180 xbb_sg = xbb->xbb_sgs; 2181 nseg = reqlist->nr_segments; 2182 2183 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2184 /* 2185 * If the first sector is not 0, the KVA will 2186 * not be contiguous and we'll need to go on 2187 * to another segment. 2188 */ 2189 if (xbb_sg->first_sect != 0) 2190 xiovec = NULL; 2191 2192 if (xiovec == NULL) { 2193 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2194 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2195 seg_idx, xbb_sg->first_sect); 2196 xiovec->iov_len = 0; 2197 xuio.uio_iovcnt++; 2198 } 2199 2200 xiovec->iov_len += xbb_sg->nsect << 9; 2201 2202 xuio.uio_resid += xbb_sg->nsect << 9; 2203 2204 /* 2205 * If the last sector is not the full page 2206 * size count, the next segment will not be 2207 * contiguous in KVA and we need a new iovec. 2208 */ 2209 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2210 xiovec = NULL; 2211 } 2212 2213 xuio.uio_td = curthread; 2214 2215 switch (operation) { 2216 case BIO_READ: 2217 2218 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2219 device_get_unit(xbb->dev), xuio.uio_offset, 2220 xuio.uio_resid); 2221 2222 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2223 2224 /* 2225 * UFS pays attention to IO_DIRECT for reads. If the 2226 * DIRECTIO option is configured into the kernel, it calls 2227 * ffs_rawread(). But that only works for single-segment 2228 * uios with user space addresses. In our case, with a 2229 * kernel uio, it still reads into the buffer cache, but it 2230 * will just try to release the buffer from the cache later 2231 * on in ffs_read(). 2232 * 2233 * ZFS does not pay attention to IO_DIRECT for reads. 2234 * 2235 * UFS does not pay attention to IO_SYNC for reads. 2236 * 2237 * ZFS pays attention to IO_SYNC (which translates into the 2238 * Solaris define FRSYNC for zfs_read()) for reads. It 2239 * attempts to sync the file before reading. 2240 * 2241 * So, to attempt to provide some barrier semantics in the 2242 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2243 */ 2244 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2245 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2246 2247 VOP_UNLOCK(xbb->vn); 2248 break; 2249 case BIO_WRITE: { 2250 struct mount *mountpoint; 2251 2252 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2253 device_get_unit(xbb->dev), xuio.uio_offset, 2254 xuio.uio_resid); 2255 2256 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2257 2258 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2259 2260 /* 2261 * UFS pays attention to IO_DIRECT for writes. The write 2262 * is done asynchronously. (Normally the write would just 2263 * get put into cache. 2264 * 2265 * UFS pays attention to IO_SYNC for writes. It will 2266 * attempt to write the buffer out synchronously if that 2267 * flag is set. 2268 * 2269 * ZFS does not pay attention to IO_DIRECT for writes. 2270 * 2271 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2272 * for writes. It will flush the transaction from the 2273 * cache before returning. 2274 * 2275 * So if we've got the BIO_ORDERED flag set, we want 2276 * IO_SYNC in either the UFS or ZFS case. 2277 */ 2278 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2279 IO_SYNC : 0, file_data->cred); 2280 VOP_UNLOCK(xbb->vn); 2281 2282 vn_finished_write(mountpoint); 2283 2284 break; 2285 } 2286 default: 2287 panic("invalid operation %d", operation); 2288 /* NOTREACHED */ 2289 } 2290 2291 bailout_send_response: 2292 2293 if (error != 0) 2294 reqlist->status = BLKIF_RSP_ERROR; 2295 2296 xbb_complete_reqlist(xbb, reqlist); 2297 2298 return (0); 2299 } 2300 2301 /*--------------------------- Backend Configuration --------------------------*/ 2302 /** 2303 * Close and cleanup any backend device/file specific state for this 2304 * block back instance. 2305 * 2306 * \param xbb Per-instance xbb configuration structure. 2307 */ 2308 static void 2309 xbb_close_backend(struct xbb_softc *xbb) 2310 { 2311 DROP_GIANT(); 2312 DPRINTF("closing dev=%s\n", xbb->dev_name); 2313 if (xbb->vn) { 2314 int flags = FREAD; 2315 2316 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2317 flags |= FWRITE; 2318 2319 switch (xbb->device_type) { 2320 case XBB_TYPE_DISK: 2321 if (xbb->backend.dev.csw) { 2322 dev_relthread(xbb->backend.dev.cdev, 2323 xbb->backend.dev.dev_ref); 2324 xbb->backend.dev.csw = NULL; 2325 xbb->backend.dev.cdev = NULL; 2326 } 2327 break; 2328 case XBB_TYPE_FILE: 2329 break; 2330 case XBB_TYPE_NONE: 2331 default: 2332 panic("Unexpected backend type."); 2333 break; 2334 } 2335 2336 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2337 xbb->vn = NULL; 2338 2339 switch (xbb->device_type) { 2340 case XBB_TYPE_DISK: 2341 break; 2342 case XBB_TYPE_FILE: 2343 if (xbb->backend.file.cred != NULL) { 2344 crfree(xbb->backend.file.cred); 2345 xbb->backend.file.cred = NULL; 2346 } 2347 break; 2348 case XBB_TYPE_NONE: 2349 default: 2350 panic("Unexpected backend type."); 2351 break; 2352 } 2353 } 2354 PICKUP_GIANT(); 2355 } 2356 2357 /** 2358 * Open a character device to be used for backend I/O. 2359 * 2360 * \param xbb Per-instance xbb configuration structure. 2361 * 2362 * \return 0 for success, errno codes for failure. 2363 */ 2364 static int 2365 xbb_open_dev(struct xbb_softc *xbb) 2366 { 2367 struct vattr vattr; 2368 struct cdev *dev; 2369 struct cdevsw *devsw; 2370 int error; 2371 2372 xbb->device_type = XBB_TYPE_DISK; 2373 xbb->dispatch_io = xbb_dispatch_dev; 2374 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2375 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2376 &xbb->backend.dev.dev_ref); 2377 if (xbb->backend.dev.csw == NULL) 2378 panic("Unable to retrieve device switch"); 2379 2380 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2381 if (error) { 2382 xenbus_dev_fatal(xbb->dev, error, "error getting " 2383 "vnode attributes for device %s", 2384 xbb->dev_name); 2385 return (error); 2386 } 2387 2388 dev = xbb->vn->v_rdev; 2389 devsw = dev->si_devsw; 2390 if (!devsw->d_ioctl) { 2391 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2392 "device %s!", xbb->dev_name); 2393 return (ENODEV); 2394 } 2395 2396 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2397 (caddr_t)&xbb->sector_size, FREAD, 2398 curthread); 2399 if (error) { 2400 xenbus_dev_fatal(xbb->dev, error, 2401 "error calling ioctl DIOCGSECTORSIZE " 2402 "for device %s", xbb->dev_name); 2403 return (error); 2404 } 2405 2406 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2407 (caddr_t)&xbb->media_size, FREAD, 2408 curthread); 2409 if (error) { 2410 xenbus_dev_fatal(xbb->dev, error, 2411 "error calling ioctl DIOCGMEDIASIZE " 2412 "for device %s", xbb->dev_name); 2413 return (error); 2414 } 2415 2416 return (0); 2417 } 2418 2419 /** 2420 * Open a file to be used for backend I/O. 2421 * 2422 * \param xbb Per-instance xbb configuration structure. 2423 * 2424 * \return 0 for success, errno codes for failure. 2425 */ 2426 static int 2427 xbb_open_file(struct xbb_softc *xbb) 2428 { 2429 struct xbb_file_data *file_data; 2430 struct vattr vattr; 2431 int error; 2432 2433 file_data = &xbb->backend.file; 2434 xbb->device_type = XBB_TYPE_FILE; 2435 xbb->dispatch_io = xbb_dispatch_file; 2436 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2437 if (error != 0) { 2438 xenbus_dev_fatal(xbb->dev, error, 2439 "error calling VOP_GETATTR()" 2440 "for file %s", xbb->dev_name); 2441 return (error); 2442 } 2443 2444 /* 2445 * Verify that we have the ability to upgrade to exclusive 2446 * access on this file so we can trap errors at open instead 2447 * of reporting them during first access. 2448 */ 2449 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2450 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2451 if (VN_IS_DOOMED(xbb->vn)) { 2452 error = EBADF; 2453 xenbus_dev_fatal(xbb->dev, error, 2454 "error locking file %s", 2455 xbb->dev_name); 2456 2457 return (error); 2458 } 2459 } 2460 2461 file_data->cred = crhold(curthread->td_ucred); 2462 xbb->media_size = vattr.va_size; 2463 2464 /* 2465 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2466 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2467 * with disklabel and UFS on FreeBSD at least. Large block sizes 2468 * may not work with other OSes as well. So just export a sector 2469 * size of 512 bytes, which should work with any OS or 2470 * application. Since our backing is a file, any block size will 2471 * work fine for the backing store. 2472 */ 2473 #if 0 2474 xbb->sector_size = vattr.va_blocksize; 2475 #endif 2476 xbb->sector_size = 512; 2477 2478 /* 2479 * Sanity check. The media size has to be at least one 2480 * sector long. 2481 */ 2482 if (xbb->media_size < xbb->sector_size) { 2483 error = EINVAL; 2484 xenbus_dev_fatal(xbb->dev, error, 2485 "file %s size %ju < block size %u", 2486 xbb->dev_name, 2487 (uintmax_t)xbb->media_size, 2488 xbb->sector_size); 2489 } 2490 return (error); 2491 } 2492 2493 /** 2494 * Open the backend provider for this connection. 2495 * 2496 * \param xbb Per-instance xbb configuration structure. 2497 * 2498 * \return 0 for success, errno codes for failure. 2499 */ 2500 static int 2501 xbb_open_backend(struct xbb_softc *xbb) 2502 { 2503 struct nameidata nd; 2504 int flags; 2505 int error; 2506 2507 flags = FREAD; 2508 error = 0; 2509 2510 DPRINTF("opening dev=%s\n", xbb->dev_name); 2511 2512 if (rootvnode == NULL) { 2513 xenbus_dev_fatal(xbb->dev, ENOENT, 2514 "Root file system not mounted"); 2515 return (ENOENT); 2516 } 2517 2518 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2519 flags |= FWRITE; 2520 2521 pwd_ensure_dirs(); 2522 2523 again: 2524 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name); 2525 error = vn_open(&nd, &flags, 0, NULL); 2526 if (error) { 2527 /* 2528 * This is the only reasonable guess we can make as far as 2529 * path if the user doesn't give us a fully qualified path. 2530 * If they want to specify a file, they need to specify the 2531 * full path. 2532 */ 2533 if (xbb->dev_name[0] != '/') { 2534 char *dev_path = "/dev/"; 2535 char *dev_name; 2536 2537 /* Try adding device path at beginning of name */ 2538 dev_name = malloc(strlen(xbb->dev_name) 2539 + strlen(dev_path) + 1, 2540 M_XENBLOCKBACK, M_NOWAIT); 2541 if (dev_name) { 2542 sprintf(dev_name, "%s%s", dev_path, 2543 xbb->dev_name); 2544 free(xbb->dev_name, M_XENBLOCKBACK); 2545 xbb->dev_name = dev_name; 2546 goto again; 2547 } 2548 } 2549 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2550 xbb->dev_name); 2551 return (error); 2552 } 2553 2554 NDFREE_PNBUF(&nd); 2555 2556 xbb->vn = nd.ni_vp; 2557 2558 /* We only support disks and files. */ 2559 if (vn_isdisk_error(xbb->vn, &error)) { 2560 error = xbb_open_dev(xbb); 2561 } else if (xbb->vn->v_type == VREG) { 2562 error = xbb_open_file(xbb); 2563 } else { 2564 error = EINVAL; 2565 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2566 "or file", xbb->dev_name); 2567 } 2568 VOP_UNLOCK(xbb->vn); 2569 2570 if (error != 0) { 2571 xbb_close_backend(xbb); 2572 return (error); 2573 } 2574 2575 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2576 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2577 2578 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2579 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2580 xbb->dev_name, xbb->sector_size, xbb->media_size); 2581 2582 return (0); 2583 } 2584 2585 /*------------------------ Inter-Domain Communication ------------------------*/ 2586 /** 2587 * Free dynamically allocated KVA or pseudo-physical address allocations. 2588 * 2589 * \param xbb Per-instance xbb configuration structure. 2590 */ 2591 static void 2592 xbb_free_communication_mem(struct xbb_softc *xbb) 2593 { 2594 if (xbb->kva != 0) { 2595 if (xbb->pseudo_phys_res != NULL) { 2596 xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, 2597 xbb->pseudo_phys_res); 2598 xbb->pseudo_phys_res = NULL; 2599 } 2600 } 2601 xbb->kva = 0; 2602 xbb->gnt_base_addr = 0; 2603 if (xbb->kva_free != NULL) { 2604 free(xbb->kva_free, M_XENBLOCKBACK); 2605 xbb->kva_free = NULL; 2606 } 2607 } 2608 2609 /** 2610 * Cleanup all inter-domain communication mechanisms. 2611 * 2612 * \param xbb Per-instance xbb configuration structure. 2613 */ 2614 static int 2615 xbb_disconnect(struct xbb_softc *xbb) 2616 { 2617 DPRINTF("\n"); 2618 2619 mtx_unlock(&xbb->lock); 2620 xen_intr_unbind(&xbb->xen_intr_handle); 2621 if (xbb->io_taskqueue != NULL) 2622 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2623 mtx_lock(&xbb->lock); 2624 2625 /* 2626 * No new interrupts can generate work, but we must wait 2627 * for all currently active requests to drain. 2628 */ 2629 if (xbb->active_request_count != 0) 2630 return (EAGAIN); 2631 2632 if (xbb->flags & XBBF_RING_CONNECTED) { 2633 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2634 struct gnttab_unmap_grant_ref *op; 2635 unsigned int ring_idx; 2636 int error; 2637 2638 for (ring_idx = 0, op = ops; 2639 ring_idx < xbb->ring_config.ring_pages; 2640 ring_idx++, op++) { 2641 op->host_addr = xbb->ring_config.gnt_addr 2642 + (ring_idx * PAGE_SIZE); 2643 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2644 op->handle = xbb->ring_config.handle[ring_idx]; 2645 } 2646 2647 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2648 xbb->ring_config.ring_pages); 2649 if (error != 0) 2650 panic("Grant table op failed (%d)", error); 2651 2652 xbb->flags &= ~XBBF_RING_CONNECTED; 2653 } 2654 2655 xbb_free_communication_mem(xbb); 2656 2657 if (xbb->requests != NULL) { 2658 free(xbb->requests, M_XENBLOCKBACK); 2659 xbb->requests = NULL; 2660 } 2661 2662 if (xbb->request_lists != NULL) { 2663 struct xbb_xen_reqlist *reqlist; 2664 int i; 2665 2666 /* There is one request list for ever allocated request. */ 2667 for (i = 0, reqlist = xbb->request_lists; 2668 i < xbb->max_requests; i++, reqlist++){ 2669 if (reqlist->gnt_handles != NULL) { 2670 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2671 reqlist->gnt_handles = NULL; 2672 } 2673 } 2674 free(xbb->request_lists, M_XENBLOCKBACK); 2675 xbb->request_lists = NULL; 2676 } 2677 2678 return (0); 2679 } 2680 2681 /** 2682 * Map shared memory ring into domain local address space, initialize 2683 * ring control structures, and bind an interrupt to the event channel 2684 * used to notify us of ring changes. 2685 * 2686 * \param xbb Per-instance xbb configuration structure. 2687 */ 2688 static int 2689 xbb_connect_ring(struct xbb_softc *xbb) 2690 { 2691 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2692 struct gnttab_map_grant_ref *gnt; 2693 u_int ring_idx; 2694 int error; 2695 2696 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2697 return (0); 2698 2699 /* 2700 * Kva for our ring is at the tail of the region of kva allocated 2701 * by xbb_alloc_communication_mem(). 2702 */ 2703 xbb->ring_config.va = xbb->kva 2704 + (xbb->kva_size 2705 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2706 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2707 + (xbb->kva_size 2708 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2709 2710 for (ring_idx = 0, gnt = gnts; 2711 ring_idx < xbb->ring_config.ring_pages; 2712 ring_idx++, gnt++) { 2713 gnt->host_addr = xbb->ring_config.gnt_addr 2714 + (ring_idx * PAGE_SIZE); 2715 gnt->flags = GNTMAP_host_map; 2716 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2717 gnt->dom = xbb->otherend_id; 2718 } 2719 2720 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2721 xbb->ring_config.ring_pages); 2722 if (error) 2723 panic("blkback: Ring page grant table op failed (%d)", error); 2724 2725 for (ring_idx = 0, gnt = gnts; 2726 ring_idx < xbb->ring_config.ring_pages; 2727 ring_idx++, gnt++) { 2728 if (gnt->status != 0) { 2729 struct gnttab_unmap_grant_ref unmap[XBB_MAX_RING_PAGES]; 2730 unsigned int i, j; 2731 2732 xbb->ring_config.va = 0; 2733 xenbus_dev_fatal(xbb->dev, EACCES, 2734 "Ring shared page mapping failed. " 2735 "Status %d.", gnt->status); 2736 2737 /* Unmap everything to avoid leaking grant table maps */ 2738 for (i = 0, j = 0; i < xbb->ring_config.ring_pages; 2739 i++) { 2740 if (gnts[i].status != GNTST_okay) 2741 continue; 2742 2743 unmap[j].host_addr = gnts[i].host_addr; 2744 unmap[j].dev_bus_addr = gnts[i].dev_bus_addr; 2745 unmap[j++].handle = gnts[i].handle; 2746 } 2747 if (j != 0) { 2748 error = HYPERVISOR_grant_table_op( 2749 GNTTABOP_unmap_grant_ref, unmap, j); 2750 if (error != 0) 2751 panic("Unable to unmap grants (%d)", 2752 error); 2753 } 2754 return (EACCES); 2755 } 2756 xbb->ring_config.handle[ring_idx] = gnt->handle; 2757 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2758 } 2759 2760 /* Initialize the ring based on ABI. */ 2761 switch (xbb->abi) { 2762 case BLKIF_PROTOCOL_NATIVE: 2763 { 2764 blkif_sring_t *sring; 2765 sring = (blkif_sring_t *)xbb->ring_config.va; 2766 BACK_RING_INIT(&xbb->rings.native, sring, 2767 xbb->ring_config.ring_pages * PAGE_SIZE); 2768 break; 2769 } 2770 case BLKIF_PROTOCOL_X86_32: 2771 { 2772 blkif_x86_32_sring_t *sring_x86_32; 2773 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2774 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2775 xbb->ring_config.ring_pages * PAGE_SIZE); 2776 break; 2777 } 2778 case BLKIF_PROTOCOL_X86_64: 2779 { 2780 blkif_x86_64_sring_t *sring_x86_64; 2781 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2782 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2783 xbb->ring_config.ring_pages * PAGE_SIZE); 2784 break; 2785 } 2786 default: 2787 panic("Unexpected blkif protocol ABI."); 2788 } 2789 2790 xbb->flags |= XBBF_RING_CONNECTED; 2791 2792 error = xen_intr_bind_remote_port(xbb->dev, 2793 xbb->otherend_id, 2794 xbb->ring_config.evtchn, 2795 xbb_filter, 2796 /*ithread_handler*/NULL, 2797 /*arg*/xbb, 2798 INTR_TYPE_BIO | INTR_MPSAFE, 2799 &xbb->xen_intr_handle); 2800 if (error) { 2801 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2802 return (error); 2803 } 2804 2805 DPRINTF("rings connected!\n"); 2806 2807 return 0; 2808 } 2809 2810 /** 2811 * Size KVA and pseudo-physical address allocations based on negotiated 2812 * values for the size and number of I/O requests, and the size of our 2813 * communication ring. 2814 * 2815 * \param xbb Per-instance xbb configuration structure. 2816 * 2817 * These address spaces are used to dynamically map pages in the 2818 * front-end's domain into our own. 2819 */ 2820 static int 2821 xbb_alloc_communication_mem(struct xbb_softc *xbb) 2822 { 2823 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2824 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2825 xbb->kva_size = xbb->reqlist_kva_size + 2826 (xbb->ring_config.ring_pages * PAGE_SIZE); 2827 2828 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); 2829 if (xbb->kva_free == NULL) 2830 return (ENOMEM); 2831 2832 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 2833 device_get_nameunit(xbb->dev), xbb->kva_size, 2834 xbb->reqlist_kva_size); 2835 /* 2836 * Reserve a range of pseudo physical memory that we can map 2837 * into kva. These pages will only be backed by machine 2838 * pages ("real memory") during the lifetime of front-end requests 2839 * via grant table operations. 2840 */ 2841 xbb->pseudo_phys_res_id = 0; 2842 xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, 2843 xbb->kva_size); 2844 if (xbb->pseudo_phys_res == NULL) { 2845 xbb->kva = 0; 2846 return (ENOMEM); 2847 } 2848 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 2849 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 2850 2851 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 2852 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 2853 (uintmax_t)xbb->gnt_base_addr); 2854 return (0); 2855 } 2856 2857 /** 2858 * Collect front-end information from the XenStore. 2859 * 2860 * \param xbb Per-instance xbb configuration structure. 2861 */ 2862 static int 2863 xbb_collect_frontend_info(struct xbb_softc *xbb) 2864 { 2865 char protocol_abi[64]; 2866 const char *otherend_path; 2867 int error; 2868 u_int ring_idx; 2869 u_int ring_page_order; 2870 size_t ring_size; 2871 2872 otherend_path = xenbus_get_otherend_path(xbb->dev); 2873 2874 /* 2875 * Protocol defaults valid even if all negotiation fails. 2876 */ 2877 xbb->ring_config.ring_pages = 1; 2878 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; 2879 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 2880 2881 /* 2882 * Mandatory data (used in all versions of the protocol) first. 2883 */ 2884 error = xs_scanf(XST_NIL, otherend_path, 2885 "event-channel", NULL, "%" PRIu32, 2886 &xbb->ring_config.evtchn); 2887 if (error != 0) { 2888 xenbus_dev_fatal(xbb->dev, error, 2889 "Unable to retrieve event-channel information " 2890 "from frontend %s. Unable to connect.", 2891 xenbus_get_otherend_path(xbb->dev)); 2892 return (error); 2893 } 2894 2895 /* 2896 * These fields are initialized to legacy protocol defaults 2897 * so we only need to fail if reading the updated value succeeds 2898 * and the new value is outside of its allowed range. 2899 * 2900 * \note xs_gather() returns on the first encountered error, so 2901 * we must use independent calls in order to guarantee 2902 * we don't miss information in a sparsly populated front-end 2903 * tree. 2904 * 2905 * \note xs_scanf() does not update variables for unmatched 2906 * fields. 2907 */ 2908 ring_page_order = 0; 2909 xbb->max_requests = 32; 2910 2911 (void)xs_scanf(XST_NIL, otherend_path, 2912 "ring-page-order", NULL, "%u", 2913 &ring_page_order); 2914 xbb->ring_config.ring_pages = 1 << ring_page_order; 2915 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 2916 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 2917 2918 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 2919 xenbus_dev_fatal(xbb->dev, EINVAL, 2920 "Front-end specified ring-pages of %u " 2921 "exceeds backend limit of %u. " 2922 "Unable to connect.", 2923 xbb->ring_config.ring_pages, 2924 XBB_MAX_RING_PAGES); 2925 return (EINVAL); 2926 } 2927 2928 if (xbb->ring_config.ring_pages == 1) { 2929 error = xs_gather(XST_NIL, otherend_path, 2930 "ring-ref", "%" PRIu32, 2931 &xbb->ring_config.ring_ref[0], 2932 NULL); 2933 if (error != 0) { 2934 xenbus_dev_fatal(xbb->dev, error, 2935 "Unable to retrieve ring information " 2936 "from frontend %s. Unable to " 2937 "connect.", 2938 xenbus_get_otherend_path(xbb->dev)); 2939 return (error); 2940 } 2941 } else { 2942 /* Multi-page ring format. */ 2943 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 2944 ring_idx++) { 2945 char ring_ref_name[]= "ring_refXX"; 2946 2947 snprintf(ring_ref_name, sizeof(ring_ref_name), 2948 "ring-ref%u", ring_idx); 2949 error = xs_scanf(XST_NIL, otherend_path, 2950 ring_ref_name, NULL, "%" PRIu32, 2951 &xbb->ring_config.ring_ref[ring_idx]); 2952 if (error != 0) { 2953 xenbus_dev_fatal(xbb->dev, error, 2954 "Failed to retriev grant " 2955 "reference for page %u of " 2956 "shared ring. Unable " 2957 "to connect.", ring_idx); 2958 return (error); 2959 } 2960 } 2961 } 2962 2963 error = xs_gather(XST_NIL, otherend_path, 2964 "protocol", "%63s", protocol_abi, 2965 NULL); 2966 if (error != 0 2967 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 2968 /* 2969 * Assume native if the frontend has not 2970 * published ABI data or it has published and 2971 * matches our own ABI. 2972 */ 2973 xbb->abi = BLKIF_PROTOCOL_NATIVE; 2974 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 2975 xbb->abi = BLKIF_PROTOCOL_X86_32; 2976 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 2977 xbb->abi = BLKIF_PROTOCOL_X86_64; 2978 } else { 2979 xenbus_dev_fatal(xbb->dev, EINVAL, 2980 "Unknown protocol ABI (%s) published by " 2981 "frontend. Unable to connect.", protocol_abi); 2982 return (EINVAL); 2983 } 2984 return (0); 2985 } 2986 2987 /** 2988 * Allocate per-request data structures given request size and number 2989 * information negotiated with the front-end. 2990 * 2991 * \param xbb Per-instance xbb configuration structure. 2992 */ 2993 static int 2994 xbb_alloc_requests(struct xbb_softc *xbb) 2995 { 2996 struct xbb_xen_req *req; 2997 struct xbb_xen_req *last_req; 2998 2999 /* 3000 * Allocate request book keeping datastructures. 3001 */ 3002 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3003 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3004 if (xbb->requests == NULL) { 3005 xenbus_dev_fatal(xbb->dev, ENOMEM, 3006 "Unable to allocate request structures"); 3007 return (ENOMEM); 3008 } 3009 3010 req = xbb->requests; 3011 last_req = &xbb->requests[xbb->max_requests - 1]; 3012 STAILQ_INIT(&xbb->request_free_stailq); 3013 while (req <= last_req) { 3014 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3015 req++; 3016 } 3017 return (0); 3018 } 3019 3020 static int 3021 xbb_alloc_request_lists(struct xbb_softc *xbb) 3022 { 3023 struct xbb_xen_reqlist *reqlist; 3024 int i; 3025 3026 /* 3027 * If no requests can be merged, we need 1 request list per 3028 * in flight request. 3029 */ 3030 xbb->request_lists = malloc(xbb->max_requests * 3031 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3032 if (xbb->request_lists == NULL) { 3033 xenbus_dev_fatal(xbb->dev, ENOMEM, 3034 "Unable to allocate request list structures"); 3035 return (ENOMEM); 3036 } 3037 3038 STAILQ_INIT(&xbb->reqlist_free_stailq); 3039 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3040 for (i = 0; i < xbb->max_requests; i++) { 3041 int seg; 3042 3043 reqlist = &xbb->request_lists[i]; 3044 3045 reqlist->xbb = xbb; 3046 3047 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3048 sizeof(*reqlist->gnt_handles), 3049 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3050 if (reqlist->gnt_handles == NULL) { 3051 xenbus_dev_fatal(xbb->dev, ENOMEM, 3052 "Unable to allocate request " 3053 "grant references"); 3054 return (ENOMEM); 3055 } 3056 3057 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3058 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3059 3060 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3061 } 3062 return (0); 3063 } 3064 3065 /** 3066 * Supply information about the physical device to the frontend 3067 * via XenBus. 3068 * 3069 * \param xbb Per-instance xbb configuration structure. 3070 */ 3071 static int 3072 xbb_publish_backend_info(struct xbb_softc *xbb) 3073 { 3074 struct xs_transaction xst; 3075 const char *our_path; 3076 const char *leaf; 3077 int error; 3078 3079 our_path = xenbus_get_node(xbb->dev); 3080 while (1) { 3081 error = xs_transaction_start(&xst); 3082 if (error != 0) { 3083 xenbus_dev_fatal(xbb->dev, error, 3084 "Error publishing backend info " 3085 "(start transaction)"); 3086 return (error); 3087 } 3088 3089 leaf = "sectors"; 3090 error = xs_printf(xst, our_path, leaf, 3091 "%"PRIu64, xbb->media_num_sectors); 3092 if (error != 0) 3093 break; 3094 3095 /* XXX Support all VBD attributes here. */ 3096 leaf = "info"; 3097 error = xs_printf(xst, our_path, leaf, "%u", 3098 xbb->flags & XBBF_READ_ONLY 3099 ? VDISK_READONLY : 0); 3100 if (error != 0) 3101 break; 3102 3103 leaf = "sector-size"; 3104 error = xs_printf(xst, our_path, leaf, "%u", 3105 xbb->sector_size); 3106 if (error != 0) 3107 break; 3108 3109 error = xs_transaction_end(xst, 0); 3110 if (error == 0) { 3111 return (0); 3112 } else if (error != EAGAIN) { 3113 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3114 return (error); 3115 } 3116 } 3117 3118 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3119 our_path, leaf); 3120 xs_transaction_end(xst, 1); 3121 return (error); 3122 } 3123 3124 /** 3125 * Connect to our blkfront peer now that it has completed publishing 3126 * its configuration into the XenStore. 3127 * 3128 * \param xbb Per-instance xbb configuration structure. 3129 */ 3130 static void 3131 xbb_connect(struct xbb_softc *xbb) 3132 { 3133 int error; 3134 3135 if (!xbb->hotplug_done || 3136 (xenbus_get_state(xbb->dev) != XenbusStateInitWait) || 3137 (xbb_collect_frontend_info(xbb) != 0)) 3138 return; 3139 3140 xbb->flags &= ~XBBF_SHUTDOWN; 3141 3142 /* 3143 * We limit the maximum number of reqlist segments to the maximum 3144 * number of segments in the ring, or our absolute maximum, 3145 * whichever is smaller. 3146 */ 3147 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3148 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3149 3150 /* 3151 * The maximum size is simply a function of the number of segments 3152 * we can handle. 3153 */ 3154 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3155 3156 /* Allocate resources whose size depends on front-end configuration. */ 3157 error = xbb_alloc_communication_mem(xbb); 3158 if (error != 0) { 3159 xenbus_dev_fatal(xbb->dev, error, 3160 "Unable to allocate communication memory"); 3161 return; 3162 } 3163 3164 error = xbb_publish_backend_info(xbb); 3165 if (error != 0) { 3166 xenbus_dev_fatal(xbb->dev, error, 3167 "Unable to publish device information"); 3168 return; 3169 } 3170 3171 error = xbb_alloc_requests(xbb); 3172 if (error != 0) { 3173 /* Specific errors are reported by xbb_alloc_requests(). */ 3174 return; 3175 } 3176 3177 error = xbb_alloc_request_lists(xbb); 3178 if (error != 0) { 3179 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3180 return; 3181 } 3182 3183 /* 3184 * Connect communication channel. 3185 */ 3186 error = xbb_connect_ring(xbb); 3187 if (error != 0) { 3188 /* Specific errors are reported by xbb_connect_ring(). */ 3189 return; 3190 } 3191 3192 /* Ready for I/O. */ 3193 xenbus_set_state(xbb->dev, XenbusStateConnected); 3194 } 3195 3196 /*-------------------------- Device Teardown Support -------------------------*/ 3197 /** 3198 * Perform device shutdown functions. 3199 * 3200 * \param xbb Per-instance xbb configuration structure. 3201 * 3202 * Mark this instance as shutting down, wait for any active I/O on the 3203 * backend device/file to drain, disconnect from the front-end, and notify 3204 * any waiters (e.g. a thread invoking our detach method) that detach can 3205 * now proceed. 3206 */ 3207 static int 3208 xbb_shutdown(struct xbb_softc *xbb) 3209 { 3210 XenbusState frontState; 3211 int error; 3212 3213 DPRINTF("\n"); 3214 3215 /* 3216 * Due to the need to drop our mutex during some 3217 * xenbus operations, it is possible for two threads 3218 * to attempt to close out shutdown processing at 3219 * the same time. Tell the caller that hits this 3220 * race to try back later. 3221 */ 3222 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3223 return (EAGAIN); 3224 3225 xbb->flags |= XBBF_IN_SHUTDOWN; 3226 mtx_unlock(&xbb->lock); 3227 3228 if (xbb->hotplug_watch.node != NULL) { 3229 xs_unregister_watch(&xbb->hotplug_watch); 3230 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3231 xbb->hotplug_watch.node = NULL; 3232 } 3233 3234 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3235 xenbus_set_state(xbb->dev, XenbusStateClosing); 3236 3237 frontState = xenbus_get_otherend_state(xbb->dev); 3238 mtx_lock(&xbb->lock); 3239 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3240 3241 /* Wait for the frontend to disconnect (if it's connected). */ 3242 if (frontState == XenbusStateConnected) 3243 return (EAGAIN); 3244 3245 DPRINTF("\n"); 3246 3247 /* Indicate shutdown is in progress. */ 3248 xbb->flags |= XBBF_SHUTDOWN; 3249 3250 /* Disconnect from the front-end. */ 3251 error = xbb_disconnect(xbb); 3252 if (error != 0) { 3253 /* 3254 * Requests still outstanding. We'll be called again 3255 * once they complete. 3256 */ 3257 KASSERT(error == EAGAIN, 3258 ("%s: Unexpected xbb_disconnect() failure %d", 3259 __func__, error)); 3260 3261 return (error); 3262 } 3263 3264 DPRINTF("\n"); 3265 3266 /* Indicate to xbb_detach() that is it safe to proceed. */ 3267 wakeup(xbb); 3268 3269 return (0); 3270 } 3271 3272 /** 3273 * Report an attach time error to the console and Xen, and cleanup 3274 * this instance by forcing immediate detach processing. 3275 * 3276 * \param xbb Per-instance xbb configuration structure. 3277 * \param err Errno describing the error. 3278 * \param fmt Printf style format and arguments 3279 */ 3280 static void 3281 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3282 { 3283 va_list ap; 3284 va_list ap_hotplug; 3285 3286 va_start(ap, fmt); 3287 va_copy(ap_hotplug, ap); 3288 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3289 "hotplug-error", fmt, ap_hotplug); 3290 va_end(ap_hotplug); 3291 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3292 "hotplug-status", "error"); 3293 3294 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3295 va_end(ap); 3296 3297 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3298 "online", "0"); 3299 mtx_lock(&xbb->lock); 3300 xbb_shutdown(xbb); 3301 mtx_unlock(&xbb->lock); 3302 } 3303 3304 /*---------------------------- NewBus Entrypoints ----------------------------*/ 3305 /** 3306 * Inspect a XenBus device and claim it if is of the appropriate type. 3307 * 3308 * \param dev NewBus device object representing a candidate XenBus device. 3309 * 3310 * \return 0 for success, errno codes for failure. 3311 */ 3312 static int 3313 xbb_probe(device_t dev) 3314 { 3315 3316 if (strcmp(xenbus_get_type(dev), "vbd")) 3317 return (ENXIO); 3318 3319 /* Only attach if Xen creates IOMMU entries for grant mapped pages. */ 3320 if (!xen_has_iommu_maps()) { 3321 static bool warned; 3322 3323 if (!warned) { 3324 warned = true; 3325 printf( 3326 "xen-blkback disabled due to grant maps lacking IOMMU entries\n"); 3327 } 3328 return (ENXIO); 3329 } 3330 3331 device_set_desc(dev, "Backend Virtual Block Device"); 3332 device_quiet(dev); 3333 return (0); 3334 } 3335 3336 /** 3337 * Setup sysctl variables to control various Block Back parameters. 3338 * 3339 * \param xbb Xen Block Back softc. 3340 * 3341 */ 3342 static void 3343 xbb_setup_sysctl(struct xbb_softc *xbb) 3344 { 3345 struct sysctl_ctx_list *sysctl_ctx = NULL; 3346 struct sysctl_oid *sysctl_tree = NULL; 3347 3348 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3349 if (sysctl_ctx == NULL) 3350 return; 3351 3352 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3353 if (sysctl_tree == NULL) 3354 return; 3355 3356 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3357 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3358 "fake the flush command"); 3359 3360 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3361 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3362 "send a real flush for N flush requests"); 3363 3364 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3365 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3366 "Don't coalesce contiguous requests"); 3367 3368 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3369 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3370 "how many I/O requests we have received"); 3371 3372 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3373 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3374 "how many I/O requests have been completed"); 3375 3376 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3377 "reqs_queued_for_completion", CTLFLAG_RW, 3378 &xbb->reqs_queued_for_completion, 3379 "how many I/O requests queued but not yet pushed"); 3380 3381 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3382 "reqs_completed_with_error", CTLFLAG_RW, 3383 &xbb->reqs_completed_with_error, 3384 "how many I/O requests completed with error status"); 3385 3386 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3387 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3388 "how many I/O dispatches were forced"); 3389 3390 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3391 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3392 "how many I/O dispatches were normal"); 3393 3394 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3395 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3396 "total number of I/O dispatches"); 3397 3398 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3399 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3400 "how many times we have run out of KVA"); 3401 3402 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3403 "request_shortages", CTLFLAG_RW, 3404 &xbb->request_shortages, 3405 "how many times we have run out of requests"); 3406 3407 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3408 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3409 "maximum outstanding requests (negotiated)"); 3410 3411 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3412 "max_request_segments", CTLFLAG_RD, 3413 &xbb->max_request_segments, 0, 3414 "maximum number of pages per requests (negotiated)"); 3415 3416 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3417 "max_request_size", CTLFLAG_RD, 3418 &xbb->max_request_size, 0, 3419 "maximum size in bytes of a request (negotiated)"); 3420 3421 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3422 "ring_pages", CTLFLAG_RD, 3423 &xbb->ring_config.ring_pages, 0, 3424 "communication channel pages (negotiated)"); 3425 } 3426 3427 static void 3428 xbb_attach_disk(device_t dev) 3429 { 3430 struct xbb_softc *xbb; 3431 int error; 3432 3433 xbb = device_get_softc(dev); 3434 3435 KASSERT(xbb->hotplug_done, ("Missing hotplug execution")); 3436 3437 /* Parse fopen style mode flags. */ 3438 if (strchr(xbb->dev_mode, 'w') == NULL) 3439 xbb->flags |= XBBF_READ_ONLY; 3440 3441 /* 3442 * Verify the physical device is present and can support 3443 * the desired I/O mode. 3444 */ 3445 error = xbb_open_backend(xbb); 3446 if (error != 0) { 3447 xbb_attach_failed(xbb, error, "Unable to open %s", 3448 xbb->dev_name); 3449 return; 3450 } 3451 3452 /* Use devstat(9) for recording statistics. */ 3453 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3454 xbb->sector_size, 3455 DEVSTAT_ALL_SUPPORTED, 3456 DEVSTAT_TYPE_DIRECT 3457 | DEVSTAT_TYPE_IF_OTHER, 3458 DEVSTAT_PRIORITY_OTHER); 3459 3460 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3461 xbb->sector_size, 3462 DEVSTAT_ALL_SUPPORTED, 3463 DEVSTAT_TYPE_DIRECT 3464 | DEVSTAT_TYPE_IF_OTHER, 3465 DEVSTAT_PRIORITY_OTHER); 3466 /* 3467 * Setup sysctl variables. 3468 */ 3469 xbb_setup_sysctl(xbb); 3470 3471 /* 3472 * Create a taskqueue for doing work that must occur from a 3473 * thread context. 3474 */ 3475 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3476 M_NOWAIT, 3477 taskqueue_thread_enqueue, 3478 /*contxt*/&xbb->io_taskqueue); 3479 if (xbb->io_taskqueue == NULL) { 3480 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3481 return; 3482 } 3483 3484 taskqueue_start_threads(&xbb->io_taskqueue, 3485 /*num threads*/1, 3486 /*priority*/PWAIT, 3487 /*thread name*/ 3488 "%s taskq", device_get_nameunit(dev)); 3489 3490 /* Update hot-plug status to satisfy xend. */ 3491 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3492 "hotplug-status", "connected"); 3493 if (error) { 3494 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3495 xenbus_get_node(xbb->dev)); 3496 return; 3497 } 3498 3499 /* The front end might be waiting for the backend, attach if so. */ 3500 if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised) 3501 xbb_connect(xbb); 3502 } 3503 3504 static void 3505 xbb_attach_cb(struct xs_watch *watch, const char **vec, unsigned int len) 3506 { 3507 device_t dev; 3508 struct xbb_softc *xbb; 3509 int error; 3510 3511 dev = (device_t)watch->callback_data; 3512 xbb = device_get_softc(dev); 3513 3514 error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", 3515 NULL, &xbb->dev_name, NULL); 3516 if (error != 0) 3517 return; 3518 3519 xs_unregister_watch(watch); 3520 free(watch->node, M_XENBLOCKBACK); 3521 watch->node = NULL; 3522 xbb->hotplug_done = true; 3523 3524 /* Collect physical device information. */ 3525 error = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "device-type", 3526 NULL, &xbb->dev_type, NULL); 3527 if (error != 0) 3528 xbb->dev_type = NULL; 3529 3530 error = xs_gather(XST_NIL, xenbus_get_node(dev), "mode", NULL, 3531 &xbb->dev_mode, NULL); 3532 if (error != 0) { 3533 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3534 xenbus_get_node(dev)); 3535 return; 3536 } 3537 3538 xbb_attach_disk(dev); 3539 } 3540 3541 /** 3542 * Attach to a XenBus device that has been claimed by our probe routine. 3543 * 3544 * \param dev NewBus device object representing this Xen Block Back instance. 3545 * 3546 * \return 0 for success, errno codes for failure. 3547 */ 3548 static int 3549 xbb_attach(device_t dev) 3550 { 3551 struct xbb_softc *xbb; 3552 int error; 3553 u_int max_ring_page_order; 3554 struct sbuf *watch_path; 3555 3556 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3557 3558 /* 3559 * Basic initialization. 3560 * After this block it is safe to call xbb_detach() 3561 * to clean up any allocated data for this instance. 3562 */ 3563 xbb = device_get_softc(dev); 3564 xbb->dev = dev; 3565 xbb->otherend_id = xenbus_get_otherend_id(dev); 3566 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3567 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3568 3569 /* 3570 * Publish protocol capabilities for consumption by the 3571 * front-end. 3572 */ 3573 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3574 "feature-barrier", "1"); 3575 if (error) { 3576 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3577 xenbus_get_node(xbb->dev)); 3578 return (error); 3579 } 3580 3581 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3582 "feature-flush-cache", "1"); 3583 if (error) { 3584 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3585 xenbus_get_node(xbb->dev)); 3586 return (error); 3587 } 3588 3589 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3590 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3591 "max-ring-page-order", "%u", max_ring_page_order); 3592 if (error) { 3593 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3594 xenbus_get_node(xbb->dev)); 3595 return (error); 3596 } 3597 3598 /* Tell the toolstack blkback has attached. */ 3599 xenbus_set_state(dev, XenbusStateInitWait); 3600 3601 if (xbb->hotplug_done) { 3602 xbb_attach_disk(dev); 3603 return (0); 3604 } 3605 3606 /* 3607 * We need to wait for hotplug script execution before 3608 * moving forward. 3609 */ 3610 watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); 3611 xbb->hotplug_watch.callback_data = (uintptr_t)dev; 3612 xbb->hotplug_watch.callback = xbb_attach_cb; 3613 KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); 3614 xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); 3615 /* 3616 * We don't care about the path updated, just about the value changes 3617 * on that single node, hence there's no need to queue more that one 3618 * event. 3619 */ 3620 xbb->hotplug_watch.max_pending = 1; 3621 sbuf_delete(watch_path); 3622 error = xs_register_watch(&xbb->hotplug_watch); 3623 if (error != 0) { 3624 xbb_attach_failed(xbb, error, "failed to create watch on %s", 3625 xbb->hotplug_watch.node); 3626 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3627 return (error); 3628 } 3629 3630 return (0); 3631 } 3632 3633 /** 3634 * Detach from a block back device instance. 3635 * 3636 * \param dev NewBus device object representing this Xen Block Back instance. 3637 * 3638 * \return 0 for success, errno codes for failure. 3639 * 3640 * \note A block back device may be detached at any time in its life-cycle, 3641 * including part way through the attach process. For this reason, 3642 * initialization order and the initialization state checks in this 3643 * routine must be carefully coupled so that attach time failures 3644 * are gracefully handled. 3645 */ 3646 static int 3647 xbb_detach(device_t dev) 3648 { 3649 struct xbb_softc *xbb; 3650 3651 DPRINTF("\n"); 3652 3653 xbb = device_get_softc(dev); 3654 mtx_lock(&xbb->lock); 3655 while (xbb_shutdown(xbb) == EAGAIN) { 3656 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3657 "xbb_shutdown", 0); 3658 } 3659 mtx_unlock(&xbb->lock); 3660 3661 DPRINTF("\n"); 3662 3663 if (xbb->io_taskqueue != NULL) 3664 taskqueue_free(xbb->io_taskqueue); 3665 3666 if (xbb->xbb_stats != NULL) 3667 devstat_remove_entry(xbb->xbb_stats); 3668 3669 if (xbb->xbb_stats_in != NULL) 3670 devstat_remove_entry(xbb->xbb_stats_in); 3671 3672 xbb_close_backend(xbb); 3673 3674 if (xbb->dev_mode != NULL) { 3675 free(xbb->dev_mode, M_XENSTORE); 3676 xbb->dev_mode = NULL; 3677 } 3678 3679 if (xbb->dev_type != NULL) { 3680 free(xbb->dev_type, M_XENSTORE); 3681 xbb->dev_type = NULL; 3682 } 3683 3684 if (xbb->dev_name != NULL) { 3685 free(xbb->dev_name, M_XENSTORE); 3686 xbb->dev_name = NULL; 3687 } 3688 3689 mtx_destroy(&xbb->lock); 3690 return (0); 3691 } 3692 3693 /** 3694 * Prepare this block back device for suspension of this VM. 3695 * 3696 * \param dev NewBus device object representing this Xen Block Back instance. 3697 * 3698 * \return 0 for success, errno codes for failure. 3699 */ 3700 static int 3701 xbb_suspend(device_t dev) 3702 { 3703 #ifdef NOT_YET 3704 struct xbb_softc *sc = device_get_softc(dev); 3705 3706 /* Prevent new requests being issued until we fix things up. */ 3707 mtx_lock(&sc->xb_io_lock); 3708 sc->connected = BLKIF_STATE_SUSPENDED; 3709 mtx_unlock(&sc->xb_io_lock); 3710 #endif 3711 3712 return (0); 3713 } 3714 3715 /** 3716 * Perform any processing required to recover from a suspended state. 3717 * 3718 * \param dev NewBus device object representing this Xen Block Back instance. 3719 * 3720 * \return 0 for success, errno codes for failure. 3721 */ 3722 static int 3723 xbb_resume(device_t dev) 3724 { 3725 return (0); 3726 } 3727 3728 /** 3729 * Handle state changes expressed via the XenStore by our front-end peer. 3730 * 3731 * \param dev NewBus device object representing this Xen 3732 * Block Back instance. 3733 * \param frontend_state The new state of the front-end. 3734 * 3735 * \return 0 for success, errno codes for failure. 3736 */ 3737 static void 3738 xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3739 { 3740 struct xbb_softc *xbb = device_get_softc(dev); 3741 3742 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3743 xenbus_strstate(frontend_state), 3744 xenbus_strstate(xenbus_get_state(xbb->dev))); 3745 3746 switch (frontend_state) { 3747 case XenbusStateInitialising: 3748 break; 3749 case XenbusStateInitialised: 3750 case XenbusStateConnected: 3751 xbb_connect(xbb); 3752 break; 3753 case XenbusStateClosing: 3754 case XenbusStateClosed: 3755 mtx_lock(&xbb->lock); 3756 xbb_shutdown(xbb); 3757 mtx_unlock(&xbb->lock); 3758 if (frontend_state == XenbusStateClosed) 3759 xenbus_set_state(xbb->dev, XenbusStateClosed); 3760 break; 3761 default: 3762 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3763 frontend_state); 3764 break; 3765 } 3766 } 3767 3768 /*---------------------------- NewBus Registration ---------------------------*/ 3769 static device_method_t xbb_methods[] = { 3770 /* Device interface */ 3771 DEVMETHOD(device_probe, xbb_probe), 3772 DEVMETHOD(device_attach, xbb_attach), 3773 DEVMETHOD(device_detach, xbb_detach), 3774 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3775 DEVMETHOD(device_suspend, xbb_suspend), 3776 DEVMETHOD(device_resume, xbb_resume), 3777 3778 /* Xenbus interface */ 3779 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3780 3781 DEVMETHOD_END 3782 }; 3783 3784 static driver_t xbb_driver = { 3785 "xbbd", 3786 xbb_methods, 3787 sizeof(struct xbb_softc), 3788 }; 3789 3790 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, 0, 0); 3791