/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * xdf.c - Xen Virtual Block Device Driver * TODO: * - support alternate block size (currently only DEV_BSIZE supported) * - revalidate geometry for removable devices * * This driver export solaris disk device nodes, accepts IO requests from * those nodes, and services those requests by talking to a backend device * in another domain. * * Communication with the backend device is done via a ringbuffer (which is * managed via xvdi interfaces) and dma memory (which is managed via ddi * interfaces). * * Communication with the backend device is dependant upon establishing a * connection to the backend device. This connection process involves * reading device configuration information from xenbus and publishing * some frontend runtime configuration parameters via the xenbus (for * consumption by the backend). Once we've published runtime configuration * information via the xenbus, the backend device can enter the connected * state and we'll enter the XD_CONNECTED state. But before we can allow * random IO to begin, we need to do IO to the backend device to determine * the device label and if flush operations are supported. Once this is * done we enter the XD_READY state and can process any IO operations. * * We recieve notifications of xenbus state changes for the backend device * (aka, the "other end") via the xdf_oe_change() callback. This callback * is single threaded, meaning that we can't recieve new notification of * other end state changes while we're processing an outstanding * notification of an other end state change. There for we can't do any * blocking operations from the xdf_oe_change() callback. This is why we * have a seperate taskq (xdf_ready_tq) which exists to do the necessary * IO to get us from the XD_CONNECTED to the XD_READY state. All IO * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go * throught xdf_lb_rdwr(), which is a synchronous IO interface. IOs * generated by the xdf_ready_tq_thread thread have priority over all * other IO requests. * * We also communicate with the backend device via the xenbus "media-req" * (XBP_MEDIA_REQ) property. For more information on this see the * comments in blkif.h. */ #include #include #include #include #include #include #include #ifdef XPV_HVM_DRIVER #include #include #else /* !XPV_HVM_DRIVER */ #include #endif /* !XPV_HVM_DRIVER */ #include #include #include #include #include #include /* * DEBUG_EVAL can be used to include debug only statements without * having to use '#ifdef DEBUG' statements */ #ifdef DEBUG #define DEBUG_EVAL(x) (x) #else /* !DEBUG */ #define DEBUG_EVAL(x) #endif /* !DEBUG */ #define XDF_DRAIN_MSEC_DELAY (50*1000) /* 00.05 sec */ #define XDF_DRAIN_RETRY_COUNT 200 /* 10.00 sec */ #define INVALID_DOMID ((domid_t)-1) #define FLUSH_DISKCACHE 0x1 #define WRITE_BARRIER 0x2 #define DEFAULT_FLUSH_BLOCK 156 /* block to write to cause a cache flush */ #define USE_WRITE_BARRIER(vdp) \ ((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported) #define USE_FLUSH_DISKCACHE(vdp) \ ((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported) #define IS_WRITE_BARRIER(vdp, bp) \ (!IS_READ(bp) && USE_WRITE_BARRIER(vdp) && \ ((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block)) #define IS_FLUSH_DISKCACHE(bp) \ (!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0)) #define VREQ_DONE(vreq) \ VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) && \ (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) || \ (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws))) #define BP_VREQ(bp) ((v_req_t *)((bp)->av_back)) #define BP_VREQ_SET(bp, vreq) (((bp)->av_back = (buf_t *)(vreq))) extern int do_polled_io; /* run-time tunables that we don't want the compiler to optimize away */ volatile int xdf_debug = 0; volatile boolean_t xdf_barrier_flush_disable = B_FALSE; /* per module globals */ major_t xdf_major; static void *xdf_ssp; static kmem_cache_t *xdf_vreq_cache; static kmem_cache_t *xdf_gs_cache; static int xdf_maxphys = XB_MAXPHYS; static diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK; static int xdf_fbrewrites; /* flush block re-write count */ /* misc public functions (used by xdf_shell.c) */ int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *); int xdf_lb_getinfo(dev_info_t *, int, void *, void *); /* misc private functions */ static void xdf_io_start(xdf_t *); /* callbacks from commmon label */ static cmlb_tg_ops_t xdf_lb_ops = { TG_DK_OPS_VERSION_1, xdf_lb_rdwr, xdf_lb_getinfo }; /* * I/O buffer DMA attributes * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most */ static ddi_dma_attr_t xb_dma_attr = { DMA_ATTR_V0, (uint64_t)0, /* lowest address */ (uint64_t)0xffffffffffffffff, /* highest usable address */ (uint64_t)0xffffff, /* DMA counter limit max */ (uint64_t)XB_BSIZE, /* alignment in bytes */ XB_BSIZE - 1, /* bitmap of burst sizes */ XB_BSIZE, /* min transfer */ (uint64_t)XB_MAX_XFER, /* maximum transfer */ (uint64_t)PAGEOFFSET, /* 1 page segment length */ BLKIF_MAX_SEGMENTS_PER_REQUEST, /* maximum number of segments */ XB_BSIZE, /* granularity */ 0, /* flags (reserved) */ }; static ddi_device_acc_attr_t xc_acc_attr = { DDI_DEVICE_ATTR_V0, DDI_NEVERSWAP_ACC, DDI_STRICTORDER_ACC }; static void xdf_timeout_handler(void *arg) { xdf_t *vdp = arg; mutex_enter(&vdp->xdf_dev_lk); vdp->xdf_timeout_id = 0; mutex_exit(&vdp->xdf_dev_lk); /* new timeout thread could be re-scheduled */ xdf_io_start(vdp); } /* * callback func when DMA/GTE resources is available * * Note: we only register one callback function to grant table subsystem * since we only have one 'struct gnttab_free_callback' in xdf_t. */ static int xdf_dmacallback(caddr_t arg) { xdf_t *vdp = (xdf_t *)arg; ASSERT(vdp != NULL); DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n", vdp->xdf_addr)); ddi_trigger_softintr(vdp->xdf_softintr_id); return (DDI_DMA_CALLBACK_DONE); } static ge_slot_t * gs_get(xdf_t *vdp, int isread) { grant_ref_t gh; ge_slot_t *gs; /* try to alloc GTEs needed in this slot, first */ if (gnttab_alloc_grant_references( BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) { if (vdp->xdf_gnt_callback.next == NULL) { SETDMACBON(vdp); gnttab_request_free_callback( &vdp->xdf_gnt_callback, (void (*)(void *))xdf_dmacallback, (void *)vdp, BLKIF_MAX_SEGMENTS_PER_REQUEST); } return (NULL); } gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP); if (gs == NULL) { gnttab_free_grant_references(gh); if (vdp->xdf_timeout_id == 0) /* restart I/O after one second */ vdp->xdf_timeout_id = timeout(xdf_timeout_handler, vdp, hz); return (NULL); } /* init gs_slot */ gs->gs_oeid = vdp->xdf_peer; gs->gs_isread = isread; gs->gs_ghead = gh; gs->gs_ngrefs = 0; return (gs); } static void gs_free(ge_slot_t *gs) { int i; /* release all grant table entry resources used in this slot */ for (i = 0; i < gs->gs_ngrefs; i++) gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0); gnttab_free_grant_references(gs->gs_ghead); list_remove(&gs->gs_vreq->v_gs, gs); kmem_cache_free(xdf_gs_cache, gs); } static grant_ref_t gs_grant(ge_slot_t *gs, mfn_t mfn) { grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead); ASSERT(gr != -1); ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST); gs->gs_ge[gs->gs_ngrefs++] = gr; gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread); return (gr); } /* * Alloc a vreq for this bp * bp->av_back contains the pointer to the vreq upon return */ static v_req_t * vreq_get(xdf_t *vdp, buf_t *bp) { v_req_t *vreq = NULL; ASSERT(BP_VREQ(bp) == NULL); vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP); if (vreq == NULL) { if (vdp->xdf_timeout_id == 0) /* restart I/O after one second */ vdp->xdf_timeout_id = timeout(xdf_timeout_handler, vdp, hz); return (NULL); } bzero(vreq, sizeof (v_req_t)); list_create(&vreq->v_gs, sizeof (ge_slot_t), offsetof(ge_slot_t, gs_vreq_link)); vreq->v_buf = bp; vreq->v_status = VREQ_INIT; vreq->v_runq = B_FALSE; BP_VREQ_SET(bp, vreq); /* init of other fields in vreq is up to the caller */ list_insert_head(&vdp->xdf_vreq_act, (void *)vreq); return (vreq); } static void vreq_free(xdf_t *vdp, v_req_t *vreq) { buf_t *bp = vreq->v_buf; ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); ASSERT(BP_VREQ(bp) == vreq); list_remove(&vdp->xdf_vreq_act, vreq); if (vreq->v_flush_diskcache == FLUSH_DISKCACHE) goto done; switch (vreq->v_status) { case VREQ_DMAWIN_DONE: case VREQ_GS_ALLOCED: case VREQ_DMABUF_BOUND: (void) ddi_dma_unbind_handle(vreq->v_dmahdl); /*FALLTHRU*/ case VREQ_DMAMEM_ALLOCED: if (!ALIGNED_XFER(bp)) { ASSERT(vreq->v_abuf != NULL); if (!IS_ERROR(bp) && IS_READ(bp)) bcopy(vreq->v_abuf, bp->b_un.b_addr, bp->b_bcount); ddi_dma_mem_free(&vreq->v_align); } /*FALLTHRU*/ case VREQ_MEMDMAHDL_ALLOCED: if (!ALIGNED_XFER(bp)) ddi_dma_free_handle(&vreq->v_memdmahdl); /*FALLTHRU*/ case VREQ_DMAHDL_ALLOCED: ddi_dma_free_handle(&vreq->v_dmahdl); break; default: break; } done: ASSERT(!vreq->v_runq); list_destroy(&vreq->v_gs); kmem_cache_free(xdf_vreq_cache, vreq); } /* * Snarf new data if our flush block was re-written */ static void check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno) { int nblks; boolean_t mapin; if (IS_WRITE_BARRIER(vdp, bp)) return; /* write was a flush write */ mapin = B_FALSE; nblks = bp->b_bcount >> DEV_BSHIFT; if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) { xdf_fbrewrites++; if (bp->b_flags & (B_PAGEIO | B_PHYS)) { mapin = B_TRUE; bp_mapin(bp); } bcopy(bp->b_un.b_addr + ((xdf_flush_block - blkno) << DEV_BSHIFT), vdp->xdf_cache_flush_block, DEV_BSIZE); if (mapin) bp_mapout(bp); } } /* * Initalize the DMA and grant table resources for the buf */ static int vreq_setup(xdf_t *vdp, v_req_t *vreq) { int rc; ddi_dma_attr_t dmaattr; uint_t ndcs, ndws; ddi_dma_handle_t dh; ddi_dma_handle_t mdh; ddi_dma_cookie_t dc; ddi_acc_handle_t abh; caddr_t aba; ge_slot_t *gs; size_t bufsz; off_t off; size_t sz; buf_t *bp = vreq->v_buf; int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) | DDI_DMA_STREAMING | DDI_DMA_PARTIAL; switch (vreq->v_status) { case VREQ_INIT: if (IS_FLUSH_DISKCACHE(bp)) { if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) { DPRINTF(DMA_DBG, ("xdf@%s: " "get ge_slotfailed\n", vdp->xdf_addr)); return (DDI_FAILURE); } vreq->v_blkno = 0; vreq->v_nslots = 1; vreq->v_flush_diskcache = FLUSH_DISKCACHE; vreq->v_status = VREQ_GS_ALLOCED; gs->gs_vreq = vreq; list_insert_head(&vreq->v_gs, gs); return (DDI_SUCCESS); } if (IS_WRITE_BARRIER(vdp, bp)) vreq->v_flush_diskcache = WRITE_BARRIER; vreq->v_blkno = bp->b_blkno + (diskaddr_t)(uintptr_t)bp->b_private; /* See if we wrote new data to our flush block */ if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp)) check_fbwrite(vdp, bp, vreq->v_blkno); vreq->v_status = VREQ_INIT_DONE; /*FALLTHRU*/ case VREQ_INIT_DONE: /* * alloc DMA handle */ rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr, xdf_dmacallback, (caddr_t)vdp, &dh); if (rc != DDI_SUCCESS) { SETDMACBON(vdp); DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n", vdp->xdf_addr)); return (DDI_FAILURE); } vreq->v_dmahdl = dh; vreq->v_status = VREQ_DMAHDL_ALLOCED; /*FALLTHRU*/ case VREQ_DMAHDL_ALLOCED: /* * alloc dma handle for 512-byte aligned buf */ if (!ALIGNED_XFER(bp)) { /* * XXPV: we need to temporarily enlarge the seg * boundary and s/g length to work round CR6381968 */ dmaattr = xb_dma_attr; dmaattr.dma_attr_seg = (uint64_t)-1; dmaattr.dma_attr_sgllen = INT_MAX; rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr, xdf_dmacallback, (caddr_t)vdp, &mdh); if (rc != DDI_SUCCESS) { SETDMACBON(vdp); DPRINTF(DMA_DBG, ("xdf@%s: " "unaligned buf DMAhandle alloc failed\n", vdp->xdf_addr)); return (DDI_FAILURE); } vreq->v_memdmahdl = mdh; vreq->v_status = VREQ_MEMDMAHDL_ALLOCED; } /*FALLTHRU*/ case VREQ_MEMDMAHDL_ALLOCED: /* * alloc 512-byte aligned buf */ if (!ALIGNED_XFER(bp)) { if (bp->b_flags & (B_PAGEIO | B_PHYS)) bp_mapin(bp); rc = ddi_dma_mem_alloc(vreq->v_memdmahdl, roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr, DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp, &aba, &bufsz, &abh); if (rc != DDI_SUCCESS) { SETDMACBON(vdp); DPRINTF(DMA_DBG, ("xdf@%s: " "DMA mem allocation failed\n", vdp->xdf_addr)); return (DDI_FAILURE); } vreq->v_abuf = aba; vreq->v_align = abh; vreq->v_status = VREQ_DMAMEM_ALLOCED; ASSERT(bufsz >= bp->b_bcount); if (!IS_READ(bp)) bcopy(bp->b_un.b_addr, vreq->v_abuf, bp->b_bcount); } /*FALLTHRU*/ case VREQ_DMAMEM_ALLOCED: /* * dma bind */ if (ALIGNED_XFER(bp)) { rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp, dma_flags, xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs); } else { rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl, NULL, vreq->v_abuf, bp->b_bcount, dma_flags, xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs); } if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) { /* get num of dma windows */ if (rc == DDI_DMA_PARTIAL_MAP) { rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws); ASSERT(rc == DDI_SUCCESS); } else { ndws = 1; } } else { SETDMACBON(vdp); DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n", vdp->xdf_addr)); return (DDI_FAILURE); } vreq->v_dmac = dc; vreq->v_dmaw = 0; vreq->v_ndmacs = ndcs; vreq->v_ndmaws = ndws; vreq->v_nslots = ndws; vreq->v_status = VREQ_DMABUF_BOUND; /*FALLTHRU*/ case VREQ_DMABUF_BOUND: /* * get ge_slot, callback is set upon failure from gs_get(), * if not set previously */ if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) { DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n", vdp->xdf_addr)); return (DDI_FAILURE); } vreq->v_status = VREQ_GS_ALLOCED; gs->gs_vreq = vreq; list_insert_head(&vreq->v_gs, gs); break; case VREQ_GS_ALLOCED: /* nothing need to be done */ break; case VREQ_DMAWIN_DONE: /* * move to the next dma window */ ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws); /* get a ge_slot for this DMA window */ if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) { DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n", vdp->xdf_addr)); return (DDI_FAILURE); } vreq->v_dmaw++; VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz, &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS); vreq->v_status = VREQ_GS_ALLOCED; gs->gs_vreq = vreq; list_insert_head(&vreq->v_gs, gs); break; default: return (DDI_FAILURE); } return (DDI_SUCCESS); } static int xdf_cmlb_attach(xdf_t *vdp) { dev_info_t *dip = vdp->xdf_dip; return (cmlb_attach(dip, &xdf_lb_ops, XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT, XD_IS_RM(vdp), B_TRUE, XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD, #if defined(XPV_HVM_DRIVER) (XD_IS_CD(vdp) ? 0 : CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT) | CMLB_INTERNAL_MINOR_NODES, #else /* !XPV_HVM_DRIVER */ XD_IS_CD(vdp) ? 0 : CMLB_FAKE_LABEL_ONE_PARTITION, #endif /* !XPV_HVM_DRIVER */ vdp->xdf_vd_lbl, NULL)); } static void xdf_io_err(buf_t *bp, int err, size_t resid) { bioerror(bp, err); if (resid == 0) bp->b_resid = bp->b_bcount; biodone(bp); } static void xdf_kstat_enter(xdf_t *vdp, buf_t *bp) { v_req_t *vreq = BP_VREQ(bp); ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); if (vdp->xdf_xdev_iostat == NULL) return; if ((vreq != NULL) && vreq->v_runq) { kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat)); } else { kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat)); } } static void xdf_kstat_exit(xdf_t *vdp, buf_t *bp) { v_req_t *vreq = BP_VREQ(bp); ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); if (vdp->xdf_xdev_iostat == NULL) return; if ((vreq != NULL) && vreq->v_runq) { kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat)); } else { kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat)); } } static void xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp) { v_req_t *vreq = BP_VREQ(bp); ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); ASSERT(!vreq->v_runq); vreq->v_runq = B_TRUE; if (vdp->xdf_xdev_iostat == NULL) return; kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat)); } static void xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp) { v_req_t *vreq = BP_VREQ(bp); ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); ASSERT(vreq->v_runq); vreq->v_runq = B_FALSE; if (vdp->xdf_xdev_iostat == NULL) return; kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat)); } int xdf_kstat_create(dev_info_t *dip, char *ks_module, int instance) { xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip); kstat_t *kstat; buf_t *bp; if ((kstat = kstat_create( ks_module, instance, NULL, "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) return (-1); /* See comment about locking in xdf_kstat_delete(). */ mutex_enter(&vdp->xdf_iostat_lk); mutex_enter(&vdp->xdf_dev_lk); /* only one kstat can exist at a time */ if (vdp->xdf_xdev_iostat != NULL) { mutex_exit(&vdp->xdf_dev_lk); mutex_exit(&vdp->xdf_iostat_lk); kstat_delete(kstat); return (-1); } vdp->xdf_xdev_iostat = kstat; vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk; kstat_install(vdp->xdf_xdev_iostat); /* * Now that we've created a kstat, we need to update the waitq and * runq counts for the kstat to reflect our current state. * * For a buf_t structure to be on the runq, it must have a ring * buffer slot associated with it. To get a ring buffer slot the * buf must first have a v_req_t and a ge_slot_t associated with it. * Then when it is granted a ring buffer slot, v_runq will be set to * true. * * For a buf_t structure to be on the waitq, it must not be on the * runq. So to find all the buf_t's that should be on waitq, we * walk the active buf list and add any buf_t's which aren't on the * runq to the waitq. */ bp = vdp->xdf_f_act; while (bp != NULL) { xdf_kstat_enter(vdp, bp); bp = bp->av_forw; } if (vdp->xdf_ready_tq_bp != NULL) xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp); mutex_exit(&vdp->xdf_dev_lk); mutex_exit(&vdp->xdf_iostat_lk); return (0); } void xdf_kstat_delete(dev_info_t *dip) { xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip); kstat_t *kstat; buf_t *bp; /* * The locking order here is xdf_iostat_lk and then xdf_dev_lk. * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer * and the contents of the our kstat. xdf_iostat_lk is used * to protect the allocation and freeing of the actual kstat. * xdf_dev_lk can't be used for this purpose because kstat * readers use it to access the contents of the kstat and * hence it can't be held when calling kstat_delete(). */ mutex_enter(&vdp->xdf_iostat_lk); mutex_enter(&vdp->xdf_dev_lk); if (vdp->xdf_xdev_iostat == NULL) { mutex_exit(&vdp->xdf_dev_lk); mutex_exit(&vdp->xdf_iostat_lk); return; } /* * We're about to destroy the kstat structures, so it isn't really * necessary to update the runq and waitq counts. But, since this * isn't a hot code path we can afford to be a little pedantic and * go ahead and decrement the runq and waitq kstat counters to zero * before free'ing them. This helps us ensure that we've gotten all * our accounting correct. * * For an explanation of how we determine which buffers go on the * runq vs which go on the waitq, see the comments in * xdf_kstat_create(). */ bp = vdp->xdf_f_act; while (bp != NULL) { xdf_kstat_exit(vdp, bp); bp = bp->av_forw; } if (vdp->xdf_ready_tq_bp != NULL) xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp); kstat = vdp->xdf_xdev_iostat; vdp->xdf_xdev_iostat = NULL; mutex_exit(&vdp->xdf_dev_lk); kstat_delete(kstat); mutex_exit(&vdp->xdf_iostat_lk); } /* * Add an IO requests onto the active queue. * * We have to detect IOs generated by xdf_ready_tq_thread. These IOs * are used to establish a connection to the backend, so they recieve * priority over all other IOs. Since xdf_ready_tq_thread only does * synchronous IO, there can only be one xdf_ready_tq_thread request at any * given time and we record the buf associated with that request in * xdf_ready_tq_bp. */ static void xdf_bp_push(xdf_t *vdp, buf_t *bp) { ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); ASSERT(bp->av_forw == NULL); xdf_kstat_enter(vdp, bp); if (curthread == vdp->xdf_ready_tq_thread) { /* new IO requests from the ready thread */ ASSERT(vdp->xdf_ready_tq_bp == NULL); vdp->xdf_ready_tq_bp = bp; return; } /* this is normal IO request */ ASSERT(bp != vdp->xdf_ready_tq_bp); if (vdp->xdf_f_act == NULL) { /* this is only only IO on the active queue */ ASSERT(vdp->xdf_l_act == NULL); ASSERT(vdp->xdf_i_act == NULL); vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp; return; } /* add this IO to the tail of the active queue */ vdp->xdf_l_act->av_forw = bp; vdp->xdf_l_act = bp; if (vdp->xdf_i_act == NULL) vdp->xdf_i_act = bp; } static void xdf_bp_pop(xdf_t *vdp, buf_t *bp) { buf_t *bp_iter; ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); ASSERT(VREQ_DONE(BP_VREQ(bp))); if (vdp->xdf_ready_tq_bp == bp) { /* we're done with a ready thread IO request */ ASSERT(bp->av_forw == NULL); vdp->xdf_ready_tq_bp = NULL; return; } /* we're done with a normal IO request */ ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act)); ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act)); ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act))); ASSERT(vdp->xdf_f_act != vdp->xdf_i_act); if (bp == vdp->xdf_f_act) { /* This IO was at the head of our active queue. */ vdp->xdf_f_act = bp->av_forw; if (bp == vdp->xdf_l_act) vdp->xdf_l_act = NULL; } else { /* There IO finished before some other pending IOs. */ bp_iter = vdp->xdf_f_act; while (bp != bp_iter->av_forw) { bp_iter = bp_iter->av_forw; ASSERT(VREQ_DONE(BP_VREQ(bp_iter))); ASSERT(bp_iter != vdp->xdf_i_act); } bp_iter->av_forw = bp->av_forw; if (bp == vdp->xdf_l_act) vdp->xdf_l_act = bp_iter; } bp->av_forw = NULL; } static buf_t * xdf_bp_next(xdf_t *vdp) { v_req_t *vreq; buf_t *bp; if (vdp->xdf_state == XD_CONNECTED) { /* * If we're in the XD_CONNECTED state, we only service IOs * from the xdf_ready_tq_thread thread. */ if ((bp = vdp->xdf_ready_tq_bp) == NULL) return (NULL); if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq))) return (bp); return (NULL); } /* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */ if (vdp->xdf_state != XD_READY) return (NULL); ASSERT(vdp->xdf_ready_tq_bp == NULL); for (;;) { if ((bp = vdp->xdf_i_act) == NULL) return (NULL); if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq))) return (bp); /* advance the active buf index pointer */ vdp->xdf_i_act = bp->av_forw; } } static void xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr) { ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id; v_req_t *vreq = gs->gs_vreq; buf_t *bp = vreq->v_buf; ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); ASSERT(BP_VREQ(bp) == vreq); gs_free(gs); if (bioerr != 0) bioerror(bp, bioerr); ASSERT(vreq->v_nslots > 0); if (--vreq->v_nslots > 0) return; /* remove this IO from our active queue */ xdf_bp_pop(vdp, bp); ASSERT(vreq->v_runq); xdf_kstat_exit(vdp, bp); vreq->v_runq = B_FALSE; vreq_free(vdp, vreq); if (IS_ERROR(bp)) { xdf_io_err(bp, geterror(bp), 0); } else if (bp->b_resid != 0) { /* Partial transfers are an error */ xdf_io_err(bp, EIO, bp->b_resid); } else { biodone(bp); } } /* * xdf interrupt handler */ static uint_t xdf_intr_locked(xdf_t *vdp) { xendev_ring_t *xbr; blkif_response_t *resp; int bioerr; uint64_t id; uint8_t op; uint16_t status; ddi_acc_handle_t acchdl; ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); if ((xbr = vdp->xdf_xb_ring) == NULL) return (DDI_INTR_UNCLAIMED); acchdl = vdp->xdf_xb_ring_hdl; /* * complete all requests which have a response */ while (resp = xvdi_ring_get_response(xbr)) { id = ddi_get64(acchdl, &resp->id); op = ddi_get8(acchdl, &resp->operation); status = ddi_get16(acchdl, (uint16_t *)&resp->status); DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n", op, id, status)); if (status != BLKIF_RSP_OKAY) { DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s", vdp->xdf_addr, (op == BLKIF_OP_READ) ? "reading" : "writing")); bioerr = EIO; } else { bioerr = 0; } xdf_io_fini(vdp, id, bioerr); } return (DDI_INTR_CLAIMED); } /* * xdf_intr runs at PIL 5, so no one else can grab xdf_dev_lk and * block at a lower pil. */ static uint_t xdf_intr(caddr_t arg) { xdf_t *vdp = (xdf_t *)arg; int rv; mutex_enter(&vdp->xdf_dev_lk); rv = xdf_intr_locked(vdp); mutex_exit(&vdp->xdf_dev_lk); if (!do_polled_io) xdf_io_start(vdp); return (rv); } static void xdf_ring_push(xdf_t *vdp) { ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); if (vdp->xdf_xb_ring == NULL) return; if (xvdi_ring_push_request(vdp->xdf_xb_ring)) { DPRINTF(IO_DBG, ( "xdf@%s: xdf_ring_push: sent request(s) to backend\n", vdp->xdf_addr)); } if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN) xvdi_notify_oe(vdp->xdf_dip); } static int xdf_ring_drain_locked(xdf_t *vdp) { int pollc, rv = 0; ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); if (xdf_debug & SUSRES_DBG) xen_printf("xdf_ring_drain: start\n"); for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) { if (vdp->xdf_xb_ring == NULL) goto out; if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) (void) xdf_intr_locked(vdp); if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring)) goto out; xdf_ring_push(vdp); /* file-backed devices can be slow */ mutex_exit(&vdp->xdf_dev_lk); #ifdef XPV_HVM_DRIVER (void) HYPERVISOR_yield(); #endif /* XPV_HVM_DRIVER */ delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY)); mutex_enter(&vdp->xdf_dev_lk); } cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr); out: if (vdp->xdf_xb_ring != NULL) { if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) || xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) rv = EIO; } if (xdf_debug & SUSRES_DBG) xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n", vdp->xdf_addr, rv); return (rv); } static int xdf_ring_drain(xdf_t *vdp) { int rv; mutex_enter(&vdp->xdf_dev_lk); rv = xdf_ring_drain_locked(vdp); mutex_exit(&vdp->xdf_dev_lk); return (rv); } /* * Destroy all v_req_t, grant table entries, and our ring buffer. */ static void xdf_ring_destroy(xdf_t *vdp) { v_req_t *vreq; buf_t *bp; ge_slot_t *gs; ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk)); ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); if ((vdp->xdf_state != XD_INIT) && (vdp->xdf_state != XD_CONNECTED) && (vdp->xdf_state != XD_READY)) { ASSERT(vdp->xdf_xb_ring == NULL); ASSERT(vdp->xdf_xb_ring_hdl == NULL); ASSERT(vdp->xdf_peer == INVALID_DOMID); ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN); ASSERT(list_is_empty(&vdp->xdf_vreq_act)); return; } /* * We don't want to recieve async notifications from the backend * when it finishes processing ring entries. */ #ifdef XPV_HVM_DRIVER ec_unbind_evtchn(vdp->xdf_evtchn); #else /* !XPV_HVM_DRIVER */ (void) ddi_remove_intr(vdp->xdf_dip, 0, NULL); #endif /* !XPV_HVM_DRIVER */ /* * Drain any requests in the ring. We need to do this before we * can free grant table entries, because if active ring entries * point to grants, then the backend could be trying to access * those grants. */ (void) xdf_ring_drain_locked(vdp); /* We're done talking to the backend so free up our event channel */ xvdi_free_evtchn(vdp->xdf_dip); vdp->xdf_evtchn = INVALID_EVTCHN; while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) { bp = vreq->v_buf; ASSERT(BP_VREQ(bp) == vreq); /* Free up any grant table entries associaed with this IO */ while ((gs = list_head(&vreq->v_gs)) != NULL) gs_free(gs); /* If this IO was on the runq, move it back to the waitq. */ if (vreq->v_runq) xdf_kstat_runq_to_waitq(vdp, bp); /* * Reset any buf IO state since we're going to re-issue the * IO when we reconnect. */ vreq_free(vdp, vreq); BP_VREQ_SET(bp, NULL); bioerror(bp, 0); } /* reset the active queue index pointer */ vdp->xdf_i_act = vdp->xdf_f_act; /* Destroy the ring */ xvdi_free_ring(vdp->xdf_xb_ring); vdp->xdf_xb_ring = NULL; vdp->xdf_xb_ring_hdl = NULL; vdp->xdf_peer = INVALID_DOMID; } void xdfmin(struct buf *bp) { if (bp->b_bcount > xdf_maxphys) bp->b_bcount = xdf_maxphys; } /* * Check if we have a pending "eject" media request. */ static int xdf_eject_pending(xdf_t *vdp) { dev_info_t *dip = vdp->xdf_dip; char *xsname, *str; if (!vdp->xdf_media_req_supported) return (B_FALSE); if (((xsname = xvdi_get_xsname(dip)) == NULL) || (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0)) return (B_FALSE); if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) { strfree(str); return (B_FALSE); } strfree(str); return (B_TRUE); } /* * Generate a media request. */ static int xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required) { dev_info_t *dip = vdp->xdf_dip; char *xsname; /* * we can't be holding xdf_dev_lk because xenbus_printf() can * block while waiting for a PIL 1 interrupt message. this * would cause a deadlock with xdf_intr() which needs to grab * xdf_dev_lk as well and runs at PIL 5. */ ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk)); ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk)); if ((xsname = xvdi_get_xsname(dip)) == NULL) return (ENXIO); /* Check if we support media requests */ if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported) return (ENOTTY); /* If an eject is pending then don't allow any new requests */ if (xdf_eject_pending(vdp)) return (ENXIO); /* Make sure that there is media present */ if (media_required && (vdp->xdf_xdev_nblocks == 0)) return (ENXIO); /* We only allow operations when the device is ready and connected */ if (vdp->xdf_state != XD_READY) return (EIO); if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0) return (EIO); return (0); } /* * populate a single blkif_request_t w/ a buf */ static void xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq) { grant_ref_t gr; uint8_t fsect, lsect; size_t bcnt; paddr_t dma_addr; off_t blk_off; dev_info_t *dip = vdp->xdf_dip; blkif_vdev_t vdev = xvdi_get_vdevnum(dip); v_req_t *vreq = BP_VREQ(bp); uint64_t blkno = vreq->v_blkno; uint_t ndmacs = vreq->v_ndmacs; ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl; int seg = 0; int isread = IS_READ(bp); ge_slot_t *gs = list_head(&vreq->v_gs); ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); ASSERT(vreq->v_status == VREQ_GS_ALLOCED); if (isread) ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ); else { switch (vreq->v_flush_diskcache) { case FLUSH_DISKCACHE: ddi_put8(acchdl, &rreq->operation, BLKIF_OP_FLUSH_DISKCACHE); ddi_put16(acchdl, &rreq->handle, vdev); ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs)); ddi_put8(acchdl, &rreq->nr_segments, 0); vreq->v_status = VREQ_DMAWIN_DONE; return; case WRITE_BARRIER: ddi_put8(acchdl, &rreq->operation, BLKIF_OP_WRITE_BARRIER); break; default: if (!vdp->xdf_wce) ddi_put8(acchdl, &rreq->operation, BLKIF_OP_WRITE_BARRIER); else ddi_put8(acchdl, &rreq->operation, BLKIF_OP_WRITE); break; } } ddi_put16(acchdl, &rreq->handle, vdev); ddi_put64(acchdl, &rreq->sector_number, blkno); ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs)); /* * loop until all segments are populated or no more dma cookie in buf */ for (;;) { /* * Each segment of a blkif request can transfer up to * one 4K page of data. */ bcnt = vreq->v_dmac.dmac_size; dma_addr = vreq->v_dmac.dmac_laddress; blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr); fsect = blk_off >> XB_BSHIFT; lsect = fsect + (bcnt >> XB_BSHIFT) - 1; ASSERT(bcnt <= PAGESIZE); ASSERT((bcnt % XB_BSIZE) == 0); ASSERT((blk_off & XB_BMASK) == 0); ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE && lsect < XB_MAX_SEGLEN / XB_BSIZE); gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT); ddi_put32(acchdl, &rreq->seg[seg].gref, gr); ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect); ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect); DPRINTF(IO_DBG, ( "xdf@%s: seg%d: dmacS %lu blk_off %ld\n", vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off)); DPRINTF(IO_DBG, ( "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n", vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr)); blkno += (bcnt >> XB_BSHIFT); seg++; ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST); if (--ndmacs) { ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac); continue; } vreq->v_status = VREQ_DMAWIN_DONE; vreq->v_blkno = blkno; break; } ddi_put8(acchdl, &rreq->nr_segments, seg); DPRINTF(IO_DBG, ( "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n", vdp->xdf_addr, rreq->id)); } static void xdf_io_start(xdf_t *vdp) { struct buf *bp; v_req_t *vreq; blkif_request_t *rreq; boolean_t rreqready = B_FALSE; mutex_enter(&vdp->xdf_dev_lk); /* * Populate the ring request(s). Loop until there is no buf to * transfer or no free slot available in I/O ring. */ for (;;) { /* don't start any new IO if we're suspending */ if (vdp->xdf_suspending) break; if ((bp = xdf_bp_next(vdp)) == NULL) break; /* if the buf doesn't already have a vreq, allocate one */ if (((vreq = BP_VREQ(bp)) == NULL) && ((vreq = vreq_get(vdp, bp)) == NULL)) break; /* alloc DMA/GTE resources */ if (vreq_setup(vdp, vreq) != DDI_SUCCESS) break; /* get next blkif_request in the ring */ if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL) break; bzero(rreq, sizeof (blkif_request_t)); rreqready = B_TRUE; /* populate blkif_request with this buf */ xdf_process_rreq(vdp, bp, rreq); /* * This buffer/vreq pair is has been allocated a ring buffer * resources, so if it isn't already in our runq, add it. */ if (!vreq->v_runq) xdf_kstat_waitq_to_runq(vdp, bp); } /* Send the request(s) to the backend */ if (rreqready) xdf_ring_push(vdp); mutex_exit(&vdp->xdf_dev_lk); } /* check if partition is open, -1 - check all partitions on the disk */ static boolean_t xdf_isopen(xdf_t *vdp, int partition) { int i; ulong_t parbit; boolean_t rval = B_FALSE; ASSERT((partition == -1) || ((partition >= 0) || (partition < XDF_PEXT))); if (partition == -1) parbit = (ulong_t)-1; else parbit = 1 << partition; for (i = 0; i < OTYPCNT; i++) { if (vdp->xdf_vd_open[i] & parbit) rval = B_TRUE; } return (rval); } /* * The connection should never be closed as long as someone is holding * us open, there is pending IO, or someone is waiting waiting for a * connection. */ static boolean_t xdf_busy(xdf_t *vdp) { ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); if ((vdp->xdf_xb_ring != NULL) && xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) { ASSERT(vdp->xdf_state != XD_CLOSED); return (B_TRUE); } if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) { ASSERT(vdp->xdf_state != XD_CLOSED); return (B_TRUE); } if (xdf_isopen(vdp, -1)) { ASSERT(vdp->xdf_state != XD_CLOSED); return (B_TRUE); } if (vdp->xdf_connect_req > 0) { ASSERT(vdp->xdf_state != XD_CLOSED); return (B_TRUE); } return (B_FALSE); } static void xdf_set_state(xdf_t *vdp, xdf_state_t new_state) { ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk)); ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n", vdp->xdf_addr, vdp->xdf_state, new_state)); vdp->xdf_state = new_state; cv_broadcast(&vdp->xdf_dev_cv); } static void xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet) { dev_info_t *dip = vdp->xdf_dip; boolean_t busy; ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk)); ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk)); ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED)); /* Check if we're already there. */ if (vdp->xdf_state == new_state) return; mutex_enter(&vdp->xdf_dev_lk); busy = xdf_busy(vdp); /* If we're already closed then there's nothing todo. */ if (vdp->xdf_state == XD_CLOSED) { ASSERT(!busy); xdf_set_state(vdp, new_state); mutex_exit(&vdp->xdf_dev_lk); return; } #ifdef DEBUG /* UhOh. Warn the user that something bad has happened. */ if (!quiet && busy && (vdp->xdf_state == XD_READY) && (vdp->xdf_xdev_nblocks != 0)) { cmn_err(CE_WARN, "xdf@%s: disconnected while in use", vdp->xdf_addr); } #endif /* DEBUG */ xdf_ring_destroy(vdp); /* If we're busy then we can only go into the unknown state */ xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state); mutex_exit(&vdp->xdf_dev_lk); /* if we're closed now, let the other end know */ if (vdp->xdf_state == XD_CLOSED) (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed); } /* * Kick-off connect process * Status should be XD_UNKNOWN or XD_CLOSED * On success, status will be changed to XD_INIT * On error, it will be changed to XD_UNKNOWN */ static int xdf_setstate_init(xdf_t *vdp) { dev_info_t *dip = vdp->xdf_dip; xenbus_transaction_t xbt; grant_ref_t gref; char *xsname, *str; int rv; ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk)); ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk)); ASSERT((vdp->xdf_state == XD_UNKNOWN) || (vdp->xdf_state == XD_CLOSED)); DPRINTF(DDI_DBG, ("xdf@%s: starting connection process\n", vdp->xdf_addr)); /* * If an eject is pending then don't allow a new connection. * (Only the backend can clear media request eject request.) */ if (xdf_eject_pending(vdp)) return (DDI_FAILURE); if ((xsname = xvdi_get_xsname(dip)) == NULL) goto errout; if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID) goto errout; (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising); /* * Sanity check for the existance of the xenbus device-type property. * This property might not exist if we our xenbus device nodes was * force destroyed while we were still connected to the backend. */ if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) goto errout; strfree(str); if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) goto errout; vdp->xdf_evtchn = xvdi_get_evtchn(dip); #ifdef XPV_HVM_DRIVER ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp); #else /* !XPV_HVM_DRIVER */ if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) != DDI_SUCCESS) { cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: " "failed to add intr handler", vdp->xdf_addr); goto errout1; } #endif /* !XPV_HVM_DRIVER */ if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE, sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) != DDI_SUCCESS) { cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring", vdp->xdf_addr); goto errout2; } vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */ /* * Write into xenstore the info needed by backend */ trans_retry: if (xenbus_transaction_start(&xbt)) { cmn_err(CE_WARN, "xdf@%s: failed to start transaction", vdp->xdf_addr); xvdi_fatal_error(dip, EIO, "connect transaction init"); goto fail_trans; } /* * XBP_PROTOCOL is written by the domain builder in the case of PV * domains. However, it is not written for HVM domains, so let's * write it here. */ if (((rv = xenbus_printf(xbt, xsname, XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) || ((rv = xenbus_printf(xbt, xsname, XBP_RING_REF, "%u", gref)) != 0) || ((rv = xenbus_printf(xbt, xsname, XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) || ((rv = xenbus_printf(xbt, xsname, XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) || ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) { (void) xenbus_transaction_end(xbt, 1); xvdi_fatal_error(dip, rv, "connect transaction setup"); goto fail_trans; } /* kick-off connect process */ if (rv = xenbus_transaction_end(xbt, 0)) { if (rv == EAGAIN) goto trans_retry; xvdi_fatal_error(dip, rv, "connect transaction commit"); goto fail_trans; } ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk)); mutex_enter(&vdp->xdf_dev_lk); xdf_set_state(vdp, XD_INIT); mutex_exit(&vdp->xdf_dev_lk); return (DDI_SUCCESS); fail_trans: xvdi_free_ring(vdp->xdf_xb_ring); errout2: #ifdef XPV_HVM_DRIVER ec_unbind_evtchn(vdp->xdf_evtchn); #else /* !XPV_HVM_DRIVER */ (void) ddi_remove_intr(vdp->xdf_dip, 0, NULL); #endif /* !XPV_HVM_DRIVER */ errout1: xvdi_free_evtchn(dip); vdp->xdf_evtchn = INVALID_EVTCHN; errout: xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE); cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend", vdp->xdf_addr); return (DDI_FAILURE); } int xdf_get_flush_block(xdf_t *vdp) { /* * Get a DEV_BSIZE aligned bufer */ vdp->xdf_flush_mem = kmem_alloc(vdp->xdf_xdev_secsize * 2, KM_SLEEP); vdp->xdf_cache_flush_block = (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), (int)vdp->xdf_xdev_secsize); if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block, xdf_flush_block, vdp->xdf_xdev_secsize, NULL) != 0) return (DDI_FAILURE); return (DDI_SUCCESS); } static void xdf_setstate_ready(void *arg) { xdf_t *vdp = (xdf_t *)arg; vdp->xdf_ready_tq_thread = curthread; /* * We've created all the minor nodes via cmlb_attach() using default * value in xdf_attach() to make it possible to block in xdf_open(), * in case there's anyone (say, booting thread) ever trying to open * it before connected to backend. We will refresh all those minor * nodes w/ latest info we've got now when we are almost connected. */ mutex_enter(&vdp->xdf_dev_lk); if (vdp->xdf_cmbl_reattach) { vdp->xdf_cmbl_reattach = B_FALSE; mutex_exit(&vdp->xdf_dev_lk); if (xdf_cmlb_attach(vdp) != 0) { xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE); return; } mutex_enter(&vdp->xdf_dev_lk); } /* If we're not still trying to get to the ready state, then bail. */ if (vdp->xdf_state != XD_CONNECTED) { mutex_exit(&vdp->xdf_dev_lk); return; } mutex_exit(&vdp->xdf_dev_lk); /* * If backend has feature-barrier, see if it supports disk * cache flush op. */ vdp->xdf_flush_supported = B_FALSE; if (vdp->xdf_feature_barrier) { /* * Pretend we already know flush is supported so probe * will attempt the correct op. */ vdp->xdf_flush_supported = B_TRUE; if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) { vdp->xdf_flush_supported = B_TRUE; } else { vdp->xdf_flush_supported = B_FALSE; /* * If the other end does not support the cache flush op * then we must use a barrier-write to force disk * cache flushing. Barrier writes require that a data * block actually be written. * Cache a block to barrier-write when we are * asked to perform a flush. * XXX - would it be better to just copy 1 block * (512 bytes) from whatever write we did last * and rewrite that block? */ if (xdf_get_flush_block(vdp) != DDI_SUCCESS) { xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE); return; } } } mutex_enter(&vdp->xdf_cb_lk); mutex_enter(&vdp->xdf_dev_lk); if (vdp->xdf_state == XD_CONNECTED) xdf_set_state(vdp, XD_READY); mutex_exit(&vdp->xdf_dev_lk); /* Restart any currently queued up io */ xdf_io_start(vdp); mutex_exit(&vdp->xdf_cb_lk); } /* * synthetic geometry */ #define XDF_NSECTS 256 #define XDF_NHEADS 16 static void xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp) { xdf_t *vdp; uint_t ncyl; vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip)); ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS); bzero(geomp, sizeof (*geomp)); geomp->g_ncyl = ncyl == 0 ? 1 : ncyl; geomp->g_acyl = 0; geomp->g_nhead = XDF_NHEADS; geomp->g_nsect = XDF_NSECTS; geomp->g_secsize = vdp->xdf_xdev_secsize; geomp->g_capacity = vdp->xdf_xdev_nblocks; geomp->g_intrlv = 0; geomp->g_rpm = 7200; } /* * Finish other initialization after we've connected to backend * Status should be XD_INIT before calling this routine * On success, status should be changed to XD_CONNECTED. * On error, status should stay XD_INIT */ static int xdf_setstate_connected(xdf_t *vdp) { dev_info_t *dip = vdp->xdf_dip; cmlb_geom_t pgeom; diskaddr_t nblocks = 0; uint_t secsize = 0; char *oename, *xsname, *str; uint_t dinfo; ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk)); ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk)); ASSERT(vdp->xdf_state == XD_INIT); if (((xsname = xvdi_get_xsname(dip)) == NULL) || ((oename = xvdi_get_oename(dip)) == NULL)) return (DDI_FAILURE); /* Make sure the other end is XenbusStateConnected */ if (xenbus_read_driver_state(oename) != XenbusStateConnected) return (DDI_FAILURE); /* Determine if feature barrier is supported by backend */ if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB))) cmn_err(CE_NOTE, "!xdf@%s: feature-barrier not supported", vdp->xdf_addr); /* * Probe backend. Read the device size into xdf_xdev_nblocks * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE * flags in xdf_dinfo. If the emulated device type is "cdrom", * we always set VDISK_CDROM, regardless of if it's present in * the xenbus info parameter. */ if (xenbus_gather(XBT_NULL, oename, XBP_SECTORS, "%"SCNu64, &nblocks, XBP_SECTOR_SIZE, "%u", &secsize, XBP_INFO, "%u", &dinfo, NULL) != 0) { cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: " "cannot read backend info", vdp->xdf_addr); return (DDI_FAILURE); } if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) { cmn_err(CE_WARN, "xdf@%s: cannot read device-type", vdp->xdf_addr); return (DDI_FAILURE); } if (strcmp(str, XBV_DEV_TYPE_CD) == 0) dinfo |= VDISK_CDROM; strfree(str); if (secsize == 0 || !(ISP2(secsize / DEV_BSIZE))) secsize = DEV_BSIZE; vdp->xdf_xdev_nblocks = nblocks; vdp->xdf_xdev_secsize = secsize; #ifdef _ILP32 if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) { cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: " "backend disk device too large with %llu blocks for" " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks); xvdi_fatal_error(dip, EFBIG, "reading backend info"); return (DDI_FAILURE); } #endif /* * If the physical geometry for a fixed disk has been explicity * set then make sure that the specified physical geometry isn't * larger than the device we connected to. */ if (vdp->xdf_pgeom_fixed && (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) { cmn_err(CE_WARN, "xdf@%s: connect failed, fixed geometry too large", vdp->xdf_addr); return (DDI_FAILURE); } vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP); /* mark vbd is ready for I/O */ mutex_enter(&vdp->xdf_dev_lk); xdf_set_state(vdp, XD_CONNECTED); /* check if the cmlb label should be updated */ xdf_synthetic_pgeom(dip, &pgeom); if ((vdp->xdf_dinfo != dinfo) || (!vdp->xdf_pgeom_fixed && (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) { vdp->xdf_cmbl_reattach = B_TRUE; vdp->xdf_dinfo = dinfo; if (!vdp->xdf_pgeom_fixed) vdp->xdf_pgeom = pgeom; } if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) { if (vdp->xdf_xdev_nblocks == 0) { vdp->xdf_mstate = DKIO_EJECTED; cv_broadcast(&vdp->xdf_mstate_cv); } else { vdp->xdf_mstate = DKIO_INSERTED; cv_broadcast(&vdp->xdf_mstate_cv); } } else { if (vdp->xdf_mstate != DKIO_NONE) { vdp->xdf_mstate = DKIO_NONE; cv_broadcast(&vdp->xdf_mstate_cv); } } mutex_exit(&vdp->xdf_dev_lk); cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr, (uint64_t)vdp->xdf_xdev_nblocks); /* Restart any currently queued up io */ xdf_io_start(vdp); /* * To get to the ready state we have to do IO to the backend device, * but we can't initiate IO from the other end change callback thread * (which is the current context we're executing in.) This is because * if the other end disconnects while we're doing IO from the callback * thread, then we can't recieve that disconnect event and we hang * waiting for an IO that can never complete. */ (void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp, DDI_SLEEP); (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected); return (DDI_SUCCESS); } /*ARGSUSED*/ static void xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data) { XenbusState new_state = *(XenbusState *)impl_data; xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip); DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n", vdp->xdf_addr, new_state)); mutex_enter(&vdp->xdf_cb_lk); /* We assume that this callback is single threaded */ ASSERT(vdp->xdf_oe_change_thread == NULL); DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread); /* ignore any backend state changes if we're suspending/suspended */ if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) { DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL); mutex_exit(&vdp->xdf_cb_lk); return; } switch (new_state) { case XenbusStateUnknown: case XenbusStateInitialising: case XenbusStateInitWait: case XenbusStateInitialised: if (vdp->xdf_state == XD_INIT) break; xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE); if (xdf_setstate_init(vdp) != DDI_SUCCESS) break; ASSERT(vdp->xdf_state == XD_INIT); break; case XenbusStateConnected: if ((vdp->xdf_state == XD_CONNECTED) || (vdp->xdf_state == XD_READY)) break; if (vdp->xdf_state != XD_INIT) { xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE); if (xdf_setstate_init(vdp) != DDI_SUCCESS) break; ASSERT(vdp->xdf_state == XD_INIT); } if (xdf_setstate_connected(vdp) != DDI_SUCCESS) { xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE); break; } ASSERT(vdp->xdf_state == XD_CONNECTED); break; case XenbusStateClosing: if (xdf_isopen(vdp, -1)) { cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, still in use", vdp->xdf_addr); break; } /*FALLTHROUGH*/ case XenbusStateClosed: xdf_disconnect(vdp, XD_CLOSED, B_FALSE); break; } /* notify anybody waiting for oe state change */ cv_broadcast(&vdp->xdf_dev_cv); DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL); mutex_exit(&vdp->xdf_cb_lk); } static int xdf_connect_locked(xdf_t *vdp, boolean_t wait) { int rv, timeouts = 0, reset = 20; ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk)); ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); /* we can't connect once we're in the closed state */ if (vdp->xdf_state == XD_CLOSED) return (XD_CLOSED); vdp->xdf_connect_req++; while (vdp->xdf_state != XD_READY) { mutex_exit(&vdp->xdf_dev_lk); /* only one thread at a time can be the connection thread */ if (vdp->xdf_connect_thread == NULL) vdp->xdf_connect_thread = curthread; if (vdp->xdf_connect_thread == curthread) { if ((timeouts > 0) && ((timeouts % reset) == 0)) { /* * If we haven't establised a connection * within the reset time, then disconnect * so we can try again, and double the reset * time. The reset time starts at 2 sec. */ (void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE); reset *= 2; } if (vdp->xdf_state == XD_UNKNOWN) (void) xdf_setstate_init(vdp); if (vdp->xdf_state == XD_INIT) (void) xdf_setstate_connected(vdp); } mutex_enter(&vdp->xdf_dev_lk); if (!wait || (vdp->xdf_state == XD_READY)) goto out; mutex_exit((&vdp->xdf_cb_lk)); if (vdp->xdf_connect_thread != curthread) { rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk); } else { /* delay for 0.1 sec */ rv = cv_reltimedwait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk, drv_usectohz(100*1000), TR_CLOCK_TICK); if (rv == -1) timeouts++; } mutex_exit((&vdp->xdf_dev_lk)); mutex_enter((&vdp->xdf_cb_lk)); mutex_enter((&vdp->xdf_dev_lk)); if (rv == 0) goto out; } out: ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk)); ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk)); if (vdp->xdf_connect_thread == curthread) { /* * wake up someone else so they can become the connection * thread. */ cv_signal(&vdp->xdf_dev_cv); vdp->xdf_connect_thread = NULL; } /* Try to lock the media */ mutex_exit((&vdp->xdf_dev_lk)); (void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE); mutex_enter((&vdp->xdf_dev_lk)); vdp->xdf_connect_req--; return (vdp->xdf_state); } static uint_t xdf_iorestart(caddr_t arg) { xdf_t *vdp = (xdf_t *)arg; ASSERT(vdp != NULL); mutex_enter(&vdp->xdf_dev_lk); ASSERT(ISDMACBON(vdp)); SETDMACBOFF(vdp); mutex_exit(&vdp->xdf_dev_lk); xdf_io_start(vdp); return (DDI_INTR_CLAIMED); } #if defined(XPV_HVM_DRIVER) typedef struct xdf_hvm_entry { list_node_t xdf_he_list; char *xdf_he_path; dev_info_t *xdf_he_dip; } xdf_hvm_entry_t; static list_t xdf_hvm_list; static kmutex_t xdf_hvm_list_lock; static xdf_hvm_entry_t * i_xdf_hvm_find(const char *path, dev_info_t *dip) { xdf_hvm_entry_t *i; ASSERT((path != NULL) || (dip != NULL)); ASSERT(MUTEX_HELD(&xdf_hvm_list_lock)); i = list_head(&xdf_hvm_list); while (i != NULL) { if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) { i = list_next(&xdf_hvm_list, i); continue; } if ((dip != NULL) && (i->xdf_he_dip != dip)) { i = list_next(&xdf_hvm_list, i); continue; } break; } return (i); } dev_info_t * xdf_hvm_hold(const char *path) { xdf_hvm_entry_t *i; dev_info_t *dip; mutex_enter(&xdf_hvm_list_lock); i = i_xdf_hvm_find(path, NULL); if (i == NULL) { mutex_exit(&xdf_hvm_list_lock); return (B_FALSE); } ndi_hold_devi(dip = i->xdf_he_dip); mutex_exit(&xdf_hvm_list_lock); return (dip); } static void xdf_hvm_add(dev_info_t *dip) { xdf_hvm_entry_t *i; char *path; /* figure out the path for the dip */ path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); (void) ddi_pathname(dip, path); i = kmem_alloc(sizeof (*i), KM_SLEEP); i->xdf_he_dip = dip; i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP); mutex_enter(&xdf_hvm_list_lock); ASSERT(i_xdf_hvm_find(path, NULL) == NULL); ASSERT(i_xdf_hvm_find(NULL, dip) == NULL); list_insert_head(&xdf_hvm_list, i); mutex_exit(&xdf_hvm_list_lock); kmem_free(path, MAXPATHLEN); } static void xdf_hvm_rm(dev_info_t *dip) { xdf_hvm_entry_t *i; mutex_enter(&xdf_hvm_list_lock); VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL); list_remove(&xdf_hvm_list, i); mutex_exit(&xdf_hvm_list_lock); kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1); kmem_free(i, sizeof (*i)); } static void xdf_hvm_init(void) { list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t), offsetof(xdf_hvm_entry_t, xdf_he_list)); mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL); } static void xdf_hvm_fini(void) { ASSERT(list_head(&xdf_hvm_list) == NULL); list_destroy(&xdf_hvm_list); mutex_destroy(&xdf_hvm_list_lock); } boolean_t xdf_hvm_connect(dev_info_t *dip) { xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip); char *oename, *str; int rv; mutex_enter(&vdp->xdf_cb_lk); /* * Before try to establish a connection we need to wait for the * backend hotplug scripts to have run. Once they are run the * "/hotplug-status" property will be set to "connected". */ for (;;) { ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk)); /* * Get the xenbus path to the backend device. Note that * we can't cache this path (and we look it up on each pass * through this loop) because it could change during * suspend, resume, and migration operations. */ if ((oename = xvdi_get_oename(dip)) == NULL) { mutex_exit(&vdp->xdf_cb_lk); return (B_FALSE); } str = NULL; if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) && (strcmp(str, XBV_HP_STATUS_CONN) == 0)) break; if (str != NULL) strfree(str); /* wait for an update to "/hotplug-status" */ if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) { /* we got interrupted by a signal */ mutex_exit(&vdp->xdf_cb_lk); return (B_FALSE); } } /* Good news. The backend hotplug scripts have been run. */ ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk)); ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0); strfree(str); /* * If we're emulating a cd device and if the backend doesn't support * media request opreations, then we're not going to bother trying * to establish a connection for a couple reasons. First off, media * requests support is required to support operations like eject and * media locking. Second, other backend platforms like Linux don't * support hvm pv cdrom access. They don't even have a backend pv * driver for cdrom device nodes, so we don't want to block forever * waiting for a connection to a backend driver that doesn't exist. */ if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) { mutex_exit(&vdp->xdf_cb_lk); return (B_FALSE); } mutex_enter(&vdp->xdf_dev_lk); rv = xdf_connect_locked(vdp, B_TRUE); mutex_exit(&vdp->xdf_dev_lk); mutex_exit(&vdp->xdf_cb_lk); return ((rv == XD_READY) ? B_TRUE : B_FALSE); } int xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp) { xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip); /* sanity check the requested physical geometry */ mutex_enter(&vdp->xdf_dev_lk); if ((geomp->g_secsize != XB_BSIZE) || (geomp->g_capacity == 0)) { mutex_exit(&vdp->xdf_dev_lk); return (EINVAL); } /* * If we've already connected to the backend device then make sure * we're not defining a physical geometry larger than our backend * device. */ if ((vdp->xdf_xdev_nblocks != 0) && (geomp->g_capacity > vdp->xdf_xdev_nblocks)) { mutex_exit(&vdp->xdf_dev_lk); return (EINVAL); } bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom)); vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl; vdp->xdf_pgeom.g_acyl = geomp->g_acyl; vdp->xdf_pgeom.g_nhead = geomp->g_nhead; vdp->xdf_pgeom.g_nsect = geomp->g_nsect; vdp->xdf_pgeom.g_secsize = geomp->g_secsize; vdp->xdf_pgeom.g_capacity = geomp->g_capacity; vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv; vdp->xdf_pgeom.g_rpm = geomp->g_rpm; vdp->xdf_pgeom_fixed = B_TRUE; mutex_exit(&vdp->xdf_dev_lk); /* force a re-validation */ cmlb_invalidate(vdp->xdf_vd_lbl, NULL); return (0); } boolean_t xdf_is_cd(dev_info_t *dip) { xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip); boolean_t rv; mutex_enter(&vdp->xdf_cb_lk); rv = XD_IS_CD(vdp); mutex_exit(&vdp->xdf_cb_lk); return (rv); } boolean_t xdf_is_rm(dev_info_t *dip) { xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip); boolean_t rv; mutex_enter(&vdp->xdf_cb_lk); rv = XD_IS_RM(vdp); mutex_exit(&vdp->xdf_cb_lk); return (rv); } boolean_t xdf_media_req_supported(dev_info_t *dip) { xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip); boolean_t rv; mutex_enter(&vdp->xdf_cb_lk); rv = vdp->xdf_media_req_supported; mutex_exit(&vdp->xdf_cb_lk); return (rv); } #endif /* XPV_HVM_DRIVER */ static int xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp) { xdf_t *vdp; vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip)); if (vdp == NULL) return (ENXIO); mutex_enter(&vdp->xdf_dev_lk); *capp = vdp->xdf_pgeom.g_capacity; DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp)); mutex_exit(&vdp->xdf_dev_lk); return (0); } static int xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp) { xdf_t *vdp; if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL) return (ENXIO); *geomp = vdp->xdf_pgeom; return (0); } /* * No real HBA, no geometry available from it */ /*ARGSUSED*/ static int xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp) { return (EINVAL); } static int xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep) { xdf_t *vdp; if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip)))) return (ENXIO); if (XD_IS_RO(vdp)) tgattributep->media_is_writable = 0; else tgattributep->media_is_writable = 1; tgattributep->media_is_rotational = 0; return (0); } /* ARGSUSED3 */ int xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie) { int instance; xdf_t *vdp; instance = ddi_get_instance(dip); if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) return (ENXIO); switch (cmd) { case TG_GETPHYGEOM: return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg)); case TG_GETVIRTGEOM: return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg)); case TG_GETCAPACITY: return (xdf_lb_getcap(dip, (diskaddr_t *)arg)); case TG_GETBLOCKSIZE: mutex_enter(&vdp->xdf_cb_lk); *(uint32_t *)arg = vdp->xdf_xdev_secsize; mutex_exit(&vdp->xdf_cb_lk); return (0); case TG_GETATTR: return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg)); default: return (ENOTTY); } } /* ARGSUSED5 */ int xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp, diskaddr_t start, size_t reqlen, void *tg_cookie) { xdf_t *vdp; struct buf *bp; int err = 0; vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip)); /* We don't allow IO from the oe_change callback thread */ ASSERT(curthread != vdp->xdf_oe_change_thread); if ((start + ((reqlen / (vdp->xdf_xdev_secsize / DEV_BSIZE)) >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity) return (EINVAL); bp = getrbuf(KM_SLEEP); if (cmd == TG_READ) bp->b_flags = B_BUSY | B_READ; else bp->b_flags = B_BUSY | B_WRITE; bp->b_un.b_addr = bufp; bp->b_bcount = reqlen; bp->b_blkno = start * (vdp->xdf_xdev_secsize / DEV_BSIZE); bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */ mutex_enter(&vdp->xdf_dev_lk); xdf_bp_push(vdp, bp); mutex_exit(&vdp->xdf_dev_lk); xdf_io_start(vdp); if (curthread == vdp->xdf_ready_tq_thread) (void) xdf_ring_drain(vdp); err = biowait(bp); ASSERT(bp->b_flags & B_DONE); freerbuf(bp); return (err); } /* * Lock the current media. Set the media state to "lock". * (Media locks are only respected by the backend driver.) */ static int xdf_ioctl_mlock(xdf_t *vdp) { int rv; mutex_enter(&vdp->xdf_cb_lk); rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE); mutex_exit(&vdp->xdf_cb_lk); return (rv); } /* * Release a media lock. Set the media state to "none". */ static int xdf_ioctl_munlock(xdf_t *vdp) { int rv; mutex_enter(&vdp->xdf_cb_lk); rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE); mutex_exit(&vdp->xdf_cb_lk); return (rv); } /* * Eject the current media. Ignores any media locks. (Media locks * are only for benifit of the the backend.) */ static int xdf_ioctl_eject(xdf_t *vdp) { int rv; mutex_enter(&vdp->xdf_cb_lk); if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) { mutex_exit(&vdp->xdf_cb_lk); return (rv); } /* * We've set the media requests xenbus parameter to eject, so now * disconnect from the backend, wait for the backend to clear * the media requets xenbus paramter, and then we can reconnect * to the backend. */ (void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE); mutex_enter(&vdp->xdf_dev_lk); if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) { mutex_exit(&vdp->xdf_dev_lk); mutex_exit(&vdp->xdf_cb_lk); return (EIO); } mutex_exit(&vdp->xdf_dev_lk); mutex_exit(&vdp->xdf_cb_lk); return (0); } /* * Watch for media state changes. This can be an insertion of a device * (triggered by a 'xm block-configure' request in another domain) or * the ejection of a device (triggered by a local "eject" operation). * For a full description of the DKIOCSTATE ioctl behavior see dkio(7I). */ static int xdf_dkstate(xdf_t *vdp, enum dkio_state mstate) { enum dkio_state prev_state; mutex_enter(&vdp->xdf_cb_lk); prev_state = vdp->xdf_mstate; if (vdp->xdf_mstate == mstate) { while (vdp->xdf_mstate == prev_state) { if (cv_wait_sig(&vdp->xdf_mstate_cv, &vdp->xdf_cb_lk) == 0) { mutex_exit(&vdp->xdf_cb_lk); return (EINTR); } } } if ((prev_state != DKIO_INSERTED) && (vdp->xdf_mstate == DKIO_INSERTED)) { (void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE); mutex_exit(&vdp->xdf_cb_lk); return (0); } mutex_exit(&vdp->xdf_cb_lk); return (0); } /*ARGSUSED*/ static int xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { minor_t minor = getminor(dev); int part = XDF_PART(minor); xdf_t *vdp; int rv; if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) || (!xdf_isopen(vdp, part))) return (ENXIO); DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n", vdp->xdf_addr, cmd, cmd)); switch (cmd) { default: return (ENOTTY); case DKIOCG_PHYGEOM: case DKIOCG_VIRTGEOM: case DKIOCGGEOM: case DKIOCSGEOM: case DKIOCGAPART: case DKIOCSAPART: case DKIOCGVTOC: case DKIOCSVTOC: case DKIOCPARTINFO: case DKIOCGEXTVTOC: case DKIOCSEXTVTOC: case DKIOCEXTPARTINFO: case DKIOCGMBOOT: case DKIOCSMBOOT: case DKIOCGETEFI: case DKIOCSETEFI: case DKIOCSETEXTPART: case DKIOCPARTITION: return (cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp, rvalp, NULL)); case FDEJECT: case DKIOCEJECT: case CDROMEJECT: return (xdf_ioctl_eject(vdp)); case DKIOCLOCK: return (xdf_ioctl_mlock(vdp)); case DKIOCUNLOCK: return (xdf_ioctl_munlock(vdp)); case CDROMREADOFFSET: { int offset = 0; if (!XD_IS_CD(vdp)) return (ENOTTY); if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode)) return (EFAULT); return (0); } case DKIOCGMEDIAINFO: { struct dk_minfo media_info; media_info.dki_lbsize = vdp->xdf_xdev_secsize; media_info.dki_capacity = vdp->xdf_pgeom.g_capacity; if (XD_IS_CD(vdp)) media_info.dki_media_type = DK_CDROM; else media_info.dki_media_type = DK_FIXED_DISK; if (ddi_copyout(&media_info, (void *)arg, sizeof (struct dk_minfo), mode)) return (EFAULT); return (0); } case DKIOCINFO: { struct dk_cinfo info; /* controller information */ if (XD_IS_CD(vdp)) info.dki_ctype = DKC_CDROM; else info.dki_ctype = DKC_VBD; info.dki_cnum = 0; (void) strncpy((char *)(&info.dki_cname), "xdf", 8); /* unit information */ info.dki_unit = ddi_get_instance(vdp->xdf_dip); (void) strncpy((char *)(&info.dki_dname), "xdf", 8); info.dki_flags = DKI_FMTVOL; info.dki_partition = part; info.dki_maxtransfer = maxphys / DEV_BSIZE; info.dki_addr = 0; info.dki_space = 0; info.dki_prio = 0; info.dki_vec = 0; if (ddi_copyout(&info, (void *)arg, sizeof (info), mode)) return (EFAULT); return (0); } case DKIOCSTATE: { enum dkio_state mstate; if (ddi_copyin((void *)arg, &mstate, sizeof (mstate), mode) != 0) return (EFAULT); if ((rv = xdf_dkstate(vdp, mstate)) != 0) return (rv); mstate = vdp->xdf_mstate; if (ddi_copyout(&mstate, (void *)arg, sizeof (mstate), mode) != 0) return (EFAULT); return (0); } case DKIOCREMOVABLE: { int i = BOOLEAN2VOID(XD_IS_RM(vdp)); if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode)) return (EFAULT); return (0); } case DKIOCGETWCE: { int i = BOOLEAN2VOID(XD_IS_RM(vdp)); if (ddi_copyout(&i, (void *)arg, sizeof (i), mode)) return (EFAULT); return (0); } case DKIOCSETWCE: { int i; if (ddi_copyin((void *)arg, &i, sizeof (i), mode)) return (EFAULT); vdp->xdf_wce = VOID2BOOLEAN(i); return (0); } case DKIOCFLUSHWRITECACHE: { struct dk_callback *dkc = (struct dk_callback *)arg; if (vdp->xdf_flush_supported) { rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, (void *)dev); } else if (vdp->xdf_feature_barrier && !xdf_barrier_flush_disable) { rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, vdp->xdf_cache_flush_block, xdf_flush_block, vdp->xdf_xdev_secsize, (void *)dev); } else { return (ENOTTY); } if ((mode & FKIOCTL) && (dkc != NULL) && (dkc->dkc_callback != NULL)) { (*dkc->dkc_callback)(dkc->dkc_cookie, rv); /* need to return 0 after calling callback */ rv = 0; } return (rv); } } /*NOTREACHED*/ } static int xdf_strategy(struct buf *bp) { xdf_t *vdp; minor_t minor; diskaddr_t p_blkct, p_blkst; daddr_t blkno; ulong_t nblks; int part; minor = getminor(bp->b_edev); part = XDF_PART(minor); vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor)); mutex_enter(&vdp->xdf_dev_lk); if (!xdf_isopen(vdp, part)) { mutex_exit(&vdp->xdf_dev_lk); xdf_io_err(bp, ENXIO, 0); return (0); } /* We don't allow IO from the oe_change callback thread */ ASSERT(curthread != vdp->xdf_oe_change_thread); /* Check for writes to a read only device */ if (!IS_READ(bp) && XD_IS_RO(vdp)) { mutex_exit(&vdp->xdf_dev_lk); xdf_io_err(bp, EROFS, 0); return (0); } /* Check if this I/O is accessing a partition or the entire disk */ if ((long)bp->b_private == XB_SLICE_NONE) { /* This I/O is using an absolute offset */ p_blkct = vdp->xdf_xdev_nblocks; p_blkst = 0; } else { /* This I/O is using a partition relative offset */ mutex_exit(&vdp->xdf_dev_lk); if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct, &p_blkst, NULL, NULL, NULL)) { xdf_io_err(bp, ENXIO, 0); return (0); } mutex_enter(&vdp->xdf_dev_lk); } /* * Adjust the real blkno and bcount according to the underline * physical sector size. */ blkno = bp->b_blkno / (vdp->xdf_xdev_secsize / XB_BSIZE); /* check for a starting block beyond the disk or partition limit */ if (blkno > p_blkct) { DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64, vdp->xdf_addr, (longlong_t)blkno, (uint64_t)p_blkct)); mutex_exit(&vdp->xdf_dev_lk); xdf_io_err(bp, EINVAL, 0); return (0); } /* Legacy: don't set error flag at this case */ if (blkno == p_blkct) { mutex_exit(&vdp->xdf_dev_lk); bp->b_resid = bp->b_bcount; biodone(bp); return (0); } /* sanitize the input buf */ bioerror(bp, 0); bp->b_resid = 0; bp->av_back = bp->av_forw = NULL; /* Adjust for partial transfer, this will result in an error later */ if (vdp->xdf_xdev_secsize != 0 && vdp->xdf_xdev_secsize != XB_BSIZE) { nblks = bp->b_bcount / vdp->xdf_xdev_secsize; } else { nblks = bp->b_bcount >> XB_BSHIFT; } if ((blkno + nblks) > p_blkct) { if (vdp->xdf_xdev_secsize != 0 && vdp->xdf_xdev_secsize != XB_BSIZE) { bp->b_resid = ((blkno + nblks) - p_blkct) * vdp->xdf_xdev_secsize; } else { bp->b_resid = ((blkno + nblks) - p_blkct) << XB_BSHIFT; } bp->b_bcount -= bp->b_resid; } DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n", vdp->xdf_addr, (longlong_t)blkno, (ulong_t)bp->b_bcount)); /* Fix up the buf struct */ bp->b_flags |= B_BUSY; bp->b_private = (void *)(uintptr_t)p_blkst; xdf_bp_push(vdp, bp); mutex_exit(&vdp->xdf_dev_lk); xdf_io_start(vdp); if (do_polled_io) (void) xdf_ring_drain(vdp); return (0); } /*ARGSUSED*/ static int xdf_read(dev_t dev, struct uio *uiop, cred_t *credp) { xdf_t *vdp; minor_t minor; diskaddr_t p_blkcnt; int part; minor = getminor(dev); if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) return (ENXIO); DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n", vdp->xdf_addr, (int64_t)uiop->uio_offset)); part = XDF_PART(minor); if (!xdf_isopen(vdp, part)) return (ENXIO); if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, NULL, NULL, NULL, NULL)) return (ENXIO); if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp)) return (ENOSPC); if (U_INVAL(uiop)) return (EINVAL); return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop)); } /*ARGSUSED*/ static int xdf_write(dev_t dev, struct uio *uiop, cred_t *credp) { xdf_t *vdp; minor_t minor; diskaddr_t p_blkcnt; int part; minor = getminor(dev); if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) return (ENXIO); DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n", vdp->xdf_addr, (int64_t)uiop->uio_offset)); part = XDF_PART(minor); if (!xdf_isopen(vdp, part)) return (ENXIO); if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, NULL, NULL, NULL, NULL)) return (ENXIO); if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp)) return (ENOSPC); if (U_INVAL(uiop)) return (EINVAL); return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop)); } /*ARGSUSED*/ static int xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp) { xdf_t *vdp; minor_t minor; struct uio *uiop = aiop->aio_uio; diskaddr_t p_blkcnt; int part; minor = getminor(dev); if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) return (ENXIO); part = XDF_PART(minor); if (!xdf_isopen(vdp, part)) return (ENXIO); if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, NULL, NULL, NULL, NULL)) return (ENXIO); if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp)) return (ENOSPC); if (U_INVAL(uiop)) return (EINVAL); return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop)); } /*ARGSUSED*/ static int xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp) { xdf_t *vdp; minor_t minor; struct uio *uiop = aiop->aio_uio; diskaddr_t p_blkcnt; int part; minor = getminor(dev); if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) return (ENXIO); part = XDF_PART(minor); if (!xdf_isopen(vdp, part)) return (ENXIO); if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, NULL, NULL, NULL, NULL)) return (ENXIO); if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp)) return (ENOSPC); if (U_INVAL(uiop)) return (EINVAL); return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop)); } static int xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) { struct buf dumpbuf, *dbp = &dumpbuf; xdf_t *vdp; minor_t minor; int err = 0; int part; diskaddr_t p_blkcnt, p_blkst; minor = getminor(dev); if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) return (ENXIO); DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n", vdp->xdf_addr, (void *)addr, blkno, nblk)); /* We don't allow IO from the oe_change callback thread */ ASSERT(curthread != vdp->xdf_oe_change_thread); part = XDF_PART(minor); if (!xdf_isopen(vdp, part)) return (ENXIO); if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst, NULL, NULL, NULL)) return (ENXIO); if ((blkno + nblk) > (p_blkcnt * (vdp->xdf_xdev_secsize / XB_BSIZE))) { cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64, vdp->xdf_addr, (daddr_t)((blkno + nblk) / (vdp->xdf_xdev_secsize / XB_BSIZE)), (uint64_t)p_blkcnt); return (EINVAL); } bioinit(dbp); dbp->b_flags = B_BUSY; dbp->b_un.b_addr = addr; dbp->b_bcount = nblk << DEV_BSHIFT; dbp->b_blkno = blkno; dbp->b_edev = dev; dbp->b_private = (void *)(uintptr_t)p_blkst; mutex_enter(&vdp->xdf_dev_lk); xdf_bp_push(vdp, dbp); mutex_exit(&vdp->xdf_dev_lk); xdf_io_start(vdp); err = xdf_ring_drain(vdp); biofini(dbp); return (err); } /*ARGSUSED*/ static int xdf_close(dev_t dev, int flag, int otyp, struct cred *credp) { minor_t minor; xdf_t *vdp; int part; ulong_t parbit; minor = getminor(dev); if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) return (ENXIO); mutex_enter(&vdp->xdf_dev_lk); part = XDF_PART(minor); if (!xdf_isopen(vdp, part)) { mutex_exit(&vdp->xdf_dev_lk); return (ENXIO); } parbit = 1 << part; ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0); if (otyp == OTYP_LYR) { ASSERT(vdp->xdf_vd_lyropen[part] > 0); if (--vdp->xdf_vd_lyropen[part] == 0) vdp->xdf_vd_open[otyp] &= ~parbit; } else { vdp->xdf_vd_open[otyp] &= ~parbit; } vdp->xdf_vd_exclopen &= ~parbit; mutex_exit(&vdp->xdf_dev_lk); return (0); } static int xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp) { minor_t minor; xdf_t *vdp; int part; ulong_t parbit; diskaddr_t p_blkct = 0; boolean_t firstopen; boolean_t nodelay; minor = getminor(*devp); if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) return (ENXIO); nodelay = (flag & (FNDELAY | FNONBLOCK)); DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr)); /* do cv_wait until connected or failed */ mutex_enter(&vdp->xdf_cb_lk); mutex_enter(&vdp->xdf_dev_lk); if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) { mutex_exit(&vdp->xdf_dev_lk); mutex_exit(&vdp->xdf_cb_lk); return (ENXIO); } mutex_exit(&vdp->xdf_cb_lk); if ((flag & FWRITE) && XD_IS_RO(vdp)) { mutex_exit(&vdp->xdf_dev_lk); return (EROFS); } part = XDF_PART(minor); parbit = 1 << part; if ((vdp->xdf_vd_exclopen & parbit) || ((flag & FEXCL) && xdf_isopen(vdp, part))) { mutex_exit(&vdp->xdf_dev_lk); return (EBUSY); } /* are we the first one to open this node? */ firstopen = !xdf_isopen(vdp, -1); if (otyp == OTYP_LYR) vdp->xdf_vd_lyropen[part]++; vdp->xdf_vd_open[otyp] |= parbit; if (flag & FEXCL) vdp->xdf_vd_exclopen |= parbit; mutex_exit(&vdp->xdf_dev_lk); /* force a re-validation */ if (firstopen) cmlb_invalidate(vdp->xdf_vd_lbl, NULL); /* If this is a non-blocking open then we're done */ if (nodelay) return (0); /* * This is a blocking open, so we require: * - that the disk have a valid label on it * - that the size of the partition that we're opening is non-zero */ if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct, NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) { (void) xdf_close(*devp, flag, otyp, credp); return (ENXIO); } return (0); } /*ARGSUSED*/ static void xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg) { xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip); cv_broadcast(&vdp->xdf_hp_status_cv); } static int xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags, char *name, caddr_t valuep, int *lengthp) { xdf_t *vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip)); /* * Sanity check that if a dev_t or dip were specified that they * correspond to this device driver. On debug kernels we'll * panic and on non-debug kernels we'll return failure. */ ASSERT(ddi_driver_major(dip) == xdf_major); ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major)); if ((ddi_driver_major(dip) != xdf_major) || ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major))) return (DDI_PROP_NOT_FOUND); if (vdp == NULL) return (ddi_prop_op(dev, dip, prop_op, flags, name, valuep, lengthp)); return (cmlb_prop_op(vdp->xdf_vd_lbl, dev, dip, prop_op, flags, name, valuep, lengthp, XDF_PART(getminor(dev)), NULL)); } /*ARGSUSED*/ static int xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp) { int instance = XDF_INST(getminor((dev_t)arg)); xdf_t *vbdp; switch (cmd) { case DDI_INFO_DEVT2DEVINFO: if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) { *rp = NULL; return (DDI_FAILURE); } *rp = vbdp->xdf_dip; return (DDI_SUCCESS); case DDI_INFO_DEVT2INSTANCE: *rp = (void *)(uintptr_t)instance; return (DDI_SUCCESS); default: return (DDI_FAILURE); } } /*ARGSUSED*/ static int xdf_resume(dev_info_t *dip) { xdf_t *vdp; char *oename; if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL) goto err; if (xdf_debug & SUSRES_DBG) xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr); mutex_enter(&vdp->xdf_cb_lk); if (xvdi_resume(dip) != DDI_SUCCESS) { mutex_exit(&vdp->xdf_cb_lk); goto err; } if (((oename = xvdi_get_oename(dip)) == NULL) || (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS, xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) { mutex_exit(&vdp->xdf_cb_lk); goto err; } mutex_enter(&vdp->xdf_dev_lk); ASSERT(vdp->xdf_state != XD_READY); xdf_set_state(vdp, XD_UNKNOWN); mutex_exit(&vdp->xdf_dev_lk); if (xdf_setstate_init(vdp) != DDI_SUCCESS) { mutex_exit(&vdp->xdf_cb_lk); goto err; } mutex_exit(&vdp->xdf_cb_lk); if (xdf_debug & SUSRES_DBG) xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr); return (DDI_SUCCESS); err: if (xdf_debug & SUSRES_DBG) xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr); return (DDI_FAILURE); } static int xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { int n, instance = ddi_get_instance(dip); ddi_iblock_cookie_t ibc, softibc; boolean_t dev_iscd = B_FALSE; xdf_t *vdp; char *oename, *xsname, *str; if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM, "xdf_debug", 0)) != 0) xdf_debug = n; switch (cmd) { case DDI_RESUME: return (xdf_resume(dip)); case DDI_ATTACH: break; default: return (DDI_FAILURE); } /* DDI_ATTACH */ if (((xsname = xvdi_get_xsname(dip)) == NULL) || ((oename = xvdi_get_oename(dip)) == NULL)) return (DDI_FAILURE); /* * Disable auto-detach. This is necessary so that we don't get * detached while we're disconnected from the back end. */ if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS)) return (DDI_FAILURE); /* driver handles kernel-issued IOCTLs */ if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) return (DDI_FAILURE); if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS) return (DDI_FAILURE); if (ddi_get_soft_iblock_cookie(dip, DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS) return (DDI_FAILURE); if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) { cmn_err(CE_WARN, "xdf@%s: cannot read device-type", ddi_get_name_addr(dip)); return (DDI_FAILURE); } if (strcmp(str, XBV_DEV_TYPE_CD) == 0) dev_iscd = B_TRUE; strfree(str); if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS) return (DDI_FAILURE); DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip))); vdp = ddi_get_soft_state(xdf_ssp, instance); ddi_set_driver_private(dip, vdp); vdp->xdf_dip = dip; vdp->xdf_addr = ddi_get_name_addr(dip); vdp->xdf_suspending = B_FALSE; vdp->xdf_media_req_supported = B_FALSE; vdp->xdf_peer = INVALID_DOMID; vdp->xdf_evtchn = INVALID_EVTCHN; list_create(&vdp->xdf_vreq_act, sizeof (v_req_t), offsetof(v_req_t, v_link)); cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL); cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL); cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc); mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc); mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc); vdp->xdf_cmbl_reattach = B_TRUE; if (dev_iscd) { vdp->xdf_dinfo |= VDISK_CDROM; vdp->xdf_mstate = DKIO_EJECTED; } else { vdp->xdf_mstate = DKIO_NONE; } if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq", 1, TASKQ_DEFAULTPRI, 0)) == NULL) goto errout0; if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS, xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS) goto errout0; if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id, &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) { cmn_err(CE_WARN, "xdf@%s: failed to add softintr", ddi_get_name_addr(dip)); goto errout0; } /* * Initialize the physical geometry stucture. Note that currently * we don't know the size of the backend device so the number * of blocks on the device will be initialized to zero. Once * we connect to the backend device we'll update the physical * geometry to reflect the real size of the device. */ xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom); vdp->xdf_pgeom_fixed = B_FALSE; /* * create default device minor nodes: non-removable disk * we will adjust minor nodes after we are connected w/ backend */ cmlb_alloc_handle(&vdp->xdf_vd_lbl); if (xdf_cmlb_attach(vdp) != 0) { cmn_err(CE_WARN, "xdf@%s: attach failed, cmlb attach failed", ddi_get_name_addr(dip)); goto errout0; } /* * We ship with cache-enabled disks */ vdp->xdf_wce = B_TRUE; mutex_enter(&vdp->xdf_cb_lk); /* Watch backend XenbusState change */ if (xvdi_add_event_handler(dip, XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) { mutex_exit(&vdp->xdf_cb_lk); goto errout0; } if (xdf_setstate_init(vdp) != DDI_SUCCESS) { cmn_err(CE_WARN, "xdf@%s: start connection failed", ddi_get_name_addr(dip)); mutex_exit(&vdp->xdf_cb_lk); goto errout1; } mutex_exit(&vdp->xdf_cb_lk); #if defined(XPV_HVM_DRIVER) xdf_hvm_add(dip); /* Report our version to dom0. */ if (xenbus_printf(XBT_NULL, "guest/xdf", "version", "%d", HVMPV_XDF_VERS)) cmn_err(CE_WARN, "xdf: couldn't write version\n"); #else /* !XPV_HVM_DRIVER */ /* create kstat for iostat(1M) */ if (xdf_kstat_create(dip, "xdf", instance) != 0) { cmn_err(CE_WARN, "xdf@%s: failed to create kstat", ddi_get_name_addr(dip)); goto errout1; } #endif /* !XPV_HVM_DRIVER */ ddi_report_dev(dip); DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr)); return (DDI_SUCCESS); errout1: (void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed); xvdi_remove_event_handler(dip, XS_OE_STATE); errout0: if (vdp->xdf_vd_lbl != NULL) { cmlb_detach(vdp->xdf_vd_lbl, NULL); cmlb_free_handle(&vdp->xdf_vd_lbl); vdp->xdf_vd_lbl = NULL; } if (vdp->xdf_softintr_id != NULL) ddi_remove_softintr(vdp->xdf_softintr_id); xvdi_remove_xb_watch_handlers(dip); if (vdp->xdf_ready_tq != NULL) ddi_taskq_destroy(vdp->xdf_ready_tq); mutex_destroy(&vdp->xdf_cb_lk); mutex_destroy(&vdp->xdf_dev_lk); cv_destroy(&vdp->xdf_dev_cv); cv_destroy(&vdp->xdf_hp_status_cv); ddi_soft_state_free(xdf_ssp, instance); ddi_set_driver_private(dip, NULL); ddi_prop_remove_all(dip); cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip)); return (DDI_FAILURE); } static int xdf_suspend(dev_info_t *dip) { int instance = ddi_get_instance(dip); xdf_t *vdp; if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) return (DDI_FAILURE); if (xdf_debug & SUSRES_DBG) xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr); xvdi_suspend(dip); mutex_enter(&vdp->xdf_cb_lk); mutex_enter(&vdp->xdf_dev_lk); vdp->xdf_suspending = B_TRUE; xdf_ring_destroy(vdp); xdf_set_state(vdp, XD_SUSPEND); vdp->xdf_suspending = B_FALSE; mutex_exit(&vdp->xdf_dev_lk); mutex_exit(&vdp->xdf_cb_lk); if (xdf_debug & SUSRES_DBG) xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr); return (DDI_SUCCESS); } static int xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { xdf_t *vdp; int instance; switch (cmd) { case DDI_PM_SUSPEND: break; case DDI_SUSPEND: return (xdf_suspend(dip)); case DDI_DETACH: break; default: return (DDI_FAILURE); } instance = ddi_get_instance(dip); DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip))); vdp = ddi_get_soft_state(xdf_ssp, instance); if (vdp == NULL) return (DDI_FAILURE); mutex_enter(&vdp->xdf_cb_lk); xdf_disconnect(vdp, XD_CLOSED, B_FALSE); if (vdp->xdf_state != XD_CLOSED) { mutex_exit(&vdp->xdf_cb_lk); return (DDI_FAILURE); } mutex_exit(&vdp->xdf_cb_lk); ASSERT(!ISDMACBON(vdp)); #if defined(XPV_HVM_DRIVER) xdf_hvm_rm(dip); #endif /* XPV_HVM_DRIVER */ if (vdp->xdf_timeout_id != 0) (void) untimeout(vdp->xdf_timeout_id); xvdi_remove_event_handler(dip, XS_OE_STATE); ddi_taskq_destroy(vdp->xdf_ready_tq); cmlb_detach(vdp->xdf_vd_lbl, NULL); cmlb_free_handle(&vdp->xdf_vd_lbl); /* we'll support backend running in domU later */ #ifdef DOMU_BACKEND (void) xvdi_post_event(dip, XEN_HP_REMOVE); #endif list_destroy(&vdp->xdf_vreq_act); ddi_prop_remove_all(dip); xdf_kstat_delete(dip); ddi_remove_softintr(vdp->xdf_softintr_id); xvdi_remove_xb_watch_handlers(dip); ddi_set_driver_private(dip, NULL); cv_destroy(&vdp->xdf_dev_cv); mutex_destroy(&vdp->xdf_cb_lk); mutex_destroy(&vdp->xdf_dev_lk); if (vdp->xdf_cache_flush_block != NULL) kmem_free(vdp->xdf_flush_mem, 2 * vdp->xdf_xdev_secsize); ddi_soft_state_free(xdf_ssp, instance); return (DDI_SUCCESS); } /* * Driver linkage structures. */ static struct cb_ops xdf_cbops = { xdf_open, xdf_close, xdf_strategy, nodev, xdf_dump, xdf_read, xdf_write, xdf_ioctl, nodev, nodev, nodev, nochpoll, xdf_prop_op, NULL, D_MP | D_NEW | D_64BIT, CB_REV, xdf_aread, xdf_awrite }; struct dev_ops xdf_devops = { DEVO_REV, /* devo_rev */ 0, /* devo_refcnt */ xdf_getinfo, /* devo_getinfo */ nulldev, /* devo_identify */ nulldev, /* devo_probe */ xdf_attach, /* devo_attach */ xdf_detach, /* devo_detach */ nodev, /* devo_reset */ &xdf_cbops, /* devo_cb_ops */ NULL, /* devo_bus_ops */ NULL, /* devo_power */ ddi_quiesce_not_supported, /* devo_quiesce */ }; /* * Module linkage structures. */ static struct modldrv modldrv = { &mod_driverops, /* Type of module. This one is a driver */ "virtual block driver", /* short description */ &xdf_devops /* driver specific ops */ }; static struct modlinkage xdf_modlinkage = { MODREV_1, (void *)&modldrv, NULL }; /* * standard module entry points */ int _init(void) { int rc; xdf_major = ddi_name_to_major("xdf"); if (xdf_major == (major_t)-1) return (EINVAL); if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0) return (rc); xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache", sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0); xdf_gs_cache = kmem_cache_create("xdf_gs_cache", sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0); #if defined(XPV_HVM_DRIVER) xdf_hvm_init(); #endif /* XPV_HVM_DRIVER */ if ((rc = mod_install(&xdf_modlinkage)) != 0) { #if defined(XPV_HVM_DRIVER) xdf_hvm_fini(); #endif /* XPV_HVM_DRIVER */ kmem_cache_destroy(xdf_vreq_cache); kmem_cache_destroy(xdf_gs_cache); ddi_soft_state_fini(&xdf_ssp); return (rc); } return (rc); } int _fini(void) { int err; if ((err = mod_remove(&xdf_modlinkage)) != 0) return (err); #if defined(XPV_HVM_DRIVER) xdf_hvm_fini(); #endif /* XPV_HVM_DRIVER */ kmem_cache_destroy(xdf_vreq_cache); kmem_cache_destroy(xdf_gs_cache); ddi_soft_state_fini(&xdf_ssp); return (0); } int _info(struct modinfo *modinfop) { return (mod_info(&xdf_modlinkage, modinfop)); }