1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
29 * Copyright 2017 Nexenta Systems, Inc.
30 */
31
32 /*
33 * xdf.c - Xen Virtual Block Device Driver
34 * TODO:
35 * - support alternate block size (currently only DEV_BSIZE supported)
36 * - revalidate geometry for removable devices
37 *
38 * This driver exports disk device nodes, accepts IO requests from those
39 * nodes, and services those requests by talking to a backend device
40 * in another domain.
41 *
42 * Communication with the backend device is done via a ringbuffer (which is
43 * managed via xvdi interfaces) and dma memory (which is managed via ddi
44 * interfaces).
45 *
46 * Communication with the backend device is dependant upon establishing a
47 * connection to the backend device. This connection process involves
48 * reading device configuration information from xenbus and publishing
49 * some frontend runtime configuration parameters via the xenbus (for
50 * consumption by the backend). Once we've published runtime configuration
51 * information via the xenbus, the backend device can enter the connected
52 * state and we'll enter the XD_CONNECTED state. But before we can allow
53 * random IO to begin, we need to do IO to the backend device to determine
54 * the device label and if flush operations are supported. Once this is
55 * done we enter the XD_READY state and can process any IO operations.
56 *
57 * We receive notifications of xenbus state changes for the backend device
58 * (aka, the "other end") via the xdf_oe_change() callback. This callback
59 * is single threaded, meaning that we can't receive new notification of
60 * other end state changes while we're processing an outstanding
61 * notification of an other end state change. There for we can't do any
62 * blocking operations from the xdf_oe_change() callback. This is why we
63 * have a seperate taskq (xdf_ready_tq) which exists to do the necessary
64 * IO to get us from the XD_CONNECTED to the XD_READY state. All IO
65 * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go
66 * throught xdf_lb_rdwr(), which is a synchronous IO interface. IOs
67 * generated by the xdf_ready_tq_thread thread have priority over all
68 * other IO requests.
69 *
70 * We also communicate with the backend device via the xenbus "media-req"
71 * (XBP_MEDIA_REQ) property. For more information on this see the
72 * comments in blkif.h.
73 */
74
75 #include <io/xdf.h>
76
77 #include <sys/conf.h>
78 #include <sys/dkio.h>
79 #include <sys/promif.h>
80 #include <sys/sysmacros.h>
81 #include <sys/kstat.h>
82 #include <sys/mach_mmu.h>
83 #ifdef XPV_HVM_DRIVER
84 #include <sys/xpv_support.h>
85 #else /* !XPV_HVM_DRIVER */
86 #include <sys/evtchn_impl.h>
87 #endif /* !XPV_HVM_DRIVER */
88 #include <sys/sunndi.h>
89 #include <public/io/xenbus.h>
90 #include <xen/sys/xenbus_impl.h>
91 #include <sys/scsi/generic/inquiry.h>
92 #include <xen/io/blkif_impl.h>
93 #include <sys/fdio.h>
94 #include <sys/cdio.h>
95
96 /*
97 * DEBUG_EVAL can be used to include debug only statements without
98 * having to use '#ifdef DEBUG' statements
99 */
100 #ifdef DEBUG
101 #define DEBUG_EVAL(x) (x)
102 #else /* !DEBUG */
103 #define DEBUG_EVAL(x)
104 #endif /* !DEBUG */
105
106 #define XDF_DRAIN_MSEC_DELAY (50*1000) /* 00.05 sec */
107 #define XDF_DRAIN_RETRY_COUNT 200 /* 10.00 sec */
108 #define XDF_STATE_TIMEOUT (30*1000*1000) /* 30.00 sec */
109
110 #define INVALID_DOMID ((domid_t)-1)
111 #define FLUSH_DISKCACHE 0x1
112 #define WRITE_BARRIER 0x2
113 #define DEFAULT_FLUSH_BLOCK 156 /* block to write to cause a cache flush */
114 #define USE_WRITE_BARRIER(vdp) \
115 ((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
116 #define USE_FLUSH_DISKCACHE(vdp) \
117 ((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
118 #define IS_WRITE_BARRIER(vdp, bp) \
119 (!IS_READ(bp) && USE_WRITE_BARRIER(vdp) && \
120 ((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
121 #define IS_FLUSH_DISKCACHE(bp) \
122 (!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
123
124 #define VREQ_DONE(vreq) \
125 VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) && \
126 (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) || \
127 (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws)))
128
129 #define BP_VREQ(bp) ((v_req_t *)((bp)->av_back))
130 #define BP_VREQ_SET(bp, vreq) (((bp)->av_back = (buf_t *)(vreq)))
131
132 extern int do_polled_io;
133
134 /* run-time tunables that we don't want the compiler to optimize away */
135 volatile int xdf_debug = 0;
136 volatile boolean_t xdf_barrier_flush_disable = B_FALSE;
137
138 /* per module globals */
139 major_t xdf_major;
140 static void *xdf_ssp;
141 static kmem_cache_t *xdf_vreq_cache;
142 static kmem_cache_t *xdf_gs_cache;
143 static int xdf_maxphys = XB_MAXPHYS;
144 static diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
145 static int xdf_fbrewrites; /* flush block re-write count */
146
147 /* misc public functions */
148 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
149 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
150
151 /* misc private functions */
152 static void xdf_io_start(xdf_t *);
153 static void xdf_devid_setup(xdf_t *);
154
155 /* callbacks from commmon label */
156 static cmlb_tg_ops_t xdf_lb_ops = {
157 TG_DK_OPS_VERSION_1,
158 xdf_lb_rdwr,
159 xdf_lb_getinfo
160 };
161
162 /*
163 * I/O buffer DMA attributes
164 * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
165 */
166 static ddi_dma_attr_t xb_dma_attr = {
167 DMA_ATTR_V0,
168 (uint64_t)0, /* lowest address */
169 (uint64_t)0xffffffffffffffff, /* highest usable address */
170 (uint64_t)0xffffff, /* DMA counter limit max */
171 (uint64_t)XB_BSIZE, /* alignment in bytes */
172 XB_BSIZE - 1, /* bitmap of burst sizes */
173 XB_BSIZE, /* min transfer */
174 (uint64_t)XB_MAX_XFER, /* maximum transfer */
175 (uint64_t)PAGEOFFSET, /* 1 page segment length */
176 BLKIF_MAX_SEGMENTS_PER_REQUEST, /* maximum number of segments */
177 XB_BSIZE, /* granularity */
178 0, /* flags (reserved) */
179 };
180
181 static ddi_device_acc_attr_t xc_acc_attr = {
182 DDI_DEVICE_ATTR_V0,
183 DDI_NEVERSWAP_ACC,
184 DDI_STRICTORDER_ACC
185 };
186
187 static void
xdf_timeout_handler(void * arg)188 xdf_timeout_handler(void *arg)
189 {
190 xdf_t *vdp = arg;
191
192 mutex_enter(&vdp->xdf_dev_lk);
193 vdp->xdf_timeout_id = 0;
194 mutex_exit(&vdp->xdf_dev_lk);
195
196 /* new timeout thread could be re-scheduled */
197 xdf_io_start(vdp);
198 }
199
200 /*
201 * callback func when DMA/GTE resources is available
202 *
203 * Note: we only register one callback function to grant table subsystem
204 * since we only have one 'struct gnttab_free_callback' in xdf_t.
205 */
206 static void
xdf_gncallback(void * arg)207 xdf_gncallback(void *arg)
208 {
209 xdf_t *vdp = arg;
210 ASSERT(vdp != NULL);
211
212 DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
213 vdp->xdf_addr));
214
215 ddi_trigger_softintr(vdp->xdf_softintr_id);
216 }
217
218 static int
xdf_dmacallback(caddr_t arg)219 xdf_dmacallback(caddr_t arg)
220 {
221 xdf_gncallback(arg);
222 return (DDI_DMA_CALLBACK_DONE);
223 }
224
225 static ge_slot_t *
gs_get(xdf_t * vdp,int isread)226 gs_get(xdf_t *vdp, int isread)
227 {
228 grant_ref_t gh;
229 ge_slot_t *gs;
230
231 /* try to alloc GTEs needed in this slot, first */
232 if (gnttab_alloc_grant_references(
233 BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
234 if (vdp->xdf_gnt_callback.next == NULL) {
235 SETDMACBON(vdp);
236 gnttab_request_free_callback(
237 &vdp->xdf_gnt_callback,
238 xdf_gncallback,
239 (void *)vdp,
240 BLKIF_MAX_SEGMENTS_PER_REQUEST);
241 }
242 return (NULL);
243 }
244
245 gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
246 if (gs == NULL) {
247 gnttab_free_grant_references(gh);
248 if (vdp->xdf_timeout_id == 0)
249 /* restart I/O after one second */
250 vdp->xdf_timeout_id =
251 timeout(xdf_timeout_handler, vdp, hz);
252 return (NULL);
253 }
254
255 /* init gs_slot */
256 gs->gs_oeid = vdp->xdf_peer;
257 gs->gs_isread = isread;
258 gs->gs_ghead = gh;
259 gs->gs_ngrefs = 0;
260
261 return (gs);
262 }
263
264 static void
gs_free(ge_slot_t * gs)265 gs_free(ge_slot_t *gs)
266 {
267 int i;
268
269 /* release all grant table entry resources used in this slot */
270 for (i = 0; i < gs->gs_ngrefs; i++)
271 gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0);
272 gnttab_free_grant_references(gs->gs_ghead);
273 list_remove(&gs->gs_vreq->v_gs, gs);
274 kmem_cache_free(xdf_gs_cache, gs);
275 }
276
277 static grant_ref_t
gs_grant(ge_slot_t * gs,mfn_t mfn)278 gs_grant(ge_slot_t *gs, mfn_t mfn)
279 {
280 grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead);
281
282 ASSERT(gr != -1);
283 ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
284 gs->gs_ge[gs->gs_ngrefs++] = gr;
285 gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread);
286
287 return (gr);
288 }
289
290 /*
291 * Alloc a vreq for this bp
292 * bp->av_back contains the pointer to the vreq upon return
293 */
294 static v_req_t *
vreq_get(xdf_t * vdp,buf_t * bp)295 vreq_get(xdf_t *vdp, buf_t *bp)
296 {
297 v_req_t *vreq = NULL;
298
299 ASSERT(BP_VREQ(bp) == NULL);
300
301 vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
302 if (vreq == NULL) {
303 if (vdp->xdf_timeout_id == 0)
304 /* restart I/O after one second */
305 vdp->xdf_timeout_id =
306 timeout(xdf_timeout_handler, vdp, hz);
307 return (NULL);
308 }
309 bzero(vreq, sizeof (v_req_t));
310 list_create(&vreq->v_gs, sizeof (ge_slot_t),
311 offsetof(ge_slot_t, gs_vreq_link));
312 vreq->v_buf = bp;
313 vreq->v_status = VREQ_INIT;
314 vreq->v_runq = B_FALSE;
315 BP_VREQ_SET(bp, vreq);
316 /* init of other fields in vreq is up to the caller */
317
318 list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
319
320 return (vreq);
321 }
322
323 static void
vreq_free(xdf_t * vdp,v_req_t * vreq)324 vreq_free(xdf_t *vdp, v_req_t *vreq)
325 {
326 buf_t *bp = vreq->v_buf;
327
328 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
329 ASSERT(BP_VREQ(bp) == vreq);
330
331 list_remove(&vdp->xdf_vreq_act, vreq);
332
333 if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
334 goto done;
335
336 switch (vreq->v_status) {
337 case VREQ_DMAWIN_DONE:
338 case VREQ_GS_ALLOCED:
339 case VREQ_DMABUF_BOUND:
340 (void) ddi_dma_unbind_handle(vreq->v_dmahdl);
341 /*FALLTHRU*/
342 case VREQ_DMAMEM_ALLOCED:
343 if (!ALIGNED_XFER(bp)) {
344 ASSERT(vreq->v_abuf != NULL);
345 if (!IS_ERROR(bp) && IS_READ(bp))
346 bcopy(vreq->v_abuf, bp->b_un.b_addr,
347 bp->b_bcount);
348 ddi_dma_mem_free(&vreq->v_align);
349 }
350 /*FALLTHRU*/
351 case VREQ_MEMDMAHDL_ALLOCED:
352 if (!ALIGNED_XFER(bp))
353 ddi_dma_free_handle(&vreq->v_memdmahdl);
354 /*FALLTHRU*/
355 case VREQ_DMAHDL_ALLOCED:
356 ddi_dma_free_handle(&vreq->v_dmahdl);
357 break;
358 default:
359 break;
360 }
361 done:
362 ASSERT(!vreq->v_runq);
363 list_destroy(&vreq->v_gs);
364 kmem_cache_free(xdf_vreq_cache, vreq);
365 }
366
367 /*
368 * Snarf new data if our flush block was re-written
369 */
370 static void
check_fbwrite(xdf_t * vdp,buf_t * bp,daddr_t blkno)371 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
372 {
373 int nblks;
374 boolean_t mapin;
375
376 if (IS_WRITE_BARRIER(vdp, bp))
377 return; /* write was a flush write */
378
379 mapin = B_FALSE;
380 nblks = bp->b_bcount >> DEV_BSHIFT;
381 if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
382 xdf_fbrewrites++;
383 if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
384 mapin = B_TRUE;
385 bp_mapin(bp);
386 }
387 bcopy(bp->b_un.b_addr +
388 ((xdf_flush_block - blkno) << DEV_BSHIFT),
389 vdp->xdf_cache_flush_block, DEV_BSIZE);
390 if (mapin)
391 bp_mapout(bp);
392 }
393 }
394
395 /*
396 * Initalize the DMA and grant table resources for the buf
397 */
398 static int
vreq_setup(xdf_t * vdp,v_req_t * vreq)399 vreq_setup(xdf_t *vdp, v_req_t *vreq)
400 {
401 int rc;
402 ddi_dma_attr_t dmaattr;
403 uint_t ndcs, ndws;
404 ddi_dma_handle_t dh;
405 ddi_dma_handle_t mdh;
406 ddi_dma_cookie_t dc;
407 ddi_acc_handle_t abh;
408 caddr_t aba;
409 ge_slot_t *gs;
410 size_t bufsz;
411 off_t off;
412 size_t sz;
413 buf_t *bp = vreq->v_buf;
414 int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
415 DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
416
417 switch (vreq->v_status) {
418 case VREQ_INIT:
419 if (IS_FLUSH_DISKCACHE(bp)) {
420 if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
421 DPRINTF(DMA_DBG, ("xdf@%s: "
422 "get ge_slotfailed\n", vdp->xdf_addr));
423 return (DDI_FAILURE);
424 }
425 vreq->v_blkno = 0;
426 vreq->v_nslots = 1;
427 vreq->v_flush_diskcache = FLUSH_DISKCACHE;
428 vreq->v_status = VREQ_GS_ALLOCED;
429 gs->gs_vreq = vreq;
430 list_insert_head(&vreq->v_gs, gs);
431 return (DDI_SUCCESS);
432 }
433
434 if (IS_WRITE_BARRIER(vdp, bp))
435 vreq->v_flush_diskcache = WRITE_BARRIER;
436 vreq->v_blkno = bp->b_blkno +
437 (diskaddr_t)(uintptr_t)bp->b_private;
438 /* See if we wrote new data to our flush block */
439 if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
440 check_fbwrite(vdp, bp, vreq->v_blkno);
441 vreq->v_status = VREQ_INIT_DONE;
442 /*FALLTHRU*/
443
444 case VREQ_INIT_DONE:
445 /*
446 * alloc DMA handle
447 */
448 rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
449 xdf_dmacallback, (caddr_t)vdp, &dh);
450 if (rc != DDI_SUCCESS) {
451 SETDMACBON(vdp);
452 DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
453 vdp->xdf_addr));
454 return (DDI_FAILURE);
455 }
456
457 vreq->v_dmahdl = dh;
458 vreq->v_status = VREQ_DMAHDL_ALLOCED;
459 /*FALLTHRU*/
460
461 case VREQ_DMAHDL_ALLOCED:
462 /*
463 * alloc dma handle for 512-byte aligned buf
464 */
465 if (!ALIGNED_XFER(bp)) {
466 /*
467 * XXPV: we need to temporarily enlarge the seg
468 * boundary and s/g length to work round CR6381968
469 */
470 dmaattr = xb_dma_attr;
471 dmaattr.dma_attr_seg = (uint64_t)-1;
472 dmaattr.dma_attr_sgllen = INT_MAX;
473 rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
474 xdf_dmacallback, (caddr_t)vdp, &mdh);
475 if (rc != DDI_SUCCESS) {
476 SETDMACBON(vdp);
477 DPRINTF(DMA_DBG, ("xdf@%s: "
478 "unaligned buf DMAhandle alloc failed\n",
479 vdp->xdf_addr));
480 return (DDI_FAILURE);
481 }
482 vreq->v_memdmahdl = mdh;
483 vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
484 }
485 /*FALLTHRU*/
486
487 case VREQ_MEMDMAHDL_ALLOCED:
488 /*
489 * alloc 512-byte aligned buf
490 */
491 if (!ALIGNED_XFER(bp)) {
492 if (bp->b_flags & (B_PAGEIO | B_PHYS))
493 bp_mapin(bp);
494 rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
495 roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
496 DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
497 &aba, &bufsz, &abh);
498 if (rc != DDI_SUCCESS) {
499 SETDMACBON(vdp);
500 DPRINTF(DMA_DBG, ("xdf@%s: "
501 "DMA mem allocation failed\n",
502 vdp->xdf_addr));
503 return (DDI_FAILURE);
504 }
505
506 vreq->v_abuf = aba;
507 vreq->v_align = abh;
508 vreq->v_status = VREQ_DMAMEM_ALLOCED;
509
510 ASSERT(bufsz >= bp->b_bcount);
511 if (!IS_READ(bp))
512 bcopy(bp->b_un.b_addr, vreq->v_abuf,
513 bp->b_bcount);
514 }
515 /*FALLTHRU*/
516
517 case VREQ_DMAMEM_ALLOCED:
518 /*
519 * dma bind
520 */
521 if (ALIGNED_XFER(bp)) {
522 rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
523 dma_flags, xdf_dmacallback, (caddr_t)vdp,
524 &dc, &ndcs);
525 } else {
526 rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
527 NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
528 xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
529 }
530 if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
531 /* get num of dma windows */
532 if (rc == DDI_DMA_PARTIAL_MAP) {
533 rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
534 ASSERT(rc == DDI_SUCCESS);
535 } else {
536 ndws = 1;
537 }
538 } else {
539 SETDMACBON(vdp);
540 DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
541 vdp->xdf_addr));
542 return (DDI_FAILURE);
543 }
544
545 vreq->v_dmac = dc;
546 vreq->v_dmaw = 0;
547 vreq->v_ndmacs = ndcs;
548 vreq->v_ndmaws = ndws;
549 vreq->v_nslots = ndws;
550 vreq->v_status = VREQ_DMABUF_BOUND;
551 /*FALLTHRU*/
552
553 case VREQ_DMABUF_BOUND:
554 /*
555 * get ge_slot, callback is set upon failure from gs_get(),
556 * if not set previously
557 */
558 if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
559 DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
560 vdp->xdf_addr));
561 return (DDI_FAILURE);
562 }
563
564 vreq->v_status = VREQ_GS_ALLOCED;
565 gs->gs_vreq = vreq;
566 list_insert_head(&vreq->v_gs, gs);
567 break;
568
569 case VREQ_GS_ALLOCED:
570 /* nothing need to be done */
571 break;
572
573 case VREQ_DMAWIN_DONE:
574 /*
575 * move to the next dma window
576 */
577 ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
578
579 /* get a ge_slot for this DMA window */
580 if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
581 DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
582 vdp->xdf_addr));
583 return (DDI_FAILURE);
584 }
585
586 vreq->v_dmaw++;
587 VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
588 &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS);
589 vreq->v_status = VREQ_GS_ALLOCED;
590 gs->gs_vreq = vreq;
591 list_insert_head(&vreq->v_gs, gs);
592 break;
593
594 default:
595 return (DDI_FAILURE);
596 }
597
598 return (DDI_SUCCESS);
599 }
600
601 static int
xdf_cmlb_attach(xdf_t * vdp)602 xdf_cmlb_attach(xdf_t *vdp)
603 {
604 dev_info_t *dip = vdp->xdf_dip;
605
606 return (cmlb_attach(dip, &xdf_lb_ops,
607 XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
608 XD_IS_RM(vdp), B_TRUE,
609 XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
610 0, vdp->xdf_vd_lbl, NULL));
611 }
612
613 static void
xdf_io_err(buf_t * bp,int err,size_t resid)614 xdf_io_err(buf_t *bp, int err, size_t resid)
615 {
616 bioerror(bp, err);
617 if (resid == 0)
618 bp->b_resid = bp->b_bcount;
619 biodone(bp);
620 }
621
622 static void
xdf_kstat_enter(xdf_t * vdp,buf_t * bp)623 xdf_kstat_enter(xdf_t *vdp, buf_t *bp)
624 {
625 v_req_t *vreq = BP_VREQ(bp);
626
627 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
628
629 if (vdp->xdf_xdev_iostat == NULL)
630 return;
631 if ((vreq != NULL) && vreq->v_runq) {
632 kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
633 } else {
634 kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
635 }
636 }
637
638 static void
xdf_kstat_exit(xdf_t * vdp,buf_t * bp)639 xdf_kstat_exit(xdf_t *vdp, buf_t *bp)
640 {
641 v_req_t *vreq = BP_VREQ(bp);
642
643 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
644
645 if (vdp->xdf_xdev_iostat == NULL)
646 return;
647
648 if ((vreq != NULL) && vreq->v_runq) {
649 kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
650 } else {
651 kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
652 }
653
654 if (bp->b_flags & B_READ) {
655 KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->reads++;
656 KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nread += bp->b_bcount;
657 } else if (bp->b_flags & B_WRITE) {
658 KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->writes++;
659 KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nwritten += bp->b_bcount;
660 }
661 }
662
663 static void
xdf_kstat_waitq_to_runq(xdf_t * vdp,buf_t * bp)664 xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp)
665 {
666 v_req_t *vreq = BP_VREQ(bp);
667
668 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
669 ASSERT(!vreq->v_runq);
670
671 vreq->v_runq = B_TRUE;
672 if (vdp->xdf_xdev_iostat == NULL)
673 return;
674 kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
675 }
676
677 static void
xdf_kstat_runq_to_waitq(xdf_t * vdp,buf_t * bp)678 xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp)
679 {
680 v_req_t *vreq = BP_VREQ(bp);
681
682 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
683 ASSERT(vreq->v_runq);
684
685 vreq->v_runq = B_FALSE;
686 if (vdp->xdf_xdev_iostat == NULL)
687 return;
688 kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
689 }
690
691 int
xdf_kstat_create(dev_info_t * dip)692 xdf_kstat_create(dev_info_t *dip)
693 {
694 xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
695 kstat_t *kstat;
696 buf_t *bp;
697
698 if ((kstat = kstat_create("xdf", ddi_get_instance(dip), NULL, "disk",
699 KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL)
700 return (-1);
701
702 /* See comment about locking in xdf_kstat_delete(). */
703 mutex_enter(&vdp->xdf_iostat_lk);
704 mutex_enter(&vdp->xdf_dev_lk);
705
706 /* only one kstat can exist at a time */
707 if (vdp->xdf_xdev_iostat != NULL) {
708 mutex_exit(&vdp->xdf_dev_lk);
709 mutex_exit(&vdp->xdf_iostat_lk);
710 kstat_delete(kstat);
711 return (-1);
712 }
713
714 vdp->xdf_xdev_iostat = kstat;
715 vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
716 kstat_install(vdp->xdf_xdev_iostat);
717
718 /*
719 * Now that we've created a kstat, we need to update the waitq and
720 * runq counts for the kstat to reflect our current state.
721 *
722 * For a buf_t structure to be on the runq, it must have a ring
723 * buffer slot associated with it. To get a ring buffer slot the
724 * buf must first have a v_req_t and a ge_slot_t associated with it.
725 * Then when it is granted a ring buffer slot, v_runq will be set to
726 * true.
727 *
728 * For a buf_t structure to be on the waitq, it must not be on the
729 * runq. So to find all the buf_t's that should be on waitq, we
730 * walk the active buf list and add any buf_t's which aren't on the
731 * runq to the waitq.
732 */
733 bp = vdp->xdf_f_act;
734 while (bp != NULL) {
735 xdf_kstat_enter(vdp, bp);
736 bp = bp->av_forw;
737 }
738 if (vdp->xdf_ready_tq_bp != NULL)
739 xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp);
740
741 mutex_exit(&vdp->xdf_dev_lk);
742 mutex_exit(&vdp->xdf_iostat_lk);
743 return (0);
744 }
745
746 void
xdf_kstat_delete(dev_info_t * dip)747 xdf_kstat_delete(dev_info_t *dip)
748 {
749 xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
750 kstat_t *kstat;
751 buf_t *bp;
752
753 /*
754 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
755 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
756 * and the contents of the our kstat. xdf_iostat_lk is used
757 * to protect the allocation and freeing of the actual kstat.
758 * xdf_dev_lk can't be used for this purpose because kstat
759 * readers use it to access the contents of the kstat and
760 * hence it can't be held when calling kstat_delete().
761 */
762 mutex_enter(&vdp->xdf_iostat_lk);
763 mutex_enter(&vdp->xdf_dev_lk);
764
765 if (vdp->xdf_xdev_iostat == NULL) {
766 mutex_exit(&vdp->xdf_dev_lk);
767 mutex_exit(&vdp->xdf_iostat_lk);
768 return;
769 }
770
771 /*
772 * We're about to destroy the kstat structures, so it isn't really
773 * necessary to update the runq and waitq counts. But, since this
774 * isn't a hot code path we can afford to be a little pedantic and
775 * go ahead and decrement the runq and waitq kstat counters to zero
776 * before free'ing them. This helps us ensure that we've gotten all
777 * our accounting correct.
778 *
779 * For an explanation of how we determine which buffers go on the
780 * runq vs which go on the waitq, see the comments in
781 * xdf_kstat_create().
782 */
783 bp = vdp->xdf_f_act;
784 while (bp != NULL) {
785 xdf_kstat_exit(vdp, bp);
786 bp = bp->av_forw;
787 }
788 if (vdp->xdf_ready_tq_bp != NULL)
789 xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp);
790
791 kstat = vdp->xdf_xdev_iostat;
792 vdp->xdf_xdev_iostat = NULL;
793 mutex_exit(&vdp->xdf_dev_lk);
794 kstat_delete(kstat);
795 mutex_exit(&vdp->xdf_iostat_lk);
796 }
797
798 /*
799 * Add an IO requests onto the active queue.
800 *
801 * We have to detect IOs generated by xdf_ready_tq_thread. These IOs
802 * are used to establish a connection to the backend, so they receive
803 * priority over all other IOs. Since xdf_ready_tq_thread only does
804 * synchronous IO, there can only be one xdf_ready_tq_thread request at any
805 * given time and we record the buf associated with that request in
806 * xdf_ready_tq_bp.
807 */
808 static void
xdf_bp_push(xdf_t * vdp,buf_t * bp)809 xdf_bp_push(xdf_t *vdp, buf_t *bp)
810 {
811 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
812 ASSERT(bp->av_forw == NULL);
813
814 xdf_kstat_enter(vdp, bp);
815
816 if (curthread == vdp->xdf_ready_tq_thread) {
817 /* new IO requests from the ready thread */
818 ASSERT(vdp->xdf_ready_tq_bp == NULL);
819 vdp->xdf_ready_tq_bp = bp;
820 return;
821 }
822
823 /* this is normal IO request */
824 ASSERT(bp != vdp->xdf_ready_tq_bp);
825
826 if (vdp->xdf_f_act == NULL) {
827 /* this is only only IO on the active queue */
828 ASSERT(vdp->xdf_l_act == NULL);
829 ASSERT(vdp->xdf_i_act == NULL);
830 vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp;
831 return;
832 }
833
834 /* add this IO to the tail of the active queue */
835 vdp->xdf_l_act->av_forw = bp;
836 vdp->xdf_l_act = bp;
837 if (vdp->xdf_i_act == NULL)
838 vdp->xdf_i_act = bp;
839 }
840
841 static void
xdf_bp_pop(xdf_t * vdp,buf_t * bp)842 xdf_bp_pop(xdf_t *vdp, buf_t *bp)
843 {
844 buf_t *bp_iter;
845
846 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
847 ASSERT(VREQ_DONE(BP_VREQ(bp)));
848
849 if (vdp->xdf_ready_tq_bp == bp) {
850 /* we're done with a ready thread IO request */
851 ASSERT(bp->av_forw == NULL);
852 vdp->xdf_ready_tq_bp = NULL;
853 return;
854 }
855
856 /* we're done with a normal IO request */
857 ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act));
858 ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act));
859 ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act)));
860 ASSERT(vdp->xdf_f_act != vdp->xdf_i_act);
861
862 if (bp == vdp->xdf_f_act) {
863 /* This IO was at the head of our active queue. */
864 vdp->xdf_f_act = bp->av_forw;
865 if (bp == vdp->xdf_l_act)
866 vdp->xdf_l_act = NULL;
867 } else {
868 /* There IO finished before some other pending IOs. */
869 bp_iter = vdp->xdf_f_act;
870 while (bp != bp_iter->av_forw) {
871 bp_iter = bp_iter->av_forw;
872 ASSERT(VREQ_DONE(BP_VREQ(bp_iter)));
873 ASSERT(bp_iter != vdp->xdf_i_act);
874 }
875 bp_iter->av_forw = bp->av_forw;
876 if (bp == vdp->xdf_l_act)
877 vdp->xdf_l_act = bp_iter;
878 }
879 bp->av_forw = NULL;
880 }
881
882 static buf_t *
xdf_bp_next(xdf_t * vdp)883 xdf_bp_next(xdf_t *vdp)
884 {
885 v_req_t *vreq;
886 buf_t *bp;
887
888 if (vdp->xdf_state == XD_CONNECTED) {
889 /*
890 * If we're in the XD_CONNECTED state, we only service IOs
891 * from the xdf_ready_tq_thread thread.
892 */
893 if ((bp = vdp->xdf_ready_tq_bp) == NULL)
894 return (NULL);
895 if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
896 return (bp);
897 return (NULL);
898 }
899
900 /* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */
901 if (vdp->xdf_state != XD_READY)
902 return (NULL);
903
904 ASSERT(vdp->xdf_ready_tq_bp == NULL);
905 for (;;) {
906 if ((bp = vdp->xdf_i_act) == NULL)
907 return (NULL);
908 if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
909 return (bp);
910
911 /* advance the active buf index pointer */
912 vdp->xdf_i_act = bp->av_forw;
913 }
914 }
915
916 static void
xdf_io_fini(xdf_t * vdp,uint64_t id,int bioerr)917 xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr)
918 {
919 ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
920 v_req_t *vreq = gs->gs_vreq;
921 buf_t *bp = vreq->v_buf;
922
923 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
924 ASSERT(BP_VREQ(bp) == vreq);
925
926 gs_free(gs);
927
928 if (bioerr != 0)
929 bioerror(bp, bioerr);
930 ASSERT(vreq->v_nslots > 0);
931 if (--vreq->v_nslots > 0)
932 return;
933
934 /* remove this IO from our active queue */
935 xdf_bp_pop(vdp, bp);
936
937 ASSERT(vreq->v_runq);
938 xdf_kstat_exit(vdp, bp);
939 vreq->v_runq = B_FALSE;
940 vreq_free(vdp, vreq);
941
942 if (IS_ERROR(bp)) {
943 xdf_io_err(bp, geterror(bp), 0);
944 } else if (bp->b_resid != 0) {
945 /* Partial transfers are an error */
946 xdf_io_err(bp, EIO, bp->b_resid);
947 } else {
948 biodone(bp);
949 }
950 }
951
952 /*
953 * xdf interrupt handler
954 */
955 static uint_t
xdf_intr_locked(xdf_t * vdp)956 xdf_intr_locked(xdf_t *vdp)
957 {
958 xendev_ring_t *xbr;
959 blkif_response_t *resp;
960 int bioerr;
961 uint64_t id;
962 uint8_t op;
963 uint16_t status;
964 ddi_acc_handle_t acchdl;
965
966 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
967
968 if ((xbr = vdp->xdf_xb_ring) == NULL)
969 return (DDI_INTR_UNCLAIMED);
970
971 acchdl = vdp->xdf_xb_ring_hdl;
972
973 /*
974 * complete all requests which have a response
975 */
976 while (resp = xvdi_ring_get_response(xbr)) {
977 id = ddi_get64(acchdl, &resp->id);
978 op = ddi_get8(acchdl, &resp->operation);
979 status = ddi_get16(acchdl, (uint16_t *)&resp->status);
980 DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
981 op, id, status));
982
983 if (status != BLKIF_RSP_OKAY) {
984 DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
985 vdp->xdf_addr,
986 (op == BLKIF_OP_READ) ? "reading" : "writing"));
987 bioerr = EIO;
988 } else {
989 bioerr = 0;
990 }
991
992 xdf_io_fini(vdp, id, bioerr);
993 }
994 return (DDI_INTR_CLAIMED);
995 }
996
997 /*
998 * xdf_intr runs at PIL 5, so no one else can grab xdf_dev_lk and
999 * block at a lower pil.
1000 */
1001 static uint_t
xdf_intr(caddr_t arg)1002 xdf_intr(caddr_t arg)
1003 {
1004 xdf_t *vdp = (xdf_t *)arg;
1005 int rv;
1006
1007 mutex_enter(&vdp->xdf_dev_lk);
1008 rv = xdf_intr_locked(vdp);
1009 mutex_exit(&vdp->xdf_dev_lk);
1010
1011 if (!do_polled_io)
1012 xdf_io_start(vdp);
1013
1014 return (rv);
1015 }
1016
1017 static void
xdf_ring_push(xdf_t * vdp)1018 xdf_ring_push(xdf_t *vdp)
1019 {
1020 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1021
1022 if (vdp->xdf_xb_ring == NULL)
1023 return;
1024
1025 if (xvdi_ring_push_request(vdp->xdf_xb_ring)) {
1026 DPRINTF(IO_DBG, (
1027 "xdf@%s: xdf_ring_push: sent request(s) to backend\n",
1028 vdp->xdf_addr));
1029 }
1030
1031 if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN)
1032 xvdi_notify_oe(vdp->xdf_dip);
1033 }
1034
1035 static int
xdf_ring_drain_locked(xdf_t * vdp)1036 xdf_ring_drain_locked(xdf_t *vdp)
1037 {
1038 int pollc, rv = 0;
1039
1040 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1041
1042 if (xdf_debug & SUSRES_DBG)
1043 xen_printf("xdf_ring_drain: start\n");
1044
1045 for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) {
1046 if (vdp->xdf_xb_ring == NULL)
1047 goto out;
1048
1049 if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1050 (void) xdf_intr_locked(vdp);
1051 if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring))
1052 goto out;
1053 xdf_ring_push(vdp);
1054
1055 /* file-backed devices can be slow */
1056 mutex_exit(&vdp->xdf_dev_lk);
1057 #ifdef XPV_HVM_DRIVER
1058 (void) HYPERVISOR_yield();
1059 #endif /* XPV_HVM_DRIVER */
1060 delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY));
1061 mutex_enter(&vdp->xdf_dev_lk);
1062 }
1063 cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr);
1064
1065 out:
1066 if (vdp->xdf_xb_ring != NULL) {
1067 if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) ||
1068 xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1069 rv = EIO;
1070 }
1071 if (xdf_debug & SUSRES_DBG)
1072 xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n",
1073 vdp->xdf_addr, rv);
1074 return (rv);
1075 }
1076
1077 static int
xdf_ring_drain(xdf_t * vdp)1078 xdf_ring_drain(xdf_t *vdp)
1079 {
1080 int rv;
1081 mutex_enter(&vdp->xdf_dev_lk);
1082 rv = xdf_ring_drain_locked(vdp);
1083 mutex_exit(&vdp->xdf_dev_lk);
1084 return (rv);
1085 }
1086
1087 /*
1088 * Destroy all v_req_t, grant table entries, and our ring buffer.
1089 */
1090 static void
xdf_ring_destroy(xdf_t * vdp)1091 xdf_ring_destroy(xdf_t *vdp)
1092 {
1093 v_req_t *vreq;
1094 buf_t *bp;
1095 ge_slot_t *gs;
1096
1097 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1098 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1099
1100 if ((vdp->xdf_state != XD_INIT) &&
1101 (vdp->xdf_state != XD_CONNECTED) &&
1102 (vdp->xdf_state != XD_READY)) {
1103 ASSERT(vdp->xdf_xb_ring == NULL);
1104 ASSERT(vdp->xdf_xb_ring_hdl == NULL);
1105 ASSERT(vdp->xdf_peer == INVALID_DOMID);
1106 ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN);
1107 ASSERT(list_is_empty(&vdp->xdf_vreq_act));
1108 return;
1109 }
1110
1111 /*
1112 * We don't want to receive async notifications from the backend
1113 * when it finishes processing ring entries.
1114 */
1115 #ifdef XPV_HVM_DRIVER
1116 ec_unbind_evtchn(vdp->xdf_evtchn);
1117 #else /* !XPV_HVM_DRIVER */
1118 (void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1119 #endif /* !XPV_HVM_DRIVER */
1120
1121 /*
1122 * Drain any requests in the ring. We need to do this before we
1123 * can free grant table entries, because if active ring entries
1124 * point to grants, then the backend could be trying to access
1125 * those grants.
1126 */
1127 (void) xdf_ring_drain_locked(vdp);
1128
1129 /* We're done talking to the backend so free up our event channel */
1130 xvdi_free_evtchn(vdp->xdf_dip);
1131 vdp->xdf_evtchn = INVALID_EVTCHN;
1132
1133 while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) {
1134 bp = vreq->v_buf;
1135 ASSERT(BP_VREQ(bp) == vreq);
1136
1137 /* Free up any grant table entries associaed with this IO */
1138 while ((gs = list_head(&vreq->v_gs)) != NULL)
1139 gs_free(gs);
1140
1141 /* If this IO was on the runq, move it back to the waitq. */
1142 if (vreq->v_runq)
1143 xdf_kstat_runq_to_waitq(vdp, bp);
1144
1145 /*
1146 * Reset any buf IO state since we're going to re-issue the
1147 * IO when we reconnect.
1148 */
1149 vreq_free(vdp, vreq);
1150 BP_VREQ_SET(bp, NULL);
1151 bioerror(bp, 0);
1152 }
1153
1154 /* reset the active queue index pointer */
1155 vdp->xdf_i_act = vdp->xdf_f_act;
1156
1157 /* Destroy the ring */
1158 xvdi_free_ring(vdp->xdf_xb_ring);
1159 vdp->xdf_xb_ring = NULL;
1160 vdp->xdf_xb_ring_hdl = NULL;
1161 vdp->xdf_peer = INVALID_DOMID;
1162 }
1163
1164 void
xdfmin(struct buf * bp)1165 xdfmin(struct buf *bp)
1166 {
1167 if (bp->b_bcount > xdf_maxphys)
1168 bp->b_bcount = xdf_maxphys;
1169 }
1170
1171 /*
1172 * Check if we have a pending "eject" media request.
1173 */
1174 static int
xdf_eject_pending(xdf_t * vdp)1175 xdf_eject_pending(xdf_t *vdp)
1176 {
1177 dev_info_t *dip = vdp->xdf_dip;
1178 char *xsname, *str;
1179
1180 if (!vdp->xdf_media_req_supported)
1181 return (B_FALSE);
1182
1183 if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1184 (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0))
1185 return (B_FALSE);
1186
1187 if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
1188 strfree(str);
1189 return (B_FALSE);
1190 }
1191 strfree(str);
1192 return (B_TRUE);
1193 }
1194
1195 /*
1196 * Generate a media request.
1197 */
1198 static int
xdf_media_req(xdf_t * vdp,char * req,boolean_t media_required)1199 xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required)
1200 {
1201 dev_info_t *dip = vdp->xdf_dip;
1202 char *xsname;
1203
1204 /*
1205 * we can't be holding xdf_dev_lk because xenbus_printf() can
1206 * block while waiting for a PIL 1 interrupt message. this
1207 * would cause a deadlock with xdf_intr() which needs to grab
1208 * xdf_dev_lk as well and runs at PIL 5.
1209 */
1210 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1211 ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1212
1213 if ((xsname = xvdi_get_xsname(dip)) == NULL)
1214 return (ENXIO);
1215
1216 /* Check if we support media requests */
1217 if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported)
1218 return (ENOTTY);
1219
1220 /* If an eject is pending then don't allow any new requests */
1221 if (xdf_eject_pending(vdp))
1222 return (ENXIO);
1223
1224 /* Make sure that there is media present */
1225 if (media_required && (vdp->xdf_xdev_nblocks == 0))
1226 return (ENXIO);
1227
1228 /* We only allow operations when the device is ready and connected */
1229 if (vdp->xdf_state != XD_READY)
1230 return (EIO);
1231
1232 if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0)
1233 return (EIO);
1234
1235 return (0);
1236 }
1237
1238 /*
1239 * populate a single blkif_request_t w/ a buf
1240 */
1241 static void
xdf_process_rreq(xdf_t * vdp,struct buf * bp,blkif_request_t * rreq)1242 xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1243 {
1244 grant_ref_t gr;
1245 uint8_t fsect, lsect;
1246 size_t bcnt;
1247 paddr_t dma_addr;
1248 off_t blk_off;
1249 dev_info_t *dip = vdp->xdf_dip;
1250 blkif_vdev_t vdev = xvdi_get_vdevnum(dip);
1251 v_req_t *vreq = BP_VREQ(bp);
1252 uint64_t blkno = vreq->v_blkno;
1253 uint_t ndmacs = vreq->v_ndmacs;
1254 ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1255 int seg = 0;
1256 int isread = IS_READ(bp);
1257 ge_slot_t *gs = list_head(&vreq->v_gs);
1258
1259 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1260 ASSERT(vreq->v_status == VREQ_GS_ALLOCED);
1261
1262 if (isread)
1263 ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1264 else {
1265 switch (vreq->v_flush_diskcache) {
1266 case FLUSH_DISKCACHE:
1267 ddi_put8(acchdl, &rreq->operation,
1268 BLKIF_OP_FLUSH_DISKCACHE);
1269 ddi_put16(acchdl, &rreq->handle, vdev);
1270 ddi_put64(acchdl, &rreq->id,
1271 (uint64_t)(uintptr_t)(gs));
1272 ddi_put8(acchdl, &rreq->nr_segments, 0);
1273 vreq->v_status = VREQ_DMAWIN_DONE;
1274 return;
1275 case WRITE_BARRIER:
1276 ddi_put8(acchdl, &rreq->operation,
1277 BLKIF_OP_WRITE_BARRIER);
1278 break;
1279 default:
1280 if (!vdp->xdf_wce)
1281 ddi_put8(acchdl, &rreq->operation,
1282 BLKIF_OP_WRITE_BARRIER);
1283 else
1284 ddi_put8(acchdl, &rreq->operation,
1285 BLKIF_OP_WRITE);
1286 break;
1287 }
1288 }
1289
1290 ddi_put16(acchdl, &rreq->handle, vdev);
1291 ddi_put64(acchdl, &rreq->sector_number, blkno);
1292 ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs));
1293
1294 /*
1295 * loop until all segments are populated or no more dma cookie in buf
1296 */
1297 for (;;) {
1298 /*
1299 * Each segment of a blkif request can transfer up to
1300 * one 4K page of data.
1301 */
1302 bcnt = vreq->v_dmac.dmac_size;
1303 dma_addr = vreq->v_dmac.dmac_laddress;
1304 blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1305 fsect = blk_off >> XB_BSHIFT;
1306 lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1307
1308 ASSERT(bcnt <= PAGESIZE);
1309 ASSERT((bcnt % XB_BSIZE) == 0);
1310 ASSERT((blk_off & XB_BMASK) == 0);
1311 ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1312 lsect < XB_MAX_SEGLEN / XB_BSIZE);
1313
1314 gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT);
1315 ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1316 ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1317 ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1318
1319 DPRINTF(IO_DBG, (
1320 "xdf@%s: seg%d: dmacS %lu blk_off %ld\n",
1321 vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off));
1322 DPRINTF(IO_DBG, (
1323 "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n",
1324 vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr));
1325
1326 blkno += (bcnt >> XB_BSHIFT);
1327 seg++;
1328 ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1329 if (--ndmacs) {
1330 ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1331 continue;
1332 }
1333
1334 vreq->v_status = VREQ_DMAWIN_DONE;
1335 vreq->v_blkno = blkno;
1336 break;
1337 }
1338 ddi_put8(acchdl, &rreq->nr_segments, seg);
1339 DPRINTF(IO_DBG, (
1340 "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n",
1341 vdp->xdf_addr, rreq->id));
1342 }
1343
1344 static void
xdf_io_start(xdf_t * vdp)1345 xdf_io_start(xdf_t *vdp)
1346 {
1347 struct buf *bp;
1348 v_req_t *vreq;
1349 blkif_request_t *rreq;
1350 boolean_t rreqready = B_FALSE;
1351
1352 mutex_enter(&vdp->xdf_dev_lk);
1353
1354 /*
1355 * Populate the ring request(s). Loop until there is no buf to
1356 * transfer or no free slot available in I/O ring.
1357 */
1358 for (;;) {
1359 /* don't start any new IO if we're suspending */
1360 if (vdp->xdf_suspending)
1361 break;
1362 if ((bp = xdf_bp_next(vdp)) == NULL)
1363 break;
1364
1365 /* if the buf doesn't already have a vreq, allocate one */
1366 if (((vreq = BP_VREQ(bp)) == NULL) &&
1367 ((vreq = vreq_get(vdp, bp)) == NULL))
1368 break;
1369
1370 /* alloc DMA/GTE resources */
1371 if (vreq_setup(vdp, vreq) != DDI_SUCCESS)
1372 break;
1373
1374 /* get next blkif_request in the ring */
1375 if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL)
1376 break;
1377 bzero(rreq, sizeof (blkif_request_t));
1378 rreqready = B_TRUE;
1379
1380 /* populate blkif_request with this buf */
1381 xdf_process_rreq(vdp, bp, rreq);
1382
1383 /*
1384 * This buffer/vreq pair is has been allocated a ring buffer
1385 * resources, so if it isn't already in our runq, add it.
1386 */
1387 if (!vreq->v_runq)
1388 xdf_kstat_waitq_to_runq(vdp, bp);
1389 }
1390
1391 /* Send the request(s) to the backend */
1392 if (rreqready)
1393 xdf_ring_push(vdp);
1394
1395 mutex_exit(&vdp->xdf_dev_lk);
1396 }
1397
1398
1399 /* check if partition is open, -1 - check all partitions on the disk */
1400 static boolean_t
xdf_isopen(xdf_t * vdp,int partition)1401 xdf_isopen(xdf_t *vdp, int partition)
1402 {
1403 int i;
1404 ulong_t parbit;
1405 boolean_t rval = B_FALSE;
1406
1407 ASSERT((partition == -1) ||
1408 ((partition >= 0) || (partition < XDF_PEXT)));
1409
1410 if (partition == -1)
1411 parbit = (ulong_t)-1;
1412 else
1413 parbit = 1 << partition;
1414
1415 for (i = 0; i < OTYPCNT; i++) {
1416 if (vdp->xdf_vd_open[i] & parbit)
1417 rval = B_TRUE;
1418 }
1419
1420 return (rval);
1421 }
1422
1423 /*
1424 * The connection should never be closed as long as someone is holding
1425 * us open, there is pending IO, or someone is waiting waiting for a
1426 * connection.
1427 */
1428 static boolean_t
xdf_busy(xdf_t * vdp)1429 xdf_busy(xdf_t *vdp)
1430 {
1431 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1432
1433 if ((vdp->xdf_xb_ring != NULL) &&
1434 xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
1435 ASSERT(vdp->xdf_state != XD_CLOSED);
1436 return (B_TRUE);
1437 }
1438
1439 if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) {
1440 ASSERT(vdp->xdf_state != XD_CLOSED);
1441 return (B_TRUE);
1442 }
1443
1444 if (xdf_isopen(vdp, -1)) {
1445 ASSERT(vdp->xdf_state != XD_CLOSED);
1446 return (B_TRUE);
1447 }
1448
1449 if (vdp->xdf_connect_req > 0) {
1450 ASSERT(vdp->xdf_state != XD_CLOSED);
1451 return (B_TRUE);
1452 }
1453
1454 return (B_FALSE);
1455 }
1456
1457 static void
xdf_set_state(xdf_t * vdp,xdf_state_t new_state)1458 xdf_set_state(xdf_t *vdp, xdf_state_t new_state)
1459 {
1460 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1461 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1462 DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n",
1463 vdp->xdf_addr, vdp->xdf_state, new_state));
1464 vdp->xdf_state = new_state;
1465 cv_broadcast(&vdp->xdf_dev_cv);
1466 }
1467
1468 static void
xdf_disconnect(xdf_t * vdp,xdf_state_t new_state,boolean_t quiet)1469 xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet)
1470 {
1471 dev_info_t *dip = vdp->xdf_dip;
1472 boolean_t busy;
1473
1474 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1475 ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1476 ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED));
1477
1478 /* Check if we're already there. */
1479 if (vdp->xdf_state == new_state)
1480 return;
1481
1482 mutex_enter(&vdp->xdf_dev_lk);
1483 busy = xdf_busy(vdp);
1484
1485 /* If we're already closed then there's nothing todo. */
1486 if (vdp->xdf_state == XD_CLOSED) {
1487 ASSERT(!busy);
1488 xdf_set_state(vdp, new_state);
1489 mutex_exit(&vdp->xdf_dev_lk);
1490 return;
1491 }
1492
1493 #ifdef DEBUG
1494 /* UhOh. Warn the user that something bad has happened. */
1495 if (!quiet && busy && (vdp->xdf_state == XD_READY) &&
1496 (vdp->xdf_xdev_nblocks != 0)) {
1497 cmn_err(CE_WARN, "xdf@%s: disconnected while in use",
1498 vdp->xdf_addr);
1499 }
1500 #endif /* DEBUG */
1501
1502 xdf_ring_destroy(vdp);
1503
1504 /* If we're busy then we can only go into the unknown state */
1505 xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state);
1506 mutex_exit(&vdp->xdf_dev_lk);
1507
1508 /* if we're closed now, let the other end know */
1509 if (vdp->xdf_state == XD_CLOSED)
1510 (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1511 }
1512
1513
1514 /*
1515 * Kick-off connect process
1516 * Status should be XD_UNKNOWN or XD_CLOSED
1517 * On success, status will be changed to XD_INIT
1518 * On error, it will be changed to XD_UNKNOWN
1519 */
1520 static int
xdf_setstate_init(xdf_t * vdp)1521 xdf_setstate_init(xdf_t *vdp)
1522 {
1523 dev_info_t *dip = vdp->xdf_dip;
1524 xenbus_transaction_t xbt;
1525 grant_ref_t gref;
1526 char *xsname, *str;
1527 int rv;
1528
1529 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1530 ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1531 ASSERT((vdp->xdf_state == XD_UNKNOWN) ||
1532 (vdp->xdf_state == XD_CLOSED));
1533
1534 DPRINTF(DDI_DBG,
1535 ("xdf@%s: starting connection process\n", vdp->xdf_addr));
1536
1537 /*
1538 * If an eject is pending then don't allow a new connection.
1539 * (Only the backend can clear media request eject request.)
1540 */
1541 if (xdf_eject_pending(vdp))
1542 return (DDI_FAILURE);
1543
1544 if ((xsname = xvdi_get_xsname(dip)) == NULL)
1545 goto errout;
1546
1547 if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID)
1548 goto errout;
1549
1550 (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising);
1551
1552 /*
1553 * Sanity check for the existance of the xenbus device-type property.
1554 * This property might not exist if our xenbus device nodes were
1555 * force destroyed while we were still connected to the backend.
1556 */
1557 if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0)
1558 goto errout;
1559 strfree(str);
1560
1561 if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS)
1562 goto errout;
1563
1564 vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1565 #ifdef XPV_HVM_DRIVER
1566 ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1567 #else /* !XPV_HVM_DRIVER */
1568 if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1569 DDI_SUCCESS) {
1570 cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: "
1571 "failed to add intr handler", vdp->xdf_addr);
1572 goto errout1;
1573 }
1574 #endif /* !XPV_HVM_DRIVER */
1575
1576 if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1577 sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1578 DDI_SUCCESS) {
1579 cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1580 vdp->xdf_addr);
1581 goto errout2;
1582 }
1583 vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1584
1585 /*
1586 * Write into xenstore the info needed by backend
1587 */
1588 trans_retry:
1589 if (xenbus_transaction_start(&xbt)) {
1590 cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1591 vdp->xdf_addr);
1592 xvdi_fatal_error(dip, EIO, "connect transaction init");
1593 goto fail_trans;
1594 }
1595
1596 /*
1597 * XBP_PROTOCOL is written by the domain builder in the case of PV
1598 * domains. However, it is not written for HVM domains, so let's
1599 * write it here.
1600 */
1601 if (((rv = xenbus_printf(xbt, xsname,
1602 XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) ||
1603 ((rv = xenbus_printf(xbt, xsname,
1604 XBP_RING_REF, "%u", gref)) != 0) ||
1605 ((rv = xenbus_printf(xbt, xsname,
1606 XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) ||
1607 ((rv = xenbus_printf(xbt, xsname,
1608 XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) ||
1609 ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) {
1610 (void) xenbus_transaction_end(xbt, 1);
1611 xvdi_fatal_error(dip, rv, "connect transaction setup");
1612 goto fail_trans;
1613 }
1614
1615 /* kick-off connect process */
1616 if (rv = xenbus_transaction_end(xbt, 0)) {
1617 if (rv == EAGAIN)
1618 goto trans_retry;
1619 xvdi_fatal_error(dip, rv, "connect transaction commit");
1620 goto fail_trans;
1621 }
1622
1623 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1624 mutex_enter(&vdp->xdf_dev_lk);
1625 xdf_set_state(vdp, XD_INIT);
1626 mutex_exit(&vdp->xdf_dev_lk);
1627
1628 return (DDI_SUCCESS);
1629
1630 fail_trans:
1631 xvdi_free_ring(vdp->xdf_xb_ring);
1632 errout2:
1633 #ifdef XPV_HVM_DRIVER
1634 ec_unbind_evtchn(vdp->xdf_evtchn);
1635 #else /* !XPV_HVM_DRIVER */
1636 (void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1637 #endif /* !XPV_HVM_DRIVER */
1638 errout1:
1639 xvdi_free_evtchn(dip);
1640 vdp->xdf_evtchn = INVALID_EVTCHN;
1641 errout:
1642 xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1643 cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend",
1644 vdp->xdf_addr);
1645 return (DDI_FAILURE);
1646 }
1647
1648 int
xdf_get_flush_block(xdf_t * vdp)1649 xdf_get_flush_block(xdf_t *vdp)
1650 {
1651 /*
1652 * Get a DEV_BSIZE aligned bufer
1653 */
1654 vdp->xdf_flush_mem = kmem_alloc(vdp->xdf_xdev_secsize * 2, KM_SLEEP);
1655 vdp->xdf_cache_flush_block =
1656 (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem),
1657 (int)vdp->xdf_xdev_secsize);
1658
1659 if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1660 xdf_flush_block, vdp->xdf_xdev_secsize, NULL) != 0)
1661 return (DDI_FAILURE);
1662 return (DDI_SUCCESS);
1663 }
1664
1665 static void
xdf_setstate_ready(void * arg)1666 xdf_setstate_ready(void *arg)
1667 {
1668 xdf_t *vdp = (xdf_t *)arg;
1669 dev_info_t *dip = vdp->xdf_dip;
1670
1671 vdp->xdf_ready_tq_thread = curthread;
1672
1673 /* Create minor nodes now when we are almost connected */
1674 mutex_enter(&vdp->xdf_dev_lk);
1675 if (vdp->xdf_cmlb_reattach) {
1676 vdp->xdf_cmlb_reattach = B_FALSE;
1677 mutex_exit(&vdp->xdf_dev_lk);
1678 if (xdf_cmlb_attach(vdp) != 0) {
1679 cmn_err(CE_WARN,
1680 "xdf@%s: cmlb attach failed",
1681 ddi_get_name_addr(dip));
1682 xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1683 return;
1684 }
1685 mutex_enter(&vdp->xdf_dev_lk);
1686 }
1687
1688 /* If we're not still trying to get to the ready state, then bail. */
1689 if (vdp->xdf_state != XD_CONNECTED) {
1690 mutex_exit(&vdp->xdf_dev_lk);
1691 return;
1692 }
1693 mutex_exit(&vdp->xdf_dev_lk);
1694
1695 /*
1696 * If backend has feature-barrier, see if it supports disk
1697 * cache flush op.
1698 */
1699 vdp->xdf_flush_supported = B_FALSE;
1700 if (vdp->xdf_feature_barrier) {
1701 /*
1702 * Pretend we already know flush is supported so probe
1703 * will attempt the correct op.
1704 */
1705 vdp->xdf_flush_supported = B_TRUE;
1706 if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1707 vdp->xdf_flush_supported = B_TRUE;
1708 } else {
1709 vdp->xdf_flush_supported = B_FALSE;
1710 /*
1711 * If the other end does not support the cache flush op
1712 * then we must use a barrier-write to force disk
1713 * cache flushing. Barrier writes require that a data
1714 * block actually be written.
1715 * Cache a block to barrier-write when we are
1716 * asked to perform a flush.
1717 * XXX - would it be better to just copy 1 block
1718 * (512 bytes) from whatever write we did last
1719 * and rewrite that block?
1720 */
1721 if (xdf_get_flush_block(vdp) != DDI_SUCCESS) {
1722 xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1723 return;
1724 }
1725 }
1726 }
1727
1728 mutex_enter(&vdp->xdf_cb_lk);
1729 mutex_enter(&vdp->xdf_dev_lk);
1730 if (vdp->xdf_state == XD_CONNECTED)
1731 xdf_set_state(vdp, XD_READY);
1732 mutex_exit(&vdp->xdf_dev_lk);
1733
1734 /* Restart any currently queued up io */
1735 xdf_io_start(vdp);
1736
1737 mutex_exit(&vdp->xdf_cb_lk);
1738 }
1739
1740 /*
1741 * synthetic geometry
1742 */
1743 #define XDF_NSECTS 256
1744 #define XDF_NHEADS 16
1745
1746 static void
xdf_synthetic_pgeom(dev_info_t * dip,cmlb_geom_t * geomp)1747 xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp)
1748 {
1749 xdf_t *vdp;
1750 uint_t ncyl;
1751
1752 vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
1753
1754 ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1755
1756 bzero(geomp, sizeof (*geomp));
1757 geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1758 geomp->g_acyl = 0;
1759 geomp->g_nhead = XDF_NHEADS;
1760 geomp->g_nsect = XDF_NSECTS;
1761 geomp->g_secsize = vdp->xdf_xdev_secsize;
1762 geomp->g_capacity = vdp->xdf_xdev_nblocks;
1763 geomp->g_intrlv = 0;
1764 geomp->g_rpm = 7200;
1765 }
1766
1767 /*
1768 * Finish other initialization after we've connected to backend
1769 * Status should be XD_INIT before calling this routine
1770 * On success, status should be changed to XD_CONNECTED.
1771 * On error, status should stay XD_INIT
1772 */
1773 static int
xdf_setstate_connected(xdf_t * vdp)1774 xdf_setstate_connected(xdf_t *vdp)
1775 {
1776 dev_info_t *dip = vdp->xdf_dip;
1777 cmlb_geom_t pgeom;
1778 diskaddr_t nblocks = 0;
1779 uint_t secsize = 0;
1780 char *oename, *xsname, *str;
1781 uint_t dinfo;
1782
1783 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1784 ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1785 ASSERT(vdp->xdf_state == XD_INIT);
1786
1787 if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1788 ((oename = xvdi_get_oename(dip)) == NULL))
1789 return (DDI_FAILURE);
1790
1791 /* Make sure the other end is XenbusStateConnected */
1792 if (xenbus_read_driver_state(oename) != XenbusStateConnected)
1793 return (DDI_FAILURE);
1794
1795 /* Determine if feature barrier is supported by backend */
1796 if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB)))
1797 cmn_err(CE_NOTE, "!xdf@%s: feature-barrier not supported",
1798 vdp->xdf_addr);
1799
1800 /*
1801 * Probe backend. Read the device size into xdf_xdev_nblocks
1802 * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE
1803 * flags in xdf_dinfo. If the emulated device type is "cdrom",
1804 * we always set VDISK_CDROM, regardless of if it's present in
1805 * the xenbus info parameter.
1806 */
1807 if (xenbus_gather(XBT_NULL, oename,
1808 XBP_SECTORS, "%"SCNu64, &nblocks,
1809 XBP_SECTOR_SIZE, "%u", &secsize,
1810 XBP_INFO, "%u", &dinfo,
1811 NULL) != 0) {
1812 cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1813 "cannot read backend info", vdp->xdf_addr);
1814 return (DDI_FAILURE);
1815 }
1816 if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
1817 cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
1818 vdp->xdf_addr);
1819 return (DDI_FAILURE);
1820 }
1821 if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
1822 dinfo |= VDISK_CDROM;
1823 strfree(str);
1824
1825 if (secsize == 0 || !(ISP2(secsize / DEV_BSIZE)))
1826 secsize = DEV_BSIZE;
1827 vdp->xdf_xdev_nblocks = nblocks;
1828 vdp->xdf_xdev_secsize = secsize;
1829 #ifdef _ILP32
1830 if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1831 cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1832 "backend disk device too large with %llu blocks for"
1833 " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks);
1834 xvdi_fatal_error(dip, EFBIG, "reading backend info");
1835 return (DDI_FAILURE);
1836 }
1837 #endif
1838
1839 /*
1840 * If the physical geometry for a fixed disk has been explicity
1841 * set then make sure that the specified physical geometry isn't
1842 * larger than the device we connected to.
1843 */
1844 if (vdp->xdf_pgeom_fixed &&
1845 (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) {
1846 cmn_err(CE_WARN,
1847 "xdf@%s: connect failed, fixed geometry too large",
1848 vdp->xdf_addr);
1849 return (DDI_FAILURE);
1850 }
1851
1852 vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP);
1853
1854 /* mark vbd is ready for I/O */
1855 mutex_enter(&vdp->xdf_dev_lk);
1856 xdf_set_state(vdp, XD_CONNECTED);
1857
1858 /* check if the cmlb label should be updated */
1859 xdf_synthetic_pgeom(dip, &pgeom);
1860 if ((vdp->xdf_dinfo != dinfo) ||
1861 (!vdp->xdf_pgeom_fixed &&
1862 (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) {
1863 vdp->xdf_cmlb_reattach = B_TRUE;
1864
1865 vdp->xdf_dinfo = dinfo;
1866 if (!vdp->xdf_pgeom_fixed)
1867 vdp->xdf_pgeom = pgeom;
1868 }
1869
1870 if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) {
1871 if (vdp->xdf_xdev_nblocks == 0) {
1872 vdp->xdf_mstate = DKIO_EJECTED;
1873 cv_broadcast(&vdp->xdf_mstate_cv);
1874 } else {
1875 vdp->xdf_mstate = DKIO_INSERTED;
1876 cv_broadcast(&vdp->xdf_mstate_cv);
1877 }
1878 } else {
1879 if (vdp->xdf_mstate != DKIO_NONE) {
1880 vdp->xdf_mstate = DKIO_NONE;
1881 cv_broadcast(&vdp->xdf_mstate_cv);
1882 }
1883 }
1884
1885 mutex_exit(&vdp->xdf_dev_lk);
1886
1887 cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr,
1888 (uint64_t)vdp->xdf_xdev_nblocks);
1889
1890 /* Restart any currently queued up io */
1891 xdf_io_start(vdp);
1892
1893 /*
1894 * To get to the ready state we have to do IO to the backend device,
1895 * but we can't initiate IO from the other end change callback thread
1896 * (which is the current context we're executing in.) This is because
1897 * if the other end disconnects while we're doing IO from the callback
1898 * thread, then we can't receive that disconnect event and we hang
1899 * waiting for an IO that can never complete.
1900 */
1901 (void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp,
1902 DDI_SLEEP);
1903
1904 (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1905 return (DDI_SUCCESS);
1906 }
1907
1908 /*ARGSUSED*/
1909 static void
xdf_oe_change(dev_info_t * dip,ddi_eventcookie_t id,void * arg,void * impl_data)1910 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1911 {
1912 XenbusState new_state = *(XenbusState *)impl_data;
1913 xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1914
1915 DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1916 vdp->xdf_addr, new_state));
1917
1918 mutex_enter(&vdp->xdf_cb_lk);
1919
1920 /* We assume that this callback is single threaded */
1921 ASSERT(vdp->xdf_oe_change_thread == NULL);
1922 DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread);
1923
1924 /* ignore any backend state changes if we're suspending/suspended */
1925 if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) {
1926 DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1927 mutex_exit(&vdp->xdf_cb_lk);
1928 return;
1929 }
1930
1931 switch (new_state) {
1932 case XenbusStateUnknown:
1933 case XenbusStateInitialising:
1934 case XenbusStateInitWait:
1935 case XenbusStateInitialised:
1936 if (vdp->xdf_state == XD_INIT)
1937 break;
1938
1939 xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1940 if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1941 break;
1942 ASSERT(vdp->xdf_state == XD_INIT);
1943 break;
1944
1945 case XenbusStateConnected:
1946 if ((vdp->xdf_state == XD_CONNECTED) ||
1947 (vdp->xdf_state == XD_READY))
1948 break;
1949
1950 if (vdp->xdf_state != XD_INIT) {
1951 xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1952 if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1953 break;
1954 ASSERT(vdp->xdf_state == XD_INIT);
1955 }
1956
1957 if (xdf_setstate_connected(vdp) != DDI_SUCCESS) {
1958 xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1959 break;
1960 }
1961 ASSERT(vdp->xdf_state == XD_CONNECTED);
1962 break;
1963
1964 case XenbusStateClosing:
1965 if (xdf_isopen(vdp, -1)) {
1966 cmn_err(CE_NOTE,
1967 "xdf@%s: hot-unplug failed, still in use",
1968 vdp->xdf_addr);
1969 break;
1970 }
1971 /*FALLTHROUGH*/
1972 case XenbusStateClosed:
1973 xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
1974 break;
1975 }
1976
1977 /* notify anybody waiting for oe state change */
1978 cv_broadcast(&vdp->xdf_dev_cv);
1979 DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1980 mutex_exit(&vdp->xdf_cb_lk);
1981 }
1982
1983 static int
xdf_connect_locked(xdf_t * vdp,boolean_t wait)1984 xdf_connect_locked(xdf_t *vdp, boolean_t wait)
1985 {
1986 int rv, timeouts = 0, reset = 20;
1987
1988 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1989 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1990
1991 /* we can't connect once we're in the closed state */
1992 if (vdp->xdf_state == XD_CLOSED)
1993 return (XD_CLOSED);
1994
1995 vdp->xdf_connect_req++;
1996 while (vdp->xdf_state != XD_READY) {
1997 mutex_exit(&vdp->xdf_dev_lk);
1998
1999 /* only one thread at a time can be the connection thread */
2000 if (vdp->xdf_connect_thread == NULL)
2001 vdp->xdf_connect_thread = curthread;
2002
2003 if (vdp->xdf_connect_thread == curthread) {
2004 if ((timeouts > 0) && ((timeouts % reset) == 0)) {
2005 /*
2006 * If we haven't establised a connection
2007 * within the reset time, then disconnect
2008 * so we can try again, and double the reset
2009 * time. The reset time starts at 2 sec.
2010 */
2011 (void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2012 reset *= 2;
2013 }
2014 if (vdp->xdf_state == XD_UNKNOWN)
2015 (void) xdf_setstate_init(vdp);
2016 if (vdp->xdf_state == XD_INIT)
2017 (void) xdf_setstate_connected(vdp);
2018 }
2019
2020 mutex_enter(&vdp->xdf_dev_lk);
2021 if (!wait || (vdp->xdf_state == XD_READY))
2022 goto out;
2023
2024 mutex_exit((&vdp->xdf_cb_lk));
2025 if (vdp->xdf_connect_thread != curthread) {
2026 rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk);
2027 } else {
2028 /* delay for 0.1 sec */
2029 rv = cv_reltimedwait_sig(&vdp->xdf_dev_cv,
2030 &vdp->xdf_dev_lk, drv_usectohz(100*1000),
2031 TR_CLOCK_TICK);
2032 if (rv == -1)
2033 timeouts++;
2034 }
2035 mutex_exit((&vdp->xdf_dev_lk));
2036 mutex_enter((&vdp->xdf_cb_lk));
2037 mutex_enter((&vdp->xdf_dev_lk));
2038 if (rv == 0)
2039 goto out;
2040 }
2041
2042 out:
2043 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2044 ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2045
2046 if (vdp->xdf_connect_thread == curthread) {
2047 /*
2048 * wake up someone else so they can become the connection
2049 * thread.
2050 */
2051 cv_signal(&vdp->xdf_dev_cv);
2052 vdp->xdf_connect_thread = NULL;
2053 }
2054
2055 /* Try to lock the media */
2056 mutex_exit((&vdp->xdf_dev_lk));
2057 (void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2058 mutex_enter((&vdp->xdf_dev_lk));
2059
2060 vdp->xdf_connect_req--;
2061 return (vdp->xdf_state);
2062 }
2063
2064 static uint_t
xdf_iorestart(caddr_t arg)2065 xdf_iorestart(caddr_t arg)
2066 {
2067 xdf_t *vdp = (xdf_t *)arg;
2068
2069 ASSERT(vdp != NULL);
2070
2071 mutex_enter(&vdp->xdf_dev_lk);
2072 ASSERT(ISDMACBON(vdp));
2073 SETDMACBOFF(vdp);
2074 mutex_exit(&vdp->xdf_dev_lk);
2075
2076 xdf_io_start(vdp);
2077
2078 return (DDI_INTR_CLAIMED);
2079 }
2080
2081 #ifdef XPV_HVM_DRIVER
2082
2083 typedef struct xdf_hvm_entry {
2084 list_node_t xdf_he_list;
2085 char *xdf_he_path;
2086 dev_info_t *xdf_he_dip;
2087 } xdf_hvm_entry_t;
2088
2089 static list_t xdf_hvm_list;
2090 static kmutex_t xdf_hvm_list_lock;
2091
2092 static xdf_hvm_entry_t *
i_xdf_hvm_find(const char * path,dev_info_t * dip)2093 i_xdf_hvm_find(const char *path, dev_info_t *dip)
2094 {
2095 xdf_hvm_entry_t *i;
2096
2097 ASSERT((path != NULL) || (dip != NULL));
2098 ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2099
2100 i = list_head(&xdf_hvm_list);
2101 while (i != NULL) {
2102 if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2103 i = list_next(&xdf_hvm_list, i);
2104 continue;
2105 }
2106 if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2107 i = list_next(&xdf_hvm_list, i);
2108 continue;
2109 }
2110 break;
2111 }
2112 return (i);
2113 }
2114
2115 dev_info_t *
xdf_hvm_hold(const char * path)2116 xdf_hvm_hold(const char *path)
2117 {
2118 xdf_hvm_entry_t *i;
2119 dev_info_t *dip;
2120
2121 mutex_enter(&xdf_hvm_list_lock);
2122 i = i_xdf_hvm_find(path, NULL);
2123 if (i == NULL) {
2124 mutex_exit(&xdf_hvm_list_lock);
2125 return (B_FALSE);
2126 }
2127 ndi_hold_devi(dip = i->xdf_he_dip);
2128 mutex_exit(&xdf_hvm_list_lock);
2129 return (dip);
2130 }
2131
2132 static void
xdf_hvm_add(dev_info_t * dip)2133 xdf_hvm_add(dev_info_t *dip)
2134 {
2135 xdf_hvm_entry_t *i;
2136 char *path;
2137
2138 /* figure out the path for the dip */
2139 path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2140 (void) ddi_pathname(dip, path);
2141
2142 i = kmem_alloc(sizeof (*i), KM_SLEEP);
2143 i->xdf_he_dip = dip;
2144 i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2145
2146 mutex_enter(&xdf_hvm_list_lock);
2147 ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2148 ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2149 list_insert_head(&xdf_hvm_list, i);
2150 mutex_exit(&xdf_hvm_list_lock);
2151
2152 kmem_free(path, MAXPATHLEN);
2153 }
2154
2155 static void
xdf_hvm_rm(dev_info_t * dip)2156 xdf_hvm_rm(dev_info_t *dip)
2157 {
2158 xdf_hvm_entry_t *i;
2159
2160 mutex_enter(&xdf_hvm_list_lock);
2161 VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2162 list_remove(&xdf_hvm_list, i);
2163 mutex_exit(&xdf_hvm_list_lock);
2164
2165 kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2166 kmem_free(i, sizeof (*i));
2167 }
2168
2169 static void
xdf_hvm_init(void)2170 xdf_hvm_init(void)
2171 {
2172 list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2173 offsetof(xdf_hvm_entry_t, xdf_he_list));
2174 mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2175 }
2176
2177 static void
xdf_hvm_fini(void)2178 xdf_hvm_fini(void)
2179 {
2180 ASSERT(list_head(&xdf_hvm_list) == NULL);
2181 list_destroy(&xdf_hvm_list);
2182 mutex_destroy(&xdf_hvm_list_lock);
2183 }
2184
2185 boolean_t
xdf_hvm_connect(dev_info_t * dip)2186 xdf_hvm_connect(dev_info_t *dip)
2187 {
2188 xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
2189 char *oename, *str;
2190 int rv;
2191
2192 mutex_enter(&vdp->xdf_cb_lk);
2193
2194 /*
2195 * Before try to establish a connection we need to wait for the
2196 * backend hotplug scripts to have run. Once they are run the
2197 * "<oename>/hotplug-status" property will be set to "connected".
2198 */
2199 for (;;) {
2200 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2201
2202 /*
2203 * Get the xenbus path to the backend device. Note that
2204 * we can't cache this path (and we look it up on each pass
2205 * through this loop) because it could change during
2206 * suspend, resume, and migration operations.
2207 */
2208 if ((oename = xvdi_get_oename(dip)) == NULL) {
2209 mutex_exit(&vdp->xdf_cb_lk);
2210 return (B_FALSE);
2211 }
2212
2213 str = NULL;
2214 if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) &&
2215 (strcmp(str, XBV_HP_STATUS_CONN) == 0))
2216 break;
2217
2218 if (str != NULL)
2219 strfree(str);
2220
2221 /* wait for an update to "<oename>/hotplug-status" */
2222 if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) {
2223 /* we got interrupted by a signal */
2224 mutex_exit(&vdp->xdf_cb_lk);
2225 return (B_FALSE);
2226 }
2227 }
2228
2229 /* Good news. The backend hotplug scripts have been run. */
2230 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2231 ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0);
2232 strfree(str);
2233
2234 /*
2235 * If we're emulating a cd device and if the backend doesn't support
2236 * media request opreations, then we're not going to bother trying
2237 * to establish a connection for a couple reasons. First off, media
2238 * requests support is required to support operations like eject and
2239 * media locking. Second, other backend platforms like Linux don't
2240 * support hvm pv cdrom access. They don't even have a backend pv
2241 * driver for cdrom device nodes, so we don't want to block forever
2242 * waiting for a connection to a backend driver that doesn't exist.
2243 */
2244 if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) {
2245 mutex_exit(&vdp->xdf_cb_lk);
2246 return (B_FALSE);
2247 }
2248
2249 mutex_enter(&vdp->xdf_dev_lk);
2250 rv = xdf_connect_locked(vdp, B_TRUE);
2251 mutex_exit(&vdp->xdf_dev_lk);
2252 mutex_exit(&vdp->xdf_cb_lk);
2253
2254 return ((rv == XD_READY) ? B_TRUE : B_FALSE);
2255 }
2256
2257 int
xdf_hvm_setpgeom(dev_info_t * dip,cmlb_geom_t * geomp)2258 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2259 {
2260 xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
2261
2262 /* sanity check the requested physical geometry */
2263 mutex_enter(&vdp->xdf_dev_lk);
2264 if ((geomp->g_secsize != XB_BSIZE) ||
2265 (geomp->g_capacity == 0)) {
2266 mutex_exit(&vdp->xdf_dev_lk);
2267 return (EINVAL);
2268 }
2269
2270 /*
2271 * If we've already connected to the backend device then make sure
2272 * we're not defining a physical geometry larger than our backend
2273 * device.
2274 */
2275 if ((vdp->xdf_xdev_nblocks != 0) &&
2276 (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2277 mutex_exit(&vdp->xdf_dev_lk);
2278 return (EINVAL);
2279 }
2280
2281 bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom));
2282 vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl;
2283 vdp->xdf_pgeom.g_acyl = geomp->g_acyl;
2284 vdp->xdf_pgeom.g_nhead = geomp->g_nhead;
2285 vdp->xdf_pgeom.g_nsect = geomp->g_nsect;
2286 vdp->xdf_pgeom.g_secsize = geomp->g_secsize;
2287 vdp->xdf_pgeom.g_capacity = geomp->g_capacity;
2288 vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv;
2289 vdp->xdf_pgeom.g_rpm = geomp->g_rpm;
2290
2291 vdp->xdf_pgeom_fixed = B_TRUE;
2292 mutex_exit(&vdp->xdf_dev_lk);
2293
2294 /* force a re-validation */
2295 cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2296
2297 return (0);
2298 }
2299
2300 boolean_t
xdf_is_cd(dev_info_t * dip)2301 xdf_is_cd(dev_info_t *dip)
2302 {
2303 xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
2304 boolean_t rv;
2305
2306 mutex_enter(&vdp->xdf_cb_lk);
2307 rv = XD_IS_CD(vdp);
2308 mutex_exit(&vdp->xdf_cb_lk);
2309 return (rv);
2310 }
2311
2312 boolean_t
xdf_is_rm(dev_info_t * dip)2313 xdf_is_rm(dev_info_t *dip)
2314 {
2315 xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
2316 boolean_t rv;
2317
2318 mutex_enter(&vdp->xdf_cb_lk);
2319 rv = XD_IS_RM(vdp);
2320 mutex_exit(&vdp->xdf_cb_lk);
2321 return (rv);
2322 }
2323
2324 boolean_t
xdf_media_req_supported(dev_info_t * dip)2325 xdf_media_req_supported(dev_info_t *dip)
2326 {
2327 xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
2328 boolean_t rv;
2329
2330 mutex_enter(&vdp->xdf_cb_lk);
2331 rv = vdp->xdf_media_req_supported;
2332 mutex_exit(&vdp->xdf_cb_lk);
2333 return (rv);
2334 }
2335
2336 #endif /* XPV_HVM_DRIVER */
2337
2338 static int
xdf_lb_getcap(dev_info_t * dip,diskaddr_t * capp)2339 xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp)
2340 {
2341 xdf_t *vdp;
2342 vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2343
2344 if (vdp == NULL)
2345 return (ENXIO);
2346
2347 mutex_enter(&vdp->xdf_dev_lk);
2348 *capp = vdp->xdf_pgeom.g_capacity;
2349 DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp));
2350 mutex_exit(&vdp->xdf_dev_lk);
2351 return (0);
2352 }
2353
2354 static int
xdf_lb_getpgeom(dev_info_t * dip,cmlb_geom_t * geomp)2355 xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2356 {
2357 xdf_t *vdp;
2358
2359 if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
2360 return (ENXIO);
2361 *geomp = vdp->xdf_pgeom;
2362 return (0);
2363 }
2364
2365 /*
2366 * No real HBA, no geometry available from it
2367 */
2368 /*ARGSUSED*/
2369 static int
xdf_lb_getvgeom(dev_info_t * dip,cmlb_geom_t * geomp)2370 xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2371 {
2372 return (EINVAL);
2373 }
2374
2375 static int
xdf_lb_getattribute(dev_info_t * dip,tg_attribute_t * tgattributep)2376 xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep)
2377 {
2378 xdf_t *vdp;
2379
2380 if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))))
2381 return (ENXIO);
2382
2383 if (XD_IS_RO(vdp))
2384 tgattributep->media_is_writable = 0;
2385 else
2386 tgattributep->media_is_writable = 1;
2387 tgattributep->media_is_rotational = 0;
2388 return (0);
2389 }
2390
2391 /* ARGSUSED3 */
2392 int
xdf_lb_getinfo(dev_info_t * dip,int cmd,void * arg,void * tg_cookie)2393 xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
2394 {
2395 int instance;
2396 xdf_t *vdp;
2397
2398 instance = ddi_get_instance(dip);
2399
2400 if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
2401 return (ENXIO);
2402
2403 switch (cmd) {
2404 case TG_GETPHYGEOM:
2405 return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg));
2406 case TG_GETVIRTGEOM:
2407 return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg));
2408 case TG_GETCAPACITY:
2409 return (xdf_lb_getcap(dip, (diskaddr_t *)arg));
2410 case TG_GETBLOCKSIZE:
2411 mutex_enter(&vdp->xdf_cb_lk);
2412 *(uint32_t *)arg = vdp->xdf_xdev_secsize;
2413 mutex_exit(&vdp->xdf_cb_lk);
2414 return (0);
2415 case TG_GETATTR:
2416 return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg));
2417 default:
2418 return (ENOTTY);
2419 }
2420 }
2421
2422 /* ARGSUSED5 */
2423 int
xdf_lb_rdwr(dev_info_t * dip,uchar_t cmd,void * bufp,diskaddr_t start,size_t reqlen,void * tg_cookie)2424 xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp,
2425 diskaddr_t start, size_t reqlen, void *tg_cookie)
2426 {
2427 xdf_t *vdp;
2428 struct buf *bp;
2429 int err = 0;
2430
2431 vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2432
2433 /* We don't allow IO from the oe_change callback thread */
2434 ASSERT(curthread != vdp->xdf_oe_change_thread);
2435
2436 /*
2437 * Having secsize of 0 means that device isn't connected yet.
2438 * FIXME This happens for CD devices, and there's nothing we
2439 * can do about it at the moment.
2440 */
2441 if (vdp->xdf_xdev_secsize == 0)
2442 return (EIO);
2443
2444 if ((start + ((reqlen / (vdp->xdf_xdev_secsize / DEV_BSIZE))
2445 >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
2446 return (EINVAL);
2447
2448 bp = getrbuf(KM_SLEEP);
2449 if (cmd == TG_READ)
2450 bp->b_flags = B_BUSY | B_READ;
2451 else
2452 bp->b_flags = B_BUSY | B_WRITE;
2453
2454 bp->b_un.b_addr = bufp;
2455 bp->b_bcount = reqlen;
2456 bp->b_blkno = start * (vdp->xdf_xdev_secsize / DEV_BSIZE);
2457 bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
2458
2459 mutex_enter(&vdp->xdf_dev_lk);
2460 xdf_bp_push(vdp, bp);
2461 mutex_exit(&vdp->xdf_dev_lk);
2462 xdf_io_start(vdp);
2463 if (curthread == vdp->xdf_ready_tq_thread)
2464 (void) xdf_ring_drain(vdp);
2465 err = biowait(bp);
2466 ASSERT(bp->b_flags & B_DONE);
2467 freerbuf(bp);
2468 return (err);
2469 }
2470
2471 /*
2472 * Lock the current media. Set the media state to "lock".
2473 * (Media locks are only respected by the backend driver.)
2474 */
2475 static int
xdf_ioctl_mlock(xdf_t * vdp)2476 xdf_ioctl_mlock(xdf_t *vdp)
2477 {
2478 int rv;
2479 mutex_enter(&vdp->xdf_cb_lk);
2480 rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2481 mutex_exit(&vdp->xdf_cb_lk);
2482 return (rv);
2483 }
2484
2485 /*
2486 * Release a media lock. Set the media state to "none".
2487 */
2488 static int
xdf_ioctl_munlock(xdf_t * vdp)2489 xdf_ioctl_munlock(xdf_t *vdp)
2490 {
2491 int rv;
2492 mutex_enter(&vdp->xdf_cb_lk);
2493 rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE);
2494 mutex_exit(&vdp->xdf_cb_lk);
2495 return (rv);
2496 }
2497
2498 /*
2499 * Eject the current media. Ignores any media locks. (Media locks
2500 * are only for benifit of the the backend.)
2501 */
2502 static int
xdf_ioctl_eject(xdf_t * vdp)2503 xdf_ioctl_eject(xdf_t *vdp)
2504 {
2505 int rv;
2506
2507 mutex_enter(&vdp->xdf_cb_lk);
2508 if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) {
2509 mutex_exit(&vdp->xdf_cb_lk);
2510 return (rv);
2511 }
2512
2513 /*
2514 * We've set the media requests xenbus parameter to eject, so now
2515 * disconnect from the backend, wait for the backend to clear
2516 * the media requets xenbus paramter, and then we can reconnect
2517 * to the backend.
2518 */
2519 (void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2520 mutex_enter(&vdp->xdf_dev_lk);
2521 if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) {
2522 mutex_exit(&vdp->xdf_dev_lk);
2523 mutex_exit(&vdp->xdf_cb_lk);
2524 return (EIO);
2525 }
2526 mutex_exit(&vdp->xdf_dev_lk);
2527 mutex_exit(&vdp->xdf_cb_lk);
2528 return (0);
2529 }
2530
2531 /*
2532 * Watch for media state changes. This can be an insertion of a device
2533 * (triggered by a 'xm block-configure' request in another domain) or
2534 * the ejection of a device (triggered by a local "eject" operation).
2535 * For a full description of the DKIOCSTATE ioctl behavior see dkio(4I).
2536 */
2537 static int
xdf_dkstate(xdf_t * vdp,enum dkio_state mstate)2538 xdf_dkstate(xdf_t *vdp, enum dkio_state mstate)
2539 {
2540 enum dkio_state prev_state;
2541
2542 mutex_enter(&vdp->xdf_cb_lk);
2543 prev_state = vdp->xdf_mstate;
2544
2545 if (vdp->xdf_mstate == mstate) {
2546 while (vdp->xdf_mstate == prev_state) {
2547 if (cv_wait_sig(&vdp->xdf_mstate_cv,
2548 &vdp->xdf_cb_lk) == 0) {
2549 mutex_exit(&vdp->xdf_cb_lk);
2550 return (EINTR);
2551 }
2552 }
2553 }
2554
2555 if ((prev_state != DKIO_INSERTED) &&
2556 (vdp->xdf_mstate == DKIO_INSERTED)) {
2557 (void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2558 mutex_exit(&vdp->xdf_cb_lk);
2559 return (0);
2560 }
2561
2562 mutex_exit(&vdp->xdf_cb_lk);
2563 return (0);
2564 }
2565
2566 /*ARGSUSED*/
2567 static int
xdf_ioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * credp,int * rvalp)2568 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2569 int *rvalp)
2570 {
2571 minor_t minor = getminor(dev);
2572 int part = XDF_PART(minor);
2573 xdf_t *vdp;
2574 int rv;
2575
2576 if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) ||
2577 (!xdf_isopen(vdp, part)))
2578 return (ENXIO);
2579
2580 DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n",
2581 vdp->xdf_addr, cmd, cmd));
2582
2583 switch (cmd) {
2584 default:
2585 return (ENOTTY);
2586 case DKIOCG_PHYGEOM:
2587 case DKIOCG_VIRTGEOM:
2588 case DKIOCGGEOM:
2589 case DKIOCSGEOM:
2590 case DKIOCGAPART:
2591 case DKIOCSAPART:
2592 case DKIOCGVTOC:
2593 case DKIOCSVTOC:
2594 case DKIOCPARTINFO:
2595 case DKIOCGEXTVTOC:
2596 case DKIOCSEXTVTOC:
2597 case DKIOCEXTPARTINFO:
2598 case DKIOCGMBOOT:
2599 case DKIOCSMBOOT:
2600 case DKIOCGETEFI:
2601 case DKIOCSETEFI:
2602 case DKIOCSETEXTPART:
2603 case DKIOCPARTITION:
2604 rv = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
2605 rvalp, NULL);
2606 if (rv != 0)
2607 return (rv);
2608 /*
2609 * If we're labelling the disk, we have to update the geometry
2610 * in the cmlb data structures, and we also have to write a new
2611 * devid to the disk. Note that writing an EFI label currently
2612 * requires 4 ioctls, and devid setup will fail on all but the
2613 * last.
2614 */
2615 if (cmd == DKIOCSEXTVTOC || cmd == DKIOCSVTOC ||
2616 cmd == DKIOCSETEFI) {
2617 rv = cmlb_validate(vdp->xdf_vd_lbl, 0, 0);
2618 if (rv == 0) {
2619 xdf_devid_setup(vdp);
2620 } else {
2621 cmn_err(CE_WARN,
2622 "xdf@%s, labeling failed on validate",
2623 vdp->xdf_addr);
2624 }
2625 }
2626 return (rv);
2627 case FDEJECT:
2628 case DKIOCEJECT:
2629 case CDROMEJECT:
2630 return (xdf_ioctl_eject(vdp));
2631 case DKIOCLOCK:
2632 return (xdf_ioctl_mlock(vdp));
2633 case DKIOCUNLOCK:
2634 return (xdf_ioctl_munlock(vdp));
2635 case CDROMREADOFFSET: {
2636 int offset = 0;
2637 if (!XD_IS_CD(vdp))
2638 return (ENOTTY);
2639 if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode))
2640 return (EFAULT);
2641 return (0);
2642 }
2643 case DKIOCGMEDIAINFO: {
2644 struct dk_minfo media_info;
2645
2646 media_info.dki_lbsize = vdp->xdf_xdev_secsize;
2647 media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
2648 if (XD_IS_CD(vdp))
2649 media_info.dki_media_type = DK_CDROM;
2650 else
2651 media_info.dki_media_type = DK_FIXED_DISK;
2652
2653 if (ddi_copyout(&media_info, (void *)arg,
2654 sizeof (struct dk_minfo), mode))
2655 return (EFAULT);
2656 return (0);
2657 }
2658 case DKIOCINFO: {
2659 struct dk_cinfo info;
2660
2661 /* controller information */
2662 if (XD_IS_CD(vdp))
2663 info.dki_ctype = DKC_CDROM;
2664 else
2665 info.dki_ctype = DKC_VBD;
2666
2667 info.dki_cnum = 0;
2668 (void) strncpy((char *)(&info.dki_cname), "xdf", 8);
2669
2670 /* unit information */
2671 info.dki_unit = ddi_get_instance(vdp->xdf_dip);
2672 (void) strncpy((char *)(&info.dki_dname), "xdf", 8);
2673 info.dki_flags = DKI_FMTVOL;
2674 info.dki_partition = part;
2675 info.dki_maxtransfer = maxphys / DEV_BSIZE;
2676 info.dki_addr = 0;
2677 info.dki_space = 0;
2678 info.dki_prio = 0;
2679 info.dki_vec = 0;
2680
2681 if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
2682 return (EFAULT);
2683 return (0);
2684 }
2685 case DKIOCSTATE: {
2686 enum dkio_state mstate;
2687
2688 if (ddi_copyin((void *)arg, &mstate,
2689 sizeof (mstate), mode) != 0)
2690 return (EFAULT);
2691 if ((rv = xdf_dkstate(vdp, mstate)) != 0)
2692 return (rv);
2693 mstate = vdp->xdf_mstate;
2694 if (ddi_copyout(&mstate, (void *)arg,
2695 sizeof (mstate), mode) != 0)
2696 return (EFAULT);
2697 return (0);
2698 }
2699 case DKIOCREMOVABLE: {
2700 int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2701 if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode))
2702 return (EFAULT);
2703 return (0);
2704 }
2705 case DKIOCGETWCE: {
2706 int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2707 if (ddi_copyout(&i, (void *)arg, sizeof (i), mode))
2708 return (EFAULT);
2709 return (0);
2710 }
2711 case DKIOCSETWCE: {
2712 int i;
2713 if (ddi_copyin((void *)arg, &i, sizeof (i), mode))
2714 return (EFAULT);
2715 vdp->xdf_wce = VOID2BOOLEAN(i);
2716 return (0);
2717 }
2718 case DKIOCFLUSHWRITECACHE: {
2719 struct dk_callback *dkc = (struct dk_callback *)arg;
2720
2721 if (vdp->xdf_flush_supported) {
2722 rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2723 NULL, 0, 0, (void *)dev);
2724 } else if (vdp->xdf_feature_barrier &&
2725 !xdf_barrier_flush_disable) {
2726 rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2727 vdp->xdf_cache_flush_block, xdf_flush_block,
2728 vdp->xdf_xdev_secsize, (void *)dev);
2729 } else {
2730 return (ENOTTY);
2731 }
2732 if ((mode & FKIOCTL) && (dkc != NULL) &&
2733 (dkc->dkc_callback != NULL)) {
2734 (*dkc->dkc_callback)(dkc->dkc_cookie, rv);
2735 /* need to return 0 after calling callback */
2736 rv = 0;
2737 }
2738 return (rv);
2739 }
2740 }
2741 /*NOTREACHED*/
2742 }
2743
2744 static int
xdf_strategy(struct buf * bp)2745 xdf_strategy(struct buf *bp)
2746 {
2747 xdf_t *vdp;
2748 minor_t minor;
2749 diskaddr_t p_blkct, p_blkst;
2750 daddr_t blkno;
2751 ulong_t nblks;
2752 int part;
2753
2754 minor = getminor(bp->b_edev);
2755 part = XDF_PART(minor);
2756 vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor));
2757
2758 mutex_enter(&vdp->xdf_dev_lk);
2759 if (!xdf_isopen(vdp, part)) {
2760 mutex_exit(&vdp->xdf_dev_lk);
2761 xdf_io_err(bp, ENXIO, 0);
2762 return (0);
2763 }
2764
2765 /* We don't allow IO from the oe_change callback thread */
2766 ASSERT(curthread != vdp->xdf_oe_change_thread);
2767
2768 /* Check for writes to a read only device */
2769 if (!IS_READ(bp) && XD_IS_RO(vdp)) {
2770 mutex_exit(&vdp->xdf_dev_lk);
2771 xdf_io_err(bp, EROFS, 0);
2772 return (0);
2773 }
2774
2775 /* Check if this I/O is accessing a partition or the entire disk */
2776 if ((long)bp->b_private == XB_SLICE_NONE) {
2777 /* This I/O is using an absolute offset */
2778 p_blkct = vdp->xdf_xdev_nblocks;
2779 p_blkst = 0;
2780 } else {
2781 /* This I/O is using a partition relative offset */
2782 mutex_exit(&vdp->xdf_dev_lk);
2783 if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
2784 &p_blkst, NULL, NULL, NULL)) {
2785 xdf_io_err(bp, ENXIO, 0);
2786 return (0);
2787 }
2788 mutex_enter(&vdp->xdf_dev_lk);
2789 }
2790
2791 /*
2792 * Adjust the real blkno and bcount according to the underline
2793 * physical sector size.
2794 */
2795 blkno = bp->b_blkno / (vdp->xdf_xdev_secsize / XB_BSIZE);
2796
2797 /* check for a starting block beyond the disk or partition limit */
2798 if (blkno > p_blkct) {
2799 DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64,
2800 vdp->xdf_addr, (longlong_t)blkno, (uint64_t)p_blkct));
2801 mutex_exit(&vdp->xdf_dev_lk);
2802 xdf_io_err(bp, EINVAL, 0);
2803 return (0);
2804 }
2805
2806 /* Legacy: don't set error flag at this case */
2807 if (blkno == p_blkct) {
2808 mutex_exit(&vdp->xdf_dev_lk);
2809 bp->b_resid = bp->b_bcount;
2810 biodone(bp);
2811 return (0);
2812 }
2813
2814 /* sanitize the input buf */
2815 bioerror(bp, 0);
2816 bp->b_resid = 0;
2817 bp->av_back = bp->av_forw = NULL;
2818
2819 /* Adjust for partial transfer, this will result in an error later */
2820 if (vdp->xdf_xdev_secsize != 0 &&
2821 vdp->xdf_xdev_secsize != XB_BSIZE) {
2822 nblks = bp->b_bcount / vdp->xdf_xdev_secsize;
2823 } else {
2824 nblks = bp->b_bcount >> XB_BSHIFT;
2825 }
2826
2827 if ((blkno + nblks) > p_blkct) {
2828 if (vdp->xdf_xdev_secsize != 0 &&
2829 vdp->xdf_xdev_secsize != XB_BSIZE) {
2830 bp->b_resid =
2831 ((blkno + nblks) - p_blkct) *
2832 vdp->xdf_xdev_secsize;
2833 } else {
2834 bp->b_resid =
2835 ((blkno + nblks) - p_blkct) <<
2836 XB_BSHIFT;
2837 }
2838 bp->b_bcount -= bp->b_resid;
2839 }
2840
2841 DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n",
2842 vdp->xdf_addr, (longlong_t)blkno, (ulong_t)bp->b_bcount));
2843
2844 /* Fix up the buf struct */
2845 bp->b_flags |= B_BUSY;
2846 bp->b_private = (void *)(uintptr_t)p_blkst;
2847
2848 xdf_bp_push(vdp, bp);
2849 mutex_exit(&vdp->xdf_dev_lk);
2850 xdf_io_start(vdp);
2851 if (do_polled_io)
2852 (void) xdf_ring_drain(vdp);
2853 return (0);
2854 }
2855
2856 /*ARGSUSED*/
2857 static int
xdf_read(dev_t dev,struct uio * uiop,cred_t * credp)2858 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
2859 {
2860 xdf_t *vdp;
2861 minor_t minor;
2862 diskaddr_t p_blkcnt;
2863 int part;
2864
2865 minor = getminor(dev);
2866 if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2867 return (ENXIO);
2868
2869 DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n",
2870 vdp->xdf_addr, (int64_t)uiop->uio_offset));
2871
2872 part = XDF_PART(minor);
2873 if (!xdf_isopen(vdp, part))
2874 return (ENXIO);
2875
2876 if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2877 NULL, NULL, NULL, NULL))
2878 return (ENXIO);
2879
2880 if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2881 return (ENOSPC);
2882
2883 if (U_INVAL(uiop))
2884 return (EINVAL);
2885
2886 return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
2887 }
2888
2889 /*ARGSUSED*/
2890 static int
xdf_write(dev_t dev,struct uio * uiop,cred_t * credp)2891 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
2892 {
2893 xdf_t *vdp;
2894 minor_t minor;
2895 diskaddr_t p_blkcnt;
2896 int part;
2897
2898 minor = getminor(dev);
2899 if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2900 return (ENXIO);
2901
2902 DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n",
2903 vdp->xdf_addr, (int64_t)uiop->uio_offset));
2904
2905 part = XDF_PART(minor);
2906 if (!xdf_isopen(vdp, part))
2907 return (ENXIO);
2908
2909 if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2910 NULL, NULL, NULL, NULL))
2911 return (ENXIO);
2912
2913 if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2914 return (ENOSPC);
2915
2916 if (U_INVAL(uiop))
2917 return (EINVAL);
2918
2919 return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop));
2920 }
2921
2922 /*ARGSUSED*/
2923 static int
xdf_aread(dev_t dev,struct aio_req * aiop,cred_t * credp)2924 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
2925 {
2926 xdf_t *vdp;
2927 minor_t minor;
2928 struct uio *uiop = aiop->aio_uio;
2929 diskaddr_t p_blkcnt;
2930 int part;
2931
2932 minor = getminor(dev);
2933 if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2934 return (ENXIO);
2935
2936 part = XDF_PART(minor);
2937 if (!xdf_isopen(vdp, part))
2938 return (ENXIO);
2939
2940 if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2941 NULL, NULL, NULL, NULL))
2942 return (ENXIO);
2943
2944 if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2945 return (ENOSPC);
2946
2947 if (U_INVAL(uiop))
2948 return (EINVAL);
2949
2950 return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop));
2951 }
2952
2953 /*ARGSUSED*/
2954 static int
xdf_awrite(dev_t dev,struct aio_req * aiop,cred_t * credp)2955 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
2956 {
2957 xdf_t *vdp;
2958 minor_t minor;
2959 struct uio *uiop = aiop->aio_uio;
2960 diskaddr_t p_blkcnt;
2961 int part;
2962
2963 minor = getminor(dev);
2964 if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2965 return (ENXIO);
2966
2967 part = XDF_PART(minor);
2968 if (!xdf_isopen(vdp, part))
2969 return (ENXIO);
2970
2971 if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2972 NULL, NULL, NULL, NULL))
2973 return (ENXIO);
2974
2975 if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2976 return (ENOSPC);
2977
2978 if (U_INVAL(uiop))
2979 return (EINVAL);
2980
2981 return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop));
2982 }
2983
2984 static int
xdf_dump(dev_t dev,caddr_t addr,daddr_t blkno,int nblk)2985 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
2986 {
2987 struct buf dumpbuf, *dbp = &dumpbuf;
2988 xdf_t *vdp;
2989 minor_t minor;
2990 int err = 0;
2991 int part;
2992 diskaddr_t p_blkcnt, p_blkst;
2993
2994 minor = getminor(dev);
2995 if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2996 return (ENXIO);
2997
2998 DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n",
2999 vdp->xdf_addr, (void *)addr, blkno, nblk));
3000
3001 /* We don't allow IO from the oe_change callback thread */
3002 ASSERT(curthread != vdp->xdf_oe_change_thread);
3003
3004 part = XDF_PART(minor);
3005 if (!xdf_isopen(vdp, part))
3006 return (ENXIO);
3007
3008 if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
3009 NULL, NULL, NULL))
3010 return (ENXIO);
3011
3012 if ((blkno + nblk) >
3013 (p_blkcnt * (vdp->xdf_xdev_secsize / XB_BSIZE))) {
3014 cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64,
3015 vdp->xdf_addr, (daddr_t)((blkno + nblk) /
3016 (vdp->xdf_xdev_secsize / XB_BSIZE)), (uint64_t)p_blkcnt);
3017 return (EINVAL);
3018 }
3019
3020 bioinit(dbp);
3021 dbp->b_flags = B_BUSY;
3022 dbp->b_un.b_addr = addr;
3023 dbp->b_bcount = nblk << DEV_BSHIFT;
3024 dbp->b_blkno = blkno;
3025 dbp->b_edev = dev;
3026 dbp->b_private = (void *)(uintptr_t)p_blkst;
3027
3028 mutex_enter(&vdp->xdf_dev_lk);
3029 xdf_bp_push(vdp, dbp);
3030 mutex_exit(&vdp->xdf_dev_lk);
3031 xdf_io_start(vdp);
3032 err = xdf_ring_drain(vdp);
3033 biofini(dbp);
3034 return (err);
3035 }
3036
3037 /*ARGSUSED*/
3038 static int
xdf_close(dev_t dev,int flag,int otyp,struct cred * credp)3039 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
3040 {
3041 minor_t minor;
3042 xdf_t *vdp;
3043 int part;
3044 ulong_t parbit;
3045
3046 minor = getminor(dev);
3047 if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3048 return (ENXIO);
3049
3050 mutex_enter(&vdp->xdf_dev_lk);
3051 part = XDF_PART(minor);
3052 if (!xdf_isopen(vdp, part)) {
3053 mutex_exit(&vdp->xdf_dev_lk);
3054 return (ENXIO);
3055 }
3056 parbit = 1 << part;
3057
3058 ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
3059 if (otyp == OTYP_LYR) {
3060 ASSERT(vdp->xdf_vd_lyropen[part] > 0);
3061 if (--vdp->xdf_vd_lyropen[part] == 0)
3062 vdp->xdf_vd_open[otyp] &= ~parbit;
3063 } else {
3064 vdp->xdf_vd_open[otyp] &= ~parbit;
3065 }
3066 vdp->xdf_vd_exclopen &= ~parbit;
3067
3068 mutex_exit(&vdp->xdf_dev_lk);
3069 return (0);
3070 }
3071
3072 static int
xdf_open(dev_t * devp,int flag,int otyp,cred_t * credp)3073 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
3074 {
3075 minor_t minor;
3076 xdf_t *vdp;
3077 int part;
3078 ulong_t parbit;
3079 diskaddr_t p_blkct = 0;
3080 boolean_t firstopen;
3081 boolean_t nodelay;
3082
3083 minor = getminor(*devp);
3084 if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3085 return (ENXIO);
3086
3087 nodelay = (flag & (FNDELAY | FNONBLOCK));
3088
3089 DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr));
3090
3091 /* do cv_wait until connected or failed */
3092 mutex_enter(&vdp->xdf_cb_lk);
3093 mutex_enter(&vdp->xdf_dev_lk);
3094 if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) {
3095 mutex_exit(&vdp->xdf_dev_lk);
3096 mutex_exit(&vdp->xdf_cb_lk);
3097 return (ENXIO);
3098 }
3099 mutex_exit(&vdp->xdf_cb_lk);
3100
3101 if ((flag & FWRITE) && XD_IS_RO(vdp)) {
3102 mutex_exit(&vdp->xdf_dev_lk);
3103 return (EROFS);
3104 }
3105
3106 part = XDF_PART(minor);
3107 parbit = 1 << part;
3108 if ((vdp->xdf_vd_exclopen & parbit) ||
3109 ((flag & FEXCL) && xdf_isopen(vdp, part))) {
3110 mutex_exit(&vdp->xdf_dev_lk);
3111 return (EBUSY);
3112 }
3113
3114 /* are we the first one to open this node? */
3115 firstopen = !xdf_isopen(vdp, -1);
3116
3117 if (otyp == OTYP_LYR)
3118 vdp->xdf_vd_lyropen[part]++;
3119
3120 vdp->xdf_vd_open[otyp] |= parbit;
3121
3122 if (flag & FEXCL)
3123 vdp->xdf_vd_exclopen |= parbit;
3124
3125 mutex_exit(&vdp->xdf_dev_lk);
3126
3127 /* force a re-validation */
3128 if (firstopen)
3129 cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
3130
3131 /* If this is a non-blocking open then we're done */
3132 if (nodelay)
3133 return (0);
3134
3135 /*
3136 * This is a blocking open, so we require:
3137 * - that the disk have a valid label on it
3138 * - that the size of the partition that we're opening is non-zero
3139 */
3140 if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
3141 NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) {
3142 (void) xdf_close(*devp, flag, otyp, credp);
3143 return (ENXIO);
3144 }
3145
3146 return (0);
3147 }
3148
3149 /*ARGSUSED*/
3150 static void
xdf_watch_hp_status_cb(dev_info_t * dip,const char * path,void * arg)3151 xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg)
3152 {
3153 xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
3154 cv_broadcast(&vdp->xdf_hp_status_cv);
3155 }
3156
3157 static int
xdf_prop_op(dev_t dev,dev_info_t * dip,ddi_prop_op_t prop_op,int flags,char * name,caddr_t valuep,int * lengthp)3158 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
3159 char *name, caddr_t valuep, int *lengthp)
3160 {
3161 xdf_t *vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
3162
3163 /*
3164 * Sanity check that if a dev_t or dip were specified that they
3165 * correspond to this device driver. On debug kernels we'll
3166 * panic and on non-debug kernels we'll return failure.
3167 */
3168 ASSERT(ddi_driver_major(dip) == xdf_major);
3169 ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major));
3170 if ((ddi_driver_major(dip) != xdf_major) ||
3171 ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major)))
3172 return (DDI_PROP_NOT_FOUND);
3173
3174 if (vdp == NULL)
3175 return (ddi_prop_op(dev, dip, prop_op, flags,
3176 name, valuep, lengthp));
3177
3178 return (cmlb_prop_op(vdp->xdf_vd_lbl,
3179 dev, dip, prop_op, flags, name, valuep, lengthp,
3180 XDF_PART(getminor(dev)), NULL));
3181 }
3182
3183 /*ARGSUSED*/
3184 static int
xdf_getinfo(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** rp)3185 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
3186 {
3187 int instance = XDF_INST(getminor((dev_t)arg));
3188 xdf_t *vbdp;
3189
3190 switch (cmd) {
3191 case DDI_INFO_DEVT2DEVINFO:
3192 if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) {
3193 *rp = NULL;
3194 return (DDI_FAILURE);
3195 }
3196 *rp = vbdp->xdf_dip;
3197 return (DDI_SUCCESS);
3198
3199 case DDI_INFO_DEVT2INSTANCE:
3200 *rp = (void *)(uintptr_t)instance;
3201 return (DDI_SUCCESS);
3202
3203 default:
3204 return (DDI_FAILURE);
3205 }
3206 }
3207
3208 /*ARGSUSED*/
3209 static int
xdf_resume(dev_info_t * dip)3210 xdf_resume(dev_info_t *dip)
3211 {
3212 xdf_t *vdp;
3213 char *oename;
3214
3215 if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
3216 goto err;
3217
3218 if (xdf_debug & SUSRES_DBG)
3219 xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr);
3220
3221 mutex_enter(&vdp->xdf_cb_lk);
3222
3223 if (xvdi_resume(dip) != DDI_SUCCESS) {
3224 mutex_exit(&vdp->xdf_cb_lk);
3225 goto err;
3226 }
3227
3228 if (((oename = xvdi_get_oename(dip)) == NULL) ||
3229 (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3230 xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) {
3231 mutex_exit(&vdp->xdf_cb_lk);
3232 goto err;
3233 }
3234
3235 mutex_enter(&vdp->xdf_dev_lk);
3236 ASSERT(vdp->xdf_state != XD_READY);
3237 xdf_set_state(vdp, XD_UNKNOWN);
3238 mutex_exit(&vdp->xdf_dev_lk);
3239
3240 if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3241 mutex_exit(&vdp->xdf_cb_lk);
3242 goto err;
3243 }
3244
3245 mutex_exit(&vdp->xdf_cb_lk);
3246
3247 if (xdf_debug & SUSRES_DBG)
3248 xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr);
3249 return (DDI_SUCCESS);
3250 err:
3251 if (xdf_debug & SUSRES_DBG)
3252 xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr);
3253 return (DDI_FAILURE);
3254 }
3255
3256 /*
3257 * Uses the in-memory devid if one exists.
3258 *
3259 * Create a devid and write it on the first block of the last track of
3260 * the last cylinder.
3261 * Return DDI_SUCCESS or DDI_FAILURE.
3262 */
3263 static int
xdf_devid_fabricate(xdf_t * vdp)3264 xdf_devid_fabricate(xdf_t *vdp)
3265 {
3266 ddi_devid_t devid = vdp->xdf_tgt_devid; /* null if no devid */
3267 struct dk_devid *dkdevidp = NULL; /* devid struct stored on disk */
3268 diskaddr_t blk;
3269 uint_t *ip, chksum;
3270 int i, devid_size;
3271
3272 if (cmlb_get_devid_block(vdp->xdf_vd_lbl, &blk, NULL) != 0)
3273 goto err;
3274
3275 if (devid == NULL && ddi_devid_init(vdp->xdf_dip, DEVID_FAB, 0,
3276 NULL, &devid) != DDI_SUCCESS)
3277 goto err;
3278
3279 /* allocate a buffer */
3280 dkdevidp = (struct dk_devid *)kmem_zalloc(NBPSCTR, KM_SLEEP);
3281
3282 /* Fill in the revision */
3283 dkdevidp->dkd_rev_hi = DK_DEVID_REV_MSB;
3284 dkdevidp->dkd_rev_lo = DK_DEVID_REV_LSB;
3285
3286 /* Copy in the device id */
3287 devid_size = ddi_devid_sizeof(devid);
3288 if (devid_size > DK_DEVID_SIZE)
3289 goto err;
3290 bcopy(devid, dkdevidp->dkd_devid, devid_size);
3291
3292 /* Calculate the chksum */
3293 chksum = 0;
3294 ip = (uint_t *)dkdevidp;
3295 for (i = 0; i < (NBPSCTR / sizeof (int)) - 1; i++)
3296 chksum ^= ip[i];
3297
3298 /* Fill in the checksum */
3299 DKD_FORMCHKSUM(chksum, dkdevidp);
3300
3301 if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, dkdevidp, blk,
3302 NBPSCTR, NULL) != 0)
3303 goto err;
3304
3305 kmem_free(dkdevidp, NBPSCTR);
3306
3307 vdp->xdf_tgt_devid = devid;
3308 return (DDI_SUCCESS);
3309
3310 err:
3311 if (dkdevidp != NULL)
3312 kmem_free(dkdevidp, NBPSCTR);
3313 if (devid != NULL && vdp->xdf_tgt_devid == NULL)
3314 ddi_devid_free(devid);
3315 return (DDI_FAILURE);
3316 }
3317
3318 /*
3319 * xdf_devid_read() is a local copy of xdfs_devid_read(), modified to use xdf
3320 * functions.
3321 *
3322 * Read a devid from on the first block of the last track of
3323 * the last cylinder. Make sure what we read is a valid devid.
3324 * Return DDI_SUCCESS or DDI_FAILURE.
3325 */
3326 static int
xdf_devid_read(xdf_t * vdp)3327 xdf_devid_read(xdf_t *vdp)
3328 {
3329 diskaddr_t blk;
3330 struct dk_devid *dkdevidp;
3331 uint_t *ip, chksum;
3332 int i;
3333
3334 if (cmlb_get_devid_block(vdp->xdf_vd_lbl, &blk, NULL) != 0)
3335 return (DDI_FAILURE);
3336
3337 dkdevidp = kmem_zalloc(NBPSCTR, KM_SLEEP);
3338 if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, dkdevidp, blk,
3339 NBPSCTR, NULL) != 0)
3340 goto err;
3341
3342 /* Validate the revision */
3343 if ((dkdevidp->dkd_rev_hi != DK_DEVID_REV_MSB) ||
3344 (dkdevidp->dkd_rev_lo != DK_DEVID_REV_LSB))
3345 goto err;
3346
3347 /* Calculate the checksum */
3348 chksum = 0;
3349 ip = (uint_t *)dkdevidp;
3350 for (i = 0; i < (NBPSCTR / sizeof (int)) - 1; i++)
3351 chksum ^= ip[i];
3352 if (DKD_GETCHKSUM(dkdevidp) != chksum)
3353 goto err;
3354
3355 /* Validate the device id */
3356 if (ddi_devid_valid((ddi_devid_t)dkdevidp->dkd_devid) != DDI_SUCCESS)
3357 goto err;
3358
3359 /* keep a copy of the device id */
3360 i = ddi_devid_sizeof((ddi_devid_t)dkdevidp->dkd_devid);
3361 vdp->xdf_tgt_devid = kmem_alloc(i, KM_SLEEP);
3362 bcopy(dkdevidp->dkd_devid, vdp->xdf_tgt_devid, i);
3363 kmem_free(dkdevidp, NBPSCTR);
3364 return (DDI_SUCCESS);
3365
3366 err:
3367 kmem_free(dkdevidp, NBPSCTR);
3368 return (DDI_FAILURE);
3369 }
3370
3371 /*
3372 * xdf_devid_setup() is a modified copy of cmdk_devid_setup().
3373 *
3374 * This function creates a devid if we don't already have one, and
3375 * registers it. If we already have one, we make sure that it can be
3376 * read from the disk, otherwise we write it to the disk ourselves. If
3377 * we didn't already have a devid, and we create one, we also need to
3378 * register it.
3379 */
3380 void
xdf_devid_setup(xdf_t * vdp)3381 xdf_devid_setup(xdf_t *vdp)
3382 {
3383 int rc;
3384 boolean_t existed = vdp->xdf_tgt_devid != NULL;
3385
3386 /* Read devid from the disk, if present */
3387 rc = xdf_devid_read(vdp);
3388
3389 /* Otherwise write a devid (which we create if necessary) on the disk */
3390 if (rc != DDI_SUCCESS)
3391 rc = xdf_devid_fabricate(vdp);
3392
3393 /* If we created a devid or found it on the disk, register it */
3394 if (rc == DDI_SUCCESS && !existed)
3395 (void) ddi_devid_register(vdp->xdf_dip, vdp->xdf_tgt_devid);
3396 }
3397
3398 static int
xdf_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)3399 xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3400 {
3401 int n, instance = ddi_get_instance(dip);
3402 ddi_iblock_cookie_t ibc, softibc;
3403 boolean_t dev_iscd = B_FALSE;
3404 xdf_t *vdp;
3405 char *oename, *xsname, *str;
3406 clock_t timeout;
3407 int err = 0;
3408
3409 if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM,
3410 "xdf_debug", 0)) != 0)
3411 xdf_debug = n;
3412
3413 switch (cmd) {
3414 case DDI_RESUME:
3415 return (xdf_resume(dip));
3416 case DDI_ATTACH:
3417 break;
3418 default:
3419 return (DDI_FAILURE);
3420 }
3421 /* DDI_ATTACH */
3422
3423 if ((xsname = xvdi_get_xsname(dip)) == NULL ||
3424 (oename = xvdi_get_oename(dip)) == NULL)
3425 return (DDI_FAILURE);
3426
3427 /*
3428 * Disable auto-detach. This is necessary so that we don't get
3429 * detached while we're disconnected from the back end.
3430 */
3431 if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip,
3432 DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS))
3433 return (DDI_FAILURE);
3434
3435 /* driver handles kernel-issued IOCTLs */
3436 if (ddi_prop_create(DDI_DEV_T_NONE, dip,
3437 DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS)
3438 return (DDI_FAILURE);
3439
3440 if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
3441 return (DDI_FAILURE);
3442
3443 if (ddi_get_soft_iblock_cookie(dip,
3444 DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS)
3445 return (DDI_FAILURE);
3446
3447 if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
3448 cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
3449 ddi_get_name_addr(dip));
3450 return (DDI_FAILURE);
3451 }
3452 if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
3453 dev_iscd = B_TRUE;
3454 strfree(str);
3455
3456 if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS)
3457 return (DDI_FAILURE);
3458
3459 DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip)));
3460 vdp = ddi_get_soft_state(xdf_ssp, instance);
3461 ddi_set_driver_private(dip, vdp);
3462 vdp->xdf_dip = dip;
3463 vdp->xdf_addr = ddi_get_name_addr(dip);
3464 vdp->xdf_suspending = B_FALSE;
3465 vdp->xdf_media_req_supported = B_FALSE;
3466 vdp->xdf_peer = INVALID_DOMID;
3467 vdp->xdf_evtchn = INVALID_EVTCHN;
3468 list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
3469 offsetof(v_req_t, v_link));
3470 cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
3471 cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL);
3472 cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL);
3473 mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3474 mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3475 mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3476 vdp->xdf_cmlb_reattach = B_TRUE;
3477 if (dev_iscd) {
3478 vdp->xdf_dinfo |= VDISK_CDROM;
3479 vdp->xdf_mstate = DKIO_EJECTED;
3480 } else {
3481 vdp->xdf_mstate = DKIO_NONE;
3482 }
3483
3484 if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq",
3485 1, TASKQ_DEFAULTPRI, 0)) == NULL)
3486 goto errout0;
3487
3488 if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3489 xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)
3490 goto errout0;
3491
3492 if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
3493 &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
3494 cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
3495 ddi_get_name_addr(dip));
3496 goto errout0;
3497 }
3498
3499 /*
3500 * Initialize the physical geometry stucture. Note that currently
3501 * we don't know the size of the backend device so the number
3502 * of blocks on the device will be initialized to zero. Once
3503 * we connect to the backend device we'll update the physical
3504 * geometry to reflect the real size of the device.
3505 */
3506 xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom);
3507 vdp->xdf_pgeom_fixed = B_FALSE;
3508
3509 /*
3510 * Allocate the cmlb handle, minor nodes will be created once
3511 * the device is connected with backend.
3512 */
3513 cmlb_alloc_handle(&vdp->xdf_vd_lbl);
3514
3515 /* We ship with cache-enabled disks */
3516 vdp->xdf_wce = B_TRUE;
3517
3518 mutex_enter(&vdp->xdf_cb_lk);
3519 /* Watch backend XenbusState change */
3520 if (xvdi_add_event_handler(dip,
3521 XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) {
3522 mutex_exit(&vdp->xdf_cb_lk);
3523 goto errout0;
3524 }
3525
3526 if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3527 cmn_err(CE_WARN, "xdf@%s: start connection failed",
3528 ddi_get_name_addr(dip));
3529 mutex_exit(&vdp->xdf_cb_lk);
3530 goto errout1;
3531 }
3532
3533 /* Nothing else to do for CD devices */
3534 if (dev_iscd) {
3535 mutex_exit(&vdp->xdf_cb_lk);
3536 goto done;
3537 }
3538
3539 /*
3540 * In order to do cmlb_validate, we have to wait for the disk to
3541 * acknowledge the attach, so we can query the backend for the disk
3542 * geometry (see xdf_setstate_connected).
3543 *
3544 * We only wait 30 seconds; if this is the root disk, the boot
3545 * will fail, but it would fail anyway if the device never
3546 * connected. If this is a non-boot disk, that disk will fail
3547 * to connect, but again, it would fail anyway.
3548 */
3549 timeout = ddi_get_lbolt() + drv_usectohz(XDF_STATE_TIMEOUT);
3550 while (vdp->xdf_state != XD_CONNECTED && vdp->xdf_state != XD_READY) {
3551 if (cv_timedwait(&vdp->xdf_dev_cv, &vdp->xdf_cb_lk,
3552 timeout) < 0) {
3553 cmn_err(CE_WARN, "xdf@%s: disk failed to connect",
3554 ddi_get_name_addr(dip));
3555 mutex_exit(&vdp->xdf_cb_lk);
3556 goto errout1;
3557 }
3558 }
3559 mutex_exit(&vdp->xdf_cb_lk);
3560
3561 /*
3562 * We call cmlb_validate so that the geometry information in
3563 * vdp->xdf_vd_lbl is correct; this fills out the number of
3564 * alternate cylinders so that we have a place to write the
3565 * devid.
3566 */
3567 if ((err = cmlb_validate(vdp->xdf_vd_lbl, 0, NULL)) != 0) {
3568 cmn_err(CE_NOTE,
3569 "xdf@%s: cmlb_validate failed: %d",
3570 ddi_get_name_addr(dip), err);
3571 /*
3572 * We can carry on even if cmlb_validate() returns EINVAL here,
3573 * as we'll rewrite the disk label anyway.
3574 */
3575 if (err != EINVAL)
3576 goto errout1;
3577 }
3578
3579 /*
3580 * xdf_devid_setup will only write a devid if one isn't
3581 * already present. If it fails to find or create one, we
3582 * create one in-memory so that when we label the disk later,
3583 * it will have a devid to use. This is helpful to deal with
3584 * cases where people use the devids of their disks before
3585 * labelling them; note that this does cause problems if
3586 * people rely on the devids of unlabelled disks to persist
3587 * across reboot.
3588 */
3589 xdf_devid_setup(vdp);
3590 if (vdp->xdf_tgt_devid == NULL) {
3591 if (ddi_devid_init(vdp->xdf_dip, DEVID_FAB, 0, NULL,
3592 &vdp->xdf_tgt_devid) != DDI_SUCCESS) {
3593 cmn_err(CE_WARN,
3594 "xdf@%s_ attach failed, devid_init failed",
3595 ddi_get_name_addr(dip));
3596 goto errout1;
3597 } else {
3598 (void) ddi_devid_register(vdp->xdf_dip,
3599 vdp->xdf_tgt_devid);
3600 }
3601 }
3602
3603 done:
3604 #ifdef XPV_HVM_DRIVER
3605 xdf_hvm_add(dip);
3606
3607 /* Report our version to dom0 */
3608 (void) xenbus_printf(XBT_NULL, "guest/xdf", "version", "%d",
3609 HVMPV_XDF_VERS);
3610 #endif /* XPV_HVM_DRIVER */
3611
3612 /* Create kstat for iostat(8) */
3613 if (xdf_kstat_create(dip) != 0) {
3614 cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
3615 ddi_get_name_addr(dip));
3616 goto errout1;
3617 }
3618
3619 /*
3620 * Don't bother with getting real device identification
3621 * strings (is it even possible?), they are unlikely to
3622 * change often (if at all).
3623 */
3624 (void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_VENDOR_ID,
3625 "Xen");
3626 (void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_PRODUCT_ID,
3627 dev_iscd ? "Virtual CD" : "Virtual disk");
3628 (void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_REVISION_ID,
3629 "1.0");
3630
3631 ddi_report_dev(dip);
3632 DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr));
3633 return (DDI_SUCCESS);
3634
3635 errout1:
3636 (void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed);
3637 xvdi_remove_event_handler(dip, XS_OE_STATE);
3638 errout0:
3639 if (vdp->xdf_vd_lbl != NULL) {
3640 cmlb_free_handle(&vdp->xdf_vd_lbl);
3641 vdp->xdf_vd_lbl = NULL;
3642 }
3643 if (vdp->xdf_softintr_id != NULL)
3644 ddi_remove_softintr(vdp->xdf_softintr_id);
3645 xvdi_remove_xb_watch_handlers(dip);
3646 if (vdp->xdf_ready_tq != NULL)
3647 ddi_taskq_destroy(vdp->xdf_ready_tq);
3648 mutex_destroy(&vdp->xdf_cb_lk);
3649 mutex_destroy(&vdp->xdf_dev_lk);
3650 cv_destroy(&vdp->xdf_dev_cv);
3651 cv_destroy(&vdp->xdf_hp_status_cv);
3652 ddi_soft_state_free(xdf_ssp, instance);
3653 ddi_set_driver_private(dip, NULL);
3654 ddi_prop_remove_all(dip);
3655 cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip));
3656 return (DDI_FAILURE);
3657 }
3658
3659 static int
xdf_suspend(dev_info_t * dip)3660 xdf_suspend(dev_info_t *dip)
3661 {
3662 int instance = ddi_get_instance(dip);
3663 xdf_t *vdp;
3664
3665 if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
3666 return (DDI_FAILURE);
3667
3668 if (xdf_debug & SUSRES_DBG)
3669 xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr);
3670
3671 xvdi_suspend(dip);
3672
3673 mutex_enter(&vdp->xdf_cb_lk);
3674 mutex_enter(&vdp->xdf_dev_lk);
3675
3676 vdp->xdf_suspending = B_TRUE;
3677 xdf_ring_destroy(vdp);
3678 xdf_set_state(vdp, XD_SUSPEND);
3679 vdp->xdf_suspending = B_FALSE;
3680
3681 mutex_exit(&vdp->xdf_dev_lk);
3682 mutex_exit(&vdp->xdf_cb_lk);
3683
3684 if (xdf_debug & SUSRES_DBG)
3685 xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr);
3686
3687 return (DDI_SUCCESS);
3688 }
3689
3690 static int
xdf_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)3691 xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3692 {
3693 xdf_t *vdp;
3694 int instance;
3695
3696 switch (cmd) {
3697
3698 case DDI_PM_SUSPEND:
3699 break;
3700
3701 case DDI_SUSPEND:
3702 return (xdf_suspend(dip));
3703
3704 case DDI_DETACH:
3705 break;
3706
3707 default:
3708 return (DDI_FAILURE);
3709 }
3710
3711 instance = ddi_get_instance(dip);
3712 DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip)));
3713 vdp = ddi_get_soft_state(xdf_ssp, instance);
3714
3715 if (vdp == NULL)
3716 return (DDI_FAILURE);
3717
3718 mutex_enter(&vdp->xdf_cb_lk);
3719 xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
3720 if (vdp->xdf_state != XD_CLOSED) {
3721 mutex_exit(&vdp->xdf_cb_lk);
3722 return (DDI_FAILURE);
3723 }
3724 mutex_exit(&vdp->xdf_cb_lk);
3725
3726 ASSERT(!ISDMACBON(vdp));
3727
3728 #ifdef XPV_HVM_DRIVER
3729 xdf_hvm_rm(dip);
3730 #endif /* XPV_HVM_DRIVER */
3731
3732 if (vdp->xdf_timeout_id != 0)
3733 (void) untimeout(vdp->xdf_timeout_id);
3734
3735 xvdi_remove_event_handler(dip, XS_OE_STATE);
3736 ddi_taskq_destroy(vdp->xdf_ready_tq);
3737
3738 cmlb_detach(vdp->xdf_vd_lbl, NULL);
3739 cmlb_free_handle(&vdp->xdf_vd_lbl);
3740
3741 /* we'll support backend running in domU later */
3742 #ifdef DOMU_BACKEND
3743 (void) xvdi_post_event(dip, XEN_HP_REMOVE);
3744 #endif
3745
3746 list_destroy(&vdp->xdf_vreq_act);
3747 ddi_prop_remove_all(dip);
3748 xdf_kstat_delete(dip);
3749 ddi_remove_softintr(vdp->xdf_softintr_id);
3750 xvdi_remove_xb_watch_handlers(dip);
3751 ddi_set_driver_private(dip, NULL);
3752 cv_destroy(&vdp->xdf_dev_cv);
3753 mutex_destroy(&vdp->xdf_cb_lk);
3754 mutex_destroy(&vdp->xdf_dev_lk);
3755 if (vdp->xdf_cache_flush_block != NULL)
3756 kmem_free(vdp->xdf_flush_mem, 2 * vdp->xdf_xdev_secsize);
3757 ddi_soft_state_free(xdf_ssp, instance);
3758 return (DDI_SUCCESS);
3759 }
3760
3761 /*
3762 * Driver linkage structures.
3763 */
3764 static struct cb_ops xdf_cbops = {
3765 xdf_open,
3766 xdf_close,
3767 xdf_strategy,
3768 nodev,
3769 xdf_dump,
3770 xdf_read,
3771 xdf_write,
3772 xdf_ioctl,
3773 nodev,
3774 nodev,
3775 nodev,
3776 nochpoll,
3777 xdf_prop_op,
3778 NULL,
3779 D_MP | D_NEW | D_64BIT,
3780 CB_REV,
3781 xdf_aread,
3782 xdf_awrite
3783 };
3784
3785 struct dev_ops xdf_devops = {
3786 DEVO_REV, /* devo_rev */
3787 0, /* devo_refcnt */
3788 xdf_getinfo, /* devo_getinfo */
3789 nulldev, /* devo_identify */
3790 nulldev, /* devo_probe */
3791 xdf_attach, /* devo_attach */
3792 xdf_detach, /* devo_detach */
3793 nodev, /* devo_reset */
3794 &xdf_cbops, /* devo_cb_ops */
3795 NULL, /* devo_bus_ops */
3796 NULL, /* devo_power */
3797 ddi_quiesce_not_supported, /* devo_quiesce */
3798 };
3799
3800 /*
3801 * Module linkage structures.
3802 */
3803 static struct modldrv modldrv = {
3804 &mod_driverops, /* Type of module. This one is a driver */
3805 "virtual block driver", /* short description */
3806 &xdf_devops /* driver specific ops */
3807 };
3808
3809 static struct modlinkage xdf_modlinkage = {
3810 MODREV_1, (void *)&modldrv, NULL
3811 };
3812
3813 /*
3814 * standard module entry points
3815 */
3816 int
_init(void)3817 _init(void)
3818 {
3819 int rc;
3820
3821 xdf_major = ddi_name_to_major("xdf");
3822 if (xdf_major == (major_t)-1)
3823 return (EINVAL);
3824
3825 if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0)
3826 return (rc);
3827
3828 xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
3829 sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3830 xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
3831 sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3832
3833 #ifdef XPV_HVM_DRIVER
3834 xdf_hvm_init();
3835 #endif /* XPV_HVM_DRIVER */
3836
3837 if ((rc = mod_install(&xdf_modlinkage)) != 0) {
3838 #ifdef XPV_HVM_DRIVER
3839 xdf_hvm_fini();
3840 #endif /* XPV_HVM_DRIVER */
3841 kmem_cache_destroy(xdf_vreq_cache);
3842 kmem_cache_destroy(xdf_gs_cache);
3843 ddi_soft_state_fini(&xdf_ssp);
3844 return (rc);
3845 }
3846
3847 return (rc);
3848 }
3849
3850 int
_fini(void)3851 _fini(void)
3852 {
3853 int err;
3854 if ((err = mod_remove(&xdf_modlinkage)) != 0)
3855 return (err);
3856
3857 #ifdef XPV_HVM_DRIVER
3858 xdf_hvm_fini();
3859 #endif /* XPV_HVM_DRIVER */
3860
3861 kmem_cache_destroy(xdf_vreq_cache);
3862 kmem_cache_destroy(xdf_gs_cache);
3863 ddi_soft_state_fini(&xdf_ssp);
3864
3865 return (0);
3866 }
3867
3868 int
_info(struct modinfo * modinfop)3869 _info(struct modinfo *modinfop)
3870 {
3871 return (mod_info(&xdf_modlinkage, modinfop));
3872 }
3873