xref: /titanic_50/usr/src/uts/common/io/blkdev/blkdev.c (revision f097ef9c335ca15d17a8caba066fb2da38a3a1be)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2011, 2012 Nexenta Systems, Inc.  All rights reserved.
24  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/ksynch.h>
29 #include <sys/kmem.h>
30 #include <sys/file.h>
31 #include <sys/errno.h>
32 #include <sys/open.h>
33 #include <sys/buf.h>
34 #include <sys/uio.h>
35 #include <sys/aio_req.h>
36 #include <sys/cred.h>
37 #include <sys/modctl.h>
38 #include <sys/cmlb.h>
39 #include <sys/conf.h>
40 #include <sys/devops.h>
41 #include <sys/list.h>
42 #include <sys/sysmacros.h>
43 #include <sys/dkio.h>
44 #include <sys/vtoc.h>
45 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
46 #include <sys/kstat.h>
47 #include <sys/fs/dv_node.h>
48 #include <sys/ddi.h>
49 #include <sys/sunddi.h>
50 #include <sys/note.h>
51 #include <sys/blkdev.h>
52 
53 #define	BD_MAXPART	64
54 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
55 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
56 
57 typedef struct bd bd_t;
58 typedef struct bd_xfer_impl bd_xfer_impl_t;
59 
60 struct bd {
61 	void		*d_private;
62 	dev_info_t	*d_dip;
63 	kmutex_t	d_ocmutex;
64 	kmutex_t	d_iomutex;
65 	kmutex_t	d_statemutex;
66 	kcondvar_t	d_statecv;
67 	enum dkio_state	d_state;
68 	cmlb_handle_t	d_cmlbh;
69 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
70 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
71 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
72 
73 	uint32_t	d_qsize;
74 	uint32_t	d_qactive;
75 	uint32_t	d_maxxfer;
76 	uint32_t	d_blkshift;
77 	uint64_t	d_numblks;
78 	ddi_devid_t	d_devid;
79 
80 	kmem_cache_t	*d_cache;
81 	list_t		d_runq;
82 	list_t		d_waitq;
83 	kstat_t		*d_ksp;
84 	kstat_io_t	*d_kiop;
85 
86 	boolean_t	d_rdonly;
87 	boolean_t	d_removable;
88 	boolean_t	d_hotpluggable;
89 	boolean_t	d_use_dma;
90 
91 	ddi_dma_attr_t	d_dma;
92 	bd_ops_t	d_ops;
93 	bd_handle_t	d_handle;
94 };
95 
96 struct bd_handle {
97 	bd_ops_t	h_ops;
98 	ddi_dma_attr_t	*h_dma;
99 	dev_info_t	*h_parent;
100 	dev_info_t	*h_child;
101 	void		*h_private;
102 	bd_t		*h_bd;
103 	char		*h_name;
104 	char		h_addr[20];	/* enough for %X,%X */
105 };
106 
107 struct bd_xfer_impl {
108 	bd_xfer_t	i_public;
109 	list_node_t	i_linkage;
110 	bd_t		*i_bd;
111 	buf_t		*i_bp;
112 	uint_t		i_num_win;
113 	uint_t		i_cur_win;
114 	off_t		i_offset;
115 	int		(*i_func)(void *, bd_xfer_t *);
116 	uint32_t	i_blkshift;
117 	size_t		i_len;
118 	size_t		i_resid;
119 };
120 
121 #define	i_dmah		i_public.x_dmah
122 #define	i_dmac		i_public.x_dmac
123 #define	i_ndmac		i_public.x_ndmac
124 #define	i_kaddr		i_public.x_kaddr
125 #define	i_nblks		i_public.x_nblks
126 #define	i_blkno		i_public.x_blkno
127 #define	i_flags		i_public.x_flags
128 
129 
130 /*
131  * Private prototypes.
132  */
133 
134 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
135 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
136 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
137 
138 static int bd_open(dev_t *, int, int, cred_t *);
139 static int bd_close(dev_t, int, int, cred_t *);
140 static int bd_strategy(struct buf *);
141 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
142 static int bd_dump(dev_t, caddr_t, daddr_t, int);
143 static int bd_read(dev_t, struct uio *, cred_t *);
144 static int bd_write(dev_t, struct uio *, cred_t *);
145 static int bd_aread(dev_t, struct aio_req *, cred_t *);
146 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
147 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
148     caddr_t, int *);
149 
150 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
151     void *);
152 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
153 static int bd_xfer_ctor(void *, void *, int);
154 static void bd_xfer_dtor(void *, void *);
155 static void bd_sched(bd_t *);
156 static void bd_submit(bd_t *, bd_xfer_impl_t *);
157 static void bd_runq_exit(bd_xfer_impl_t *, int);
158 static void bd_update_state(bd_t *);
159 static int bd_check_state(bd_t *, enum dkio_state *);
160 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
161 
162 struct cmlb_tg_ops bd_tg_ops = {
163 	TG_DK_OPS_VERSION_1,
164 	bd_tg_rdwr,
165 	bd_tg_getinfo,
166 };
167 
168 static struct cb_ops bd_cb_ops = {
169 	bd_open, 		/* open */
170 	bd_close, 		/* close */
171 	bd_strategy, 		/* strategy */
172 	nodev, 			/* print */
173 	bd_dump,		/* dump */
174 	bd_read, 		/* read */
175 	bd_write, 		/* write */
176 	bd_ioctl, 		/* ioctl */
177 	nodev, 			/* devmap */
178 	nodev, 			/* mmap */
179 	nodev, 			/* segmap */
180 	nochpoll, 		/* poll */
181 	bd_prop_op, 		/* cb_prop_op */
182 	0, 			/* streamtab  */
183 	D_64BIT | D_MP,		/* Driver comaptibility flag */
184 	CB_REV,			/* cb_rev */
185 	bd_aread,		/* async read */
186 	bd_awrite		/* async write */
187 };
188 
189 struct dev_ops bd_dev_ops = {
190 	DEVO_REV, 		/* devo_rev, */
191 	0, 			/* refcnt  */
192 	bd_getinfo,		/* getinfo */
193 	nulldev, 		/* identify */
194 	nulldev, 		/* probe */
195 	bd_attach, 		/* attach */
196 	bd_detach,		/* detach */
197 	nodev, 			/* reset */
198 	&bd_cb_ops, 		/* driver operations */
199 	NULL,			/* bus operations */
200 	NULL,			/* power */
201 	ddi_quiesce_not_needed,	/* quiesce */
202 };
203 
204 static struct modldrv modldrv = {
205 	&mod_driverops,
206 	"Generic Block Device",
207 	&bd_dev_ops,
208 };
209 
210 static struct modlinkage modlinkage = {
211 	MODREV_1, { &modldrv, NULL }
212 };
213 
214 static void *bd_state;
215 static krwlock_t bd_lock;
216 
217 int
218 _init(void)
219 {
220 	int	rv;
221 
222 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
223 	if (rv != DDI_SUCCESS) {
224 		return (rv);
225 	}
226 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
227 	rv = mod_install(&modlinkage);
228 	if (rv != DDI_SUCCESS) {
229 		rw_destroy(&bd_lock);
230 		ddi_soft_state_fini(&bd_state);
231 	}
232 	return (rv);
233 }
234 
235 int
236 _fini(void)
237 {
238 	int	rv;
239 
240 	rv = mod_remove(&modlinkage);
241 	if (rv == DDI_SUCCESS) {
242 		rw_destroy(&bd_lock);
243 		ddi_soft_state_fini(&bd_state);
244 	}
245 	return (rv);
246 }
247 
248 int
249 _info(struct modinfo *modinfop)
250 {
251 	return (mod_info(&modlinkage, modinfop));
252 }
253 
254 static int
255 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
256 {
257 	bd_t	*bd;
258 	minor_t	inst;
259 
260 	_NOTE(ARGUNUSED(dip));
261 
262 	inst = BDINST((dev_t)arg);
263 
264 	switch (cmd) {
265 	case DDI_INFO_DEVT2DEVINFO:
266 		bd = ddi_get_soft_state(bd_state, inst);
267 		if (bd == NULL) {
268 			return (DDI_FAILURE);
269 		}
270 		*resultp = (void *)bd->d_dip;
271 		break;
272 
273 	case DDI_INFO_DEVT2INSTANCE:
274 		*resultp = (void *)(intptr_t)inst;
275 		break;
276 
277 	default:
278 		return (DDI_FAILURE);
279 	}
280 	return (DDI_SUCCESS);
281 }
282 
283 static int
284 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
285 {
286 	int		inst;
287 	bd_handle_t	hdl;
288 	bd_t		*bd;
289 	bd_drive_t	drive;
290 	int		rv;
291 	char		name[16];
292 	char		kcache[32];
293 
294 	switch (cmd) {
295 	case DDI_ATTACH:
296 		break;
297 	case DDI_RESUME:
298 		/* We don't do anything native for suspend/resume */
299 		return (DDI_SUCCESS);
300 	default:
301 		return (DDI_FAILURE);
302 	}
303 
304 	inst = ddi_get_instance(dip);
305 	hdl = ddi_get_parent_data(dip);
306 
307 	(void) snprintf(name, sizeof (name), "%s%d",
308 	    ddi_driver_name(dip), ddi_get_instance(dip));
309 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
310 
311 	if (hdl == NULL) {
312 		cmn_err(CE_WARN, "%s: missing parent data!", name);
313 		return (DDI_FAILURE);
314 	}
315 
316 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
317 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
318 		return (DDI_FAILURE);
319 	}
320 	bd = ddi_get_soft_state(bd_state, inst);
321 
322 	if (hdl->h_dma) {
323 		bd->d_dma = *(hdl->h_dma);
324 		bd->d_dma.dma_attr_granular =
325 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
326 		bd->d_use_dma = B_TRUE;
327 
328 		if (bd->d_maxxfer &&
329 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
330 			cmn_err(CE_WARN,
331 			    "%s: inconsistent maximum transfer size!",
332 			    name);
333 			/* We force it */
334 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
335 		} else {
336 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
337 		}
338 	} else {
339 		bd->d_use_dma = B_FALSE;
340 		if (bd->d_maxxfer == 0) {
341 			bd->d_maxxfer = 1024 * 1024;
342 		}
343 	}
344 	bd->d_ops = hdl->h_ops;
345 	bd->d_private = hdl->h_private;
346 	bd->d_blkshift = 9;	/* 512 bytes, to start */
347 
348 	if (bd->d_maxxfer % DEV_BSIZE) {
349 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
350 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
351 	}
352 	if (bd->d_maxxfer < DEV_BSIZE) {
353 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
354 		ddi_soft_state_free(bd_state, inst);
355 		return (DDI_FAILURE);
356 	}
357 
358 	bd->d_dip = dip;
359 	bd->d_handle = hdl;
360 	hdl->h_bd = bd;
361 	ddi_set_driver_private(dip, bd);
362 
363 	mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL);
364 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
365 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
366 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
367 
368 	list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t),
369 	    offsetof(struct bd_xfer_impl, i_linkage));
370 	list_create(&bd->d_runq, sizeof (bd_xfer_impl_t),
371 	    offsetof(struct bd_xfer_impl, i_linkage));
372 
373 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
374 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
375 
376 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
377 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
378 	if (bd->d_ksp != NULL) {
379 		bd->d_ksp->ks_lock = &bd->d_iomutex;
380 		kstat_install(bd->d_ksp);
381 		bd->d_kiop = bd->d_ksp->ks_data;
382 	} else {
383 		/*
384 		 * Even if we cannot create the kstat, we create a
385 		 * scratch kstat.  The reason for this is to ensure
386 		 * that we can update the kstat all of the time,
387 		 * without adding an extra branch instruction.
388 		 */
389 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
390 	}
391 
392 	cmlb_alloc_handle(&bd->d_cmlbh);
393 
394 	bd->d_state = DKIO_NONE;
395 
396 	bzero(&drive, sizeof (drive));
397 	bd->d_ops.o_drive_info(bd->d_private, &drive);
398 	bd->d_qsize = drive.d_qsize;
399 	bd->d_maxxfer = drive.d_maxxfer;
400 	bd->d_removable = drive.d_removable;
401 	bd->d_hotpluggable = drive.d_hotpluggable;
402 
403 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
404 	    bd->d_removable, bd->d_hotpluggable,
405 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
406 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
407 	if (rv != 0) {
408 		cmlb_free_handle(&bd->d_cmlbh);
409 		kmem_cache_destroy(bd->d_cache);
410 		mutex_destroy(&bd->d_iomutex);
411 		mutex_destroy(&bd->d_ocmutex);
412 		mutex_destroy(&bd->d_statemutex);
413 		cv_destroy(&bd->d_statecv);
414 		list_destroy(&bd->d_waitq);
415 		list_destroy(&bd->d_runq);
416 		if (bd->d_ksp != NULL) {
417 			kstat_delete(bd->d_ksp);
418 			bd->d_ksp = NULL;
419 		} else {
420 			kmem_free(bd->d_kiop, sizeof (kstat_io_t));
421 		}
422 		ddi_soft_state_free(bd_state, inst);
423 		return (DDI_FAILURE);
424 	}
425 
426 	if (bd->d_ops.o_devid_init != NULL) {
427 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
428 		if (rv == DDI_SUCCESS) {
429 			if (ddi_devid_register(dip, bd->d_devid) !=
430 			    DDI_SUCCESS) {
431 				cmn_err(CE_WARN,
432 				    "%s: unable to register devid", name);
433 			}
434 		}
435 	}
436 
437 	/*
438 	 * Add a zero-length attribute to tell the world we support
439 	 * kernel ioctls (for layered drivers).  Also set up properties
440 	 * used by HAL to identify removable media.
441 	 */
442 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
443 	    DDI_KERNEL_IOCTL, NULL, 0);
444 	if (bd->d_removable) {
445 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
446 		    "removable-media", NULL, 0);
447 	}
448 	if (bd->d_hotpluggable) {
449 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
450 		    "hotpluggable", NULL, 0);
451 	}
452 
453 	ddi_report_dev(dip);
454 
455 	return (DDI_SUCCESS);
456 }
457 
458 static int
459 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
460 {
461 	bd_t	*bd;
462 
463 	bd = ddi_get_driver_private(dip);
464 
465 	switch (cmd) {
466 	case DDI_DETACH:
467 		break;
468 	case DDI_SUSPEND:
469 		/* We don't suspend, but our parent does */
470 		return (DDI_SUCCESS);
471 	default:
472 		return (DDI_FAILURE);
473 	}
474 	if (bd->d_ksp != NULL) {
475 		kstat_delete(bd->d_ksp);
476 		bd->d_ksp = NULL;
477 	} else {
478 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
479 	}
480 	cmlb_detach(bd->d_cmlbh, 0);
481 	cmlb_free_handle(&bd->d_cmlbh);
482 	if (bd->d_devid)
483 		ddi_devid_free(bd->d_devid);
484 	kmem_cache_destroy(bd->d_cache);
485 	mutex_destroy(&bd->d_iomutex);
486 	mutex_destroy(&bd->d_ocmutex);
487 	mutex_destroy(&bd->d_statemutex);
488 	cv_destroy(&bd->d_statecv);
489 	list_destroy(&bd->d_waitq);
490 	list_destroy(&bd->d_runq);
491 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
492 	return (DDI_SUCCESS);
493 }
494 
495 static int
496 bd_xfer_ctor(void *buf, void *arg, int kmflag)
497 {
498 	bd_xfer_impl_t	*xi;
499 	bd_t		*bd = arg;
500 	int		(*dcb)(caddr_t);
501 
502 	if (kmflag == KM_SLEEP) {
503 		dcb = DDI_DMA_SLEEP;
504 	} else {
505 		dcb = DDI_DMA_DONTWAIT;
506 	}
507 
508 	xi = buf;
509 	bzero(xi, sizeof (*xi));
510 	xi->i_bd = bd;
511 
512 	if (bd->d_use_dma) {
513 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
514 		    &xi->i_dmah) != DDI_SUCCESS) {
515 			return (-1);
516 		}
517 	}
518 
519 	return (0);
520 }
521 
522 static void
523 bd_xfer_dtor(void *buf, void *arg)
524 {
525 	bd_xfer_impl_t	*xi = buf;
526 
527 	_NOTE(ARGUNUSED(arg));
528 
529 	if (xi->i_dmah)
530 		ddi_dma_free_handle(&xi->i_dmah);
531 	xi->i_dmah = NULL;
532 }
533 
534 static bd_xfer_impl_t *
535 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
536     int kmflag)
537 {
538 	bd_xfer_impl_t		*xi;
539 	int			rv;
540 	int			status;
541 	unsigned		dir;
542 	int			(*cb)(caddr_t);
543 	size_t			len;
544 	uint32_t		shift;
545 
546 	if (kmflag == KM_SLEEP) {
547 		cb = DDI_DMA_SLEEP;
548 	} else {
549 		cb = DDI_DMA_DONTWAIT;
550 	}
551 
552 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
553 	if (xi == NULL) {
554 		bioerror(bp, ENOMEM);
555 		return (NULL);
556 	}
557 
558 	ASSERT(bp);
559 
560 	xi->i_bp = bp;
561 	xi->i_func = func;
562 	xi->i_blkno = bp->b_lblkno;
563 
564 	if (bp->b_bcount == 0) {
565 		xi->i_len = 0;
566 		xi->i_nblks = 0;
567 		xi->i_kaddr = NULL;
568 		xi->i_resid = 0;
569 		xi->i_num_win = 0;
570 		goto done;
571 	}
572 
573 	if (bp->b_flags & B_READ) {
574 		dir = DDI_DMA_READ;
575 		xi->i_func = bd->d_ops.o_read;
576 	} else {
577 		dir = DDI_DMA_WRITE;
578 		xi->i_func = bd->d_ops.o_write;
579 	}
580 
581 	shift = bd->d_blkshift;
582 	xi->i_blkshift = shift;
583 
584 	if (!bd->d_use_dma) {
585 		bp_mapin(bp);
586 		rv = 0;
587 		xi->i_offset = 0;
588 		xi->i_num_win =
589 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
590 		xi->i_cur_win = 0;
591 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
592 		xi->i_nblks = xi->i_len >> shift;
593 		xi->i_kaddr = bp->b_un.b_addr;
594 		xi->i_resid = bp->b_bcount;
595 	} else {
596 
597 		/*
598 		 * We have to use consistent DMA if the address is misaligned.
599 		 */
600 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
601 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
602 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
603 		} else {
604 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
605 		}
606 
607 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
608 		    NULL, &xi->i_dmac, &xi->i_ndmac);
609 		switch (status) {
610 		case DDI_DMA_MAPPED:
611 			xi->i_num_win = 1;
612 			xi->i_cur_win = 0;
613 			xi->i_offset = 0;
614 			xi->i_len = bp->b_bcount;
615 			xi->i_nblks = xi->i_len >> shift;
616 			xi->i_resid = bp->b_bcount;
617 			rv = 0;
618 			break;
619 		case DDI_DMA_PARTIAL_MAP:
620 			xi->i_cur_win = 0;
621 
622 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
623 			    DDI_SUCCESS) ||
624 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
625 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
626 			    DDI_SUCCESS) ||
627 			    (P2PHASE(len, shift) != 0)) {
628 				(void) ddi_dma_unbind_handle(xi->i_dmah);
629 				rv = EFAULT;
630 				goto done;
631 			}
632 			xi->i_len = len;
633 			xi->i_nblks = xi->i_len >> shift;
634 			xi->i_resid = bp->b_bcount;
635 			rv = 0;
636 			break;
637 		case DDI_DMA_NORESOURCES:
638 			rv = EAGAIN;
639 			goto done;
640 		case DDI_DMA_TOOBIG:
641 			rv = EINVAL;
642 			goto done;
643 		case DDI_DMA_NOMAPPING:
644 		case DDI_DMA_INUSE:
645 		default:
646 			rv = EFAULT;
647 			goto done;
648 		}
649 	}
650 
651 done:
652 	if (rv != 0) {
653 		kmem_cache_free(bd->d_cache, xi);
654 		bioerror(bp, rv);
655 		return (NULL);
656 	}
657 
658 	return (xi);
659 }
660 
661 static void
662 bd_xfer_free(bd_xfer_impl_t *xi)
663 {
664 	if (xi->i_dmah) {
665 		(void) ddi_dma_unbind_handle(xi->i_dmah);
666 	}
667 	kmem_cache_free(xi->i_bd->d_cache, xi);
668 }
669 
670 static int
671 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
672 {
673 	dev_t		dev = *devp;
674 	bd_t		*bd;
675 	minor_t		part;
676 	minor_t		inst;
677 	uint64_t	mask;
678 	boolean_t	ndelay;
679 	int		rv;
680 	diskaddr_t	nblks;
681 	diskaddr_t	lba;
682 
683 	_NOTE(ARGUNUSED(credp));
684 
685 	part = BDPART(dev);
686 	inst = BDINST(dev);
687 
688 	if (otyp >= OTYPCNT)
689 		return (EINVAL);
690 
691 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
692 
693 	/*
694 	 * Block any DR events from changing the set of registered
695 	 * devices while we function.
696 	 */
697 	rw_enter(&bd_lock, RW_READER);
698 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
699 		rw_exit(&bd_lock);
700 		return (ENXIO);
701 	}
702 
703 	mutex_enter(&bd->d_ocmutex);
704 
705 	ASSERT(part < 64);
706 	mask = (1U << part);
707 
708 	bd_update_state(bd);
709 
710 	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
711 
712 		/* non-blocking opens are allowed to succeed */
713 		if (!ndelay) {
714 			rv = ENXIO;
715 			goto done;
716 		}
717 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
718 	    NULL, NULL, 0) == 0) {
719 
720 		/*
721 		 * We read the partinfo, verify valid ranges.  If the
722 		 * partition is invalid, and we aren't blocking or
723 		 * doing a raw access, then fail. (Non-blocking and
724 		 * raw accesses can still succeed to allow a disk with
725 		 * bad partition data to opened by format and fdisk.)
726 		 */
727 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
728 			rv = ENXIO;
729 			goto done;
730 		}
731 	} else if (!ndelay) {
732 		/*
733 		 * cmlb_partinfo failed -- invalid partition or no
734 		 * disk label.
735 		 */
736 		rv = ENXIO;
737 		goto done;
738 	}
739 
740 	if ((flag & FWRITE) && bd->d_rdonly) {
741 		rv = EROFS;
742 		goto done;
743 	}
744 
745 	if ((bd->d_open_excl) & (mask)) {
746 		rv = EBUSY;
747 		goto done;
748 	}
749 	if (flag & FEXCL) {
750 		if (bd->d_open_lyr[part]) {
751 			rv = EBUSY;
752 			goto done;
753 		}
754 		for (int i = 0; i < OTYP_LYR; i++) {
755 			if (bd->d_open_reg[i] & mask) {
756 				rv = EBUSY;
757 				goto done;
758 			}
759 		}
760 	}
761 
762 	if (otyp == OTYP_LYR) {
763 		bd->d_open_lyr[part]++;
764 	} else {
765 		bd->d_open_reg[otyp] |= mask;
766 	}
767 	if (flag & FEXCL) {
768 		bd->d_open_excl |= mask;
769 	}
770 
771 	rv = 0;
772 done:
773 	mutex_exit(&bd->d_ocmutex);
774 	rw_exit(&bd_lock);
775 
776 	return (rv);
777 }
778 
779 static int
780 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
781 {
782 	bd_t		*bd;
783 	minor_t		inst;
784 	minor_t		part;
785 	uint64_t	mask;
786 	boolean_t	last = B_TRUE;
787 
788 	_NOTE(ARGUNUSED(flag));
789 	_NOTE(ARGUNUSED(credp));
790 
791 	part = BDPART(dev);
792 	inst = BDINST(dev);
793 
794 	ASSERT(part < 64);
795 	mask = (1U << part);
796 
797 	rw_enter(&bd_lock, RW_READER);
798 
799 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
800 		rw_exit(&bd_lock);
801 		return (ENXIO);
802 	}
803 
804 	mutex_enter(&bd->d_ocmutex);
805 	if (bd->d_open_excl & mask) {
806 		bd->d_open_excl &= ~mask;
807 	}
808 	if (otyp == OTYP_LYR) {
809 		bd->d_open_lyr[part]--;
810 	} else {
811 		bd->d_open_reg[otyp] &= ~mask;
812 	}
813 	for (int i = 0; i < 64; i++) {
814 		if (bd->d_open_lyr[part]) {
815 			last = B_FALSE;
816 		}
817 	}
818 	for (int i = 0; last && (i < OTYP_LYR); i++) {
819 		if (bd->d_open_reg[i]) {
820 			last = B_FALSE;
821 		}
822 	}
823 	mutex_exit(&bd->d_ocmutex);
824 
825 	if (last) {
826 		cmlb_invalidate(bd->d_cmlbh, 0);
827 	}
828 	rw_exit(&bd_lock);
829 
830 	return (0);
831 }
832 
833 static int
834 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
835 {
836 	minor_t		inst;
837 	minor_t		part;
838 	diskaddr_t	pstart;
839 	diskaddr_t	psize;
840 	bd_t		*bd;
841 	bd_xfer_impl_t	*xi;
842 	buf_t		*bp;
843 	int		rv;
844 
845 	rw_enter(&bd_lock, RW_READER);
846 
847 	part = BDPART(dev);
848 	inst = BDINST(dev);
849 
850 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
851 		rw_exit(&bd_lock);
852 		return (ENXIO);
853 	}
854 	/*
855 	 * do cmlb, but do it synchronously unless we already have the
856 	 * partition (which we probably should.)
857 	 */
858 	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
859 	    (void *)1)) {
860 		rw_exit(&bd_lock);
861 		return (ENXIO);
862 	}
863 
864 	if ((blkno + nblk) > psize) {
865 		rw_exit(&bd_lock);
866 		return (EINVAL);
867 	}
868 	bp = getrbuf(KM_NOSLEEP);
869 	if (bp == NULL) {
870 		rw_exit(&bd_lock);
871 		return (ENOMEM);
872 	}
873 
874 	bp->b_bcount = nblk << bd->d_blkshift;
875 	bp->b_resid = bp->b_bcount;
876 	bp->b_lblkno = blkno;
877 	bp->b_un.b_addr = caddr;
878 
879 	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
880 	if (xi == NULL) {
881 		rw_exit(&bd_lock);
882 		freerbuf(bp);
883 		return (ENOMEM);
884 	}
885 	xi->i_blkno = blkno + pstart;
886 	xi->i_flags = BD_XFER_POLL;
887 	bd_submit(bd, xi);
888 	rw_exit(&bd_lock);
889 
890 	/*
891 	 * Generally, we should have run this entirely synchronously
892 	 * at this point and the biowait call should be a no-op.  If
893 	 * it didn't happen this way, it's a bug in the underlying
894 	 * driver not honoring BD_XFER_POLL.
895 	 */
896 	(void) biowait(bp);
897 	rv = geterror(bp);
898 	freerbuf(bp);
899 	return (rv);
900 }
901 
902 static int
903 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
904 {
905 	_NOTE(ARGUNUSED(credp));
906 	return (physio(bd_strategy, NULL, dev, B_READ, minphys, uio));
907 }
908 
909 static int
910 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
911 {
912 	_NOTE(ARGUNUSED(credp));
913 	return (physio(bd_strategy, NULL, dev, B_WRITE, minphys, uio));
914 }
915 
916 static int
917 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
918 {
919 	_NOTE(ARGUNUSED(credp));
920 	return (aphysio(bd_strategy, anocancel, dev, B_READ, minphys, aio));
921 }
922 
923 static int
924 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
925 {
926 	_NOTE(ARGUNUSED(credp));
927 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, minphys, aio));
928 }
929 
930 static int
931 bd_strategy(struct buf *bp)
932 {
933 	minor_t		inst;
934 	minor_t		part;
935 	bd_t		*bd;
936 	diskaddr_t	p_lba;
937 	diskaddr_t	p_nblks;
938 	diskaddr_t	b_nblks;
939 	bd_xfer_impl_t	*xi;
940 	uint32_t	shift;
941 	int		(*func)(void *, bd_xfer_t *);
942 
943 	part = BDPART(bp->b_edev);
944 	inst = BDINST(bp->b_edev);
945 
946 	ASSERT(bp);
947 
948 	bp->b_resid = bp->b_bcount;
949 
950 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
951 		bioerror(bp, ENXIO);
952 		biodone(bp);
953 		return (0);
954 	}
955 
956 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
957 	    NULL, NULL, 0)) {
958 		bioerror(bp, ENXIO);
959 		biodone(bp);
960 		return (0);
961 	}
962 
963 	shift = bd->d_blkshift;
964 
965 	if ((P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
966 	    (bp->b_lblkno > p_nblks)) {
967 		bioerror(bp, ENXIO);
968 		biodone(bp);
969 		return (0);
970 	}
971 	b_nblks = bp->b_bcount >> shift;
972 	if ((bp->b_lblkno == p_nblks) || (bp->b_bcount == 0)) {
973 		biodone(bp);
974 		return (0);
975 	}
976 
977 	if ((b_nblks + bp->b_lblkno) > p_nblks) {
978 		bp->b_resid = ((bp->b_lblkno + b_nblks - p_nblks) << shift);
979 		bp->b_bcount -= bp->b_resid;
980 	} else {
981 		bp->b_resid = 0;
982 	}
983 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
984 
985 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
986 	if (xi == NULL) {
987 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
988 	}
989 	if (xi == NULL) {
990 		/* bd_request_alloc will have done bioerror */
991 		biodone(bp);
992 		return (0);
993 	}
994 	xi->i_blkno = bp->b_lblkno + p_lba;
995 
996 	bd_submit(bd, xi);
997 
998 	return (0);
999 }
1000 
1001 static int
1002 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1003 {
1004 	minor_t		inst;
1005 	uint16_t	part;
1006 	bd_t		*bd;
1007 	void		*ptr = (void *)arg;
1008 	int		rv;
1009 
1010 	part = BDPART(dev);
1011 	inst = BDINST(dev);
1012 
1013 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1014 		return (ENXIO);
1015 	}
1016 
1017 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1018 	if (rv != ENOTTY)
1019 		return (rv);
1020 
1021 	switch (cmd) {
1022 	case DKIOCGMEDIAINFO: {
1023 		struct dk_minfo minfo;
1024 
1025 		/* make sure our state information is current */
1026 		bd_update_state(bd);
1027 		bzero(&minfo, sizeof (minfo));
1028 		minfo.dki_media_type = DK_FIXED_DISK;
1029 		minfo.dki_lbsize = (1U << bd->d_blkshift);
1030 		minfo.dki_capacity = bd->d_numblks;
1031 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag))  {
1032 			return (EFAULT);
1033 		}
1034 		return (0);
1035 	}
1036 	case DKIOCINFO: {
1037 		struct dk_cinfo cinfo;
1038 		bzero(&cinfo, sizeof (cinfo));
1039 		cinfo.dki_ctype = DKC_BLKDEV;
1040 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1041 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1042 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1043 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1044 		    "%s", ddi_driver_name(bd->d_dip));
1045 		cinfo.dki_unit = inst;
1046 		cinfo.dki_flags = DKI_FMTVOL;
1047 		cinfo.dki_partition = part;
1048 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1049 		cinfo.dki_addr = 0;
1050 		cinfo.dki_slave = 0;
1051 		cinfo.dki_space = 0;
1052 		cinfo.dki_prio = 0;
1053 		cinfo.dki_vec = 0;
1054 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag))  {
1055 			return (EFAULT);
1056 		}
1057 		return (0);
1058 	}
1059 	case DKIOCREMOVABLE: {
1060 		int i;
1061 		i = bd->d_removable ? 1 : 0;
1062 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1063 			return (EFAULT);
1064 		}
1065 		return (0);
1066 	}
1067 	case DKIOCHOTPLUGGABLE: {
1068 		int i;
1069 		i = bd->d_hotpluggable ? 1 : 0;
1070 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1071 			return (EFAULT);
1072 		}
1073 		return (0);
1074 	}
1075 	case DKIOCREADONLY: {
1076 		int i;
1077 		i = bd->d_rdonly ? 1 : 0;
1078 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1079 			return (EFAULT);
1080 		}
1081 		return (0);
1082 	}
1083 	case DKIOCSTATE: {
1084 		enum dkio_state	state;
1085 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1086 			return (EFAULT);
1087 		}
1088 		if ((rv = bd_check_state(bd, &state)) != 0) {
1089 			return (rv);
1090 		}
1091 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1092 			return (EFAULT);
1093 		}
1094 		return (0);
1095 	}
1096 	case DKIOCFLUSHWRITECACHE: {
1097 		struct dk_callback *dkc = NULL;
1098 
1099 		if (flag & FKIOCTL)
1100 			dkc = (void *)arg;
1101 
1102 		rv = bd_flush_write_cache(bd, dkc);
1103 		return (rv);
1104 	}
1105 
1106 	default:
1107 		break;
1108 
1109 	}
1110 	return (ENOTTY);
1111 }
1112 
1113 static int
1114 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1115     char *name, caddr_t valuep, int *lengthp)
1116 {
1117 	bd_t	*bd;
1118 
1119 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1120 	if (bd == NULL)
1121 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1122 		    name, valuep, lengthp));
1123 
1124 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1125 	    valuep, lengthp, BDPART(dev), 0));
1126 }
1127 
1128 
1129 static int
1130 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1131     size_t length, void *tg_cookie)
1132 {
1133 	bd_t		*bd;
1134 	buf_t		*bp;
1135 	bd_xfer_impl_t	*xi;
1136 	int		rv;
1137 	int		(*func)(void *, bd_xfer_t *);
1138 	int		kmflag;
1139 
1140 	/*
1141 	 * If we are running in polled mode (such as during dump(9e)
1142 	 * execution), then we cannot sleep for kernel allocations.
1143 	 */
1144 	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1145 
1146 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1147 
1148 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1149 		/* We can only transfer whole blocks at a time! */
1150 		return (EINVAL);
1151 	}
1152 
1153 	if ((bp = getrbuf(kmflag)) == NULL) {
1154 		return (ENOMEM);
1155 	}
1156 
1157 	switch (cmd) {
1158 	case TG_READ:
1159 		bp->b_flags = B_READ;
1160 		func = bd->d_ops.o_read;
1161 		break;
1162 	case TG_WRITE:
1163 		bp->b_flags = B_WRITE;
1164 		func = bd->d_ops.o_write;
1165 		break;
1166 	default:
1167 		freerbuf(bp);
1168 		return (EINVAL);
1169 	}
1170 
1171 	bp->b_un.b_addr = bufaddr;
1172 	bp->b_bcount = length;
1173 	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1174 	if (xi == NULL) {
1175 		rv = geterror(bp);
1176 		freerbuf(bp);
1177 		return (rv);
1178 	}
1179 	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1180 	xi->i_blkno = start;
1181 	bd_submit(bd, xi);
1182 	(void) biowait(bp);
1183 	rv = geterror(bp);
1184 	freerbuf(bp);
1185 
1186 	return (rv);
1187 }
1188 
1189 static int
1190 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1191 {
1192 	bd_t		*bd;
1193 
1194 	_NOTE(ARGUNUSED(tg_cookie));
1195 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1196 
1197 	switch (cmd) {
1198 	case TG_GETPHYGEOM:
1199 	case TG_GETVIRTGEOM:
1200 		/*
1201 		 * We don't have any "geometry" as such, let cmlb
1202 		 * fabricate something.
1203 		 */
1204 		return (ENOTTY);
1205 
1206 	case TG_GETCAPACITY:
1207 		bd_update_state(bd);
1208 		*(diskaddr_t *)arg = bd->d_numblks;
1209 		return (0);
1210 
1211 	case TG_GETBLOCKSIZE:
1212 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1213 		return (0);
1214 
1215 	case TG_GETATTR:
1216 		/*
1217 		 * It turns out that cmlb really doesn't do much for
1218 		 * non-writable media, but lets make the information
1219 		 * available for it in case it does more in the
1220 		 * future.  (The value is currently used for
1221 		 * triggering special behavior for CD-ROMs.)
1222 		 */
1223 		bd_update_state(bd);
1224 		((tg_attribute_t *)arg)->media_is_writable =
1225 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1226 		return (0);
1227 
1228 	default:
1229 		return (EINVAL);
1230 	}
1231 }
1232 
1233 
1234 static void
1235 bd_sched(bd_t *bd)
1236 {
1237 	bd_xfer_impl_t	*xi;
1238 	struct buf	*bp;
1239 	int		rv;
1240 
1241 	mutex_enter(&bd->d_iomutex);
1242 
1243 	while ((bd->d_qactive < bd->d_qsize) &&
1244 	    ((xi = list_remove_head(&bd->d_waitq)) != NULL)) {
1245 		bd->d_qactive++;
1246 		kstat_waitq_to_runq(bd->d_kiop);
1247 		list_insert_tail(&bd->d_runq, xi);
1248 
1249 		/*
1250 		 * Submit the job to the driver.  We drop the I/O mutex
1251 		 * so that we can deal with the case where the driver
1252 		 * completion routine calls back into us synchronously.
1253 		 */
1254 
1255 		mutex_exit(&bd->d_iomutex);
1256 
1257 		rv = xi->i_func(bd->d_private, &xi->i_public);
1258 		if (rv != 0) {
1259 			bp = xi->i_bp;
1260 			bd_xfer_free(xi);
1261 			bioerror(bp, rv);
1262 			biodone(bp);
1263 
1264 			mutex_enter(&bd->d_iomutex);
1265 			bd->d_qactive--;
1266 			kstat_runq_exit(bd->d_kiop);
1267 			list_remove(&bd->d_runq, xi);
1268 		} else {
1269 			mutex_enter(&bd->d_iomutex);
1270 		}
1271 	}
1272 
1273 	mutex_exit(&bd->d_iomutex);
1274 }
1275 
1276 static void
1277 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1278 {
1279 	mutex_enter(&bd->d_iomutex);
1280 	list_insert_tail(&bd->d_waitq, xi);
1281 	kstat_waitq_enter(bd->d_kiop);
1282 	mutex_exit(&bd->d_iomutex);
1283 
1284 	bd_sched(bd);
1285 }
1286 
1287 static void
1288 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1289 {
1290 	bd_t	*bd = xi->i_bd;
1291 	buf_t	*bp = xi->i_bp;
1292 
1293 	mutex_enter(&bd->d_iomutex);
1294 	bd->d_qactive--;
1295 	kstat_runq_exit(bd->d_kiop);
1296 	list_remove(&bd->d_runq, xi);
1297 	mutex_exit(&bd->d_iomutex);
1298 
1299 	if (err == 0) {
1300 		if (bp->b_flags & B_READ) {
1301 			bd->d_kiop->reads++;
1302 			bd->d_kiop->nread += (bp->b_bcount - xi->i_resid);
1303 		} else {
1304 			bd->d_kiop->writes++;
1305 			bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid);
1306 		}
1307 	}
1308 	bd_sched(bd);
1309 }
1310 
1311 static void
1312 bd_update_state(bd_t *bd)
1313 {
1314 	enum	dkio_state	state;
1315 	bd_media_t		media;
1316 	boolean_t		docmlb = B_FALSE;
1317 
1318 	bzero(&media, sizeof (media));
1319 
1320 	mutex_enter(&bd->d_statemutex);
1321 	if (bd->d_ops.o_media_info(bd->d_private, &media) == 0) {
1322 		if ((1U << bd->d_blkshift) != media.m_blksize) {
1323 			if ((media.m_blksize < 512) ||
1324 			    (!ISP2(media.m_blksize)) ||
1325 			    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1326 				cmn_err(CE_WARN,
1327 				    "%s%d: Invalid media block size (%d)",
1328 				    ddi_driver_name(bd->d_dip),
1329 				    ddi_get_instance(bd->d_dip),
1330 				    media.m_blksize);
1331 				/*
1332 				 * We can't use the media, treat it as
1333 				 * not present.
1334 				 */
1335 				state = DKIO_EJECTED;
1336 				bd->d_numblks = 0;
1337 			} else {
1338 				bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1339 				bd->d_numblks = media.m_nblks;
1340 				bd->d_rdonly = media.m_readonly;
1341 				state = DKIO_INSERTED;
1342 			}
1343 
1344 			/* Device size changed */
1345 			docmlb = B_TRUE;
1346 
1347 		} else {
1348 			if (bd->d_numblks != media.m_nblks) {
1349 				/* Device size changed */
1350 				docmlb = B_TRUE;
1351 			}
1352 			bd->d_numblks = media.m_nblks;
1353 			bd->d_rdonly = media.m_readonly;
1354 			state = DKIO_INSERTED;
1355 		}
1356 
1357 	} else {
1358 		bd->d_numblks = 0;
1359 		state = DKIO_EJECTED;
1360 	}
1361 	if (state != bd->d_state) {
1362 		bd->d_state = state;
1363 		cv_broadcast(&bd->d_statecv);
1364 		docmlb = B_TRUE;
1365 	}
1366 	mutex_exit(&bd->d_statemutex);
1367 
1368 	if (docmlb) {
1369 		if (state == DKIO_INSERTED) {
1370 			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
1371 		} else {
1372 			cmlb_invalidate(bd->d_cmlbh, 0);
1373 		}
1374 	}
1375 }
1376 
1377 static int
1378 bd_check_state(bd_t *bd, enum dkio_state *state)
1379 {
1380 	clock_t		when;
1381 
1382 	for (;;) {
1383 
1384 		bd_update_state(bd);
1385 
1386 		mutex_enter(&bd->d_statemutex);
1387 
1388 		if (bd->d_state != *state) {
1389 			*state = bd->d_state;
1390 			mutex_exit(&bd->d_statemutex);
1391 			break;
1392 		}
1393 
1394 		when = drv_usectohz(1000000);
1395 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1396 		    when, TR_CLOCK_TICK) == 0) {
1397 			mutex_exit(&bd->d_statemutex);
1398 			return (EINTR);
1399 		}
1400 
1401 		mutex_exit(&bd->d_statemutex);
1402 	}
1403 
1404 	return (0);
1405 }
1406 
1407 static int
1408 bd_flush_write_cache_done(struct buf *bp)
1409 {
1410 	struct dk_callback *dc = (void *)bp->b_private;
1411 
1412 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1413 	kmem_free(dc, sizeof (*dc));
1414 	freerbuf(bp);
1415 	return (0);
1416 }
1417 
1418 static int
1419 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1420 {
1421 	buf_t			*bp;
1422 	struct dk_callback	*dc;
1423 	bd_xfer_impl_t		*xi;
1424 	int			rv;
1425 
1426 	if (bd->d_ops.o_sync_cache == NULL) {
1427 		return (ENOTSUP);
1428 	}
1429 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1430 		return (ENOMEM);
1431 	}
1432 	bp->b_resid = 0;
1433 	bp->b_bcount = 0;
1434 
1435 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1436 	if (xi == NULL) {
1437 		rv = geterror(bp);
1438 		freerbuf(bp);
1439 		return (rv);
1440 	}
1441 
1442 	/* Make an asynchronous flush, but only if there is a callback */
1443 	if (dkc != NULL && dkc->dkc_callback != NULL) {
1444 		/* Make a private copy of the callback structure */
1445 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1446 		*dc = *dkc;
1447 		bp->b_private = dc;
1448 		bp->b_iodone = bd_flush_write_cache_done;
1449 
1450 		bd_submit(bd, xi);
1451 		return (0);
1452 	}
1453 
1454 	/* In case there is no callback, perform a synchronous flush */
1455 	bd_submit(bd, xi);
1456 	(void) biowait(bp);
1457 	rv = geterror(bp);
1458 	freerbuf(bp);
1459 
1460 	return (rv);
1461 }
1462 
1463 /*
1464  * Nexus support.
1465  */
1466 int
1467 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1468     void *arg, void *result)
1469 {
1470 	bd_handle_t	hdl;
1471 
1472 	switch (ctlop) {
1473 	case DDI_CTLOPS_REPORTDEV:
1474 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1475 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
1476 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
1477 		return (DDI_SUCCESS);
1478 
1479 	case DDI_CTLOPS_INITCHILD:
1480 		hdl = ddi_get_parent_data((dev_info_t *)arg);
1481 		if (hdl == NULL) {
1482 			return (DDI_NOT_WELL_FORMED);
1483 		}
1484 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1485 		return (DDI_SUCCESS);
1486 
1487 	case DDI_CTLOPS_UNINITCHILD:
1488 		ddi_set_name_addr((dev_info_t *)arg, NULL);
1489 		ndi_prop_remove_all((dev_info_t *)arg);
1490 		return (DDI_SUCCESS);
1491 
1492 	default:
1493 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1494 	}
1495 }
1496 
1497 /*
1498  * Functions for device drivers.
1499  */
1500 bd_handle_t
1501 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1502 {
1503 	bd_handle_t	hdl;
1504 
1505 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1506 	if (hdl != NULL) {
1507 		hdl->h_ops = *ops;
1508 		hdl->h_dma = dma;
1509 		hdl->h_private = private;
1510 	}
1511 
1512 	return (hdl);
1513 }
1514 
1515 void
1516 bd_free_handle(bd_handle_t hdl)
1517 {
1518 	kmem_free(hdl, sizeof (*hdl));
1519 }
1520 
1521 int
1522 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1523 {
1524 	dev_info_t	*child;
1525 	bd_drive_t	drive;
1526 
1527 	/* if drivers don't override this, make it assume none */
1528 	drive.d_lun = -1;
1529 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
1530 
1531 	hdl->h_parent = dip;
1532 	hdl->h_name = "blkdev";
1533 
1534 	if (drive.d_lun >= 0) {
1535 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X,%X",
1536 		    drive.d_target, drive.d_lun);
1537 	} else {
1538 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X",
1539 		    drive.d_target);
1540 	}
1541 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
1542 	    &child) != NDI_SUCCESS) {
1543 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
1544 		    ddi_driver_name(dip), ddi_get_instance(dip),
1545 		    "blkdev", hdl->h_addr);
1546 		return (DDI_FAILURE);
1547 	}
1548 
1549 	ddi_set_parent_data(child, hdl);
1550 	hdl->h_child = child;
1551 
1552 	if (ndi_devi_online(child, 0) == NDI_FAILURE) {
1553 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
1554 		    ddi_driver_name(dip), ddi_get_instance(dip),
1555 		    hdl->h_name, hdl->h_addr);
1556 		(void) ndi_devi_free(child);
1557 		return (DDI_FAILURE);
1558 	}
1559 
1560 	return (DDI_SUCCESS);
1561 }
1562 
1563 int
1564 bd_detach_handle(bd_handle_t hdl)
1565 {
1566 	int	circ;
1567 	int	rv;
1568 	char	*devnm;
1569 
1570 	if (hdl->h_child == NULL) {
1571 		return (DDI_SUCCESS);
1572 	}
1573 	ndi_devi_enter(hdl->h_parent, &circ);
1574 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
1575 		rv = ddi_remove_child(hdl->h_child, 0);
1576 	} else {
1577 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
1578 		(void) ddi_deviname(hdl->h_child, devnm);
1579 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
1580 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
1581 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
1582 		kmem_free(devnm, MAXNAMELEN + 1);
1583 	}
1584 	if (rv == 0) {
1585 		hdl->h_child = NULL;
1586 	}
1587 
1588 	ndi_devi_exit(hdl->h_parent, circ);
1589 	return (rv = NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
1590 }
1591 
1592 void
1593 bd_xfer_done(bd_xfer_t *xfer, int err)
1594 {
1595 	bd_xfer_impl_t	*xi = (void *)xfer;
1596 	buf_t		*bp = xi->i_bp;
1597 	int		rv;
1598 	bd_t		*bd = xi->i_bd;
1599 	size_t		len;
1600 
1601 	if (err != 0) {
1602 		bd_runq_exit(xi, err);
1603 
1604 		bp->b_resid += xi->i_resid;
1605 		bd_xfer_free(xi);
1606 		bioerror(bp, err);
1607 		biodone(bp);
1608 		return;
1609 	}
1610 
1611 	xi->i_cur_win++;
1612 	xi->i_resid -= xi->i_len;
1613 
1614 	if (xi->i_resid == 0) {
1615 		/* Job completed succcessfully! */
1616 		bd_runq_exit(xi, 0);
1617 
1618 		bd_xfer_free(xi);
1619 		biodone(bp);
1620 		return;
1621 	}
1622 
1623 	xi->i_blkno += xi->i_nblks;
1624 
1625 	if (bd->d_use_dma) {
1626 		/* More transfer still pending... advance to next DMA window. */
1627 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
1628 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
1629 	} else {
1630 		/* Advance memory window. */
1631 		xi->i_kaddr += xi->i_len;
1632 		xi->i_offset += xi->i_len;
1633 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
1634 	}
1635 
1636 
1637 	if ((rv != DDI_SUCCESS) ||
1638 	    (P2PHASE(len, (1U << xi->i_blkshift) != 0))) {
1639 		bd_runq_exit(xi, EFAULT);
1640 
1641 		bp->b_resid += xi->i_resid;
1642 		bd_xfer_free(xi);
1643 		bioerror(bp, EFAULT);
1644 		biodone(bp);
1645 		return;
1646 	}
1647 	xi->i_len = len;
1648 	xi->i_nblks = len >> xi->i_blkshift;
1649 
1650 	/* Submit next window to hardware. */
1651 	rv = xi->i_func(bd->d_private, &xi->i_public);
1652 	if (rv != 0) {
1653 		bd_runq_exit(xi, rv);
1654 
1655 		bp->b_resid += xi->i_resid;
1656 		bd_xfer_free(xi);
1657 		bioerror(bp, rv);
1658 		biodone(bp);
1659 	}
1660 }
1661 
1662 void
1663 bd_state_change(bd_handle_t hdl)
1664 {
1665 	bd_t		*bd;
1666 
1667 	if ((bd = hdl->h_bd) != NULL) {
1668 		bd_update_state(bd);
1669 	}
1670 }
1671 
1672 void
1673 bd_mod_init(struct dev_ops *devops)
1674 {
1675 	static struct bus_ops bd_bus_ops = {
1676 		BUSO_REV,		/* busops_rev */
1677 		nullbusmap,		/* bus_map */
1678 		NULL,			/* bus_get_intrspec (OBSOLETE) */
1679 		NULL,			/* bus_add_intrspec (OBSOLETE) */
1680 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
1681 		i_ddi_map_fault,	/* bus_map_fault */
1682 		NULL,			/* bus_dma_map (OBSOLETE) */
1683 		ddi_dma_allochdl,	/* bus_dma_allochdl */
1684 		ddi_dma_freehdl,	/* bus_dma_freehdl */
1685 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
1686 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
1687 		ddi_dma_flush,		/* bus_dma_flush */
1688 		ddi_dma_win,		/* bus_dma_win */
1689 		ddi_dma_mctl,		/* bus_dma_ctl */
1690 		bd_bus_ctl,		/* bus_ctl */
1691 		ddi_bus_prop_op,	/* bus_prop_op */
1692 		NULL,			/* bus_get_eventcookie */
1693 		NULL,			/* bus_add_eventcall */
1694 		NULL,			/* bus_remove_eventcall */
1695 		NULL,			/* bus_post_event */
1696 		NULL,			/* bus_intr_ctl (OBSOLETE) */
1697 		NULL,			/* bus_config */
1698 		NULL,			/* bus_unconfig */
1699 		NULL,			/* bus_fm_init */
1700 		NULL,			/* bus_fm_fini */
1701 		NULL,			/* bus_fm_access_enter */
1702 		NULL,			/* bus_fm_access_exit */
1703 		NULL,			/* bus_power */
1704 		NULL,			/* bus_intr_op */
1705 	};
1706 
1707 	devops->devo_bus_ops = &bd_bus_ops;
1708 
1709 	/*
1710 	 * NB: The device driver is free to supply its own
1711 	 * character entry device support.
1712 	 */
1713 }
1714 
1715 void
1716 bd_mod_fini(struct dev_ops *devops)
1717 {
1718 	devops->devo_bus_ops = NULL;
1719 }
1720