xref: /titanic_50/usr/src/uts/common/io/blkdev/blkdev.c (revision 0c6eaab480b44a0c790ad94e7cb6084792411de9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
24  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/ksynch.h>
30 #include <sys/kmem.h>
31 #include <sys/file.h>
32 #include <sys/errno.h>
33 #include <sys/open.h>
34 #include <sys/buf.h>
35 #include <sys/uio.h>
36 #include <sys/aio_req.h>
37 #include <sys/cred.h>
38 #include <sys/modctl.h>
39 #include <sys/cmlb.h>
40 #include <sys/conf.h>
41 #include <sys/devops.h>
42 #include <sys/list.h>
43 #include <sys/sysmacros.h>
44 #include <sys/dkio.h>
45 #include <sys/vtoc.h>
46 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
47 #include <sys/kstat.h>
48 #include <sys/fs/dv_node.h>
49 #include <sys/ddi.h>
50 #include <sys/sunddi.h>
51 #include <sys/note.h>
52 #include <sys/blkdev.h>
53 
54 #define	BD_MAXPART	64
55 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
56 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
57 
58 typedef struct bd bd_t;
59 typedef struct bd_xfer_impl bd_xfer_impl_t;
60 
61 struct bd {
62 	void		*d_private;
63 	dev_info_t	*d_dip;
64 	kmutex_t	d_ocmutex;
65 	kmutex_t	d_iomutex;
66 	kmutex_t	d_statemutex;
67 	kcondvar_t	d_statecv;
68 	enum dkio_state	d_state;
69 	cmlb_handle_t	d_cmlbh;
70 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
71 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
72 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
73 
74 	uint32_t	d_qsize;
75 	uint32_t	d_qactive;
76 	uint32_t	d_maxxfer;
77 	uint32_t	d_blkshift;
78 	uint32_t	d_pblkshift;
79 	uint64_t	d_numblks;
80 	ddi_devid_t	d_devid;
81 
82 	kmem_cache_t	*d_cache;
83 	list_t		d_runq;
84 	list_t		d_waitq;
85 	kstat_t		*d_ksp;
86 	kstat_io_t	*d_kiop;
87 
88 	boolean_t	d_rdonly;
89 	boolean_t	d_ssd;
90 	boolean_t	d_removable;
91 	boolean_t	d_hotpluggable;
92 	boolean_t	d_use_dma;
93 
94 	ddi_dma_attr_t	d_dma;
95 	bd_ops_t	d_ops;
96 	bd_handle_t	d_handle;
97 };
98 
99 struct bd_handle {
100 	bd_ops_t	h_ops;
101 	ddi_dma_attr_t	*h_dma;
102 	dev_info_t	*h_parent;
103 	dev_info_t	*h_child;
104 	void		*h_private;
105 	bd_t		*h_bd;
106 	char		*h_name;
107 	char		h_addr[20];	/* enough for %X,%X */
108 };
109 
110 struct bd_xfer_impl {
111 	bd_xfer_t	i_public;
112 	list_node_t	i_linkage;
113 	bd_t		*i_bd;
114 	buf_t		*i_bp;
115 	uint_t		i_num_win;
116 	uint_t		i_cur_win;
117 	off_t		i_offset;
118 	int		(*i_func)(void *, bd_xfer_t *);
119 	uint32_t	i_blkshift;
120 	size_t		i_len;
121 	size_t		i_resid;
122 };
123 
124 #define	i_dmah		i_public.x_dmah
125 #define	i_dmac		i_public.x_dmac
126 #define	i_ndmac		i_public.x_ndmac
127 #define	i_kaddr		i_public.x_kaddr
128 #define	i_nblks		i_public.x_nblks
129 #define	i_blkno		i_public.x_blkno
130 #define	i_flags		i_public.x_flags
131 
132 
133 /*
134  * Private prototypes.
135  */
136 
137 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
138 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
139 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
140 
141 static int bd_open(dev_t *, int, int, cred_t *);
142 static int bd_close(dev_t, int, int, cred_t *);
143 static int bd_strategy(struct buf *);
144 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
145 static int bd_dump(dev_t, caddr_t, daddr_t, int);
146 static int bd_read(dev_t, struct uio *, cred_t *);
147 static int bd_write(dev_t, struct uio *, cred_t *);
148 static int bd_aread(dev_t, struct aio_req *, cred_t *);
149 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
150 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
151     caddr_t, int *);
152 
153 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
154     void *);
155 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
156 static int bd_xfer_ctor(void *, void *, int);
157 static void bd_xfer_dtor(void *, void *);
158 static void bd_sched(bd_t *);
159 static void bd_submit(bd_t *, bd_xfer_impl_t *);
160 static void bd_runq_exit(bd_xfer_impl_t *, int);
161 static void bd_update_state(bd_t *);
162 static int bd_check_state(bd_t *, enum dkio_state *);
163 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
164 
165 struct cmlb_tg_ops bd_tg_ops = {
166 	TG_DK_OPS_VERSION_1,
167 	bd_tg_rdwr,
168 	bd_tg_getinfo,
169 };
170 
171 static struct cb_ops bd_cb_ops = {
172 	bd_open, 		/* open */
173 	bd_close, 		/* close */
174 	bd_strategy, 		/* strategy */
175 	nodev, 			/* print */
176 	bd_dump,		/* dump */
177 	bd_read, 		/* read */
178 	bd_write, 		/* write */
179 	bd_ioctl, 		/* ioctl */
180 	nodev, 			/* devmap */
181 	nodev, 			/* mmap */
182 	nodev, 			/* segmap */
183 	nochpoll, 		/* poll */
184 	bd_prop_op, 		/* cb_prop_op */
185 	0, 			/* streamtab  */
186 	D_64BIT | D_MP,		/* Driver comaptibility flag */
187 	CB_REV,			/* cb_rev */
188 	bd_aread,		/* async read */
189 	bd_awrite		/* async write */
190 };
191 
192 struct dev_ops bd_dev_ops = {
193 	DEVO_REV, 		/* devo_rev, */
194 	0, 			/* refcnt  */
195 	bd_getinfo,		/* getinfo */
196 	nulldev, 		/* identify */
197 	nulldev, 		/* probe */
198 	bd_attach, 		/* attach */
199 	bd_detach,		/* detach */
200 	nodev, 			/* reset */
201 	&bd_cb_ops, 		/* driver operations */
202 	NULL,			/* bus operations */
203 	NULL,			/* power */
204 	ddi_quiesce_not_needed,	/* quiesce */
205 };
206 
207 static struct modldrv modldrv = {
208 	&mod_driverops,
209 	"Generic Block Device",
210 	&bd_dev_ops,
211 };
212 
213 static struct modlinkage modlinkage = {
214 	MODREV_1, { &modldrv, NULL }
215 };
216 
217 static void *bd_state;
218 static krwlock_t bd_lock;
219 
220 int
221 _init(void)
222 {
223 	int	rv;
224 
225 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
226 	if (rv != DDI_SUCCESS) {
227 		return (rv);
228 	}
229 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
230 	rv = mod_install(&modlinkage);
231 	if (rv != DDI_SUCCESS) {
232 		rw_destroy(&bd_lock);
233 		ddi_soft_state_fini(&bd_state);
234 	}
235 	return (rv);
236 }
237 
238 int
239 _fini(void)
240 {
241 	int	rv;
242 
243 	rv = mod_remove(&modlinkage);
244 	if (rv == DDI_SUCCESS) {
245 		rw_destroy(&bd_lock);
246 		ddi_soft_state_fini(&bd_state);
247 	}
248 	return (rv);
249 }
250 
251 int
252 _info(struct modinfo *modinfop)
253 {
254 	return (mod_info(&modlinkage, modinfop));
255 }
256 
257 static int
258 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
259 {
260 	bd_t	*bd;
261 	minor_t	inst;
262 
263 	_NOTE(ARGUNUSED(dip));
264 
265 	inst = BDINST((dev_t)arg);
266 
267 	switch (cmd) {
268 	case DDI_INFO_DEVT2DEVINFO:
269 		bd = ddi_get_soft_state(bd_state, inst);
270 		if (bd == NULL) {
271 			return (DDI_FAILURE);
272 		}
273 		*resultp = (void *)bd->d_dip;
274 		break;
275 
276 	case DDI_INFO_DEVT2INSTANCE:
277 		*resultp = (void *)(intptr_t)inst;
278 		break;
279 
280 	default:
281 		return (DDI_FAILURE);
282 	}
283 	return (DDI_SUCCESS);
284 }
285 
286 static int
287 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
288 {
289 	int		inst;
290 	bd_handle_t	hdl;
291 	bd_t		*bd;
292 	bd_drive_t	drive;
293 	int		rv;
294 	char		name[16];
295 	char		kcache[32];
296 
297 	switch (cmd) {
298 	case DDI_ATTACH:
299 		break;
300 	case DDI_RESUME:
301 		/* We don't do anything native for suspend/resume */
302 		return (DDI_SUCCESS);
303 	default:
304 		return (DDI_FAILURE);
305 	}
306 
307 	inst = ddi_get_instance(dip);
308 	hdl = ddi_get_parent_data(dip);
309 
310 	(void) snprintf(name, sizeof (name), "%s%d",
311 	    ddi_driver_name(dip), ddi_get_instance(dip));
312 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
313 
314 	if (hdl == NULL) {
315 		cmn_err(CE_WARN, "%s: missing parent data!", name);
316 		return (DDI_FAILURE);
317 	}
318 
319 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
320 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
321 		return (DDI_FAILURE);
322 	}
323 	bd = ddi_get_soft_state(bd_state, inst);
324 
325 	if (hdl->h_dma) {
326 		bd->d_dma = *(hdl->h_dma);
327 		bd->d_dma.dma_attr_granular =
328 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
329 		bd->d_use_dma = B_TRUE;
330 
331 		if (bd->d_maxxfer &&
332 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
333 			cmn_err(CE_WARN,
334 			    "%s: inconsistent maximum transfer size!",
335 			    name);
336 			/* We force it */
337 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
338 		} else {
339 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
340 		}
341 	} else {
342 		bd->d_use_dma = B_FALSE;
343 		if (bd->d_maxxfer == 0) {
344 			bd->d_maxxfer = 1024 * 1024;
345 		}
346 	}
347 	bd->d_ops = hdl->h_ops;
348 	bd->d_private = hdl->h_private;
349 	bd->d_blkshift = 9;	/* 512 bytes, to start */
350 
351 	if (bd->d_maxxfer % DEV_BSIZE) {
352 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
353 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
354 	}
355 	if (bd->d_maxxfer < DEV_BSIZE) {
356 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
357 		ddi_soft_state_free(bd_state, inst);
358 		return (DDI_FAILURE);
359 	}
360 
361 	bd->d_dip = dip;
362 	bd->d_handle = hdl;
363 	hdl->h_bd = bd;
364 	ddi_set_driver_private(dip, bd);
365 
366 	mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL);
367 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
368 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
369 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
370 
371 	list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t),
372 	    offsetof(struct bd_xfer_impl, i_linkage));
373 	list_create(&bd->d_runq, sizeof (bd_xfer_impl_t),
374 	    offsetof(struct bd_xfer_impl, i_linkage));
375 
376 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
377 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
378 
379 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
380 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
381 	if (bd->d_ksp != NULL) {
382 		bd->d_ksp->ks_lock = &bd->d_iomutex;
383 		kstat_install(bd->d_ksp);
384 		bd->d_kiop = bd->d_ksp->ks_data;
385 	} else {
386 		/*
387 		 * Even if we cannot create the kstat, we create a
388 		 * scratch kstat.  The reason for this is to ensure
389 		 * that we can update the kstat all of the time,
390 		 * without adding an extra branch instruction.
391 		 */
392 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
393 	}
394 
395 	cmlb_alloc_handle(&bd->d_cmlbh);
396 
397 	bd->d_state = DKIO_NONE;
398 
399 	bzero(&drive, sizeof (drive));
400 	bd->d_ops.o_drive_info(bd->d_private, &drive);
401 	bd->d_qsize = drive.d_qsize;
402 	bd->d_removable = drive.d_removable;
403 	bd->d_hotpluggable = drive.d_hotpluggable;
404 
405 	if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
406 		bd->d_maxxfer = drive.d_maxxfer;
407 
408 
409 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
410 	    bd->d_removable, bd->d_hotpluggable,
411 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
412 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
413 	if (rv != 0) {
414 		cmlb_free_handle(&bd->d_cmlbh);
415 		kmem_cache_destroy(bd->d_cache);
416 		mutex_destroy(&bd->d_iomutex);
417 		mutex_destroy(&bd->d_ocmutex);
418 		mutex_destroy(&bd->d_statemutex);
419 		cv_destroy(&bd->d_statecv);
420 		list_destroy(&bd->d_waitq);
421 		list_destroy(&bd->d_runq);
422 		if (bd->d_ksp != NULL) {
423 			kstat_delete(bd->d_ksp);
424 			bd->d_ksp = NULL;
425 		} else {
426 			kmem_free(bd->d_kiop, sizeof (kstat_io_t));
427 		}
428 		ddi_soft_state_free(bd_state, inst);
429 		return (DDI_FAILURE);
430 	}
431 
432 	if (bd->d_ops.o_devid_init != NULL) {
433 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
434 		if (rv == DDI_SUCCESS) {
435 			if (ddi_devid_register(dip, bd->d_devid) !=
436 			    DDI_SUCCESS) {
437 				cmn_err(CE_WARN,
438 				    "%s: unable to register devid", name);
439 			}
440 		}
441 	}
442 
443 	/*
444 	 * Add a zero-length attribute to tell the world we support
445 	 * kernel ioctls (for layered drivers).  Also set up properties
446 	 * used by HAL to identify removable media.
447 	 */
448 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
449 	    DDI_KERNEL_IOCTL, NULL, 0);
450 	if (bd->d_removable) {
451 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
452 		    "removable-media", NULL, 0);
453 	}
454 	if (bd->d_hotpluggable) {
455 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
456 		    "hotpluggable", NULL, 0);
457 	}
458 
459 	ddi_report_dev(dip);
460 
461 	return (DDI_SUCCESS);
462 }
463 
464 static int
465 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
466 {
467 	bd_t	*bd;
468 
469 	bd = ddi_get_driver_private(dip);
470 
471 	switch (cmd) {
472 	case DDI_DETACH:
473 		break;
474 	case DDI_SUSPEND:
475 		/* We don't suspend, but our parent does */
476 		return (DDI_SUCCESS);
477 	default:
478 		return (DDI_FAILURE);
479 	}
480 	if (bd->d_ksp != NULL) {
481 		kstat_delete(bd->d_ksp);
482 		bd->d_ksp = NULL;
483 	} else {
484 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
485 	}
486 	cmlb_detach(bd->d_cmlbh, 0);
487 	cmlb_free_handle(&bd->d_cmlbh);
488 	if (bd->d_devid)
489 		ddi_devid_free(bd->d_devid);
490 	kmem_cache_destroy(bd->d_cache);
491 	mutex_destroy(&bd->d_iomutex);
492 	mutex_destroy(&bd->d_ocmutex);
493 	mutex_destroy(&bd->d_statemutex);
494 	cv_destroy(&bd->d_statecv);
495 	list_destroy(&bd->d_waitq);
496 	list_destroy(&bd->d_runq);
497 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
498 	return (DDI_SUCCESS);
499 }
500 
501 static int
502 bd_xfer_ctor(void *buf, void *arg, int kmflag)
503 {
504 	bd_xfer_impl_t	*xi;
505 	bd_t		*bd = arg;
506 	int		(*dcb)(caddr_t);
507 
508 	if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
509 		dcb = DDI_DMA_SLEEP;
510 	} else {
511 		dcb = DDI_DMA_DONTWAIT;
512 	}
513 
514 	xi = buf;
515 	bzero(xi, sizeof (*xi));
516 	xi->i_bd = bd;
517 
518 	if (bd->d_use_dma) {
519 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
520 		    &xi->i_dmah) != DDI_SUCCESS) {
521 			return (-1);
522 		}
523 	}
524 
525 	return (0);
526 }
527 
528 static void
529 bd_xfer_dtor(void *buf, void *arg)
530 {
531 	bd_xfer_impl_t	*xi = buf;
532 
533 	_NOTE(ARGUNUSED(arg));
534 
535 	if (xi->i_dmah)
536 		ddi_dma_free_handle(&xi->i_dmah);
537 	xi->i_dmah = NULL;
538 }
539 
540 static bd_xfer_impl_t *
541 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
542     int kmflag)
543 {
544 	bd_xfer_impl_t		*xi;
545 	int			rv = 0;
546 	int			status;
547 	unsigned		dir;
548 	int			(*cb)(caddr_t);
549 	size_t			len;
550 	uint32_t		shift;
551 
552 	if (kmflag == KM_SLEEP) {
553 		cb = DDI_DMA_SLEEP;
554 	} else {
555 		cb = DDI_DMA_DONTWAIT;
556 	}
557 
558 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
559 	if (xi == NULL) {
560 		bioerror(bp, ENOMEM);
561 		return (NULL);
562 	}
563 
564 	ASSERT(bp);
565 
566 	xi->i_bp = bp;
567 	xi->i_func = func;
568 	xi->i_blkno = bp->b_lblkno;
569 
570 	if (bp->b_bcount == 0) {
571 		xi->i_len = 0;
572 		xi->i_nblks = 0;
573 		xi->i_kaddr = NULL;
574 		xi->i_resid = 0;
575 		xi->i_num_win = 0;
576 		goto done;
577 	}
578 
579 	if (bp->b_flags & B_READ) {
580 		dir = DDI_DMA_READ;
581 		xi->i_func = bd->d_ops.o_read;
582 	} else {
583 		dir = DDI_DMA_WRITE;
584 		xi->i_func = bd->d_ops.o_write;
585 	}
586 
587 	shift = bd->d_blkshift;
588 	xi->i_blkshift = shift;
589 
590 	if (!bd->d_use_dma) {
591 		bp_mapin(bp);
592 		rv = 0;
593 		xi->i_offset = 0;
594 		xi->i_num_win =
595 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
596 		xi->i_cur_win = 0;
597 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
598 		xi->i_nblks = xi->i_len >> shift;
599 		xi->i_kaddr = bp->b_un.b_addr;
600 		xi->i_resid = bp->b_bcount;
601 	} else {
602 
603 		/*
604 		 * We have to use consistent DMA if the address is misaligned.
605 		 */
606 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
607 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
608 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
609 		} else {
610 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
611 		}
612 
613 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
614 		    NULL, &xi->i_dmac, &xi->i_ndmac);
615 		switch (status) {
616 		case DDI_DMA_MAPPED:
617 			xi->i_num_win = 1;
618 			xi->i_cur_win = 0;
619 			xi->i_offset = 0;
620 			xi->i_len = bp->b_bcount;
621 			xi->i_nblks = xi->i_len >> shift;
622 			xi->i_resid = bp->b_bcount;
623 			rv = 0;
624 			break;
625 		case DDI_DMA_PARTIAL_MAP:
626 			xi->i_cur_win = 0;
627 
628 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
629 			    DDI_SUCCESS) ||
630 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
631 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
632 			    DDI_SUCCESS) ||
633 			    (P2PHASE(len, shift) != 0)) {
634 				(void) ddi_dma_unbind_handle(xi->i_dmah);
635 				rv = EFAULT;
636 				goto done;
637 			}
638 			xi->i_len = len;
639 			xi->i_nblks = xi->i_len >> shift;
640 			xi->i_resid = bp->b_bcount;
641 			rv = 0;
642 			break;
643 		case DDI_DMA_NORESOURCES:
644 			rv = EAGAIN;
645 			goto done;
646 		case DDI_DMA_TOOBIG:
647 			rv = EINVAL;
648 			goto done;
649 		case DDI_DMA_NOMAPPING:
650 		case DDI_DMA_INUSE:
651 		default:
652 			rv = EFAULT;
653 			goto done;
654 		}
655 	}
656 
657 done:
658 	if (rv != 0) {
659 		kmem_cache_free(bd->d_cache, xi);
660 		bioerror(bp, rv);
661 		return (NULL);
662 	}
663 
664 	return (xi);
665 }
666 
667 static void
668 bd_xfer_free(bd_xfer_impl_t *xi)
669 {
670 	if (xi->i_dmah) {
671 		(void) ddi_dma_unbind_handle(xi->i_dmah);
672 	}
673 	kmem_cache_free(xi->i_bd->d_cache, xi);
674 }
675 
676 static int
677 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
678 {
679 	dev_t		dev = *devp;
680 	bd_t		*bd;
681 	minor_t		part;
682 	minor_t		inst;
683 	uint64_t	mask;
684 	boolean_t	ndelay;
685 	int		rv;
686 	diskaddr_t	nblks;
687 	diskaddr_t	lba;
688 
689 	_NOTE(ARGUNUSED(credp));
690 
691 	part = BDPART(dev);
692 	inst = BDINST(dev);
693 
694 	if (otyp >= OTYPCNT)
695 		return (EINVAL);
696 
697 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
698 
699 	/*
700 	 * Block any DR events from changing the set of registered
701 	 * devices while we function.
702 	 */
703 	rw_enter(&bd_lock, RW_READER);
704 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
705 		rw_exit(&bd_lock);
706 		return (ENXIO);
707 	}
708 
709 	mutex_enter(&bd->d_ocmutex);
710 
711 	ASSERT(part < 64);
712 	mask = (1U << part);
713 
714 	bd_update_state(bd);
715 
716 	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
717 
718 		/* non-blocking opens are allowed to succeed */
719 		if (!ndelay) {
720 			rv = ENXIO;
721 			goto done;
722 		}
723 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
724 	    NULL, NULL, 0) == 0) {
725 
726 		/*
727 		 * We read the partinfo, verify valid ranges.  If the
728 		 * partition is invalid, and we aren't blocking or
729 		 * doing a raw access, then fail. (Non-blocking and
730 		 * raw accesses can still succeed to allow a disk with
731 		 * bad partition data to opened by format and fdisk.)
732 		 */
733 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
734 			rv = ENXIO;
735 			goto done;
736 		}
737 	} else if (!ndelay) {
738 		/*
739 		 * cmlb_partinfo failed -- invalid partition or no
740 		 * disk label.
741 		 */
742 		rv = ENXIO;
743 		goto done;
744 	}
745 
746 	if ((flag & FWRITE) && bd->d_rdonly) {
747 		rv = EROFS;
748 		goto done;
749 	}
750 
751 	if ((bd->d_open_excl) & (mask)) {
752 		rv = EBUSY;
753 		goto done;
754 	}
755 	if (flag & FEXCL) {
756 		if (bd->d_open_lyr[part]) {
757 			rv = EBUSY;
758 			goto done;
759 		}
760 		for (int i = 0; i < OTYP_LYR; i++) {
761 			if (bd->d_open_reg[i] & mask) {
762 				rv = EBUSY;
763 				goto done;
764 			}
765 		}
766 	}
767 
768 	if (otyp == OTYP_LYR) {
769 		bd->d_open_lyr[part]++;
770 	} else {
771 		bd->d_open_reg[otyp] |= mask;
772 	}
773 	if (flag & FEXCL) {
774 		bd->d_open_excl |= mask;
775 	}
776 
777 	rv = 0;
778 done:
779 	mutex_exit(&bd->d_ocmutex);
780 	rw_exit(&bd_lock);
781 
782 	return (rv);
783 }
784 
785 static int
786 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
787 {
788 	bd_t		*bd;
789 	minor_t		inst;
790 	minor_t		part;
791 	uint64_t	mask;
792 	boolean_t	last = B_TRUE;
793 
794 	_NOTE(ARGUNUSED(flag));
795 	_NOTE(ARGUNUSED(credp));
796 
797 	part = BDPART(dev);
798 	inst = BDINST(dev);
799 
800 	ASSERT(part < 64);
801 	mask = (1U << part);
802 
803 	rw_enter(&bd_lock, RW_READER);
804 
805 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
806 		rw_exit(&bd_lock);
807 		return (ENXIO);
808 	}
809 
810 	mutex_enter(&bd->d_ocmutex);
811 	if (bd->d_open_excl & mask) {
812 		bd->d_open_excl &= ~mask;
813 	}
814 	if (otyp == OTYP_LYR) {
815 		bd->d_open_lyr[part]--;
816 	} else {
817 		bd->d_open_reg[otyp] &= ~mask;
818 	}
819 	for (int i = 0; i < 64; i++) {
820 		if (bd->d_open_lyr[part]) {
821 			last = B_FALSE;
822 		}
823 	}
824 	for (int i = 0; last && (i < OTYP_LYR); i++) {
825 		if (bd->d_open_reg[i]) {
826 			last = B_FALSE;
827 		}
828 	}
829 	mutex_exit(&bd->d_ocmutex);
830 
831 	if (last) {
832 		cmlb_invalidate(bd->d_cmlbh, 0);
833 	}
834 	rw_exit(&bd_lock);
835 
836 	return (0);
837 }
838 
839 static int
840 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
841 {
842 	minor_t		inst;
843 	minor_t		part;
844 	diskaddr_t	pstart;
845 	diskaddr_t	psize;
846 	bd_t		*bd;
847 	bd_xfer_impl_t	*xi;
848 	buf_t		*bp;
849 	int		rv;
850 
851 	rw_enter(&bd_lock, RW_READER);
852 
853 	part = BDPART(dev);
854 	inst = BDINST(dev);
855 
856 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
857 		rw_exit(&bd_lock);
858 		return (ENXIO);
859 	}
860 	/*
861 	 * do cmlb, but do it synchronously unless we already have the
862 	 * partition (which we probably should.)
863 	 */
864 	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
865 	    (void *)1)) {
866 		rw_exit(&bd_lock);
867 		return (ENXIO);
868 	}
869 
870 	if ((blkno + nblk) > psize) {
871 		rw_exit(&bd_lock);
872 		return (EINVAL);
873 	}
874 	bp = getrbuf(KM_NOSLEEP);
875 	if (bp == NULL) {
876 		rw_exit(&bd_lock);
877 		return (ENOMEM);
878 	}
879 
880 	bp->b_bcount = nblk << bd->d_blkshift;
881 	bp->b_resid = bp->b_bcount;
882 	bp->b_lblkno = blkno;
883 	bp->b_un.b_addr = caddr;
884 
885 	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
886 	if (xi == NULL) {
887 		rw_exit(&bd_lock);
888 		freerbuf(bp);
889 		return (ENOMEM);
890 	}
891 	xi->i_blkno = blkno + pstart;
892 	xi->i_flags = BD_XFER_POLL;
893 	bd_submit(bd, xi);
894 	rw_exit(&bd_lock);
895 
896 	/*
897 	 * Generally, we should have run this entirely synchronously
898 	 * at this point and the biowait call should be a no-op.  If
899 	 * it didn't happen this way, it's a bug in the underlying
900 	 * driver not honoring BD_XFER_POLL.
901 	 */
902 	(void) biowait(bp);
903 	rv = geterror(bp);
904 	freerbuf(bp);
905 	return (rv);
906 }
907 
908 void
909 bd_minphys(struct buf *bp)
910 {
911 	minor_t inst;
912 	bd_t	*bd;
913 	inst = BDINST(bp->b_edev);
914 
915 	bd = ddi_get_soft_state(bd_state, inst);
916 
917 	/*
918 	 * In a non-debug kernel, bd_strategy will catch !bd as
919 	 * well, and will fail nicely.
920 	 */
921 	ASSERT(bd);
922 
923 	if (bp->b_bcount > bd->d_maxxfer)
924 		bp->b_bcount = bd->d_maxxfer;
925 }
926 
927 static int
928 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
929 {
930 	_NOTE(ARGUNUSED(credp));
931 	return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
932 }
933 
934 static int
935 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
936 {
937 	_NOTE(ARGUNUSED(credp));
938 	return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
939 }
940 
941 static int
942 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
943 {
944 	_NOTE(ARGUNUSED(credp));
945 	return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
946 }
947 
948 static int
949 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
950 {
951 	_NOTE(ARGUNUSED(credp));
952 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
953 }
954 
955 static int
956 bd_strategy(struct buf *bp)
957 {
958 	minor_t		inst;
959 	minor_t		part;
960 	bd_t		*bd;
961 	diskaddr_t	p_lba;
962 	diskaddr_t	p_nblks;
963 	diskaddr_t	b_nblks;
964 	bd_xfer_impl_t	*xi;
965 	uint32_t	shift;
966 	int		(*func)(void *, bd_xfer_t *);
967 
968 	part = BDPART(bp->b_edev);
969 	inst = BDINST(bp->b_edev);
970 
971 	ASSERT(bp);
972 
973 	bp->b_resid = bp->b_bcount;
974 
975 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
976 		bioerror(bp, ENXIO);
977 		biodone(bp);
978 		return (0);
979 	}
980 
981 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
982 	    NULL, NULL, 0)) {
983 		bioerror(bp, ENXIO);
984 		biodone(bp);
985 		return (0);
986 	}
987 
988 	shift = bd->d_blkshift;
989 
990 	if ((P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
991 	    (bp->b_lblkno > p_nblks)) {
992 		bioerror(bp, ENXIO);
993 		biodone(bp);
994 		return (0);
995 	}
996 	b_nblks = bp->b_bcount >> shift;
997 	if ((bp->b_lblkno == p_nblks) || (bp->b_bcount == 0)) {
998 		biodone(bp);
999 		return (0);
1000 	}
1001 
1002 	if ((b_nblks + bp->b_lblkno) > p_nblks) {
1003 		bp->b_resid = ((bp->b_lblkno + b_nblks - p_nblks) << shift);
1004 		bp->b_bcount -= bp->b_resid;
1005 	} else {
1006 		bp->b_resid = 0;
1007 	}
1008 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1009 
1010 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1011 	if (xi == NULL) {
1012 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1013 	}
1014 	if (xi == NULL) {
1015 		/* bd_request_alloc will have done bioerror */
1016 		biodone(bp);
1017 		return (0);
1018 	}
1019 	xi->i_blkno = bp->b_lblkno + p_lba;
1020 
1021 	bd_submit(bd, xi);
1022 
1023 	return (0);
1024 }
1025 
1026 static int
1027 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1028 {
1029 	minor_t		inst;
1030 	uint16_t	part;
1031 	bd_t		*bd;
1032 	void		*ptr = (void *)arg;
1033 	int		rv;
1034 
1035 	part = BDPART(dev);
1036 	inst = BDINST(dev);
1037 
1038 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1039 		return (ENXIO);
1040 	}
1041 
1042 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1043 	if (rv != ENOTTY)
1044 		return (rv);
1045 
1046 	if (rvalp != NULL) {
1047 		/* the return value of the ioctl is 0 by default */
1048 		*rvalp = 0;
1049 	}
1050 
1051 	switch (cmd) {
1052 	case DKIOCGMEDIAINFO: {
1053 		struct dk_minfo minfo;
1054 
1055 		/* make sure our state information is current */
1056 		bd_update_state(bd);
1057 		bzero(&minfo, sizeof (minfo));
1058 		minfo.dki_media_type = DK_FIXED_DISK;
1059 		minfo.dki_lbsize = (1U << bd->d_blkshift);
1060 		minfo.dki_capacity = bd->d_numblks;
1061 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1062 			return (EFAULT);
1063 		}
1064 		return (0);
1065 	}
1066 	case DKIOCGMEDIAINFOEXT: {
1067 		struct dk_minfo_ext miext;
1068 
1069 		/* make sure our state information is current */
1070 		bd_update_state(bd);
1071 		bzero(&miext, sizeof (miext));
1072 		miext.dki_media_type = DK_FIXED_DISK;
1073 		miext.dki_lbsize = (1U << bd->d_blkshift);
1074 		miext.dki_pbsize = (1U << bd->d_pblkshift);
1075 		miext.dki_capacity = bd->d_numblks;
1076 		if (ddi_copyout(&miext, ptr, sizeof (miext), flag)) {
1077 			return (EFAULT);
1078 		}
1079 		return (0);
1080 	}
1081 	case DKIOCINFO: {
1082 		struct dk_cinfo cinfo;
1083 		bzero(&cinfo, sizeof (cinfo));
1084 		cinfo.dki_ctype = DKC_BLKDEV;
1085 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1086 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1087 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1088 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1089 		    "%s", ddi_driver_name(bd->d_dip));
1090 		cinfo.dki_unit = inst;
1091 		cinfo.dki_flags = DKI_FMTVOL;
1092 		cinfo.dki_partition = part;
1093 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1094 		cinfo.dki_addr = 0;
1095 		cinfo.dki_slave = 0;
1096 		cinfo.dki_space = 0;
1097 		cinfo.dki_prio = 0;
1098 		cinfo.dki_vec = 0;
1099 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1100 			return (EFAULT);
1101 		}
1102 		return (0);
1103 	}
1104 	case DKIOCREMOVABLE: {
1105 		int i;
1106 		i = bd->d_removable ? 1 : 0;
1107 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1108 			return (EFAULT);
1109 		}
1110 		return (0);
1111 	}
1112 	case DKIOCHOTPLUGGABLE: {
1113 		int i;
1114 		i = bd->d_hotpluggable ? 1 : 0;
1115 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1116 			return (EFAULT);
1117 		}
1118 		return (0);
1119 	}
1120 	case DKIOCREADONLY: {
1121 		int i;
1122 		i = bd->d_rdonly ? 1 : 0;
1123 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1124 			return (EFAULT);
1125 		}
1126 		return (0);
1127 	}
1128 	case DKIOCSOLIDSTATE: {
1129 		int i;
1130 		i = bd->d_ssd ? 1 : 0;
1131 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1132 			return (EFAULT);
1133 		}
1134 		return (0);
1135 	}
1136 	case DKIOCSTATE: {
1137 		enum dkio_state	state;
1138 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1139 			return (EFAULT);
1140 		}
1141 		if ((rv = bd_check_state(bd, &state)) != 0) {
1142 			return (rv);
1143 		}
1144 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1145 			return (EFAULT);
1146 		}
1147 		return (0);
1148 	}
1149 	case DKIOCFLUSHWRITECACHE: {
1150 		struct dk_callback *dkc = NULL;
1151 
1152 		if (flag & FKIOCTL)
1153 			dkc = (void *)arg;
1154 
1155 		rv = bd_flush_write_cache(bd, dkc);
1156 		return (rv);
1157 	}
1158 
1159 	default:
1160 		break;
1161 
1162 	}
1163 	return (ENOTTY);
1164 }
1165 
1166 static int
1167 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1168     char *name, caddr_t valuep, int *lengthp)
1169 {
1170 	bd_t	*bd;
1171 
1172 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1173 	if (bd == NULL)
1174 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1175 		    name, valuep, lengthp));
1176 
1177 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1178 	    valuep, lengthp, BDPART(dev), 0));
1179 }
1180 
1181 
1182 static int
1183 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1184     size_t length, void *tg_cookie)
1185 {
1186 	bd_t		*bd;
1187 	buf_t		*bp;
1188 	bd_xfer_impl_t	*xi;
1189 	int		rv;
1190 	int		(*func)(void *, bd_xfer_t *);
1191 	int		kmflag;
1192 
1193 	/*
1194 	 * If we are running in polled mode (such as during dump(9e)
1195 	 * execution), then we cannot sleep for kernel allocations.
1196 	 */
1197 	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1198 
1199 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1200 
1201 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1202 		/* We can only transfer whole blocks at a time! */
1203 		return (EINVAL);
1204 	}
1205 
1206 	if ((bp = getrbuf(kmflag)) == NULL) {
1207 		return (ENOMEM);
1208 	}
1209 
1210 	switch (cmd) {
1211 	case TG_READ:
1212 		bp->b_flags = B_READ;
1213 		func = bd->d_ops.o_read;
1214 		break;
1215 	case TG_WRITE:
1216 		bp->b_flags = B_WRITE;
1217 		func = bd->d_ops.o_write;
1218 		break;
1219 	default:
1220 		freerbuf(bp);
1221 		return (EINVAL);
1222 	}
1223 
1224 	bp->b_un.b_addr = bufaddr;
1225 	bp->b_bcount = length;
1226 	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1227 	if (xi == NULL) {
1228 		rv = geterror(bp);
1229 		freerbuf(bp);
1230 		return (rv);
1231 	}
1232 	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1233 	xi->i_blkno = start;
1234 	bd_submit(bd, xi);
1235 	(void) biowait(bp);
1236 	rv = geterror(bp);
1237 	freerbuf(bp);
1238 
1239 	return (rv);
1240 }
1241 
1242 static int
1243 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1244 {
1245 	bd_t		*bd;
1246 
1247 	_NOTE(ARGUNUSED(tg_cookie));
1248 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1249 
1250 	switch (cmd) {
1251 	case TG_GETPHYGEOM:
1252 	case TG_GETVIRTGEOM:
1253 		/*
1254 		 * We don't have any "geometry" as such, let cmlb
1255 		 * fabricate something.
1256 		 */
1257 		return (ENOTTY);
1258 
1259 	case TG_GETCAPACITY:
1260 		bd_update_state(bd);
1261 		*(diskaddr_t *)arg = bd->d_numblks;
1262 		return (0);
1263 
1264 	case TG_GETBLOCKSIZE:
1265 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1266 		return (0);
1267 
1268 	case TG_GETATTR:
1269 		/*
1270 		 * It turns out that cmlb really doesn't do much for
1271 		 * non-writable media, but lets make the information
1272 		 * available for it in case it does more in the
1273 		 * future.  (The value is currently used for
1274 		 * triggering special behavior for CD-ROMs.)
1275 		 */
1276 		bd_update_state(bd);
1277 		((tg_attribute_t *)arg)->media_is_writable =
1278 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1279 		((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1280 		return (0);
1281 
1282 	default:
1283 		return (EINVAL);
1284 	}
1285 }
1286 
1287 
1288 static void
1289 bd_sched(bd_t *bd)
1290 {
1291 	bd_xfer_impl_t	*xi;
1292 	struct buf	*bp;
1293 	int		rv;
1294 
1295 	mutex_enter(&bd->d_iomutex);
1296 
1297 	while ((bd->d_qactive < bd->d_qsize) &&
1298 	    ((xi = list_remove_head(&bd->d_waitq)) != NULL)) {
1299 		bd->d_qactive++;
1300 		kstat_waitq_to_runq(bd->d_kiop);
1301 		list_insert_tail(&bd->d_runq, xi);
1302 
1303 		/*
1304 		 * Submit the job to the driver.  We drop the I/O mutex
1305 		 * so that we can deal with the case where the driver
1306 		 * completion routine calls back into us synchronously.
1307 		 */
1308 
1309 		mutex_exit(&bd->d_iomutex);
1310 
1311 		rv = xi->i_func(bd->d_private, &xi->i_public);
1312 		if (rv != 0) {
1313 			bp = xi->i_bp;
1314 			bioerror(bp, rv);
1315 			biodone(bp);
1316 
1317 			mutex_enter(&bd->d_iomutex);
1318 			bd->d_qactive--;
1319 			kstat_runq_exit(bd->d_kiop);
1320 			list_remove(&bd->d_runq, xi);
1321 			bd_xfer_free(xi);
1322 		} else {
1323 			mutex_enter(&bd->d_iomutex);
1324 		}
1325 	}
1326 
1327 	mutex_exit(&bd->d_iomutex);
1328 }
1329 
1330 static void
1331 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1332 {
1333 	mutex_enter(&bd->d_iomutex);
1334 	list_insert_tail(&bd->d_waitq, xi);
1335 	kstat_waitq_enter(bd->d_kiop);
1336 	mutex_exit(&bd->d_iomutex);
1337 
1338 	bd_sched(bd);
1339 }
1340 
1341 static void
1342 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1343 {
1344 	bd_t	*bd = xi->i_bd;
1345 	buf_t	*bp = xi->i_bp;
1346 
1347 	mutex_enter(&bd->d_iomutex);
1348 	bd->d_qactive--;
1349 	kstat_runq_exit(bd->d_kiop);
1350 	list_remove(&bd->d_runq, xi);
1351 	mutex_exit(&bd->d_iomutex);
1352 
1353 	if (err == 0) {
1354 		if (bp->b_flags & B_READ) {
1355 			bd->d_kiop->reads++;
1356 			bd->d_kiop->nread += (bp->b_bcount - xi->i_resid);
1357 		} else {
1358 			bd->d_kiop->writes++;
1359 			bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid);
1360 		}
1361 	}
1362 	bd_sched(bd);
1363 }
1364 
1365 static void
1366 bd_update_state(bd_t *bd)
1367 {
1368 	enum	dkio_state	state = DKIO_INSERTED;
1369 	boolean_t		docmlb = B_FALSE;
1370 	bd_media_t		media;
1371 
1372 	bzero(&media, sizeof (media));
1373 
1374 	mutex_enter(&bd->d_statemutex);
1375 	if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) {
1376 		bd->d_numblks = 0;
1377 		state = DKIO_EJECTED;
1378 		goto done;
1379 	}
1380 
1381 	if ((media.m_blksize < 512) ||
1382 	    (!ISP2(media.m_blksize)) ||
1383 	    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1384 		cmn_err(CE_WARN, "%s%d: Invalid media block size (%d)",
1385 		    ddi_driver_name(bd->d_dip), ddi_get_instance(bd->d_dip),
1386 		    media.m_blksize);
1387 		/*
1388 		 * We can't use the media, treat it as not present.
1389 		 */
1390 		state = DKIO_EJECTED;
1391 		bd->d_numblks = 0;
1392 		goto done;
1393 	}
1394 
1395 	if (((1U << bd->d_blkshift) != media.m_blksize) ||
1396 	    (bd->d_numblks != media.m_nblks)) {
1397 		/* Device size changed */
1398 		docmlb = B_TRUE;
1399 	}
1400 
1401 	bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1402 	bd->d_pblkshift = bd->d_blkshift;
1403 	bd->d_numblks = media.m_nblks;
1404 	bd->d_rdonly = media.m_readonly;
1405 	bd->d_ssd = media.m_solidstate;
1406 
1407 	/*
1408 	 * Only use the supplied physical block size if it is non-zero,
1409 	 * greater or equal to the block size, and a power of 2. Ignore it
1410 	 * if not, it's just informational and we can still use the media.
1411 	 */
1412 	if ((media.m_pblksize != 0) &&
1413 	    (media.m_pblksize >= media.m_blksize) &&
1414 	    (ISP2(media.m_pblksize)))
1415 		bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1;
1416 
1417 done:
1418 	if (state != bd->d_state) {
1419 		bd->d_state = state;
1420 		cv_broadcast(&bd->d_statecv);
1421 		docmlb = B_TRUE;
1422 	}
1423 	mutex_exit(&bd->d_statemutex);
1424 
1425 	if (docmlb) {
1426 		if (state == DKIO_INSERTED) {
1427 			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
1428 		} else {
1429 			cmlb_invalidate(bd->d_cmlbh, 0);
1430 		}
1431 	}
1432 }
1433 
1434 static int
1435 bd_check_state(bd_t *bd, enum dkio_state *state)
1436 {
1437 	clock_t		when;
1438 
1439 	for (;;) {
1440 
1441 		bd_update_state(bd);
1442 
1443 		mutex_enter(&bd->d_statemutex);
1444 
1445 		if (bd->d_state != *state) {
1446 			*state = bd->d_state;
1447 			mutex_exit(&bd->d_statemutex);
1448 			break;
1449 		}
1450 
1451 		when = drv_usectohz(1000000);
1452 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1453 		    when, TR_CLOCK_TICK) == 0) {
1454 			mutex_exit(&bd->d_statemutex);
1455 			return (EINTR);
1456 		}
1457 
1458 		mutex_exit(&bd->d_statemutex);
1459 	}
1460 
1461 	return (0);
1462 }
1463 
1464 static int
1465 bd_flush_write_cache_done(struct buf *bp)
1466 {
1467 	struct dk_callback *dc = (void *)bp->b_private;
1468 
1469 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1470 	kmem_free(dc, sizeof (*dc));
1471 	freerbuf(bp);
1472 	return (0);
1473 }
1474 
1475 static int
1476 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1477 {
1478 	buf_t			*bp;
1479 	struct dk_callback	*dc;
1480 	bd_xfer_impl_t		*xi;
1481 	int			rv;
1482 
1483 	if (bd->d_ops.o_sync_cache == NULL) {
1484 		return (ENOTSUP);
1485 	}
1486 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1487 		return (ENOMEM);
1488 	}
1489 	bp->b_resid = 0;
1490 	bp->b_bcount = 0;
1491 
1492 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1493 	if (xi == NULL) {
1494 		rv = geterror(bp);
1495 		freerbuf(bp);
1496 		return (rv);
1497 	}
1498 
1499 	/* Make an asynchronous flush, but only if there is a callback */
1500 	if (dkc != NULL && dkc->dkc_callback != NULL) {
1501 		/* Make a private copy of the callback structure */
1502 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1503 		*dc = *dkc;
1504 		bp->b_private = dc;
1505 		bp->b_iodone = bd_flush_write_cache_done;
1506 
1507 		bd_submit(bd, xi);
1508 		return (0);
1509 	}
1510 
1511 	/* In case there is no callback, perform a synchronous flush */
1512 	bd_submit(bd, xi);
1513 	(void) biowait(bp);
1514 	rv = geterror(bp);
1515 	freerbuf(bp);
1516 
1517 	return (rv);
1518 }
1519 
1520 /*
1521  * Nexus support.
1522  */
1523 int
1524 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1525     void *arg, void *result)
1526 {
1527 	bd_handle_t	hdl;
1528 
1529 	switch (ctlop) {
1530 	case DDI_CTLOPS_REPORTDEV:
1531 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1532 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
1533 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
1534 		return (DDI_SUCCESS);
1535 
1536 	case DDI_CTLOPS_INITCHILD:
1537 		hdl = ddi_get_parent_data((dev_info_t *)arg);
1538 		if (hdl == NULL) {
1539 			return (DDI_NOT_WELL_FORMED);
1540 		}
1541 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1542 		return (DDI_SUCCESS);
1543 
1544 	case DDI_CTLOPS_UNINITCHILD:
1545 		ddi_set_name_addr((dev_info_t *)arg, NULL);
1546 		ndi_prop_remove_all((dev_info_t *)arg);
1547 		return (DDI_SUCCESS);
1548 
1549 	default:
1550 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1551 	}
1552 }
1553 
1554 /*
1555  * Functions for device drivers.
1556  */
1557 bd_handle_t
1558 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1559 {
1560 	bd_handle_t	hdl;
1561 
1562 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1563 	if (hdl != NULL) {
1564 		hdl->h_ops = *ops;
1565 		hdl->h_dma = dma;
1566 		hdl->h_private = private;
1567 	}
1568 
1569 	return (hdl);
1570 }
1571 
1572 void
1573 bd_free_handle(bd_handle_t hdl)
1574 {
1575 	kmem_free(hdl, sizeof (*hdl));
1576 }
1577 
1578 int
1579 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1580 {
1581 	dev_info_t	*child;
1582 	bd_drive_t	drive;
1583 
1584 	/* if drivers don't override this, make it assume none */
1585 	drive.d_lun = -1;
1586 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
1587 
1588 	hdl->h_parent = dip;
1589 	hdl->h_name = "blkdev";
1590 
1591 	if (drive.d_lun >= 0) {
1592 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X,%X",
1593 		    drive.d_target, drive.d_lun);
1594 	} else {
1595 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X",
1596 		    drive.d_target);
1597 	}
1598 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
1599 	    &child) != NDI_SUCCESS) {
1600 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
1601 		    ddi_driver_name(dip), ddi_get_instance(dip),
1602 		    "blkdev", hdl->h_addr);
1603 		return (DDI_FAILURE);
1604 	}
1605 
1606 	ddi_set_parent_data(child, hdl);
1607 	hdl->h_child = child;
1608 
1609 	if (ndi_devi_online(child, 0) == NDI_FAILURE) {
1610 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
1611 		    ddi_driver_name(dip), ddi_get_instance(dip),
1612 		    hdl->h_name, hdl->h_addr);
1613 		(void) ndi_devi_free(child);
1614 		return (DDI_FAILURE);
1615 	}
1616 
1617 	return (DDI_SUCCESS);
1618 }
1619 
1620 int
1621 bd_detach_handle(bd_handle_t hdl)
1622 {
1623 	int	circ;
1624 	int	rv;
1625 	char	*devnm;
1626 
1627 	if (hdl->h_child == NULL) {
1628 		return (DDI_SUCCESS);
1629 	}
1630 	ndi_devi_enter(hdl->h_parent, &circ);
1631 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
1632 		rv = ddi_remove_child(hdl->h_child, 0);
1633 	} else {
1634 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
1635 		(void) ddi_deviname(hdl->h_child, devnm);
1636 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
1637 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
1638 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
1639 		kmem_free(devnm, MAXNAMELEN + 1);
1640 	}
1641 	if (rv == 0) {
1642 		hdl->h_child = NULL;
1643 	}
1644 
1645 	ndi_devi_exit(hdl->h_parent, circ);
1646 	return (rv = NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
1647 }
1648 
1649 void
1650 bd_xfer_done(bd_xfer_t *xfer, int err)
1651 {
1652 	bd_xfer_impl_t	*xi = (void *)xfer;
1653 	buf_t		*bp = xi->i_bp;
1654 	int		rv = DDI_SUCCESS;
1655 	bd_t		*bd = xi->i_bd;
1656 	size_t		len;
1657 
1658 	if (err != 0) {
1659 		bd_runq_exit(xi, err);
1660 
1661 		bp->b_resid += xi->i_resid;
1662 		bd_xfer_free(xi);
1663 		bioerror(bp, err);
1664 		biodone(bp);
1665 		return;
1666 	}
1667 
1668 	xi->i_cur_win++;
1669 	xi->i_resid -= xi->i_len;
1670 
1671 	if (xi->i_resid == 0) {
1672 		/* Job completed succcessfully! */
1673 		bd_runq_exit(xi, 0);
1674 
1675 		bd_xfer_free(xi);
1676 		biodone(bp);
1677 		return;
1678 	}
1679 
1680 	xi->i_blkno += xi->i_nblks;
1681 
1682 	if (bd->d_use_dma) {
1683 		/* More transfer still pending... advance to next DMA window. */
1684 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
1685 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
1686 	} else {
1687 		/* Advance memory window. */
1688 		xi->i_kaddr += xi->i_len;
1689 		xi->i_offset += xi->i_len;
1690 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
1691 	}
1692 
1693 
1694 	if ((rv != DDI_SUCCESS) ||
1695 	    (P2PHASE(len, (1U << xi->i_blkshift) != 0))) {
1696 		bd_runq_exit(xi, EFAULT);
1697 
1698 		bp->b_resid += xi->i_resid;
1699 		bd_xfer_free(xi);
1700 		bioerror(bp, EFAULT);
1701 		biodone(bp);
1702 		return;
1703 	}
1704 	xi->i_len = len;
1705 	xi->i_nblks = len >> xi->i_blkshift;
1706 
1707 	/* Submit next window to hardware. */
1708 	rv = xi->i_func(bd->d_private, &xi->i_public);
1709 	if (rv != 0) {
1710 		bd_runq_exit(xi, rv);
1711 
1712 		bp->b_resid += xi->i_resid;
1713 		bd_xfer_free(xi);
1714 		bioerror(bp, rv);
1715 		biodone(bp);
1716 	}
1717 }
1718 
1719 void
1720 bd_state_change(bd_handle_t hdl)
1721 {
1722 	bd_t		*bd;
1723 
1724 	if ((bd = hdl->h_bd) != NULL) {
1725 		bd_update_state(bd);
1726 	}
1727 }
1728 
1729 void
1730 bd_mod_init(struct dev_ops *devops)
1731 {
1732 	static struct bus_ops bd_bus_ops = {
1733 		BUSO_REV,		/* busops_rev */
1734 		nullbusmap,		/* bus_map */
1735 		NULL,			/* bus_get_intrspec (OBSOLETE) */
1736 		NULL,			/* bus_add_intrspec (OBSOLETE) */
1737 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
1738 		i_ddi_map_fault,	/* bus_map_fault */
1739 		NULL,			/* bus_dma_map (OBSOLETE) */
1740 		ddi_dma_allochdl,	/* bus_dma_allochdl */
1741 		ddi_dma_freehdl,	/* bus_dma_freehdl */
1742 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
1743 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
1744 		ddi_dma_flush,		/* bus_dma_flush */
1745 		ddi_dma_win,		/* bus_dma_win */
1746 		ddi_dma_mctl,		/* bus_dma_ctl */
1747 		bd_bus_ctl,		/* bus_ctl */
1748 		ddi_bus_prop_op,	/* bus_prop_op */
1749 		NULL,			/* bus_get_eventcookie */
1750 		NULL,			/* bus_add_eventcall */
1751 		NULL,			/* bus_remove_eventcall */
1752 		NULL,			/* bus_post_event */
1753 		NULL,			/* bus_intr_ctl (OBSOLETE) */
1754 		NULL,			/* bus_config */
1755 		NULL,			/* bus_unconfig */
1756 		NULL,			/* bus_fm_init */
1757 		NULL,			/* bus_fm_fini */
1758 		NULL,			/* bus_fm_access_enter */
1759 		NULL,			/* bus_fm_access_exit */
1760 		NULL,			/* bus_power */
1761 		NULL,			/* bus_intr_op */
1762 	};
1763 
1764 	devops->devo_bus_ops = &bd_bus_ops;
1765 
1766 	/*
1767 	 * NB: The device driver is free to supply its own
1768 	 * character entry device support.
1769 	 */
1770 }
1771 
1772 void
1773 bd_mod_fini(struct dev_ops *devops)
1774 {
1775 	devops->devo_bus_ops = NULL;
1776 }
1777