xref: /titanic_50/usr/src/uts/common/io/blkdev/blkdev.c (revision 494f7e12a62129ef191a15f9dfde6b7abe3bf510)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
24  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/ksynch.h>
30 #include <sys/kmem.h>
31 #include <sys/file.h>
32 #include <sys/errno.h>
33 #include <sys/open.h>
34 #include <sys/buf.h>
35 #include <sys/uio.h>
36 #include <sys/aio_req.h>
37 #include <sys/cred.h>
38 #include <sys/modctl.h>
39 #include <sys/cmlb.h>
40 #include <sys/conf.h>
41 #include <sys/devops.h>
42 #include <sys/list.h>
43 #include <sys/sysmacros.h>
44 #include <sys/dkio.h>
45 #include <sys/vtoc.h>
46 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
47 #include <sys/kstat.h>
48 #include <sys/fs/dv_node.h>
49 #include <sys/ddi.h>
50 #include <sys/sunddi.h>
51 #include <sys/note.h>
52 #include <sys/blkdev.h>
53 
54 #define	BD_MAXPART	64
55 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
56 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
57 
58 typedef struct bd bd_t;
59 typedef struct bd_xfer_impl bd_xfer_impl_t;
60 
61 struct bd {
62 	void		*d_private;
63 	dev_info_t	*d_dip;
64 	kmutex_t	d_ocmutex;
65 	kmutex_t	d_iomutex;
66 	kmutex_t	d_statemutex;
67 	kcondvar_t	d_statecv;
68 	enum dkio_state	d_state;
69 	cmlb_handle_t	d_cmlbh;
70 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
71 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
72 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
73 
74 	uint32_t	d_qsize;
75 	uint32_t	d_qactive;
76 	uint32_t	d_maxxfer;
77 	uint32_t	d_blkshift;
78 	uint64_t	d_numblks;
79 	ddi_devid_t	d_devid;
80 
81 	kmem_cache_t	*d_cache;
82 	list_t		d_runq;
83 	list_t		d_waitq;
84 	kstat_t		*d_ksp;
85 	kstat_io_t	*d_kiop;
86 
87 	boolean_t	d_rdonly;
88 	boolean_t	d_removable;
89 	boolean_t	d_hotpluggable;
90 	boolean_t	d_use_dma;
91 
92 	ddi_dma_attr_t	d_dma;
93 	bd_ops_t	d_ops;
94 	bd_handle_t	d_handle;
95 };
96 
97 struct bd_handle {
98 	bd_ops_t	h_ops;
99 	ddi_dma_attr_t	*h_dma;
100 	dev_info_t	*h_parent;
101 	dev_info_t	*h_child;
102 	void		*h_private;
103 	bd_t		*h_bd;
104 	char		*h_name;
105 	char		h_addr[20];	/* enough for %X,%X */
106 };
107 
108 struct bd_xfer_impl {
109 	bd_xfer_t	i_public;
110 	list_node_t	i_linkage;
111 	bd_t		*i_bd;
112 	buf_t		*i_bp;
113 	uint_t		i_num_win;
114 	uint_t		i_cur_win;
115 	off_t		i_offset;
116 	int		(*i_func)(void *, bd_xfer_t *);
117 	uint32_t	i_blkshift;
118 	size_t		i_len;
119 	size_t		i_resid;
120 };
121 
122 #define	i_dmah		i_public.x_dmah
123 #define	i_dmac		i_public.x_dmac
124 #define	i_ndmac		i_public.x_ndmac
125 #define	i_kaddr		i_public.x_kaddr
126 #define	i_nblks		i_public.x_nblks
127 #define	i_blkno		i_public.x_blkno
128 #define	i_flags		i_public.x_flags
129 
130 
131 /*
132  * Private prototypes.
133  */
134 
135 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
136 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
137 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
138 
139 static int bd_open(dev_t *, int, int, cred_t *);
140 static int bd_close(dev_t, int, int, cred_t *);
141 static int bd_strategy(struct buf *);
142 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
143 static int bd_dump(dev_t, caddr_t, daddr_t, int);
144 static int bd_read(dev_t, struct uio *, cred_t *);
145 static int bd_write(dev_t, struct uio *, cred_t *);
146 static int bd_aread(dev_t, struct aio_req *, cred_t *);
147 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
148 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
149     caddr_t, int *);
150 
151 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
152     void *);
153 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
154 static int bd_xfer_ctor(void *, void *, int);
155 static void bd_xfer_dtor(void *, void *);
156 static void bd_sched(bd_t *);
157 static void bd_submit(bd_t *, bd_xfer_impl_t *);
158 static void bd_runq_exit(bd_xfer_impl_t *, int);
159 static void bd_update_state(bd_t *);
160 static int bd_check_state(bd_t *, enum dkio_state *);
161 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
162 
163 struct cmlb_tg_ops bd_tg_ops = {
164 	TG_DK_OPS_VERSION_1,
165 	bd_tg_rdwr,
166 	bd_tg_getinfo,
167 };
168 
169 static struct cb_ops bd_cb_ops = {
170 	bd_open, 		/* open */
171 	bd_close, 		/* close */
172 	bd_strategy, 		/* strategy */
173 	nodev, 			/* print */
174 	bd_dump,		/* dump */
175 	bd_read, 		/* read */
176 	bd_write, 		/* write */
177 	bd_ioctl, 		/* ioctl */
178 	nodev, 			/* devmap */
179 	nodev, 			/* mmap */
180 	nodev, 			/* segmap */
181 	nochpoll, 		/* poll */
182 	bd_prop_op, 		/* cb_prop_op */
183 	0, 			/* streamtab  */
184 	D_64BIT | D_MP,		/* Driver comaptibility flag */
185 	CB_REV,			/* cb_rev */
186 	bd_aread,		/* async read */
187 	bd_awrite		/* async write */
188 };
189 
190 struct dev_ops bd_dev_ops = {
191 	DEVO_REV, 		/* devo_rev, */
192 	0, 			/* refcnt  */
193 	bd_getinfo,		/* getinfo */
194 	nulldev, 		/* identify */
195 	nulldev, 		/* probe */
196 	bd_attach, 		/* attach */
197 	bd_detach,		/* detach */
198 	nodev, 			/* reset */
199 	&bd_cb_ops, 		/* driver operations */
200 	NULL,			/* bus operations */
201 	NULL,			/* power */
202 	ddi_quiesce_not_needed,	/* quiesce */
203 };
204 
205 static struct modldrv modldrv = {
206 	&mod_driverops,
207 	"Generic Block Device",
208 	&bd_dev_ops,
209 };
210 
211 static struct modlinkage modlinkage = {
212 	MODREV_1, { &modldrv, NULL }
213 };
214 
215 static void *bd_state;
216 static krwlock_t bd_lock;
217 
218 int
219 _init(void)
220 {
221 	int	rv;
222 
223 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
224 	if (rv != DDI_SUCCESS) {
225 		return (rv);
226 	}
227 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
228 	rv = mod_install(&modlinkage);
229 	if (rv != DDI_SUCCESS) {
230 		rw_destroy(&bd_lock);
231 		ddi_soft_state_fini(&bd_state);
232 	}
233 	return (rv);
234 }
235 
236 int
237 _fini(void)
238 {
239 	int	rv;
240 
241 	rv = mod_remove(&modlinkage);
242 	if (rv == DDI_SUCCESS) {
243 		rw_destroy(&bd_lock);
244 		ddi_soft_state_fini(&bd_state);
245 	}
246 	return (rv);
247 }
248 
249 int
250 _info(struct modinfo *modinfop)
251 {
252 	return (mod_info(&modlinkage, modinfop));
253 }
254 
255 static int
256 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
257 {
258 	bd_t	*bd;
259 	minor_t	inst;
260 
261 	_NOTE(ARGUNUSED(dip));
262 
263 	inst = BDINST((dev_t)arg);
264 
265 	switch (cmd) {
266 	case DDI_INFO_DEVT2DEVINFO:
267 		bd = ddi_get_soft_state(bd_state, inst);
268 		if (bd == NULL) {
269 			return (DDI_FAILURE);
270 		}
271 		*resultp = (void *)bd->d_dip;
272 		break;
273 
274 	case DDI_INFO_DEVT2INSTANCE:
275 		*resultp = (void *)(intptr_t)inst;
276 		break;
277 
278 	default:
279 		return (DDI_FAILURE);
280 	}
281 	return (DDI_SUCCESS);
282 }
283 
284 static int
285 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
286 {
287 	int		inst;
288 	bd_handle_t	hdl;
289 	bd_t		*bd;
290 	bd_drive_t	drive;
291 	int		rv;
292 	char		name[16];
293 	char		kcache[32];
294 
295 	switch (cmd) {
296 	case DDI_ATTACH:
297 		break;
298 	case DDI_RESUME:
299 		/* We don't do anything native for suspend/resume */
300 		return (DDI_SUCCESS);
301 	default:
302 		return (DDI_FAILURE);
303 	}
304 
305 	inst = ddi_get_instance(dip);
306 	hdl = ddi_get_parent_data(dip);
307 
308 	(void) snprintf(name, sizeof (name), "%s%d",
309 	    ddi_driver_name(dip), ddi_get_instance(dip));
310 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
311 
312 	if (hdl == NULL) {
313 		cmn_err(CE_WARN, "%s: missing parent data!", name);
314 		return (DDI_FAILURE);
315 	}
316 
317 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
318 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
319 		return (DDI_FAILURE);
320 	}
321 	bd = ddi_get_soft_state(bd_state, inst);
322 
323 	if (hdl->h_dma) {
324 		bd->d_dma = *(hdl->h_dma);
325 		bd->d_dma.dma_attr_granular =
326 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
327 		bd->d_use_dma = B_TRUE;
328 
329 		if (bd->d_maxxfer &&
330 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
331 			cmn_err(CE_WARN,
332 			    "%s: inconsistent maximum transfer size!",
333 			    name);
334 			/* We force it */
335 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
336 		} else {
337 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
338 		}
339 	} else {
340 		bd->d_use_dma = B_FALSE;
341 		if (bd->d_maxxfer == 0) {
342 			bd->d_maxxfer = 1024 * 1024;
343 		}
344 	}
345 	bd->d_ops = hdl->h_ops;
346 	bd->d_private = hdl->h_private;
347 	bd->d_blkshift = 9;	/* 512 bytes, to start */
348 
349 	if (bd->d_maxxfer % DEV_BSIZE) {
350 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
351 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
352 	}
353 	if (bd->d_maxxfer < DEV_BSIZE) {
354 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
355 		ddi_soft_state_free(bd_state, inst);
356 		return (DDI_FAILURE);
357 	}
358 
359 	bd->d_dip = dip;
360 	bd->d_handle = hdl;
361 	hdl->h_bd = bd;
362 	ddi_set_driver_private(dip, bd);
363 
364 	mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL);
365 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
366 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
367 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
368 
369 	list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t),
370 	    offsetof(struct bd_xfer_impl, i_linkage));
371 	list_create(&bd->d_runq, sizeof (bd_xfer_impl_t),
372 	    offsetof(struct bd_xfer_impl, i_linkage));
373 
374 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
375 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
376 
377 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
378 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
379 	if (bd->d_ksp != NULL) {
380 		bd->d_ksp->ks_lock = &bd->d_iomutex;
381 		kstat_install(bd->d_ksp);
382 		bd->d_kiop = bd->d_ksp->ks_data;
383 	} else {
384 		/*
385 		 * Even if we cannot create the kstat, we create a
386 		 * scratch kstat.  The reason for this is to ensure
387 		 * that we can update the kstat all of the time,
388 		 * without adding an extra branch instruction.
389 		 */
390 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
391 	}
392 
393 	cmlb_alloc_handle(&bd->d_cmlbh);
394 
395 	bd->d_state = DKIO_NONE;
396 
397 	bzero(&drive, sizeof (drive));
398 	bd->d_ops.o_drive_info(bd->d_private, &drive);
399 	bd->d_qsize = drive.d_qsize;
400 	bd->d_removable = drive.d_removable;
401 	bd->d_hotpluggable = drive.d_hotpluggable;
402 
403 	if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
404 		bd->d_maxxfer = drive.d_maxxfer;
405 
406 
407 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
408 	    bd->d_removable, bd->d_hotpluggable,
409 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
410 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
411 	if (rv != 0) {
412 		cmlb_free_handle(&bd->d_cmlbh);
413 		kmem_cache_destroy(bd->d_cache);
414 		mutex_destroy(&bd->d_iomutex);
415 		mutex_destroy(&bd->d_ocmutex);
416 		mutex_destroy(&bd->d_statemutex);
417 		cv_destroy(&bd->d_statecv);
418 		list_destroy(&bd->d_waitq);
419 		list_destroy(&bd->d_runq);
420 		if (bd->d_ksp != NULL) {
421 			kstat_delete(bd->d_ksp);
422 			bd->d_ksp = NULL;
423 		} else {
424 			kmem_free(bd->d_kiop, sizeof (kstat_io_t));
425 		}
426 		ddi_soft_state_free(bd_state, inst);
427 		return (DDI_FAILURE);
428 	}
429 
430 	if (bd->d_ops.o_devid_init != NULL) {
431 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
432 		if (rv == DDI_SUCCESS) {
433 			if (ddi_devid_register(dip, bd->d_devid) !=
434 			    DDI_SUCCESS) {
435 				cmn_err(CE_WARN,
436 				    "%s: unable to register devid", name);
437 			}
438 		}
439 	}
440 
441 	/*
442 	 * Add a zero-length attribute to tell the world we support
443 	 * kernel ioctls (for layered drivers).  Also set up properties
444 	 * used by HAL to identify removable media.
445 	 */
446 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
447 	    DDI_KERNEL_IOCTL, NULL, 0);
448 	if (bd->d_removable) {
449 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
450 		    "removable-media", NULL, 0);
451 	}
452 	if (bd->d_hotpluggable) {
453 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
454 		    "hotpluggable", NULL, 0);
455 	}
456 
457 	ddi_report_dev(dip);
458 
459 	return (DDI_SUCCESS);
460 }
461 
462 static int
463 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
464 {
465 	bd_t	*bd;
466 
467 	bd = ddi_get_driver_private(dip);
468 
469 	switch (cmd) {
470 	case DDI_DETACH:
471 		break;
472 	case DDI_SUSPEND:
473 		/* We don't suspend, but our parent does */
474 		return (DDI_SUCCESS);
475 	default:
476 		return (DDI_FAILURE);
477 	}
478 	if (bd->d_ksp != NULL) {
479 		kstat_delete(bd->d_ksp);
480 		bd->d_ksp = NULL;
481 	} else {
482 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
483 	}
484 	cmlb_detach(bd->d_cmlbh, 0);
485 	cmlb_free_handle(&bd->d_cmlbh);
486 	if (bd->d_devid)
487 		ddi_devid_free(bd->d_devid);
488 	kmem_cache_destroy(bd->d_cache);
489 	mutex_destroy(&bd->d_iomutex);
490 	mutex_destroy(&bd->d_ocmutex);
491 	mutex_destroy(&bd->d_statemutex);
492 	cv_destroy(&bd->d_statecv);
493 	list_destroy(&bd->d_waitq);
494 	list_destroy(&bd->d_runq);
495 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
496 	return (DDI_SUCCESS);
497 }
498 
499 static int
500 bd_xfer_ctor(void *buf, void *arg, int kmflag)
501 {
502 	bd_xfer_impl_t	*xi;
503 	bd_t		*bd = arg;
504 	int		(*dcb)(caddr_t);
505 
506 	if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
507 		dcb = DDI_DMA_SLEEP;
508 	} else {
509 		dcb = DDI_DMA_DONTWAIT;
510 	}
511 
512 	xi = buf;
513 	bzero(xi, sizeof (*xi));
514 	xi->i_bd = bd;
515 
516 	if (bd->d_use_dma) {
517 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
518 		    &xi->i_dmah) != DDI_SUCCESS) {
519 			return (-1);
520 		}
521 	}
522 
523 	return (0);
524 }
525 
526 static void
527 bd_xfer_dtor(void *buf, void *arg)
528 {
529 	bd_xfer_impl_t	*xi = buf;
530 
531 	_NOTE(ARGUNUSED(arg));
532 
533 	if (xi->i_dmah)
534 		ddi_dma_free_handle(&xi->i_dmah);
535 	xi->i_dmah = NULL;
536 }
537 
538 static bd_xfer_impl_t *
539 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
540     int kmflag)
541 {
542 	bd_xfer_impl_t		*xi;
543 	int			rv;
544 	int			status;
545 	unsigned		dir;
546 	int			(*cb)(caddr_t);
547 	size_t			len;
548 	uint32_t		shift;
549 
550 	if (kmflag == KM_SLEEP) {
551 		cb = DDI_DMA_SLEEP;
552 	} else {
553 		cb = DDI_DMA_DONTWAIT;
554 	}
555 
556 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
557 	if (xi == NULL) {
558 		bioerror(bp, ENOMEM);
559 		return (NULL);
560 	}
561 
562 	ASSERT(bp);
563 
564 	xi->i_bp = bp;
565 	xi->i_func = func;
566 	xi->i_blkno = bp->b_lblkno;
567 
568 	if (bp->b_bcount == 0) {
569 		xi->i_len = 0;
570 		xi->i_nblks = 0;
571 		xi->i_kaddr = NULL;
572 		xi->i_resid = 0;
573 		xi->i_num_win = 0;
574 		goto done;
575 	}
576 
577 	if (bp->b_flags & B_READ) {
578 		dir = DDI_DMA_READ;
579 		xi->i_func = bd->d_ops.o_read;
580 	} else {
581 		dir = DDI_DMA_WRITE;
582 		xi->i_func = bd->d_ops.o_write;
583 	}
584 
585 	shift = bd->d_blkshift;
586 	xi->i_blkshift = shift;
587 
588 	if (!bd->d_use_dma) {
589 		bp_mapin(bp);
590 		rv = 0;
591 		xi->i_offset = 0;
592 		xi->i_num_win =
593 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
594 		xi->i_cur_win = 0;
595 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
596 		xi->i_nblks = xi->i_len >> shift;
597 		xi->i_kaddr = bp->b_un.b_addr;
598 		xi->i_resid = bp->b_bcount;
599 	} else {
600 
601 		/*
602 		 * We have to use consistent DMA if the address is misaligned.
603 		 */
604 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
605 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
606 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
607 		} else {
608 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
609 		}
610 
611 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
612 		    NULL, &xi->i_dmac, &xi->i_ndmac);
613 		switch (status) {
614 		case DDI_DMA_MAPPED:
615 			xi->i_num_win = 1;
616 			xi->i_cur_win = 0;
617 			xi->i_offset = 0;
618 			xi->i_len = bp->b_bcount;
619 			xi->i_nblks = xi->i_len >> shift;
620 			xi->i_resid = bp->b_bcount;
621 			rv = 0;
622 			break;
623 		case DDI_DMA_PARTIAL_MAP:
624 			xi->i_cur_win = 0;
625 
626 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
627 			    DDI_SUCCESS) ||
628 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
629 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
630 			    DDI_SUCCESS) ||
631 			    (P2PHASE(len, shift) != 0)) {
632 				(void) ddi_dma_unbind_handle(xi->i_dmah);
633 				rv = EFAULT;
634 				goto done;
635 			}
636 			xi->i_len = len;
637 			xi->i_nblks = xi->i_len >> shift;
638 			xi->i_resid = bp->b_bcount;
639 			rv = 0;
640 			break;
641 		case DDI_DMA_NORESOURCES:
642 			rv = EAGAIN;
643 			goto done;
644 		case DDI_DMA_TOOBIG:
645 			rv = EINVAL;
646 			goto done;
647 		case DDI_DMA_NOMAPPING:
648 		case DDI_DMA_INUSE:
649 		default:
650 			rv = EFAULT;
651 			goto done;
652 		}
653 	}
654 
655 done:
656 	if (rv != 0) {
657 		kmem_cache_free(bd->d_cache, xi);
658 		bioerror(bp, rv);
659 		return (NULL);
660 	}
661 
662 	return (xi);
663 }
664 
665 static void
666 bd_xfer_free(bd_xfer_impl_t *xi)
667 {
668 	if (xi->i_dmah) {
669 		(void) ddi_dma_unbind_handle(xi->i_dmah);
670 	}
671 	kmem_cache_free(xi->i_bd->d_cache, xi);
672 }
673 
674 static int
675 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
676 {
677 	dev_t		dev = *devp;
678 	bd_t		*bd;
679 	minor_t		part;
680 	minor_t		inst;
681 	uint64_t	mask;
682 	boolean_t	ndelay;
683 	int		rv;
684 	diskaddr_t	nblks;
685 	diskaddr_t	lba;
686 
687 	_NOTE(ARGUNUSED(credp));
688 
689 	part = BDPART(dev);
690 	inst = BDINST(dev);
691 
692 	if (otyp >= OTYPCNT)
693 		return (EINVAL);
694 
695 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
696 
697 	/*
698 	 * Block any DR events from changing the set of registered
699 	 * devices while we function.
700 	 */
701 	rw_enter(&bd_lock, RW_READER);
702 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
703 		rw_exit(&bd_lock);
704 		return (ENXIO);
705 	}
706 
707 	mutex_enter(&bd->d_ocmutex);
708 
709 	ASSERT(part < 64);
710 	mask = (1U << part);
711 
712 	bd_update_state(bd);
713 
714 	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
715 
716 		/* non-blocking opens are allowed to succeed */
717 		if (!ndelay) {
718 			rv = ENXIO;
719 			goto done;
720 		}
721 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
722 	    NULL, NULL, 0) == 0) {
723 
724 		/*
725 		 * We read the partinfo, verify valid ranges.  If the
726 		 * partition is invalid, and we aren't blocking or
727 		 * doing a raw access, then fail. (Non-blocking and
728 		 * raw accesses can still succeed to allow a disk with
729 		 * bad partition data to opened by format and fdisk.)
730 		 */
731 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
732 			rv = ENXIO;
733 			goto done;
734 		}
735 	} else if (!ndelay) {
736 		/*
737 		 * cmlb_partinfo failed -- invalid partition or no
738 		 * disk label.
739 		 */
740 		rv = ENXIO;
741 		goto done;
742 	}
743 
744 	if ((flag & FWRITE) && bd->d_rdonly) {
745 		rv = EROFS;
746 		goto done;
747 	}
748 
749 	if ((bd->d_open_excl) & (mask)) {
750 		rv = EBUSY;
751 		goto done;
752 	}
753 	if (flag & FEXCL) {
754 		if (bd->d_open_lyr[part]) {
755 			rv = EBUSY;
756 			goto done;
757 		}
758 		for (int i = 0; i < OTYP_LYR; i++) {
759 			if (bd->d_open_reg[i] & mask) {
760 				rv = EBUSY;
761 				goto done;
762 			}
763 		}
764 	}
765 
766 	if (otyp == OTYP_LYR) {
767 		bd->d_open_lyr[part]++;
768 	} else {
769 		bd->d_open_reg[otyp] |= mask;
770 	}
771 	if (flag & FEXCL) {
772 		bd->d_open_excl |= mask;
773 	}
774 
775 	rv = 0;
776 done:
777 	mutex_exit(&bd->d_ocmutex);
778 	rw_exit(&bd_lock);
779 
780 	return (rv);
781 }
782 
783 static int
784 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
785 {
786 	bd_t		*bd;
787 	minor_t		inst;
788 	minor_t		part;
789 	uint64_t	mask;
790 	boolean_t	last = B_TRUE;
791 
792 	_NOTE(ARGUNUSED(flag));
793 	_NOTE(ARGUNUSED(credp));
794 
795 	part = BDPART(dev);
796 	inst = BDINST(dev);
797 
798 	ASSERT(part < 64);
799 	mask = (1U << part);
800 
801 	rw_enter(&bd_lock, RW_READER);
802 
803 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
804 		rw_exit(&bd_lock);
805 		return (ENXIO);
806 	}
807 
808 	mutex_enter(&bd->d_ocmutex);
809 	if (bd->d_open_excl & mask) {
810 		bd->d_open_excl &= ~mask;
811 	}
812 	if (otyp == OTYP_LYR) {
813 		bd->d_open_lyr[part]--;
814 	} else {
815 		bd->d_open_reg[otyp] &= ~mask;
816 	}
817 	for (int i = 0; i < 64; i++) {
818 		if (bd->d_open_lyr[part]) {
819 			last = B_FALSE;
820 		}
821 	}
822 	for (int i = 0; last && (i < OTYP_LYR); i++) {
823 		if (bd->d_open_reg[i]) {
824 			last = B_FALSE;
825 		}
826 	}
827 	mutex_exit(&bd->d_ocmutex);
828 
829 	if (last) {
830 		cmlb_invalidate(bd->d_cmlbh, 0);
831 	}
832 	rw_exit(&bd_lock);
833 
834 	return (0);
835 }
836 
837 static int
838 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
839 {
840 	minor_t		inst;
841 	minor_t		part;
842 	diskaddr_t	pstart;
843 	diskaddr_t	psize;
844 	bd_t		*bd;
845 	bd_xfer_impl_t	*xi;
846 	buf_t		*bp;
847 	int		rv;
848 
849 	rw_enter(&bd_lock, RW_READER);
850 
851 	part = BDPART(dev);
852 	inst = BDINST(dev);
853 
854 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
855 		rw_exit(&bd_lock);
856 		return (ENXIO);
857 	}
858 	/*
859 	 * do cmlb, but do it synchronously unless we already have the
860 	 * partition (which we probably should.)
861 	 */
862 	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
863 	    (void *)1)) {
864 		rw_exit(&bd_lock);
865 		return (ENXIO);
866 	}
867 
868 	if ((blkno + nblk) > psize) {
869 		rw_exit(&bd_lock);
870 		return (EINVAL);
871 	}
872 	bp = getrbuf(KM_NOSLEEP);
873 	if (bp == NULL) {
874 		rw_exit(&bd_lock);
875 		return (ENOMEM);
876 	}
877 
878 	bp->b_bcount = nblk << bd->d_blkshift;
879 	bp->b_resid = bp->b_bcount;
880 	bp->b_lblkno = blkno;
881 	bp->b_un.b_addr = caddr;
882 
883 	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
884 	if (xi == NULL) {
885 		rw_exit(&bd_lock);
886 		freerbuf(bp);
887 		return (ENOMEM);
888 	}
889 	xi->i_blkno = blkno + pstart;
890 	xi->i_flags = BD_XFER_POLL;
891 	bd_submit(bd, xi);
892 	rw_exit(&bd_lock);
893 
894 	/*
895 	 * Generally, we should have run this entirely synchronously
896 	 * at this point and the biowait call should be a no-op.  If
897 	 * it didn't happen this way, it's a bug in the underlying
898 	 * driver not honoring BD_XFER_POLL.
899 	 */
900 	(void) biowait(bp);
901 	rv = geterror(bp);
902 	freerbuf(bp);
903 	return (rv);
904 }
905 
906 void
907 bd_minphys(struct buf *bp)
908 {
909 	minor_t inst;
910 	bd_t	*bd;
911 	inst = BDINST(bp->b_edev);
912 
913 	bd = ddi_get_soft_state(bd_state, inst);
914 
915 	/*
916 	 * In a non-debug kernel, bd_strategy will catch !bd as
917 	 * well, and will fail nicely.
918 	 */
919 	ASSERT(bd);
920 
921 	if (bp->b_bcount > bd->d_maxxfer)
922 		bp->b_bcount = bd->d_maxxfer;
923 }
924 
925 static int
926 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
927 {
928 	_NOTE(ARGUNUSED(credp));
929 	return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
930 }
931 
932 static int
933 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
934 {
935 	_NOTE(ARGUNUSED(credp));
936 	return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
937 }
938 
939 static int
940 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
941 {
942 	_NOTE(ARGUNUSED(credp));
943 	return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
944 }
945 
946 static int
947 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
948 {
949 	_NOTE(ARGUNUSED(credp));
950 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
951 }
952 
953 static int
954 bd_strategy(struct buf *bp)
955 {
956 	minor_t		inst;
957 	minor_t		part;
958 	bd_t		*bd;
959 	diskaddr_t	p_lba;
960 	diskaddr_t	p_nblks;
961 	diskaddr_t	b_nblks;
962 	bd_xfer_impl_t	*xi;
963 	uint32_t	shift;
964 	int		(*func)(void *, bd_xfer_t *);
965 
966 	part = BDPART(bp->b_edev);
967 	inst = BDINST(bp->b_edev);
968 
969 	ASSERT(bp);
970 
971 	bp->b_resid = bp->b_bcount;
972 
973 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
974 		bioerror(bp, ENXIO);
975 		biodone(bp);
976 		return (0);
977 	}
978 
979 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
980 	    NULL, NULL, 0)) {
981 		bioerror(bp, ENXIO);
982 		biodone(bp);
983 		return (0);
984 	}
985 
986 	shift = bd->d_blkshift;
987 
988 	if ((P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
989 	    (bp->b_lblkno > p_nblks)) {
990 		bioerror(bp, ENXIO);
991 		biodone(bp);
992 		return (0);
993 	}
994 	b_nblks = bp->b_bcount >> shift;
995 	if ((bp->b_lblkno == p_nblks) || (bp->b_bcount == 0)) {
996 		biodone(bp);
997 		return (0);
998 	}
999 
1000 	if ((b_nblks + bp->b_lblkno) > p_nblks) {
1001 		bp->b_resid = ((bp->b_lblkno + b_nblks - p_nblks) << shift);
1002 		bp->b_bcount -= bp->b_resid;
1003 	} else {
1004 		bp->b_resid = 0;
1005 	}
1006 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1007 
1008 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1009 	if (xi == NULL) {
1010 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1011 	}
1012 	if (xi == NULL) {
1013 		/* bd_request_alloc will have done bioerror */
1014 		biodone(bp);
1015 		return (0);
1016 	}
1017 	xi->i_blkno = bp->b_lblkno + p_lba;
1018 
1019 	bd_submit(bd, xi);
1020 
1021 	return (0);
1022 }
1023 
1024 static int
1025 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1026 {
1027 	minor_t		inst;
1028 	uint16_t	part;
1029 	bd_t		*bd;
1030 	void		*ptr = (void *)arg;
1031 	int		rv;
1032 
1033 	part = BDPART(dev);
1034 	inst = BDINST(dev);
1035 
1036 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1037 		return (ENXIO);
1038 	}
1039 
1040 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1041 	if (rv != ENOTTY)
1042 		return (rv);
1043 
1044 	switch (cmd) {
1045 	case DKIOCGMEDIAINFO: {
1046 		struct dk_minfo minfo;
1047 
1048 		/* make sure our state information is current */
1049 		bd_update_state(bd);
1050 		bzero(&minfo, sizeof (minfo));
1051 		minfo.dki_media_type = DK_FIXED_DISK;
1052 		minfo.dki_lbsize = (1U << bd->d_blkshift);
1053 		minfo.dki_capacity = bd->d_numblks;
1054 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1055 			return (EFAULT);
1056 		}
1057 		return (0);
1058 	}
1059 	case DKIOCGMEDIAINFOEXT: {
1060 		struct dk_minfo_ext miext;
1061 
1062 		/* make sure our state information is current */
1063 		bd_update_state(bd);
1064 		bzero(&miext, sizeof (miext));
1065 		miext.dki_media_type = DK_FIXED_DISK;
1066 		miext.dki_lbsize = (1U << bd->d_blkshift);
1067 		miext.dki_pbsize = miext.dki_lbsize;
1068 		miext.dki_capacity = bd->d_numblks;
1069 		if (ddi_copyout(&miext, ptr, sizeof (miext), flag)) {
1070 			return (EFAULT);
1071 		}
1072 		return (0);
1073 	}
1074 	case DKIOCINFO: {
1075 		struct dk_cinfo cinfo;
1076 		bzero(&cinfo, sizeof (cinfo));
1077 		cinfo.dki_ctype = DKC_BLKDEV;
1078 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1079 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1080 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1081 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1082 		    "%s", ddi_driver_name(bd->d_dip));
1083 		cinfo.dki_unit = inst;
1084 		cinfo.dki_flags = DKI_FMTVOL;
1085 		cinfo.dki_partition = part;
1086 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1087 		cinfo.dki_addr = 0;
1088 		cinfo.dki_slave = 0;
1089 		cinfo.dki_space = 0;
1090 		cinfo.dki_prio = 0;
1091 		cinfo.dki_vec = 0;
1092 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1093 			return (EFAULT);
1094 		}
1095 		return (0);
1096 	}
1097 	case DKIOCREMOVABLE: {
1098 		int i;
1099 		i = bd->d_removable ? 1 : 0;
1100 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1101 			return (EFAULT);
1102 		}
1103 		return (0);
1104 	}
1105 	case DKIOCHOTPLUGGABLE: {
1106 		int i;
1107 		i = bd->d_hotpluggable ? 1 : 0;
1108 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1109 			return (EFAULT);
1110 		}
1111 		return (0);
1112 	}
1113 	case DKIOCREADONLY: {
1114 		int i;
1115 		i = bd->d_rdonly ? 1 : 0;
1116 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1117 			return (EFAULT);
1118 		}
1119 		return (0);
1120 	}
1121 	case DKIOCSTATE: {
1122 		enum dkio_state	state;
1123 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1124 			return (EFAULT);
1125 		}
1126 		if ((rv = bd_check_state(bd, &state)) != 0) {
1127 			return (rv);
1128 		}
1129 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1130 			return (EFAULT);
1131 		}
1132 		return (0);
1133 	}
1134 	case DKIOCFLUSHWRITECACHE: {
1135 		struct dk_callback *dkc = NULL;
1136 
1137 		if (flag & FKIOCTL)
1138 			dkc = (void *)arg;
1139 
1140 		rv = bd_flush_write_cache(bd, dkc);
1141 		return (rv);
1142 	}
1143 
1144 	default:
1145 		break;
1146 
1147 	}
1148 	return (ENOTTY);
1149 }
1150 
1151 static int
1152 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1153     char *name, caddr_t valuep, int *lengthp)
1154 {
1155 	bd_t	*bd;
1156 
1157 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1158 	if (bd == NULL)
1159 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1160 		    name, valuep, lengthp));
1161 
1162 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1163 	    valuep, lengthp, BDPART(dev), 0));
1164 }
1165 
1166 
1167 static int
1168 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1169     size_t length, void *tg_cookie)
1170 {
1171 	bd_t		*bd;
1172 	buf_t		*bp;
1173 	bd_xfer_impl_t	*xi;
1174 	int		rv;
1175 	int		(*func)(void *, bd_xfer_t *);
1176 	int		kmflag;
1177 
1178 	/*
1179 	 * If we are running in polled mode (such as during dump(9e)
1180 	 * execution), then we cannot sleep for kernel allocations.
1181 	 */
1182 	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1183 
1184 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1185 
1186 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1187 		/* We can only transfer whole blocks at a time! */
1188 		return (EINVAL);
1189 	}
1190 
1191 	if ((bp = getrbuf(kmflag)) == NULL) {
1192 		return (ENOMEM);
1193 	}
1194 
1195 	switch (cmd) {
1196 	case TG_READ:
1197 		bp->b_flags = B_READ;
1198 		func = bd->d_ops.o_read;
1199 		break;
1200 	case TG_WRITE:
1201 		bp->b_flags = B_WRITE;
1202 		func = bd->d_ops.o_write;
1203 		break;
1204 	default:
1205 		freerbuf(bp);
1206 		return (EINVAL);
1207 	}
1208 
1209 	bp->b_un.b_addr = bufaddr;
1210 	bp->b_bcount = length;
1211 	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1212 	if (xi == NULL) {
1213 		rv = geterror(bp);
1214 		freerbuf(bp);
1215 		return (rv);
1216 	}
1217 	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1218 	xi->i_blkno = start;
1219 	bd_submit(bd, xi);
1220 	(void) biowait(bp);
1221 	rv = geterror(bp);
1222 	freerbuf(bp);
1223 
1224 	return (rv);
1225 }
1226 
1227 static int
1228 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1229 {
1230 	bd_t		*bd;
1231 
1232 	_NOTE(ARGUNUSED(tg_cookie));
1233 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1234 
1235 	switch (cmd) {
1236 	case TG_GETPHYGEOM:
1237 	case TG_GETVIRTGEOM:
1238 		/*
1239 		 * We don't have any "geometry" as such, let cmlb
1240 		 * fabricate something.
1241 		 */
1242 		return (ENOTTY);
1243 
1244 	case TG_GETCAPACITY:
1245 		bd_update_state(bd);
1246 		*(diskaddr_t *)arg = bd->d_numblks;
1247 		return (0);
1248 
1249 	case TG_GETBLOCKSIZE:
1250 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1251 		return (0);
1252 
1253 	case TG_GETATTR:
1254 		/*
1255 		 * It turns out that cmlb really doesn't do much for
1256 		 * non-writable media, but lets make the information
1257 		 * available for it in case it does more in the
1258 		 * future.  (The value is currently used for
1259 		 * triggering special behavior for CD-ROMs.)
1260 		 */
1261 		bd_update_state(bd);
1262 		((tg_attribute_t *)arg)->media_is_writable =
1263 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1264 		return (0);
1265 
1266 	default:
1267 		return (EINVAL);
1268 	}
1269 }
1270 
1271 
1272 static void
1273 bd_sched(bd_t *bd)
1274 {
1275 	bd_xfer_impl_t	*xi;
1276 	struct buf	*bp;
1277 	int		rv;
1278 
1279 	mutex_enter(&bd->d_iomutex);
1280 
1281 	while ((bd->d_qactive < bd->d_qsize) &&
1282 	    ((xi = list_remove_head(&bd->d_waitq)) != NULL)) {
1283 		bd->d_qactive++;
1284 		kstat_waitq_to_runq(bd->d_kiop);
1285 		list_insert_tail(&bd->d_runq, xi);
1286 
1287 		/*
1288 		 * Submit the job to the driver.  We drop the I/O mutex
1289 		 * so that we can deal with the case where the driver
1290 		 * completion routine calls back into us synchronously.
1291 		 */
1292 
1293 		mutex_exit(&bd->d_iomutex);
1294 
1295 		rv = xi->i_func(bd->d_private, &xi->i_public);
1296 		if (rv != 0) {
1297 			bp = xi->i_bp;
1298 			bd_xfer_free(xi);
1299 			bioerror(bp, rv);
1300 			biodone(bp);
1301 
1302 			mutex_enter(&bd->d_iomutex);
1303 			bd->d_qactive--;
1304 			kstat_runq_exit(bd->d_kiop);
1305 			list_remove(&bd->d_runq, xi);
1306 		} else {
1307 			mutex_enter(&bd->d_iomutex);
1308 		}
1309 	}
1310 
1311 	mutex_exit(&bd->d_iomutex);
1312 }
1313 
1314 static void
1315 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1316 {
1317 	mutex_enter(&bd->d_iomutex);
1318 	list_insert_tail(&bd->d_waitq, xi);
1319 	kstat_waitq_enter(bd->d_kiop);
1320 	mutex_exit(&bd->d_iomutex);
1321 
1322 	bd_sched(bd);
1323 }
1324 
1325 static void
1326 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1327 {
1328 	bd_t	*bd = xi->i_bd;
1329 	buf_t	*bp = xi->i_bp;
1330 
1331 	mutex_enter(&bd->d_iomutex);
1332 	bd->d_qactive--;
1333 	kstat_runq_exit(bd->d_kiop);
1334 	list_remove(&bd->d_runq, xi);
1335 	mutex_exit(&bd->d_iomutex);
1336 
1337 	if (err == 0) {
1338 		if (bp->b_flags & B_READ) {
1339 			bd->d_kiop->reads++;
1340 			bd->d_kiop->nread += (bp->b_bcount - xi->i_resid);
1341 		} else {
1342 			bd->d_kiop->writes++;
1343 			bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid);
1344 		}
1345 	}
1346 	bd_sched(bd);
1347 }
1348 
1349 static void
1350 bd_update_state(bd_t *bd)
1351 {
1352 	enum	dkio_state	state;
1353 	bd_media_t		media;
1354 	boolean_t		docmlb = B_FALSE;
1355 
1356 	bzero(&media, sizeof (media));
1357 
1358 	mutex_enter(&bd->d_statemutex);
1359 	if (bd->d_ops.o_media_info(bd->d_private, &media) == 0) {
1360 		if ((1U << bd->d_blkshift) != media.m_blksize) {
1361 			if ((media.m_blksize < 512) ||
1362 			    (!ISP2(media.m_blksize)) ||
1363 			    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1364 				cmn_err(CE_WARN,
1365 				    "%s%d: Invalid media block size (%d)",
1366 				    ddi_driver_name(bd->d_dip),
1367 				    ddi_get_instance(bd->d_dip),
1368 				    media.m_blksize);
1369 				/*
1370 				 * We can't use the media, treat it as
1371 				 * not present.
1372 				 */
1373 				state = DKIO_EJECTED;
1374 				bd->d_numblks = 0;
1375 			} else {
1376 				bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1377 				bd->d_numblks = media.m_nblks;
1378 				bd->d_rdonly = media.m_readonly;
1379 				state = DKIO_INSERTED;
1380 			}
1381 
1382 			/* Device size changed */
1383 			docmlb = B_TRUE;
1384 
1385 		} else {
1386 			if (bd->d_numblks != media.m_nblks) {
1387 				/* Device size changed */
1388 				docmlb = B_TRUE;
1389 			}
1390 			bd->d_numblks = media.m_nblks;
1391 			bd->d_rdonly = media.m_readonly;
1392 			state = DKIO_INSERTED;
1393 		}
1394 
1395 	} else {
1396 		bd->d_numblks = 0;
1397 		state = DKIO_EJECTED;
1398 	}
1399 	if (state != bd->d_state) {
1400 		bd->d_state = state;
1401 		cv_broadcast(&bd->d_statecv);
1402 		docmlb = B_TRUE;
1403 	}
1404 	mutex_exit(&bd->d_statemutex);
1405 
1406 	if (docmlb) {
1407 		if (state == DKIO_INSERTED) {
1408 			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
1409 		} else {
1410 			cmlb_invalidate(bd->d_cmlbh, 0);
1411 		}
1412 	}
1413 }
1414 
1415 static int
1416 bd_check_state(bd_t *bd, enum dkio_state *state)
1417 {
1418 	clock_t		when;
1419 
1420 	for (;;) {
1421 
1422 		bd_update_state(bd);
1423 
1424 		mutex_enter(&bd->d_statemutex);
1425 
1426 		if (bd->d_state != *state) {
1427 			*state = bd->d_state;
1428 			mutex_exit(&bd->d_statemutex);
1429 			break;
1430 		}
1431 
1432 		when = drv_usectohz(1000000);
1433 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1434 		    when, TR_CLOCK_TICK) == 0) {
1435 			mutex_exit(&bd->d_statemutex);
1436 			return (EINTR);
1437 		}
1438 
1439 		mutex_exit(&bd->d_statemutex);
1440 	}
1441 
1442 	return (0);
1443 }
1444 
1445 static int
1446 bd_flush_write_cache_done(struct buf *bp)
1447 {
1448 	struct dk_callback *dc = (void *)bp->b_private;
1449 
1450 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1451 	kmem_free(dc, sizeof (*dc));
1452 	freerbuf(bp);
1453 	return (0);
1454 }
1455 
1456 static int
1457 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1458 {
1459 	buf_t			*bp;
1460 	struct dk_callback	*dc;
1461 	bd_xfer_impl_t		*xi;
1462 	int			rv;
1463 
1464 	if (bd->d_ops.o_sync_cache == NULL) {
1465 		return (ENOTSUP);
1466 	}
1467 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1468 		return (ENOMEM);
1469 	}
1470 	bp->b_resid = 0;
1471 	bp->b_bcount = 0;
1472 
1473 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1474 	if (xi == NULL) {
1475 		rv = geterror(bp);
1476 		freerbuf(bp);
1477 		return (rv);
1478 	}
1479 
1480 	/* Make an asynchronous flush, but only if there is a callback */
1481 	if (dkc != NULL && dkc->dkc_callback != NULL) {
1482 		/* Make a private copy of the callback structure */
1483 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1484 		*dc = *dkc;
1485 		bp->b_private = dc;
1486 		bp->b_iodone = bd_flush_write_cache_done;
1487 
1488 		bd_submit(bd, xi);
1489 		return (0);
1490 	}
1491 
1492 	/* In case there is no callback, perform a synchronous flush */
1493 	bd_submit(bd, xi);
1494 	(void) biowait(bp);
1495 	rv = geterror(bp);
1496 	freerbuf(bp);
1497 
1498 	return (rv);
1499 }
1500 
1501 /*
1502  * Nexus support.
1503  */
1504 int
1505 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1506     void *arg, void *result)
1507 {
1508 	bd_handle_t	hdl;
1509 
1510 	switch (ctlop) {
1511 	case DDI_CTLOPS_REPORTDEV:
1512 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1513 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
1514 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
1515 		return (DDI_SUCCESS);
1516 
1517 	case DDI_CTLOPS_INITCHILD:
1518 		hdl = ddi_get_parent_data((dev_info_t *)arg);
1519 		if (hdl == NULL) {
1520 			return (DDI_NOT_WELL_FORMED);
1521 		}
1522 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1523 		return (DDI_SUCCESS);
1524 
1525 	case DDI_CTLOPS_UNINITCHILD:
1526 		ddi_set_name_addr((dev_info_t *)arg, NULL);
1527 		ndi_prop_remove_all((dev_info_t *)arg);
1528 		return (DDI_SUCCESS);
1529 
1530 	default:
1531 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1532 	}
1533 }
1534 
1535 /*
1536  * Functions for device drivers.
1537  */
1538 bd_handle_t
1539 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1540 {
1541 	bd_handle_t	hdl;
1542 
1543 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1544 	if (hdl != NULL) {
1545 		hdl->h_ops = *ops;
1546 		hdl->h_dma = dma;
1547 		hdl->h_private = private;
1548 	}
1549 
1550 	return (hdl);
1551 }
1552 
1553 void
1554 bd_free_handle(bd_handle_t hdl)
1555 {
1556 	kmem_free(hdl, sizeof (*hdl));
1557 }
1558 
1559 int
1560 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1561 {
1562 	dev_info_t	*child;
1563 	bd_drive_t	drive;
1564 
1565 	/* if drivers don't override this, make it assume none */
1566 	drive.d_lun = -1;
1567 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
1568 
1569 	hdl->h_parent = dip;
1570 	hdl->h_name = "blkdev";
1571 
1572 	if (drive.d_lun >= 0) {
1573 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X,%X",
1574 		    drive.d_target, drive.d_lun);
1575 	} else {
1576 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X",
1577 		    drive.d_target);
1578 	}
1579 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
1580 	    &child) != NDI_SUCCESS) {
1581 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
1582 		    ddi_driver_name(dip), ddi_get_instance(dip),
1583 		    "blkdev", hdl->h_addr);
1584 		return (DDI_FAILURE);
1585 	}
1586 
1587 	ddi_set_parent_data(child, hdl);
1588 	hdl->h_child = child;
1589 
1590 	if (ndi_devi_online(child, 0) == NDI_FAILURE) {
1591 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
1592 		    ddi_driver_name(dip), ddi_get_instance(dip),
1593 		    hdl->h_name, hdl->h_addr);
1594 		(void) ndi_devi_free(child);
1595 		return (DDI_FAILURE);
1596 	}
1597 
1598 	return (DDI_SUCCESS);
1599 }
1600 
1601 int
1602 bd_detach_handle(bd_handle_t hdl)
1603 {
1604 	int	circ;
1605 	int	rv;
1606 	char	*devnm;
1607 
1608 	if (hdl->h_child == NULL) {
1609 		return (DDI_SUCCESS);
1610 	}
1611 	ndi_devi_enter(hdl->h_parent, &circ);
1612 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
1613 		rv = ddi_remove_child(hdl->h_child, 0);
1614 	} else {
1615 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
1616 		(void) ddi_deviname(hdl->h_child, devnm);
1617 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
1618 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
1619 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
1620 		kmem_free(devnm, MAXNAMELEN + 1);
1621 	}
1622 	if (rv == 0) {
1623 		hdl->h_child = NULL;
1624 	}
1625 
1626 	ndi_devi_exit(hdl->h_parent, circ);
1627 	return (rv = NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
1628 }
1629 
1630 void
1631 bd_xfer_done(bd_xfer_t *xfer, int err)
1632 {
1633 	bd_xfer_impl_t	*xi = (void *)xfer;
1634 	buf_t		*bp = xi->i_bp;
1635 	int		rv = DDI_SUCCESS;
1636 	bd_t		*bd = xi->i_bd;
1637 	size_t		len;
1638 
1639 	if (err != 0) {
1640 		bd_runq_exit(xi, err);
1641 
1642 		bp->b_resid += xi->i_resid;
1643 		bd_xfer_free(xi);
1644 		bioerror(bp, err);
1645 		biodone(bp);
1646 		return;
1647 	}
1648 
1649 	xi->i_cur_win++;
1650 	xi->i_resid -= xi->i_len;
1651 
1652 	if (xi->i_resid == 0) {
1653 		/* Job completed succcessfully! */
1654 		bd_runq_exit(xi, 0);
1655 
1656 		bd_xfer_free(xi);
1657 		biodone(bp);
1658 		return;
1659 	}
1660 
1661 	xi->i_blkno += xi->i_nblks;
1662 
1663 	if (bd->d_use_dma) {
1664 		/* More transfer still pending... advance to next DMA window. */
1665 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
1666 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
1667 	} else {
1668 		/* Advance memory window. */
1669 		xi->i_kaddr += xi->i_len;
1670 		xi->i_offset += xi->i_len;
1671 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
1672 	}
1673 
1674 
1675 	if ((rv != DDI_SUCCESS) ||
1676 	    (P2PHASE(len, (1U << xi->i_blkshift) != 0))) {
1677 		bd_runq_exit(xi, EFAULT);
1678 
1679 		bp->b_resid += xi->i_resid;
1680 		bd_xfer_free(xi);
1681 		bioerror(bp, EFAULT);
1682 		biodone(bp);
1683 		return;
1684 	}
1685 	xi->i_len = len;
1686 	xi->i_nblks = len >> xi->i_blkshift;
1687 
1688 	/* Submit next window to hardware. */
1689 	rv = xi->i_func(bd->d_private, &xi->i_public);
1690 	if (rv != 0) {
1691 		bd_runq_exit(xi, rv);
1692 
1693 		bp->b_resid += xi->i_resid;
1694 		bd_xfer_free(xi);
1695 		bioerror(bp, rv);
1696 		biodone(bp);
1697 	}
1698 }
1699 
1700 void
1701 bd_state_change(bd_handle_t hdl)
1702 {
1703 	bd_t		*bd;
1704 
1705 	if ((bd = hdl->h_bd) != NULL) {
1706 		bd_update_state(bd);
1707 	}
1708 }
1709 
1710 void
1711 bd_mod_init(struct dev_ops *devops)
1712 {
1713 	static struct bus_ops bd_bus_ops = {
1714 		BUSO_REV,		/* busops_rev */
1715 		nullbusmap,		/* bus_map */
1716 		NULL,			/* bus_get_intrspec (OBSOLETE) */
1717 		NULL,			/* bus_add_intrspec (OBSOLETE) */
1718 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
1719 		i_ddi_map_fault,	/* bus_map_fault */
1720 		NULL,			/* bus_dma_map (OBSOLETE) */
1721 		ddi_dma_allochdl,	/* bus_dma_allochdl */
1722 		ddi_dma_freehdl,	/* bus_dma_freehdl */
1723 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
1724 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
1725 		ddi_dma_flush,		/* bus_dma_flush */
1726 		ddi_dma_win,		/* bus_dma_win */
1727 		ddi_dma_mctl,		/* bus_dma_ctl */
1728 		bd_bus_ctl,		/* bus_ctl */
1729 		ddi_bus_prop_op,	/* bus_prop_op */
1730 		NULL,			/* bus_get_eventcookie */
1731 		NULL,			/* bus_add_eventcall */
1732 		NULL,			/* bus_remove_eventcall */
1733 		NULL,			/* bus_post_event */
1734 		NULL,			/* bus_intr_ctl (OBSOLETE) */
1735 		NULL,			/* bus_config */
1736 		NULL,			/* bus_unconfig */
1737 		NULL,			/* bus_fm_init */
1738 		NULL,			/* bus_fm_fini */
1739 		NULL,			/* bus_fm_access_enter */
1740 		NULL,			/* bus_fm_access_exit */
1741 		NULL,			/* bus_power */
1742 		NULL,			/* bus_intr_op */
1743 	};
1744 
1745 	devops->devo_bus_ops = &bd_bus_ops;
1746 
1747 	/*
1748 	 * NB: The device driver is free to supply its own
1749 	 * character entry device support.
1750 	 */
1751 }
1752 
1753 void
1754 bd_mod_fini(struct dev_ops *devops)
1755 {
1756 	devops->devo_bus_ops = NULL;
1757 }
1758