xref: /freebsd/sys/geom/geom_ccd.c (revision 77a0943ded95b9e6438f7db70c4a28e4d93946d4)
1 /* $FreeBSD$ */
2 
3 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
4 
5 /*
6  * Copyright (c) 1995 Jason R. Thorpe.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed for the NetBSD Project
20  *	by Jason R. Thorpe.
21  * 4. The name of the author may not be used to endorse or promote products
22  *    derived from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * Copyright (c) 1988 University of Utah.
39  * Copyright (c) 1990, 1993
40  *	The Regents of the University of California.  All rights reserved.
41  *
42  * This code is derived from software contributed to Berkeley by
43  * the Systems Programming Group of the University of Utah Computer
44  * Science Department.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions
48  * are met:
49  * 1. Redistributions of source code must retain the above copyright
50  *    notice, this list of conditions and the following disclaimer.
51  * 2. Redistributions in binary form must reproduce the above copyright
52  *    notice, this list of conditions and the following disclaimer in the
53  *    documentation and/or other materials provided with the distribution.
54  * 3. All advertising materials mentioning features or use of this software
55  *    must display the following acknowledgement:
56  *	This product includes software developed by the University of
57  *	California, Berkeley and its contributors.
58  * 4. Neither the name of the University nor the names of its contributors
59  *    may be used to endorse or promote products derived from this software
60  *    without specific prior written permission.
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72  * SUCH DAMAGE.
73  *
74  * from: Utah $Hdr: cd.c 1.6 90/11/28$
75  *
76  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
77  */
78 
79 /*
80  * "Concatenated" disk driver.
81  *
82  * Dynamic configuration and disklabel support by:
83  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
84  *	Numerical Aerodynamic Simulation Facility
85  *	Mail Stop 258-6
86  *	NASA Ames Research Center
87  *	Moffett Field, CA 94035
88  */
89 
90 #include "ccd.h"
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/kernel.h>
95 #include <sys/module.h>
96 #include <sys/proc.h>
97 #include <sys/bio.h>
98 #include <sys/malloc.h>
99 #include <sys/namei.h>
100 #include <sys/conf.h>
101 #include <sys/stat.h>
102 #include <sys/sysctl.h>
103 #include <sys/disklabel.h>
104 #include <ufs/ffs/fs.h>
105 #include <sys/devicestat.h>
106 #include <sys/fcntl.h>
107 #include <sys/vnode.h>
108 
109 #include <sys/ccdvar.h>
110 
111 #if defined(CCDDEBUG) && !defined(DEBUG)
112 #define DEBUG
113 #endif
114 
115 #ifdef DEBUG
116 #define CCDB_FOLLOW	0x01
117 #define CCDB_INIT	0x02
118 #define CCDB_IO		0x04
119 #define CCDB_LABEL	0x08
120 #define CCDB_VNODE	0x10
121 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
122     CCDB_VNODE;
123 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
124 #undef DEBUG
125 #endif
126 
127 #define	ccdunit(x)	dkunit(x)
128 #define ccdpart(x)	dkpart(x)
129 
130 /*
131    This is how mirroring works (only writes are special):
132 
133    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
134    linked together by the cb_mirror field.  "cb_pflags &
135    CCDPF_MIRROR_DONE" is set to 0 on both of them.
136 
137    When a component returns to ccdiodone(), it checks if "cb_pflags &
138    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
139    flag and returns.  If it is, it means its partner has already
140    returned, so it will go to the regular cleanup.
141 
142  */
143 
144 struct ccdbuf {
145 	struct bio	cb_buf;		/* new I/O buf */
146 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
147 	struct ccdbuf	*cb_freenext;	/* free list link */
148 	int		cb_unit;	/* target unit */
149 	int		cb_comp;	/* target component */
150 	int		cb_pflags;	/* mirror/parity status flag */
151 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
152 };
153 
154 /* bits in cb_pflags */
155 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
156 
157 #define CCDLABELDEV(dev)	\
158 	(makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
159 
160 static d_open_t ccdopen;
161 static d_close_t ccdclose;
162 static d_strategy_t ccdstrategy;
163 static d_ioctl_t ccdioctl;
164 static d_dump_t ccddump;
165 static d_psize_t ccdsize;
166 
167 #define NCCDFREEHIWAT	16
168 
169 #define CDEV_MAJOR 74
170 #define BDEV_MAJOR 21
171 
172 static struct cdevsw ccd_cdevsw = {
173 	/* open */	ccdopen,
174 	/* close */	ccdclose,
175 	/* read */	physread,
176 	/* write */	physwrite,
177 	/* ioctl */	ccdioctl,
178 	/* poll */	nopoll,
179 	/* mmap */	nommap,
180 	/* strategy */	ccdstrategy,
181 	/* name */	"ccd",
182 	/* maj */	CDEV_MAJOR,
183 	/* dump */	ccddump,
184 	/* psize */	ccdsize,
185 	/* flags */	D_DISK,
186 	/* bmaj */	BDEV_MAJOR
187 };
188 
189 /* called during module initialization */
190 static	void ccdattach __P((void));
191 static	int ccd_modevent __P((module_t, int, void *));
192 
193 /* called by biodone() at interrupt time */
194 static	void ccdiodone __P((struct bio *bp));
195 
196 static	void ccdstart __P((struct ccd_softc *, struct bio *));
197 static	void ccdinterleave __P((struct ccd_softc *, int));
198 static	void ccdintr __P((struct ccd_softc *, struct bio *));
199 static	int ccdinit __P((struct ccddevice *, char **, struct proc *));
200 static	int ccdlookup __P((char *, struct proc *p, struct vnode **));
201 static	void ccdbuffer __P((struct ccdbuf **ret, struct ccd_softc *,
202 		struct bio *, daddr_t, caddr_t, long));
203 static	void ccdgetdisklabel __P((dev_t));
204 static	void ccdmakedisklabel __P((struct ccd_softc *));
205 static	int ccdlock __P((struct ccd_softc *));
206 static	void ccdunlock __P((struct ccd_softc *));
207 
208 #ifdef DEBUG
209 static	void printiinfo __P((struct ccdiinfo *));
210 #endif
211 
212 /* Non-private for the benefit of libkvm. */
213 struct	ccd_softc *ccd_softc;
214 struct	ccddevice *ccddevs;
215 struct	ccdbuf *ccdfreebufs;
216 static	int numccdfreebufs;
217 static	int numccd = 0;
218 
219 /*
220  * getccdbuf() -	Allocate and zero a ccd buffer.
221  *
222  *	This routine is called at splbio().
223  */
224 
225 static __inline
226 struct ccdbuf *
227 getccdbuf(struct ccdbuf *cpy)
228 {
229 	struct ccdbuf *cbp;
230 
231 	/*
232 	 * Allocate from freelist or malloc as necessary
233 	 */
234 	if ((cbp = ccdfreebufs) != NULL) {
235 		ccdfreebufs = cbp->cb_freenext;
236 		--numccdfreebufs;
237 	} else {
238 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
239 	}
240 
241 	/*
242 	 * Used by mirroring code
243 	 */
244 	if (cpy)
245 		bcopy(cpy, cbp, sizeof(struct ccdbuf));
246 	else
247 		bzero(cbp, sizeof(struct ccdbuf));
248 
249 	/*
250 	 * independant struct bio initialization
251 	 */
252 
253 	return(cbp);
254 }
255 
256 /*
257  * putccdbuf() -	Free a ccd buffer.
258  *
259  *	This routine is called at splbio().
260  */
261 
262 static __inline
263 void
264 putccdbuf(struct ccdbuf *cbp)
265 {
266 
267 	if (numccdfreebufs < NCCDFREEHIWAT) {
268 		cbp->cb_freenext = ccdfreebufs;
269 		ccdfreebufs = cbp;
270 		++numccdfreebufs;
271 	} else {
272 		free((caddr_t)cbp, M_DEVBUF);
273 	}
274 }
275 
276 
277 /*
278  * Number of blocks to untouched in front of a component partition.
279  * This is to avoid violating its disklabel area when it starts at the
280  * beginning of the slice.
281  */
282 #if !defined(CCD_OFFSET)
283 #define CCD_OFFSET 16
284 #endif
285 
286 static void
287 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
288 {
289 	int i, u;
290 	char *s;
291 
292 	if (*dev != NODEV)
293 		return;
294 	i = dev_stdclone(name, &s, "ccd", &u);
295 	if (i != 2)
296 		return;
297 	if (u >= numccd)
298 		return;
299 	if (*s <= 'a' || *s >= 'h')
300 		return;
301 	if (s[1] != '\0')
302 		return;
303 	*dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
304 		UID_ROOT, GID_OPERATOR, 0640, name);
305 }
306 
307 /*
308  * Called by main() during pseudo-device attachment.  All we need
309  * to do is allocate enough space for devices to be configured later, and
310  * add devsw entries.
311  */
312 static void
313 ccdattach()
314 {
315 	int i;
316 	int num = NCCD;
317 
318 	if (num > 1)
319 		printf("ccd0-%d: Concatenated disk drivers\n", num-1);
320 	else
321 		printf("ccd0: Concatenated disk driver\n");
322 
323 	ccd_softc = (struct ccd_softc *)malloc(num * sizeof(struct ccd_softc),
324 	    M_DEVBUF, M_NOWAIT);
325 	ccddevs = (struct ccddevice *)malloc(num * sizeof(struct ccddevice),
326 	    M_DEVBUF, M_NOWAIT);
327 	if ((ccd_softc == NULL) || (ccddevs == NULL)) {
328 		printf("WARNING: no memory for concatenated disks\n");
329 		if (ccd_softc != NULL)
330 			free(ccd_softc, M_DEVBUF);
331 		if (ccddevs != NULL)
332 			free(ccddevs, M_DEVBUF);
333 		return;
334 	}
335 	numccd = num;
336 	bzero(ccd_softc, num * sizeof(struct ccd_softc));
337 	bzero(ccddevs, num * sizeof(struct ccddevice));
338 
339 	cdevsw_add(&ccd_cdevsw);
340 	/* XXX: is this necessary? */
341 	for (i = 0; i < numccd; ++i)
342 		ccddevs[i].ccd_dk = -1;
343 	EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
344 }
345 
346 static int
347 ccd_modevent(mod, type, data)
348 	module_t mod;
349 	int type;
350 	void *data;
351 {
352 	int error = 0;
353 
354 	switch (type) {
355 	case MOD_LOAD:
356 		ccdattach();
357 		break;
358 
359 	case MOD_UNLOAD:
360 		printf("ccd0: Unload not supported!\n");
361 		error = EOPNOTSUPP;
362 		break;
363 
364 	default:	/* MOD_SHUTDOWN etc */
365 		break;
366 	}
367 	return (error);
368 }
369 
370 DEV_MODULE(ccd, ccd_modevent, NULL);
371 
372 static int
373 ccdinit(ccd, cpaths, p)
374 	struct ccddevice *ccd;
375 	char **cpaths;
376 	struct proc *p;
377 {
378 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
379 	struct ccdcinfo *ci = NULL;	/* XXX */
380 	size_t size;
381 	int ix;
382 	struct vnode *vp;
383 	size_t minsize;
384 	int maxsecsize;
385 	struct partinfo dpart;
386 	struct ccdgeom *ccg = &cs->sc_geom;
387 	char tmppath[MAXPATHLEN];
388 	int error = 0;
389 
390 #ifdef DEBUG
391 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
392 		printf("ccdinit: unit %d\n", ccd->ccd_unit);
393 #endif
394 
395 	cs->sc_size = 0;
396 	cs->sc_ileave = ccd->ccd_interleave;
397 	cs->sc_nccdisks = ccd->ccd_ndev;
398 
399 	/* Allocate space for the component info. */
400 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
401 	    M_DEVBUF, M_WAITOK);
402 
403 	/*
404 	 * Verify that each component piece exists and record
405 	 * relevant information about it.
406 	 */
407 	maxsecsize = 0;
408 	minsize = 0;
409 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
410 		vp = ccd->ccd_vpp[ix];
411 		ci = &cs->sc_cinfo[ix];
412 		ci->ci_vp = vp;
413 
414 		/*
415 		 * Copy in the pathname of the component.
416 		 */
417 		bzero(tmppath, sizeof(tmppath));	/* sanity */
418 		if ((error = copyinstr(cpaths[ix], tmppath,
419 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
420 #ifdef DEBUG
421 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
422 				printf("ccd%d: can't copy path, error = %d\n",
423 				    ccd->ccd_unit, error);
424 #endif
425 			goto fail;
426 		}
427 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
428 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
429 
430 		ci->ci_dev = vn_todev(vp);
431 
432 		/*
433 		 * Get partition information for the component.
434 		 */
435 		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
436 		    FREAD, p->p_ucred, p)) != 0) {
437 #ifdef DEBUG
438 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
439 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
440 				     ccd->ccd_unit, ci->ci_path, error);
441 #endif
442 			goto fail;
443 		}
444 		if (dpart.part->p_fstype == FS_BSDFFS) {
445 			maxsecsize =
446 			    ((dpart.disklab->d_secsize > maxsecsize) ?
447 			    dpart.disklab->d_secsize : maxsecsize);
448 			size = dpart.part->p_size - CCD_OFFSET;
449 		} else {
450 #ifdef DEBUG
451 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
452 				printf("ccd%d: %s: incorrect partition type\n",
453 				    ccd->ccd_unit, ci->ci_path);
454 #endif
455 			error = EFTYPE;
456 			goto fail;
457 		}
458 
459 		/*
460 		 * Calculate the size, truncating to an interleave
461 		 * boundary if necessary.
462 		 */
463 
464 		if (cs->sc_ileave > 1)
465 			size -= size % cs->sc_ileave;
466 
467 		if (size == 0) {
468 #ifdef DEBUG
469 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
470 				printf("ccd%d: %s: size == 0\n",
471 				    ccd->ccd_unit, ci->ci_path);
472 #endif
473 			error = ENODEV;
474 			goto fail;
475 		}
476 
477 		if (minsize == 0 || size < minsize)
478 			minsize = size;
479 		ci->ci_size = size;
480 		cs->sc_size += size;
481 	}
482 
483 	/*
484 	 * Don't allow the interleave to be smaller than
485 	 * the biggest component sector.
486 	 */
487 	if ((cs->sc_ileave > 0) &&
488 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
489 #ifdef DEBUG
490 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
491 			printf("ccd%d: interleave must be at least %d\n",
492 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
493 #endif
494 		error = EINVAL;
495 		goto fail;
496 	}
497 
498 	/*
499 	 * If uniform interleave is desired set all sizes to that of
500 	 * the smallest component.  This will guarentee that a single
501 	 * interleave table is generated.
502 	 *
503 	 * Lost space must be taken into account when calculating the
504 	 * overall size.  Half the space is lost when CCDF_MIRROR is
505 	 * specified.  One disk is lost when CCDF_PARITY is specified.
506 	 */
507 	if (ccd->ccd_flags & CCDF_UNIFORM) {
508 		for (ci = cs->sc_cinfo;
509 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
510 			ci->ci_size = minsize;
511 		}
512 		if (ccd->ccd_flags & CCDF_MIRROR) {
513 			/*
514 			 * Check to see if an even number of components
515 			 * have been specified.  The interleave must also
516 			 * be non-zero in order for us to be able to
517 			 * guarentee the topology.
518 			 */
519 			if (cs->sc_nccdisks % 2) {
520 				printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
521 				error = EINVAL;
522 				goto fail;
523 			}
524 			if (cs->sc_ileave == 0) {
525 				printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
526 				error = EINVAL;
527 				goto fail;
528 			}
529 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
530 		} else if (ccd->ccd_flags & CCDF_PARITY) {
531 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
532 		} else {
533 			if (cs->sc_ileave == 0) {
534 				printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
535 				error = EINVAL;
536 				goto fail;
537 			}
538 			cs->sc_size = cs->sc_nccdisks * minsize;
539 		}
540 	}
541 
542 	/*
543 	 * Construct the interleave table.
544 	 */
545 	ccdinterleave(cs, ccd->ccd_unit);
546 
547 	/*
548 	 * Create pseudo-geometry based on 1MB cylinders.  It's
549 	 * pretty close.
550 	 */
551 	ccg->ccg_secsize = maxsecsize;
552 	ccg->ccg_ntracks = 1;
553 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
554 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
555 
556 	/*
557 	 * Add an devstat entry for this device.
558 	 */
559 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
560 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
561 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
562 			  DEVSTAT_PRIORITY_ARRAY);
563 
564 	cs->sc_flags |= CCDF_INITED;
565 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
566 	cs->sc_unit = ccd->ccd_unit;
567 	return (0);
568 fail:
569 	while (ci > cs->sc_cinfo) {
570 		ci--;
571 		free(ci->ci_path, M_DEVBUF);
572 	}
573 	free(cs->sc_cinfo, M_DEVBUF);
574 	return (error);
575 }
576 
577 static void
578 ccdinterleave(cs, unit)
579 	struct ccd_softc *cs;
580 	int unit;
581 {
582 	struct ccdcinfo *ci, *smallci;
583 	struct ccdiinfo *ii;
584 	daddr_t bn, lbn;
585 	int ix;
586 	u_long size;
587 
588 #ifdef DEBUG
589 	if (ccddebug & CCDB_INIT)
590 		printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
591 #endif
592 
593 	/*
594 	 * Allocate an interleave table.  The worst case occurs when each
595 	 * of N disks is of a different size, resulting in N interleave
596 	 * tables.
597 	 *
598 	 * Chances are this is too big, but we don't care.
599 	 */
600 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
601 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
602 	bzero((caddr_t)cs->sc_itable, size);
603 
604 	/*
605 	 * Trivial case: no interleave (actually interleave of disk size).
606 	 * Each table entry represents a single component in its entirety.
607 	 *
608 	 * An interleave of 0 may not be used with a mirror or parity setup.
609 	 */
610 	if (cs->sc_ileave == 0) {
611 		bn = 0;
612 		ii = cs->sc_itable;
613 
614 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
615 			/* Allocate space for ii_index. */
616 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
617 			ii->ii_ndisk = 1;
618 			ii->ii_startblk = bn;
619 			ii->ii_startoff = 0;
620 			ii->ii_index[0] = ix;
621 			bn += cs->sc_cinfo[ix].ci_size;
622 			ii++;
623 		}
624 		ii->ii_ndisk = 0;
625 #ifdef DEBUG
626 		if (ccddebug & CCDB_INIT)
627 			printiinfo(cs->sc_itable);
628 #endif
629 		return;
630 	}
631 
632 	/*
633 	 * The following isn't fast or pretty; it doesn't have to be.
634 	 */
635 	size = 0;
636 	bn = lbn = 0;
637 	for (ii = cs->sc_itable; ; ii++) {
638 		/*
639 		 * Allocate space for ii_index.  We might allocate more then
640 		 * we use.
641 		 */
642 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
643 		    M_DEVBUF, M_WAITOK);
644 
645 		/*
646 		 * Locate the smallest of the remaining components
647 		 */
648 		smallci = NULL;
649 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
650 		    ci++) {
651 			if (ci->ci_size > size &&
652 			    (smallci == NULL ||
653 			     ci->ci_size < smallci->ci_size)) {
654 				smallci = ci;
655 			}
656 		}
657 
658 		/*
659 		 * Nobody left, all done
660 		 */
661 		if (smallci == NULL) {
662 			ii->ii_ndisk = 0;
663 			break;
664 		}
665 
666 		/*
667 		 * Record starting logical block using an sc_ileave blocksize.
668 		 */
669 		ii->ii_startblk = bn / cs->sc_ileave;
670 
671 		/*
672 		 * Record starting comopnent block using an sc_ileave
673 		 * blocksize.  This value is relative to the beginning of
674 		 * a component disk.
675 		 */
676 		ii->ii_startoff = lbn;
677 
678 		/*
679 		 * Determine how many disks take part in this interleave
680 		 * and record their indices.
681 		 */
682 		ix = 0;
683 		for (ci = cs->sc_cinfo;
684 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
685 			if (ci->ci_size >= smallci->ci_size) {
686 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
687 			}
688 		}
689 		ii->ii_ndisk = ix;
690 		bn += ix * (smallci->ci_size - size);
691 		lbn = smallci->ci_size / cs->sc_ileave;
692 		size = smallci->ci_size;
693 	}
694 #ifdef DEBUG
695 	if (ccddebug & CCDB_INIT)
696 		printiinfo(cs->sc_itable);
697 #endif
698 }
699 
700 /* ARGSUSED */
701 static int
702 ccdopen(dev, flags, fmt, p)
703 	dev_t dev;
704 	int flags, fmt;
705 	struct proc *p;
706 {
707 	int unit = ccdunit(dev);
708 	struct ccd_softc *cs;
709 	struct disklabel *lp;
710 	int error = 0, part, pmask;
711 
712 #ifdef DEBUG
713 	if (ccddebug & CCDB_FOLLOW)
714 		printf("ccdopen(%x, %x)\n", dev, flags);
715 #endif
716 	if (unit >= numccd)
717 		return (ENXIO);
718 	cs = &ccd_softc[unit];
719 
720 	if ((error = ccdlock(cs)) != 0)
721 		return (error);
722 
723 	lp = &cs->sc_label;
724 
725 	part = ccdpart(dev);
726 	pmask = (1 << part);
727 
728 	/*
729 	 * If we're initialized, check to see if there are any other
730 	 * open partitions.  If not, then it's safe to update
731 	 * the in-core disklabel.
732 	 */
733 	if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
734 		ccdgetdisklabel(dev);
735 
736 	/* Check that the partition exists. */
737 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
738 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
739 		error = ENXIO;
740 		goto done;
741 	}
742 
743 	cs->sc_openmask |= pmask;
744  done:
745 	ccdunlock(cs);
746 	return (0);
747 }
748 
749 /* ARGSUSED */
750 static int
751 ccdclose(dev, flags, fmt, p)
752 	dev_t dev;
753 	int flags, fmt;
754 	struct proc *p;
755 {
756 	int unit = ccdunit(dev);
757 	struct ccd_softc *cs;
758 	int error = 0, part;
759 
760 #ifdef DEBUG
761 	if (ccddebug & CCDB_FOLLOW)
762 		printf("ccdclose(%x, %x)\n", dev, flags);
763 #endif
764 
765 	if (unit >= numccd)
766 		return (ENXIO);
767 	cs = &ccd_softc[unit];
768 
769 	if ((error = ccdlock(cs)) != 0)
770 		return (error);
771 
772 	part = ccdpart(dev);
773 
774 	/* ...that much closer to allowing unconfiguration... */
775 	cs->sc_openmask &= ~(1 << part);
776 	ccdunlock(cs);
777 	return (0);
778 }
779 
780 static void
781 ccdstrategy(bp)
782 	struct bio *bp;
783 {
784 	int unit = ccdunit(bp->bio_dev);
785 	struct ccd_softc *cs = &ccd_softc[unit];
786 	int s;
787 	int wlabel;
788 	struct disklabel *lp;
789 
790 #ifdef DEBUG
791 	if (ccddebug & CCDB_FOLLOW)
792 		printf("ccdstrategy(%x): unit %d\n", bp, unit);
793 #endif
794 	if ((cs->sc_flags & CCDF_INITED) == 0) {
795 		bp->bio_error = ENXIO;
796 		bp->bio_flags |= BIO_ERROR;
797 		goto done;
798 	}
799 
800 	/* If it's a nil transfer, wake up the top half now. */
801 	if (bp->bio_bcount == 0)
802 		goto done;
803 
804 	lp = &cs->sc_label;
805 
806 	/*
807 	 * Do bounds checking and adjust transfer.  If there's an
808 	 * error, the bounds check will flag that for us.
809 	 */
810 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
811 	if (ccdpart(bp->bio_dev) != RAW_PART) {
812 		if (bounds_check_with_label(bp, lp, wlabel) <= 0)
813 			goto done;
814 	} else {
815 		int pbn;        /* in sc_secsize chunks */
816 		long sz;        /* in sc_secsize chunks */
817 
818 		pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
819 		sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
820 
821 		/*
822 		 * If out of bounds return an error. If at the EOF point,
823 		 * simply read or write less.
824 		 */
825 
826 		if (pbn < 0 || pbn >= cs->sc_size) {
827 			bp->bio_resid = bp->bio_bcount;
828 			if (pbn != cs->sc_size) {
829 				bp->bio_error = EINVAL;
830 				bp->bio_flags |= BIO_ERROR;
831 			}
832 			goto done;
833 		}
834 
835 		/*
836 		 * If the request crosses EOF, truncate the request.
837 		 */
838 		if (pbn + sz > cs->sc_size) {
839 			bp->bio_bcount = (cs->sc_size - pbn) *
840 			    cs->sc_geom.ccg_secsize;
841 		}
842 	}
843 
844 	bp->bio_resid = bp->bio_bcount;
845 
846 	/*
847 	 * "Start" the unit.
848 	 */
849 	s = splbio();
850 	ccdstart(cs, bp);
851 	splx(s);
852 	return;
853 done:
854 	biodone(bp);
855 }
856 
857 static void
858 ccdstart(cs, bp)
859 	struct ccd_softc *cs;
860 	struct bio *bp;
861 {
862 	long bcount, rcount;
863 	struct ccdbuf *cbp[4];
864 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
865 	caddr_t addr;
866 	daddr_t bn;
867 	struct partition *pp;
868 
869 #ifdef DEBUG
870 	if (ccddebug & CCDB_FOLLOW)
871 		printf("ccdstart(%x, %x)\n", cs, bp);
872 #endif
873 
874 	/* Record the transaction start  */
875 	devstat_start_transaction(&cs->device_stats);
876 
877 	/*
878 	 * Translate the partition-relative block number to an absolute.
879 	 */
880 	bn = bp->bio_blkno;
881 	if (ccdpart(bp->bio_dev) != RAW_PART) {
882 		pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
883 		bn += pp->p_offset;
884 	}
885 
886 	/*
887 	 * Allocate component buffers and fire off the requests
888 	 */
889 	addr = bp->bio_data;
890 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
891 		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
892 		rcount = cbp[0]->cb_buf.bio_bcount;
893 
894 		if (cs->sc_cflags & CCDF_MIRROR) {
895 			/*
896 			 * Mirroring.  Writes go to both disks, reads are
897 			 * taken from whichever disk seems most appropriate.
898 			 *
899 			 * We attempt to localize reads to the disk whos arm
900 			 * is nearest the read request.  We ignore seeks due
901 			 * to writes when making this determination and we
902 			 * also try to avoid hogging.
903 			 */
904 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
905 				BIO_STRATEGY(&cbp[0]->cb_buf, 0);
906 				BIO_STRATEGY(&cbp[1]->cb_buf, 0);
907 			} else {
908 				int pick = cs->sc_pick;
909 				daddr_t range = cs->sc_size / 16;
910 
911 				if (bn < cs->sc_blk[pick] - range ||
912 				    bn > cs->sc_blk[pick] + range
913 				) {
914 					cs->sc_pick = pick = 1 - pick;
915 				}
916 				cs->sc_blk[pick] = bn + btodb(rcount);
917 				BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
918 			}
919 		} else {
920 			/*
921 			 * Not mirroring
922 			 */
923 			BIO_STRATEGY(&cbp[0]->cb_buf, 0);
924 		}
925 		bn += btodb(rcount);
926 		addr += rcount;
927 	}
928 }
929 
930 /*
931  * Build a component buffer header.
932  */
933 static void
934 ccdbuffer(cb, cs, bp, bn, addr, bcount)
935 	struct ccdbuf **cb;
936 	struct ccd_softc *cs;
937 	struct bio *bp;
938 	daddr_t bn;
939 	caddr_t addr;
940 	long bcount;
941 {
942 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
943 	struct ccdbuf *cbp;
944 	daddr_t cbn, cboff;
945 	off_t cbc;
946 
947 #ifdef DEBUG
948 	if (ccddebug & CCDB_IO)
949 		printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
950 		       cs, bp, bn, addr, bcount);
951 #endif
952 	/*
953 	 * Determine which component bn falls in.
954 	 */
955 	cbn = bn;
956 	cboff = 0;
957 
958 	if (cs->sc_ileave == 0) {
959 		/*
960 		 * Serially concatenated and neither a mirror nor a parity
961 		 * config.  This is a special case.
962 		 */
963 		daddr_t sblk;
964 
965 		sblk = 0;
966 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
967 			sblk += ci->ci_size;
968 		cbn -= sblk;
969 	} else {
970 		struct ccdiinfo *ii;
971 		int ccdisk, off;
972 
973 		/*
974 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
975 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
976 		 * to cbn.
977 		 */
978 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
979 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
980 
981 		/*
982 		 * Figure out which interleave table to use.
983 		 */
984 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
985 			if (ii->ii_startblk > cbn)
986 				break;
987 		}
988 		ii--;
989 
990 		/*
991 		 * off is the logical superblock relative to the beginning
992 		 * of this interleave block.
993 		 */
994 		off = cbn - ii->ii_startblk;
995 
996 		/*
997 		 * We must calculate which disk component to use (ccdisk),
998 		 * and recalculate cbn to be the superblock relative to
999 		 * the beginning of the component.  This is typically done by
1000 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
1001 		 * must typically be divided by the number of components in
1002 		 * this interleave array to be properly convert it from a
1003 		 * CCD-relative logical superblock number to a
1004 		 * component-relative superblock number.
1005 		 */
1006 		if (ii->ii_ndisk == 1) {
1007 			/*
1008 			 * When we have just one disk, it can't be a mirror
1009 			 * or a parity config.
1010 			 */
1011 			ccdisk = ii->ii_index[0];
1012 			cbn = ii->ii_startoff + off;
1013 		} else {
1014 			if (cs->sc_cflags & CCDF_MIRROR) {
1015 				/*
1016 				 * We have forced a uniform mapping, resulting
1017 				 * in a single interleave array.  We double
1018 				 * up on the first half of the available
1019 				 * components and our mirror is in the second
1020 				 * half.  This only works with a single
1021 				 * interleave array because doubling up
1022 				 * doubles the number of sectors, so there
1023 				 * cannot be another interleave array because
1024 				 * the next interleave array's calculations
1025 				 * would be off.
1026 				 */
1027 				int ndisk2 = ii->ii_ndisk / 2;
1028 				ccdisk = ii->ii_index[off % ndisk2];
1029 				cbn = ii->ii_startoff + off / ndisk2;
1030 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1031 			} else if (cs->sc_cflags & CCDF_PARITY) {
1032 				/*
1033 				 * XXX not implemented yet
1034 				 */
1035 				int ndisk2 = ii->ii_ndisk - 1;
1036 				ccdisk = ii->ii_index[off % ndisk2];
1037 				cbn = ii->ii_startoff + off / ndisk2;
1038 				if (cbn % ii->ii_ndisk <= ccdisk)
1039 					ccdisk++;
1040 			} else {
1041 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1042 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1043 			}
1044 		}
1045 
1046 		ci = &cs->sc_cinfo[ccdisk];
1047 
1048 		/*
1049 		 * Convert cbn from a superblock to a normal block so it
1050 		 * can be used to calculate (along with cboff) the normal
1051 		 * block index into this particular disk.
1052 		 */
1053 		cbn *= cs->sc_ileave;
1054 	}
1055 
1056 	/*
1057 	 * Fill in the component buf structure.
1058 	 */
1059 	cbp = getccdbuf(NULL);
1060 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
1061 	cbp->cb_buf.bio_done = ccdiodone;
1062 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
1063 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1064 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1065 	cbp->cb_buf.bio_data = addr;
1066 	if (cs->sc_ileave == 0)
1067               cbc = dbtob((off_t)(ci->ci_size - cbn));
1068 	else
1069               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1070 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1071  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1072 
1073 	/*
1074 	 * context for ccdiodone
1075 	 */
1076 	cbp->cb_obp = bp;
1077 	cbp->cb_unit = cs - ccd_softc;
1078 	cbp->cb_comp = ci - cs->sc_cinfo;
1079 
1080 #ifdef DEBUG
1081 	if (ccddebug & CCDB_IO)
1082 		printf(" dev %x(u%d): cbp %x bn %d addr %x bcnt %d\n",
1083 		       ci->ci_dev, ci-cs->sc_cinfo, cbp, cbp->cb_buf.bio_blkno,
1084 		       cbp->cb_buf.bio_data, cbp->cb_buf.bio_bcount);
1085 #endif
1086 	cb[0] = cbp;
1087 
1088 	/*
1089 	 * Note: both I/O's setup when reading from mirror, but only one
1090 	 * will be executed.
1091 	 */
1092 	if (cs->sc_cflags & CCDF_MIRROR) {
1093 		/* mirror, setup second I/O */
1094 		cbp = getccdbuf(cb[0]);
1095 		cbp->cb_buf.bio_dev = ci2->ci_dev;
1096 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1097 		cb[1] = cbp;
1098 		/* link together the ccdbuf's and clear "mirror done" flag */
1099 		cb[0]->cb_mirror = cb[1];
1100 		cb[1]->cb_mirror = cb[0];
1101 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1102 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1103 	}
1104 }
1105 
1106 static void
1107 ccdintr(cs, bp)
1108 	struct ccd_softc *cs;
1109 	struct bio *bp;
1110 {
1111 #ifdef DEBUG
1112 	if (ccddebug & CCDB_FOLLOW)
1113 		printf("ccdintr(%x, %x)\n", cs, bp);
1114 #endif
1115 	/*
1116 	 * Request is done for better or worse, wakeup the top half.
1117 	 */
1118 	if (bp->bio_flags & BIO_ERROR)
1119 		bp->bio_resid = bp->bio_bcount;
1120 	devstat_end_transaction_bio(&cs->device_stats, bp);
1121 	biodone(bp);
1122 }
1123 
1124 /*
1125  * Called at interrupt time.
1126  * Mark the component as done and if all components are done,
1127  * take a ccd interrupt.
1128  */
1129 static void
1130 ccdiodone(ibp)
1131 	struct bio *ibp;
1132 {
1133 	struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1134 	struct bio *bp = cbp->cb_obp;
1135 	int unit = cbp->cb_unit;
1136 	int count, s;
1137 
1138 	s = splbio();
1139 #ifdef DEBUG
1140 	if (ccddebug & CCDB_FOLLOW)
1141 		printf("ccdiodone(%x)\n", cbp);
1142 	if (ccddebug & CCDB_IO) {
1143 		printf("ccdiodone: bp %x bcount %d resid %d\n",
1144 		       bp, bp->bio_bcount, bp->bio_resid);
1145 		printf(" dev %x(u%d), cbp %x bn %d addr %x bcnt %d\n",
1146 		       cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1147 		       cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1148 		       cbp->cb_buf.bio_bcount);
1149 	}
1150 #endif
1151 	/*
1152 	 * If an error occured, report it.  If this is a mirrored
1153 	 * configuration and the first of two possible reads, do not
1154 	 * set the error in the bp yet because the second read may
1155 	 * succeed.
1156 	 */
1157 
1158 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1159 		const char *msg = "";
1160 
1161 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1162 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
1163 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1164 			/*
1165 			 * We will try our read on the other disk down
1166 			 * below, also reverse the default pick so if we
1167 			 * are doing a scan we do not keep hitting the
1168 			 * bad disk first.
1169 			 */
1170 			struct ccd_softc *cs = &ccd_softc[unit];
1171 
1172 			msg = ", trying other disk";
1173 			cs->sc_pick = 1 - cs->sc_pick;
1174 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1175 		} else {
1176 			bp->bio_flags |= BIO_ERROR;
1177 			bp->bio_error = cbp->cb_buf.bio_error ?
1178 			    cbp->cb_buf.bio_error : EIO;
1179 		}
1180 		printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1181 		       unit, bp->bio_error, cbp->cb_comp,
1182 		       (int)cbp->cb_buf.bio_blkno, bp->bio_blkno, msg);
1183 	}
1184 
1185 	/*
1186 	 * Process mirror.  If we are writing, I/O has been initiated on both
1187 	 * buffers and we fall through only after both are finished.
1188 	 *
1189 	 * If we are reading only one I/O is initiated at a time.  If an
1190 	 * error occurs we initiate the second I/O and return, otherwise
1191 	 * we free the second I/O without initiating it.
1192 	 */
1193 
1194 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1195 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1196 			/*
1197 			 * When writing, handshake with the second buffer
1198 			 * to determine when both are done.  If both are not
1199 			 * done, return here.
1200 			 */
1201 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1202 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1203 				putccdbuf(cbp);
1204 				splx(s);
1205 				return;
1206 			}
1207 		} else {
1208 			/*
1209 			 * When reading, either dispose of the second buffer
1210 			 * or initiate I/O on the second buffer if an error
1211 			 * occured with this one.
1212 			 */
1213 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1214 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1215 					cbp->cb_mirror->cb_pflags |=
1216 					    CCDPF_MIRROR_DONE;
1217 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1218 					putccdbuf(cbp);
1219 					splx(s);
1220 					return;
1221 				} else {
1222 					putccdbuf(cbp->cb_mirror);
1223 					/* fall through */
1224 				}
1225 			}
1226 		}
1227 	}
1228 
1229 	/*
1230 	 * use bio_caller1 to determine how big the original request was rather
1231 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1232 	 *
1233 	 * XXX We check for an error, but we do not test the resid for an
1234 	 * aligned EOF condition.  This may result in character & block
1235 	 * device access not recognizing EOF properly when read or written
1236 	 * sequentially, but will not effect filesystems.
1237 	 */
1238 	count = (long)cbp->cb_buf.bio_caller1;
1239 	putccdbuf(cbp);
1240 
1241 	/*
1242 	 * If all done, "interrupt".
1243 	 */
1244 	bp->bio_resid -= count;
1245 	if (bp->bio_resid < 0)
1246 		panic("ccdiodone: count");
1247 	if (bp->bio_resid == 0)
1248 		ccdintr(&ccd_softc[unit], bp);
1249 	splx(s);
1250 }
1251 
1252 static int
1253 ccdioctl(dev, cmd, data, flag, p)
1254 	dev_t dev;
1255 	u_long cmd;
1256 	caddr_t data;
1257 	int flag;
1258 	struct proc *p;
1259 {
1260 	int unit = ccdunit(dev);
1261 	int i, j, lookedup = 0, error = 0;
1262 	int part, pmask, s;
1263 	struct ccd_softc *cs;
1264 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1265 	struct ccddevice ccd;
1266 	char **cpp;
1267 	struct vnode **vpp;
1268 
1269 	if (unit >= numccd)
1270 		return (ENXIO);
1271 	cs = &ccd_softc[unit];
1272 
1273 	bzero(&ccd, sizeof(ccd));
1274 
1275 	switch (cmd) {
1276 	case CCDIOCSET:
1277 		if (cs->sc_flags & CCDF_INITED)
1278 			return (EBUSY);
1279 
1280 		if ((flag & FWRITE) == 0)
1281 			return (EBADF);
1282 
1283 		if ((error = ccdlock(cs)) != 0)
1284 			return (error);
1285 
1286 		/* Fill in some important bits. */
1287 		ccd.ccd_unit = unit;
1288 		ccd.ccd_interleave = ccio->ccio_ileave;
1289 		if (ccd.ccd_interleave == 0 &&
1290 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1291 		     (ccio->ccio_flags & CCDF_PARITY))) {
1292 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1293 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1294 		}
1295 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1296 		    (ccio->ccio_flags & CCDF_PARITY)) {
1297 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1298 			ccio->ccio_flags &= ~CCDF_PARITY;
1299 		}
1300 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1301 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1302 			printf("ccd%d: mirror/parity forces uniform flag\n",
1303 			       unit);
1304 			ccio->ccio_flags |= CCDF_UNIFORM;
1305 		}
1306 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1307 
1308 		/*
1309 		 * Allocate space for and copy in the array of
1310 		 * componet pathnames and device numbers.
1311 		 */
1312 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1313 		    M_DEVBUF, M_WAITOK);
1314 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1315 		    M_DEVBUF, M_WAITOK);
1316 
1317 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1318 		    ccio->ccio_ndisks * sizeof(char **));
1319 		if (error) {
1320 			free(vpp, M_DEVBUF);
1321 			free(cpp, M_DEVBUF);
1322 			ccdunlock(cs);
1323 			return (error);
1324 		}
1325 
1326 #ifdef DEBUG
1327 		if (ccddebug & CCDB_INIT)
1328 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1329 				printf("ccdioctl: component %d: 0x%x\n",
1330 				    i, cpp[i]);
1331 #endif
1332 
1333 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1334 #ifdef DEBUG
1335 			if (ccddebug & CCDB_INIT)
1336 				printf("ccdioctl: lookedup = %d\n", lookedup);
1337 #endif
1338 			if ((error = ccdlookup(cpp[i], p, &vpp[i])) != 0) {
1339 				for (j = 0; j < lookedup; ++j)
1340 					(void)vn_close(vpp[j], FREAD|FWRITE,
1341 					    p->p_ucred, p);
1342 				free(vpp, M_DEVBUF);
1343 				free(cpp, M_DEVBUF);
1344 				ccdunlock(cs);
1345 				return (error);
1346 			}
1347 			++lookedup;
1348 		}
1349 		ccd.ccd_cpp = cpp;
1350 		ccd.ccd_vpp = vpp;
1351 		ccd.ccd_ndev = ccio->ccio_ndisks;
1352 
1353 		/*
1354 		 * Initialize the ccd.  Fills in the softc for us.
1355 		 */
1356 		if ((error = ccdinit(&ccd, cpp, p)) != 0) {
1357 			for (j = 0; j < lookedup; ++j)
1358 				(void)vn_close(vpp[j], FREAD|FWRITE,
1359 				    p->p_ucred, p);
1360 			bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1361 			free(vpp, M_DEVBUF);
1362 			free(cpp, M_DEVBUF);
1363 			ccdunlock(cs);
1364 			return (error);
1365 		}
1366 
1367 		/*
1368 		 * The ccd has been successfully initialized, so
1369 		 * we can place it into the array and read the disklabel.
1370 		 */
1371 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1372 		ccio->ccio_unit = unit;
1373 		ccio->ccio_size = cs->sc_size;
1374 		ccdgetdisklabel(dev);
1375 
1376 		ccdunlock(cs);
1377 
1378 		break;
1379 
1380 	case CCDIOCCLR:
1381 		if ((cs->sc_flags & CCDF_INITED) == 0)
1382 			return (ENXIO);
1383 
1384 		if ((flag & FWRITE) == 0)
1385 			return (EBADF);
1386 
1387 		if ((error = ccdlock(cs)) != 0)
1388 			return (error);
1389 
1390 		/* Don't unconfigure if any other partitions are open */
1391 		part = ccdpart(dev);
1392 		pmask = (1 << part);
1393 		if ((cs->sc_openmask & ~pmask)) {
1394 			ccdunlock(cs);
1395 			return (EBUSY);
1396 		}
1397 
1398 		/*
1399 		 * Free ccd_softc information and clear entry.
1400 		 */
1401 
1402 		/* Close the components and free their pathnames. */
1403 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1404 			/*
1405 			 * XXX: this close could potentially fail and
1406 			 * cause Bad Things.  Maybe we need to force
1407 			 * the close to happen?
1408 			 */
1409 #ifdef DEBUG
1410 			if (ccddebug & CCDB_VNODE)
1411 				vprint("CCDIOCCLR: vnode info",
1412 				    cs->sc_cinfo[i].ci_vp);
1413 #endif
1414 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1415 			    p->p_ucred, p);
1416 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1417 		}
1418 
1419 		/* Free interleave index. */
1420 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1421 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1422 
1423 		/* Free component info and interleave table. */
1424 		free(cs->sc_cinfo, M_DEVBUF);
1425 		free(cs->sc_itable, M_DEVBUF);
1426 		cs->sc_flags &= ~CCDF_INITED;
1427 
1428 		/*
1429 		 * Free ccddevice information and clear entry.
1430 		 */
1431 		free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1432 		free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1433 		ccd.ccd_dk = -1;
1434 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1435 
1436 		/*
1437 		 * And remove the devstat entry.
1438 		 */
1439 		devstat_remove_entry(&cs->device_stats);
1440 
1441 		/* This must be atomic. */
1442 		s = splhigh();
1443 		ccdunlock(cs);
1444 		bzero(cs, sizeof(struct ccd_softc));
1445 		splx(s);
1446 
1447 		break;
1448 
1449 	case DIOCGDINFO:
1450 		if ((cs->sc_flags & CCDF_INITED) == 0)
1451 			return (ENXIO);
1452 
1453 		*(struct disklabel *)data = cs->sc_label;
1454 		break;
1455 
1456 	case DIOCGPART:
1457 		if ((cs->sc_flags & CCDF_INITED) == 0)
1458 			return (ENXIO);
1459 
1460 		((struct partinfo *)data)->disklab = &cs->sc_label;
1461 		((struct partinfo *)data)->part =
1462 		    &cs->sc_label.d_partitions[ccdpart(dev)];
1463 		break;
1464 
1465 	case DIOCWDINFO:
1466 	case DIOCSDINFO:
1467 		if ((cs->sc_flags & CCDF_INITED) == 0)
1468 			return (ENXIO);
1469 
1470 		if ((flag & FWRITE) == 0)
1471 			return (EBADF);
1472 
1473 		if ((error = ccdlock(cs)) != 0)
1474 			return (error);
1475 
1476 		cs->sc_flags |= CCDF_LABELLING;
1477 
1478 		error = setdisklabel(&cs->sc_label,
1479 		    (struct disklabel *)data, 0);
1480 		if (error == 0) {
1481 			if (cmd == DIOCWDINFO)
1482 				error = writedisklabel(CCDLABELDEV(dev),
1483 				    &cs->sc_label);
1484 		}
1485 
1486 		cs->sc_flags &= ~CCDF_LABELLING;
1487 
1488 		ccdunlock(cs);
1489 
1490 		if (error)
1491 			return (error);
1492 		break;
1493 
1494 	case DIOCWLABEL:
1495 		if ((cs->sc_flags & CCDF_INITED) == 0)
1496 			return (ENXIO);
1497 
1498 		if ((flag & FWRITE) == 0)
1499 			return (EBADF);
1500 		if (*(int *)data != 0)
1501 			cs->sc_flags |= CCDF_WLABEL;
1502 		else
1503 			cs->sc_flags &= ~CCDF_WLABEL;
1504 		break;
1505 
1506 	default:
1507 		return (ENOTTY);
1508 	}
1509 
1510 	return (0);
1511 }
1512 
1513 static int
1514 ccdsize(dev)
1515 	dev_t dev;
1516 {
1517 	struct ccd_softc *cs;
1518 	int part, size;
1519 
1520 	if (ccdopen(dev, 0, S_IFCHR, curproc))
1521 		return (-1);
1522 
1523 	cs = &ccd_softc[ccdunit(dev)];
1524 	part = ccdpart(dev);
1525 
1526 	if ((cs->sc_flags & CCDF_INITED) == 0)
1527 		return (-1);
1528 
1529 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1530 		size = -1;
1531 	else
1532 		size = cs->sc_label.d_partitions[part].p_size;
1533 
1534 	if (ccdclose(dev, 0, S_IFCHR, curproc))
1535 		return (-1);
1536 
1537 	return (size);
1538 }
1539 
1540 static int
1541 ccddump(dev)
1542 	dev_t dev;
1543 {
1544 
1545 	/* Not implemented. */
1546 	return ENXIO;
1547 }
1548 
1549 /*
1550  * Lookup the provided name in the filesystem.  If the file exists,
1551  * is a valid block device, and isn't being used by anyone else,
1552  * set *vpp to the file's vnode.
1553  */
1554 static int
1555 ccdlookup(path, p, vpp)
1556 	char *path;
1557 	struct proc *p;
1558 	struct vnode **vpp;	/* result */
1559 {
1560 	struct nameidata nd;
1561 	struct vnode *vp;
1562 	int error, flags;
1563 
1564 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, p);
1565 	flags = FREAD | FWRITE;
1566 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1567 #ifdef DEBUG
1568 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1569 			printf("ccdlookup: vn_open error = %d\n", error);
1570 #endif
1571 		return (error);
1572 	}
1573 	vp = nd.ni_vp;
1574 
1575 	if (vp->v_usecount > 1) {
1576 		error = EBUSY;
1577 		goto bad;
1578 	}
1579 
1580 	if (!vn_isdisk(vp, &error))
1581 		goto bad;
1582 
1583 #ifdef DEBUG
1584 	if (ccddebug & CCDB_VNODE)
1585 		vprint("ccdlookup: vnode info", vp);
1586 #endif
1587 
1588 	VOP_UNLOCK(vp, 0, p);
1589 	NDFREE(&nd, NDF_ONLY_PNBUF);
1590 	*vpp = vp;
1591 	return (0);
1592 bad:
1593 	VOP_UNLOCK(vp, 0, p);
1594 	NDFREE(&nd, NDF_ONLY_PNBUF);
1595 	/* vn_close does vrele() for vp */
1596 	(void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
1597 	return (error);
1598 }
1599 
1600 /*
1601  * Read the disklabel from the ccd.  If one is not present, fake one
1602  * up.
1603  */
1604 static void
1605 ccdgetdisklabel(dev)
1606 	dev_t dev;
1607 {
1608 	int unit = ccdunit(dev);
1609 	struct ccd_softc *cs = &ccd_softc[unit];
1610 	char *errstring;
1611 	struct disklabel *lp = &cs->sc_label;
1612 	struct ccdgeom *ccg = &cs->sc_geom;
1613 
1614 	bzero(lp, sizeof(*lp));
1615 
1616 	lp->d_secperunit = cs->sc_size;
1617 	lp->d_secsize = ccg->ccg_secsize;
1618 	lp->d_nsectors = ccg->ccg_nsectors;
1619 	lp->d_ntracks = ccg->ccg_ntracks;
1620 	lp->d_ncylinders = ccg->ccg_ncylinders;
1621 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1622 
1623 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1624 	lp->d_type = DTYPE_CCD;
1625 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1626 	lp->d_rpm = 3600;
1627 	lp->d_interleave = 1;
1628 	lp->d_flags = 0;
1629 
1630 	lp->d_partitions[RAW_PART].p_offset = 0;
1631 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1632 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1633 	lp->d_npartitions = RAW_PART + 1;
1634 
1635 	lp->d_bbsize = BBSIZE;				/* XXX */
1636 	lp->d_sbsize = SBSIZE;				/* XXX */
1637 
1638 	lp->d_magic = DISKMAGIC;
1639 	lp->d_magic2 = DISKMAGIC;
1640 	lp->d_checksum = dkcksum(&cs->sc_label);
1641 
1642 	/*
1643 	 * Call the generic disklabel extraction routine.
1644 	 */
1645 	errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1646 	if (errstring != NULL)
1647 		ccdmakedisklabel(cs);
1648 
1649 #ifdef DEBUG
1650 	/* It's actually extremely common to have unlabeled ccds. */
1651 	if (ccddebug & CCDB_LABEL)
1652 		if (errstring != NULL)
1653 			printf("ccd%d: %s\n", unit, errstring);
1654 #endif
1655 }
1656 
1657 /*
1658  * Take care of things one might want to take care of in the event
1659  * that a disklabel isn't present.
1660  */
1661 static void
1662 ccdmakedisklabel(cs)
1663 	struct ccd_softc *cs;
1664 {
1665 	struct disklabel *lp = &cs->sc_label;
1666 
1667 	/*
1668 	 * For historical reasons, if there's no disklabel present
1669 	 * the raw partition must be marked FS_BSDFFS.
1670 	 */
1671 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1672 
1673 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1674 }
1675 
1676 /*
1677  * Wait interruptibly for an exclusive lock.
1678  *
1679  * XXX
1680  * Several drivers do this; it should be abstracted and made MP-safe.
1681  */
1682 static int
1683 ccdlock(cs)
1684 	struct ccd_softc *cs;
1685 {
1686 	int error;
1687 
1688 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1689 		cs->sc_flags |= CCDF_WANTED;
1690 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1691 			return (error);
1692 	}
1693 	cs->sc_flags |= CCDF_LOCKED;
1694 	return (0);
1695 }
1696 
1697 /*
1698  * Unlock and wake up any waiters.
1699  */
1700 static void
1701 ccdunlock(cs)
1702 	struct ccd_softc *cs;
1703 {
1704 
1705 	cs->sc_flags &= ~CCDF_LOCKED;
1706 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1707 		cs->sc_flags &= ~CCDF_WANTED;
1708 		wakeup(cs);
1709 	}
1710 }
1711 
1712 #ifdef DEBUG
1713 static void
1714 printiinfo(ii)
1715 	struct ccdiinfo *ii;
1716 {
1717 	int ix, i;
1718 
1719 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1720 		printf(" itab[%d]: #dk %d sblk %d soff %d",
1721 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1722 		for (i = 0; i < ii->ii_ndisk; i++)
1723 			printf(" %d", ii->ii_index[i]);
1724 		printf("\n");
1725 	}
1726 }
1727 #endif
1728