xref: /freebsd/sys/geom/geom_ccd.c (revision 0fddbf874719b9bd50cf66ac26d1140bb3f2be69)
1 /* $FreeBSD$ */
2 
3 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
4 
5 /*
6  * Copyright (c) 1995 Jason R. Thorpe.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed for the NetBSD Project
20  *	by Jason R. Thorpe.
21  * 4. The name of the author may not be used to endorse or promote products
22  *    derived from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * Copyright (c) 1988 University of Utah.
39  * Copyright (c) 1990, 1993
40  *	The Regents of the University of California.  All rights reserved.
41  *
42  * This code is derived from software contributed to Berkeley by
43  * the Systems Programming Group of the University of Utah Computer
44  * Science Department.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions
48  * are met:
49  * 1. Redistributions of source code must retain the above copyright
50  *    notice, this list of conditions and the following disclaimer.
51  * 2. Redistributions in binary form must reproduce the above copyright
52  *    notice, this list of conditions and the following disclaimer in the
53  *    documentation and/or other materials provided with the distribution.
54  * 3. All advertising materials mentioning features or use of this software
55  *    must display the following acknowledgement:
56  *	This product includes software developed by the University of
57  *	California, Berkeley and its contributors.
58  * 4. Neither the name of the University nor the names of its contributors
59  *    may be used to endorse or promote products derived from this software
60  *    without specific prior written permission.
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72  * SUCH DAMAGE.
73  *
74  * from: Utah $Hdr: cd.c 1.6 90/11/28$
75  *
76  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
77  */
78 
79 /*
80  * "Concatenated" disk driver.
81  *
82  * Dynamic configuration and disklabel support by:
83  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
84  *	Numerical Aerodynamic Simulation Facility
85  *	Mail Stop 258-6
86  *	NASA Ames Research Center
87  *	Moffett Field, CA 94035
88  */
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/kernel.h>
93 #include <sys/module.h>
94 #include <sys/proc.h>
95 #include <sys/bio.h>
96 #include <sys/malloc.h>
97 #include <sys/namei.h>
98 #include <sys/conf.h>
99 #include <sys/stat.h>
100 #include <sys/sysctl.h>
101 #include <sys/disklabel.h>
102 #include <ufs/ffs/fs.h>
103 #include <sys/devicestat.h>
104 #include <sys/fcntl.h>
105 #include <sys/vnode.h>
106 
107 #include <sys/ccdvar.h>
108 
109 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
110 
111 #if defined(CCDDEBUG) && !defined(DEBUG)
112 #define DEBUG
113 #endif
114 
115 #ifdef DEBUG
116 #define CCDB_FOLLOW	0x01
117 #define CCDB_INIT	0x02
118 #define CCDB_IO		0x04
119 #define CCDB_LABEL	0x08
120 #define CCDB_VNODE	0x10
121 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
122     CCDB_VNODE;
123 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
124 #endif
125 
126 #define	ccdunit(x)	dkunit(x)
127 #define ccdpart(x)	dkpart(x)
128 
129 /*
130    This is how mirroring works (only writes are special):
131 
132    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
133    linked together by the cb_mirror field.  "cb_pflags &
134    CCDPF_MIRROR_DONE" is set to 0 on both of them.
135 
136    When a component returns to ccdiodone(), it checks if "cb_pflags &
137    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
138    flag and returns.  If it is, it means its partner has already
139    returned, so it will go to the regular cleanup.
140 
141  */
142 
143 struct ccdbuf {
144 	struct bio	cb_buf;		/* new I/O buf */
145 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
146 	struct ccdbuf	*cb_freenext;	/* free list link */
147 	int		cb_unit;	/* target unit */
148 	int		cb_comp;	/* target component */
149 	int		cb_pflags;	/* mirror/parity status flag */
150 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
151 };
152 
153 /* bits in cb_pflags */
154 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
155 
156 #define CCDLABELDEV(dev)	\
157 	(makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
158 
159 /* convinient macros for often-used statements */
160 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
161 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
162 
163 static d_open_t ccdopen;
164 static d_close_t ccdclose;
165 static d_strategy_t ccdstrategy;
166 static d_ioctl_t ccdioctl;
167 static d_dump_t ccddump;
168 static d_psize_t ccdsize;
169 
170 #define NCCDFREEHIWAT	16
171 
172 #define CDEV_MAJOR 74
173 
174 static struct cdevsw ccd_cdevsw = {
175 	/* open */	ccdopen,
176 	/* close */	ccdclose,
177 	/* read */	physread,
178 	/* write */	physwrite,
179 	/* ioctl */	ccdioctl,
180 	/* poll */	nopoll,
181 	/* mmap */	nommap,
182 	/* strategy */	ccdstrategy,
183 	/* name */	"ccd",
184 	/* maj */	CDEV_MAJOR,
185 	/* dump */	ccddump,
186 	/* psize */	ccdsize,
187 	/* flags */	D_DISK,
188 };
189 static LIST_HEAD(, ccd_s) ccd_softc_list = LIST_HEAD_INITIALIZER(&ccd_softc_list);
190 
191 static struct ccd_s *ccdfind(int);
192 static struct ccd_s *ccdnew(int);
193 static int ccddestroy(struct ccd_s *, struct proc *);
194 
195 /* called during module initialization */
196 static void ccdattach(void);
197 static int ccd_modevent(module_t, int, void *);
198 
199 /* called by biodone() at interrupt time */
200 static void ccdiodone(struct bio *bp);
201 
202 static void ccdstart(struct ccd_s *, struct bio *);
203 static void ccdinterleave(struct ccd_s *, int);
204 static void ccdintr(struct ccd_s *, struct bio *);
205 static int ccdinit(struct ccd_s *, char **, struct proc *);
206 static int ccdlookup(char *, struct proc *p, struct vnode **);
207 static void ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
208 		      struct bio *, daddr_t, caddr_t, long);
209 static void ccdgetdisklabel(dev_t);
210 static void ccdmakedisklabel(struct ccd_s *);
211 static int ccdlock(struct ccd_s *);
212 static void ccdunlock(struct ccd_s *);
213 
214 #ifdef DEBUG
215 static void printiinfo(struct ccdiinfo *);
216 #endif
217 
218 /* Non-private for the benefit of libkvm. */
219 struct ccdbuf *ccdfreebufs;
220 static int numccdfreebufs;
221 
222 /*
223  * getccdbuf() -	Allocate and zero a ccd buffer.
224  *
225  *	This routine is called at splbio().
226  */
227 
228 static __inline
229 struct ccdbuf *
230 getccdbuf(struct ccdbuf *cpy)
231 {
232 	struct ccdbuf *cbp;
233 
234 	/*
235 	 * Allocate from freelist or malloc as necessary
236 	 */
237 	if ((cbp = ccdfreebufs) != NULL) {
238 		ccdfreebufs = cbp->cb_freenext;
239 		--numccdfreebufs;
240 	} else {
241 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
242 	}
243 
244 	/*
245 	 * Used by mirroring code
246 	 */
247 	if (cpy)
248 		bcopy(cpy, cbp, sizeof(struct ccdbuf));
249 	else
250 		bzero(cbp, sizeof(struct ccdbuf));
251 
252 	/*
253 	 * independant struct bio initialization
254 	 */
255 
256 	return(cbp);
257 }
258 
259 /*
260  * putccdbuf() -	Free a ccd buffer.
261  *
262  *	This routine is called at splbio().
263  */
264 
265 static __inline
266 void
267 putccdbuf(struct ccdbuf *cbp)
268 {
269 
270 	if (numccdfreebufs < NCCDFREEHIWAT) {
271 		cbp->cb_freenext = ccdfreebufs;
272 		ccdfreebufs = cbp;
273 		++numccdfreebufs;
274 	} else {
275 		free((caddr_t)cbp, M_DEVBUF);
276 	}
277 }
278 
279 
280 /*
281  * Number of blocks to untouched in front of a component partition.
282  * This is to avoid violating its disklabel area when it starts at the
283  * beginning of the slice.
284  */
285 #if !defined(CCD_OFFSET)
286 #define CCD_OFFSET 16
287 #endif
288 
289 static struct ccd_s *
290 ccdfind(int unit)
291 {
292 	struct ccd_s *sc = NULL;
293 
294 	/* XXX: LOCK(unique unit numbers) */
295 	LIST_FOREACH(sc, &ccd_softc_list, list) {
296 		if (sc->sc_unit == unit)
297 			break;
298 	}
299 	/* XXX: UNLOCK(unique unit numbers) */
300 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
301 }
302 
303 static struct ccd_s *
304 ccdnew(int unit)
305 {
306 	struct ccd_s *sc;
307 
308 	/* XXX: LOCK(unique unit numbers) */
309 	if (IS_ALLOCATED(unit) || unit > DKMAXUNIT)
310 		return (NULL);
311 
312 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
313 	sc->sc_unit = unit;
314 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
315 	/* XXX: UNLOCK(unique unit numbers) */
316 	return (sc);
317 }
318 
319 static int
320 ccddestroy(struct ccd_s *sc, struct proc *p)
321 {
322 
323 	/* XXX: LOCK(unique unit numbers) */
324 	LIST_REMOVE(sc, list);
325 	/* XXX: UNLOCK(unique unit numbers) */
326 	FREE(sc, M_CCD);
327 	return (0);
328 }
329 
330 static void
331 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
332 {
333 	int i, u;
334 	char *s;
335 
336 	if (*dev != NODEV)
337 		return;
338 	i = dev_stdclone(name, &s, "ccd", &u);
339 	if (i != 2)
340 		return;
341 	if (*s < 'a' || *s > 'h')
342 		return;
343 	if (s[1] != '\0')
344 		return;
345 	*dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
346 		UID_ROOT, GID_OPERATOR, 0640, name);
347 }
348 
349 /*
350  * Called by main() during pseudo-device attachment.  All we need
351  * to do is to add devsw entries.
352  */
353 static void
354 ccdattach()
355 {
356 
357 	EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
358 }
359 
360 static int
361 ccd_modevent(module_t mod, int type, void *data)
362 {
363 	int error = 0;
364 
365 	switch (type) {
366 	case MOD_LOAD:
367 		ccdattach();
368 		break;
369 
370 	case MOD_UNLOAD:
371 		printf("ccd0: Unload not supported!\n");
372 		error = EOPNOTSUPP;
373 		break;
374 
375 	default:	/* MOD_SHUTDOWN etc */
376 		break;
377 	}
378 	return (error);
379 }
380 
381 DEV_MODULE(ccd, ccd_modevent, NULL);
382 
383 static int
384 ccdinit(struct ccd_s *cs, char **cpaths, struct proc *p)
385 {
386 	struct ccdcinfo *ci = NULL;	/* XXX */
387 	size_t size;
388 	int ix;
389 	struct vnode *vp;
390 	size_t minsize;
391 	int maxsecsize;
392 	struct partinfo dpart;
393 	struct ccdgeom *ccg = &cs->sc_geom;
394 	char tmppath[MAXPATHLEN];
395 	int error = 0;
396 
397 #ifdef DEBUG
398 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
399 		printf("ccdinit: unit %d\n", cs->sc_unit);
400 #endif
401 
402 	cs->sc_size = 0;
403 
404 	/* Allocate space for the component info. */
405 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
406 	    M_DEVBUF, M_WAITOK);
407 
408 	/*
409 	 * Verify that each component piece exists and record
410 	 * relevant information about it.
411 	 */
412 	maxsecsize = 0;
413 	minsize = 0;
414 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
415 		vp = cs->sc_vpp[ix];
416 		ci = &cs->sc_cinfo[ix];
417 		ci->ci_vp = vp;
418 
419 		/*
420 		 * Copy in the pathname of the component.
421 		 */
422 		bzero(tmppath, sizeof(tmppath));	/* sanity */
423 		if ((error = copyinstr(cpaths[ix], tmppath,
424 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
425 #ifdef DEBUG
426 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
427 				printf("ccd%d: can't copy path, error = %d\n",
428 				    cs->sc_unit, error);
429 #endif
430 			goto fail;
431 		}
432 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
433 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
434 
435 		ci->ci_dev = vn_todev(vp);
436 
437 		/*
438 		 * Get partition information for the component.
439 		 */
440 		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
441 		    FREAD, p->p_ucred, p)) != 0) {
442 #ifdef DEBUG
443 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
444 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
445 				     cs->sc_unit, ci->ci_path, error);
446 #endif
447 			goto fail;
448 		}
449 		if (dpart.part->p_fstype == FS_BSDFFS) {
450 			maxsecsize =
451 			    ((dpart.disklab->d_secsize > maxsecsize) ?
452 			    dpart.disklab->d_secsize : maxsecsize);
453 			size = dpart.part->p_size - CCD_OFFSET;
454 		} else {
455 #ifdef DEBUG
456 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
457 				printf("ccd%d: %s: incorrect partition type\n",
458 				    cs->sc_unit, ci->ci_path);
459 #endif
460 			error = EFTYPE;
461 			goto fail;
462 		}
463 
464 		/*
465 		 * Calculate the size, truncating to an interleave
466 		 * boundary if necessary.
467 		 */
468 
469 		if (cs->sc_ileave > 1)
470 			size -= size % cs->sc_ileave;
471 
472 		if (size == 0) {
473 #ifdef DEBUG
474 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
475 				printf("ccd%d: %s: size == 0\n",
476 				    cs->sc_unit, ci->ci_path);
477 #endif
478 			error = ENODEV;
479 			goto fail;
480 		}
481 
482 		if (minsize == 0 || size < minsize)
483 			minsize = size;
484 		ci->ci_size = size;
485 		cs->sc_size += size;
486 	}
487 
488 	/*
489 	 * Don't allow the interleave to be smaller than
490 	 * the biggest component sector.
491 	 */
492 	if ((cs->sc_ileave > 0) &&
493 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
494 #ifdef DEBUG
495 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
496 			printf("ccd%d: interleave must be at least %d\n",
497 			    cs->sc_unit, (maxsecsize / DEV_BSIZE));
498 #endif
499 		error = EINVAL;
500 		goto fail;
501 	}
502 
503 	/*
504 	 * If uniform interleave is desired set all sizes to that of
505 	 * the smallest component.  This will guarentee that a single
506 	 * interleave table is generated.
507 	 *
508 	 * Lost space must be taken into account when calculating the
509 	 * overall size.  Half the space is lost when CCDF_MIRROR is
510 	 * specified.  One disk is lost when CCDF_PARITY is specified.
511 	 */
512 	if (cs->sc_flags & CCDF_UNIFORM) {
513 		for (ci = cs->sc_cinfo;
514 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
515 			ci->ci_size = minsize;
516 		}
517 		if (cs->sc_flags & CCDF_MIRROR) {
518 			/*
519 			 * Check to see if an even number of components
520 			 * have been specified.  The interleave must also
521 			 * be non-zero in order for us to be able to
522 			 * guarentee the topology.
523 			 */
524 			if (cs->sc_nccdisks % 2) {
525 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
526 				error = EINVAL;
527 				goto fail;
528 			}
529 			if (cs->sc_ileave == 0) {
530 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
531 				error = EINVAL;
532 				goto fail;
533 			}
534 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
535 		} else if (cs->sc_flags & CCDF_PARITY) {
536 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
537 		} else {
538 			if (cs->sc_ileave == 0) {
539 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
540 				error = EINVAL;
541 				goto fail;
542 			}
543 			cs->sc_size = cs->sc_nccdisks * minsize;
544 		}
545 	}
546 
547 	/*
548 	 * Construct the interleave table.
549 	 */
550 	ccdinterleave(cs, cs->sc_unit);
551 
552 	/*
553 	 * Create pseudo-geometry based on 1MB cylinders.  It's
554 	 * pretty close.
555 	 */
556 	ccg->ccg_secsize = maxsecsize;
557 	ccg->ccg_ntracks = 1;
558 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
559 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
560 
561 	/*
562 	 * Add an devstat entry for this device.
563 	 */
564 	devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
565 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
566 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
567 			  DEVSTAT_PRIORITY_ARRAY);
568 
569 	cs->sc_flags |= CCDF_INITED;
570 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
571 	return (0);
572 fail:
573 	while (ci > cs->sc_cinfo) {
574 		ci--;
575 		free(ci->ci_path, M_DEVBUF);
576 	}
577 	free(cs->sc_cinfo, M_DEVBUF);
578 	return (error);
579 }
580 
581 static void
582 ccdinterleave(struct ccd_s *cs, int unit)
583 {
584 	struct ccdcinfo *ci, *smallci;
585 	struct ccdiinfo *ii;
586 	daddr_t bn, lbn;
587 	int ix;
588 	u_long size;
589 
590 #ifdef DEBUG
591 	if (ccddebug & CCDB_INIT)
592 		printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
593 #endif
594 
595 	/*
596 	 * Allocate an interleave table.  The worst case occurs when each
597 	 * of N disks is of a different size, resulting in N interleave
598 	 * tables.
599 	 *
600 	 * Chances are this is too big, but we don't care.
601 	 */
602 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
603 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
604 	    M_WAITOK | M_ZERO);
605 
606 	/*
607 	 * Trivial case: no interleave (actually interleave of disk size).
608 	 * Each table entry represents a single component in its entirety.
609 	 *
610 	 * An interleave of 0 may not be used with a mirror or parity setup.
611 	 */
612 	if (cs->sc_ileave == 0) {
613 		bn = 0;
614 		ii = cs->sc_itable;
615 
616 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
617 			/* Allocate space for ii_index. */
618 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
619 			ii->ii_ndisk = 1;
620 			ii->ii_startblk = bn;
621 			ii->ii_startoff = 0;
622 			ii->ii_index[0] = ix;
623 			bn += cs->sc_cinfo[ix].ci_size;
624 			ii++;
625 		}
626 		ii->ii_ndisk = 0;
627 #ifdef DEBUG
628 		if (ccddebug & CCDB_INIT)
629 			printiinfo(cs->sc_itable);
630 #endif
631 		return;
632 	}
633 
634 	/*
635 	 * The following isn't fast or pretty; it doesn't have to be.
636 	 */
637 	size = 0;
638 	bn = lbn = 0;
639 	for (ii = cs->sc_itable; ; ii++) {
640 		/*
641 		 * Allocate space for ii_index.  We might allocate more then
642 		 * we use.
643 		 */
644 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
645 		    M_DEVBUF, M_WAITOK);
646 
647 		/*
648 		 * Locate the smallest of the remaining components
649 		 */
650 		smallci = NULL;
651 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
652 		    ci++) {
653 			if (ci->ci_size > size &&
654 			    (smallci == NULL ||
655 			     ci->ci_size < smallci->ci_size)) {
656 				smallci = ci;
657 			}
658 		}
659 
660 		/*
661 		 * Nobody left, all done
662 		 */
663 		if (smallci == NULL) {
664 			ii->ii_ndisk = 0;
665 			break;
666 		}
667 
668 		/*
669 		 * Record starting logical block using an sc_ileave blocksize.
670 		 */
671 		ii->ii_startblk = bn / cs->sc_ileave;
672 
673 		/*
674 		 * Record starting comopnent block using an sc_ileave
675 		 * blocksize.  This value is relative to the beginning of
676 		 * a component disk.
677 		 */
678 		ii->ii_startoff = lbn;
679 
680 		/*
681 		 * Determine how many disks take part in this interleave
682 		 * and record their indices.
683 		 */
684 		ix = 0;
685 		for (ci = cs->sc_cinfo;
686 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
687 			if (ci->ci_size >= smallci->ci_size) {
688 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
689 			}
690 		}
691 		ii->ii_ndisk = ix;
692 		bn += ix * (smallci->ci_size - size);
693 		lbn = smallci->ci_size / cs->sc_ileave;
694 		size = smallci->ci_size;
695 	}
696 #ifdef DEBUG
697 	if (ccddebug & CCDB_INIT)
698 		printiinfo(cs->sc_itable);
699 #endif
700 }
701 
702 /* ARGSUSED */
703 static int
704 ccdopen(dev_t dev, int flags, int fmt, struct proc *p)
705 {
706 	int unit = ccdunit(dev);
707 	struct ccd_s *cs;
708 	struct disklabel *lp;
709 	int error = 0, part, pmask;
710 
711 #ifdef DEBUG
712 	if (ccddebug & CCDB_FOLLOW)
713 		printf("ccdopen(%p, %x)\n", dev, flags);
714 #endif
715 
716 	cs = IS_ALLOCATED(unit) ? ccdfind(unit) : ccdnew(unit);
717 
718 	if ((error = ccdlock(cs)) != 0)
719 		return (error);
720 
721 	lp = &cs->sc_label;
722 
723 	part = ccdpart(dev);
724 	pmask = (1 << part);
725 
726 	/*
727 	 * If we're initialized, check to see if there are any other
728 	 * open partitions.  If not, then it's safe to update
729 	 * the in-core disklabel.
730 	 */
731 	if (IS_INITED(cs) && (cs->sc_openmask == 0))
732 		ccdgetdisklabel(dev);
733 
734 	/* Check that the partition exists. */
735 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
736 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
737 		error = ENXIO;
738 		goto done;
739 	}
740 
741 	cs->sc_openmask |= pmask;
742  done:
743 	ccdunlock(cs);
744 	return (0);
745 }
746 
747 /* ARGSUSED */
748 static int
749 ccdclose(dev_t dev, int flags, int fmt, struct proc *p)
750 {
751 	int unit = ccdunit(dev);
752 	struct ccd_s *cs;
753 	int error = 0, part;
754 
755 #ifdef DEBUG
756 	if (ccddebug & CCDB_FOLLOW)
757 		printf("ccdclose(%p, %x)\n", dev, flags);
758 #endif
759 
760 	if (!IS_ALLOCATED(unit))
761 		return (ENXIO);
762 	cs = ccdfind(unit);
763 
764 	if ((error = ccdlock(cs)) != 0)
765 		return (error);
766 
767 	part = ccdpart(dev);
768 
769 	/* ...that much closer to allowing unconfiguration... */
770 	cs->sc_openmask &= ~(1 << part);
771 	/* collect "garbage" if possible */
772 	if (!IS_INITED(cs) && (cs->sc_flags & CCDF_WANTED) == 0)
773 		ccddestroy(cs, p);
774 	else
775 		ccdunlock(cs);
776 	return (0);
777 }
778 
779 static void
780 ccdstrategy(struct bio *bp)
781 {
782 	int unit = ccdunit(bp->bio_dev);
783 	struct ccd_s *cs = ccdfind(unit);
784 	int s;
785 	int wlabel;
786 	struct disklabel *lp;
787 
788 #ifdef DEBUG
789 	if (ccddebug & CCDB_FOLLOW)
790 		printf("ccdstrategy(%p): unit %d\n", bp, unit);
791 #endif
792 	if (!IS_INITED(cs)) {
793 		biofinish(bp, NULL, ENXIO);
794 		return;
795 	}
796 
797 	/* If it's a nil transfer, wake up the top half now. */
798 	if (bp->bio_bcount == 0) {
799 		biodone(bp);
800 		return;
801 	}
802 
803 	lp = &cs->sc_label;
804 
805 	/*
806 	 * Do bounds checking and adjust transfer.  If there's an
807 	 * error, the bounds check will flag that for us.
808 	 */
809 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
810 	if (ccdpart(bp->bio_dev) != RAW_PART) {
811 		if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
812 			biodone(bp);
813 			return;
814 		}
815 	} else {
816 		int pbn;        /* in sc_secsize chunks */
817 		long sz;        /* in sc_secsize chunks */
818 
819 		pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
820 		sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
821 
822 		/*
823 		 * If out of bounds return an error. If at the EOF point,
824 		 * simply read or write less.
825 		 */
826 
827 		if (pbn < 0 || pbn >= cs->sc_size) {
828 			bp->bio_resid = bp->bio_bcount;
829 			if (pbn != cs->sc_size)
830 				biofinish(bp, NULL, EINVAL);
831 			else
832 				biodone(bp);
833 			return;
834 		}
835 
836 		/*
837 		 * If the request crosses EOF, truncate the request.
838 		 */
839 		if (pbn + sz > cs->sc_size) {
840 			bp->bio_bcount = (cs->sc_size - pbn) *
841 			    cs->sc_geom.ccg_secsize;
842 		}
843 	}
844 
845 	bp->bio_resid = bp->bio_bcount;
846 
847 	/*
848 	 * "Start" the unit.
849 	 */
850 	s = splbio();
851 	ccdstart(cs, bp);
852 	splx(s);
853 	return;
854 }
855 
856 static void
857 ccdstart(struct ccd_s *cs, struct bio *bp)
858 {
859 	long bcount, rcount;
860 	struct ccdbuf *cbp[4];
861 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
862 	caddr_t addr;
863 	daddr_t bn;
864 	struct partition *pp;
865 
866 #ifdef DEBUG
867 	if (ccddebug & CCDB_FOLLOW)
868 		printf("ccdstart(%p, %p)\n", cs, bp);
869 #endif
870 
871 	/* Record the transaction start  */
872 	devstat_start_transaction(&cs->device_stats);
873 
874 	/*
875 	 * Translate the partition-relative block number to an absolute.
876 	 */
877 	bn = bp->bio_blkno;
878 	if (ccdpart(bp->bio_dev) != RAW_PART) {
879 		pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
880 		bn += pp->p_offset;
881 	}
882 
883 	/*
884 	 * Allocate component buffers and fire off the requests
885 	 */
886 	addr = bp->bio_data;
887 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
888 		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
889 		rcount = cbp[0]->cb_buf.bio_bcount;
890 
891 		if (cs->sc_cflags & CCDF_MIRROR) {
892 			/*
893 			 * Mirroring.  Writes go to both disks, reads are
894 			 * taken from whichever disk seems most appropriate.
895 			 *
896 			 * We attempt to localize reads to the disk whos arm
897 			 * is nearest the read request.  We ignore seeks due
898 			 * to writes when making this determination and we
899 			 * also try to avoid hogging.
900 			 */
901 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
902 				BIO_STRATEGY(&cbp[0]->cb_buf, 0);
903 				BIO_STRATEGY(&cbp[1]->cb_buf, 0);
904 			} else {
905 				int pick = cs->sc_pick;
906 				daddr_t range = cs->sc_size / 16;
907 
908 				if (bn < cs->sc_blk[pick] - range ||
909 				    bn > cs->sc_blk[pick] + range
910 				) {
911 					cs->sc_pick = pick = 1 - pick;
912 				}
913 				cs->sc_blk[pick] = bn + btodb(rcount);
914 				BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
915 			}
916 		} else {
917 			/*
918 			 * Not mirroring
919 			 */
920 			BIO_STRATEGY(&cbp[0]->cb_buf, 0);
921 		}
922 		bn += btodb(rcount);
923 		addr += rcount;
924 	}
925 }
926 
927 /*
928  * Build a component buffer header.
929  */
930 static void
931 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
932 {
933 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
934 	struct ccdbuf *cbp;
935 	daddr_t cbn, cboff;
936 	off_t cbc;
937 
938 #ifdef DEBUG
939 	if (ccddebug & CCDB_IO)
940 		printf("ccdbuffer(%p, %p, %d, %p, %ld)\n",
941 		       cs, bp, bn, addr, bcount);
942 #endif
943 	/*
944 	 * Determine which component bn falls in.
945 	 */
946 	cbn = bn;
947 	cboff = 0;
948 
949 	if (cs->sc_ileave == 0) {
950 		/*
951 		 * Serially concatenated and neither a mirror nor a parity
952 		 * config.  This is a special case.
953 		 */
954 		daddr_t sblk;
955 
956 		sblk = 0;
957 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
958 			sblk += ci->ci_size;
959 		cbn -= sblk;
960 	} else {
961 		struct ccdiinfo *ii;
962 		int ccdisk, off;
963 
964 		/*
965 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
966 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
967 		 * to cbn.
968 		 */
969 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
970 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
971 
972 		/*
973 		 * Figure out which interleave table to use.
974 		 */
975 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
976 			if (ii->ii_startblk > cbn)
977 				break;
978 		}
979 		ii--;
980 
981 		/*
982 		 * off is the logical superblock relative to the beginning
983 		 * of this interleave block.
984 		 */
985 		off = cbn - ii->ii_startblk;
986 
987 		/*
988 		 * We must calculate which disk component to use (ccdisk),
989 		 * and recalculate cbn to be the superblock relative to
990 		 * the beginning of the component.  This is typically done by
991 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
992 		 * must typically be divided by the number of components in
993 		 * this interleave array to be properly convert it from a
994 		 * CCD-relative logical superblock number to a
995 		 * component-relative superblock number.
996 		 */
997 		if (ii->ii_ndisk == 1) {
998 			/*
999 			 * When we have just one disk, it can't be a mirror
1000 			 * or a parity config.
1001 			 */
1002 			ccdisk = ii->ii_index[0];
1003 			cbn = ii->ii_startoff + off;
1004 		} else {
1005 			if (cs->sc_cflags & CCDF_MIRROR) {
1006 				/*
1007 				 * We have forced a uniform mapping, resulting
1008 				 * in a single interleave array.  We double
1009 				 * up on the first half of the available
1010 				 * components and our mirror is in the second
1011 				 * half.  This only works with a single
1012 				 * interleave array because doubling up
1013 				 * doubles the number of sectors, so there
1014 				 * cannot be another interleave array because
1015 				 * the next interleave array's calculations
1016 				 * would be off.
1017 				 */
1018 				int ndisk2 = ii->ii_ndisk / 2;
1019 				ccdisk = ii->ii_index[off % ndisk2];
1020 				cbn = ii->ii_startoff + off / ndisk2;
1021 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1022 			} else if (cs->sc_cflags & CCDF_PARITY) {
1023 				/*
1024 				 * XXX not implemented yet
1025 				 */
1026 				int ndisk2 = ii->ii_ndisk - 1;
1027 				ccdisk = ii->ii_index[off % ndisk2];
1028 				cbn = ii->ii_startoff + off / ndisk2;
1029 				if (cbn % ii->ii_ndisk <= ccdisk)
1030 					ccdisk++;
1031 			} else {
1032 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1033 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1034 			}
1035 		}
1036 
1037 		ci = &cs->sc_cinfo[ccdisk];
1038 
1039 		/*
1040 		 * Convert cbn from a superblock to a normal block so it
1041 		 * can be used to calculate (along with cboff) the normal
1042 		 * block index into this particular disk.
1043 		 */
1044 		cbn *= cs->sc_ileave;
1045 	}
1046 
1047 	/*
1048 	 * Fill in the component buf structure.
1049 	 */
1050 	cbp = getccdbuf(NULL);
1051 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
1052 	cbp->cb_buf.bio_done = ccdiodone;
1053 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
1054 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1055 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1056 	cbp->cb_buf.bio_data = addr;
1057 	if (cs->sc_ileave == 0)
1058               cbc = dbtob((off_t)(ci->ci_size - cbn));
1059 	else
1060               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1061 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1062  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1063 
1064 	/*
1065 	 * context for ccdiodone
1066 	 */
1067 	cbp->cb_obp = bp;
1068 	cbp->cb_unit = cs->sc_unit;
1069 	cbp->cb_comp = ci - cs->sc_cinfo;
1070 
1071 #ifdef DEBUG
1072 	if (ccddebug & CCDB_IO)
1073 		printf(" dev %p(u%ld): cbp %p bn %d addr %p bcnt %ld\n",
1074 		       ci->ci_dev, (unsigned long)(ci-cs->sc_cinfo), cbp,
1075 		       cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1076 		       cbp->cb_buf.bio_bcount);
1077 #endif
1078 	cb[0] = cbp;
1079 
1080 	/*
1081 	 * Note: both I/O's setup when reading from mirror, but only one
1082 	 * will be executed.
1083 	 */
1084 	if (cs->sc_cflags & CCDF_MIRROR) {
1085 		/* mirror, setup second I/O */
1086 		cbp = getccdbuf(cb[0]);
1087 		cbp->cb_buf.bio_dev = ci2->ci_dev;
1088 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1089 		cb[1] = cbp;
1090 		/* link together the ccdbuf's and clear "mirror done" flag */
1091 		cb[0]->cb_mirror = cb[1];
1092 		cb[1]->cb_mirror = cb[0];
1093 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1094 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1095 	}
1096 }
1097 
1098 static void
1099 ccdintr(struct ccd_s *cs, struct bio *bp)
1100 {
1101 #ifdef DEBUG
1102 	if (ccddebug & CCDB_FOLLOW)
1103 		printf("ccdintr(%p, %p)\n", cs, bp);
1104 #endif
1105 	/*
1106 	 * Request is done for better or worse, wakeup the top half.
1107 	 */
1108 	if (bp->bio_flags & BIO_ERROR)
1109 		bp->bio_resid = bp->bio_bcount;
1110 	biofinish(bp, &cs->device_stats, 0);
1111 }
1112 
1113 /*
1114  * Called at interrupt time.
1115  * Mark the component as done and if all components are done,
1116  * take a ccd interrupt.
1117  */
1118 static void
1119 ccdiodone(struct bio *ibp)
1120 {
1121 	struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1122 	struct bio *bp = cbp->cb_obp;
1123 	int unit = cbp->cb_unit;
1124 	int count, s;
1125 
1126 	s = splbio();
1127 #ifdef DEBUG
1128 	if (ccddebug & CCDB_FOLLOW)
1129 		printf("ccdiodone(%p)\n", cbp);
1130 	if (ccddebug & CCDB_IO) {
1131 		printf("ccdiodone: bp %p bcount %ld resid %ld\n",
1132 		       bp, bp->bio_bcount, bp->bio_resid);
1133 		printf(" dev %p(u%d), cbp %p bn %d addr %p bcnt %ld\n",
1134 		       cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1135 		       cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1136 		       cbp->cb_buf.bio_bcount);
1137 	}
1138 #endif
1139 	/*
1140 	 * If an error occured, report it.  If this is a mirrored
1141 	 * configuration and the first of two possible reads, do not
1142 	 * set the error in the bp yet because the second read may
1143 	 * succeed.
1144 	 */
1145 
1146 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1147 		const char *msg = "";
1148 
1149 		if ((ccdfind(unit)->sc_cflags & CCDF_MIRROR) &&
1150 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
1151 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1152 			/*
1153 			 * We will try our read on the other disk down
1154 			 * below, also reverse the default pick so if we
1155 			 * are doing a scan we do not keep hitting the
1156 			 * bad disk first.
1157 			 */
1158 			struct ccd_s *cs = ccdfind(unit);
1159 
1160 			msg = ", trying other disk";
1161 			cs->sc_pick = 1 - cs->sc_pick;
1162 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1163 		} else {
1164 			bp->bio_flags |= BIO_ERROR;
1165 			bp->bio_error = cbp->cb_buf.bio_error ?
1166 			    cbp->cb_buf.bio_error : EIO;
1167 		}
1168 		printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1169 		       unit, bp->bio_error, cbp->cb_comp,
1170 		       (int)cbp->cb_buf.bio_blkno, bp->bio_blkno, msg);
1171 	}
1172 
1173 	/*
1174 	 * Process mirror.  If we are writing, I/O has been initiated on both
1175 	 * buffers and we fall through only after both are finished.
1176 	 *
1177 	 * If we are reading only one I/O is initiated at a time.  If an
1178 	 * error occurs we initiate the second I/O and return, otherwise
1179 	 * we free the second I/O without initiating it.
1180 	 */
1181 
1182 	if (ccdfind(unit)->sc_cflags & CCDF_MIRROR) {
1183 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1184 			/*
1185 			 * When writing, handshake with the second buffer
1186 			 * to determine when both are done.  If both are not
1187 			 * done, return here.
1188 			 */
1189 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1190 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1191 				putccdbuf(cbp);
1192 				splx(s);
1193 				return;
1194 			}
1195 		} else {
1196 			/*
1197 			 * When reading, either dispose of the second buffer
1198 			 * or initiate I/O on the second buffer if an error
1199 			 * occured with this one.
1200 			 */
1201 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1202 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1203 					cbp->cb_mirror->cb_pflags |=
1204 					    CCDPF_MIRROR_DONE;
1205 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1206 					putccdbuf(cbp);
1207 					splx(s);
1208 					return;
1209 				} else {
1210 					putccdbuf(cbp->cb_mirror);
1211 					/* fall through */
1212 				}
1213 			}
1214 		}
1215 	}
1216 
1217 	/*
1218 	 * use bio_caller1 to determine how big the original request was rather
1219 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1220 	 *
1221 	 * XXX We check for an error, but we do not test the resid for an
1222 	 * aligned EOF condition.  This may result in character & block
1223 	 * device access not recognizing EOF properly when read or written
1224 	 * sequentially, but will not effect filesystems.
1225 	 */
1226 	count = (long)cbp->cb_buf.bio_caller1;
1227 	putccdbuf(cbp);
1228 
1229 	/*
1230 	 * If all done, "interrupt".
1231 	 */
1232 	bp->bio_resid -= count;
1233 	if (bp->bio_resid < 0)
1234 		panic("ccdiodone: count");
1235 	if (bp->bio_resid == 0)
1236 		ccdintr(ccdfind(unit), bp);
1237 	splx(s);
1238 }
1239 
1240 static int
1241 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
1242 {
1243 	int unit = ccdunit(dev);
1244 	int i, j, lookedup = 0, error = 0;
1245 	int part, pmask, s;
1246 	struct ccd_s *cs;
1247 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1248 	char **cpp;
1249 	struct vnode **vpp;
1250 
1251 	if (!IS_ALLOCATED(unit))
1252 		return (ENXIO);
1253 	cs = ccdfind(unit);
1254 
1255 	switch (cmd) {
1256 	case CCDIOCSET:
1257 		if (IS_INITED(cs))
1258 			return (EBUSY);
1259 
1260 		if ((flag & FWRITE) == 0)
1261 			return (EBADF);
1262 
1263 		if ((error = ccdlock(cs)) != 0)
1264 			return (error);
1265 
1266 		/* Fill in some important bits. */
1267 		cs->sc_ileave = ccio->ccio_ileave;
1268 		if (cs->sc_ileave == 0 &&
1269 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1270 		     (ccio->ccio_flags & CCDF_PARITY))) {
1271 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1272 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1273 		}
1274 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1275 		    (ccio->ccio_flags & CCDF_PARITY)) {
1276 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1277 			ccio->ccio_flags &= ~CCDF_PARITY;
1278 		}
1279 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1280 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1281 			printf("ccd%d: mirror/parity forces uniform flag\n",
1282 			       unit);
1283 			ccio->ccio_flags |= CCDF_UNIFORM;
1284 		}
1285 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1286 
1287 		/*
1288 		 * Allocate space for and copy in the array of
1289 		 * componet pathnames and device numbers.
1290 		 */
1291 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1292 		    M_DEVBUF, M_WAITOK);
1293 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1294 		    M_DEVBUF, M_WAITOK);
1295 
1296 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1297 		    ccio->ccio_ndisks * sizeof(char **));
1298 		if (error) {
1299 			free(vpp, M_DEVBUF);
1300 			free(cpp, M_DEVBUF);
1301 			ccdunlock(cs);
1302 			return (error);
1303 		}
1304 
1305 #ifdef DEBUG
1306 		if (ccddebug & CCDB_INIT)
1307 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1308 				printf("ccdioctl: component %d: %p\n",
1309 				    i, cpp[i]);
1310 #endif
1311 
1312 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1313 #ifdef DEBUG
1314 			if (ccddebug & CCDB_INIT)
1315 				printf("ccdioctl: lookedup = %d\n", lookedup);
1316 #endif
1317 			if ((error = ccdlookup(cpp[i], p, &vpp[i])) != 0) {
1318 				for (j = 0; j < lookedup; ++j)
1319 					(void)vn_close(vpp[j], FREAD|FWRITE,
1320 					    p->p_ucred, p);
1321 				free(vpp, M_DEVBUF);
1322 				free(cpp, M_DEVBUF);
1323 				ccdunlock(cs);
1324 				return (error);
1325 			}
1326 			++lookedup;
1327 		}
1328 		cs->sc_vpp = vpp;
1329 		cs->sc_nccdisks = ccio->ccio_ndisks;
1330 
1331 		/*
1332 		 * Initialize the ccd.  Fills in the softc for us.
1333 		 */
1334 		if ((error = ccdinit(cs, cpp, p)) != 0) {
1335 			for (j = 0; j < lookedup; ++j)
1336 				(void)vn_close(vpp[j], FREAD|FWRITE,
1337 				    p->p_ucred, p);
1338 			/*
1339 			 * We can't ccddestroy() cs just yet, because nothing
1340 			 * prevents user-level app to do another ioctl()
1341 			 * without closing the device first, therefore
1342 			 * declare unit null and void and let ccdclose()
1343 			 * destroy it when it is safe to do so.
1344 			 */
1345 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1346 			free(vpp, M_DEVBUF);
1347 			free(cpp, M_DEVBUF);
1348 			ccdunlock(cs);
1349 			return (error);
1350 		}
1351 
1352 		/*
1353 		 * The ccd has been successfully initialized, so
1354 		 * we can place it into the array and read the disklabel.
1355 		 */
1356 		ccio->ccio_unit = unit;
1357 		ccio->ccio_size = cs->sc_size;
1358 		ccdgetdisklabel(dev);
1359 
1360 		ccdunlock(cs);
1361 
1362 		break;
1363 
1364 	case CCDIOCCLR:
1365 		if (!IS_INITED(cs))
1366 			return (ENXIO);
1367 
1368 		if ((flag & FWRITE) == 0)
1369 			return (EBADF);
1370 
1371 		if ((error = ccdlock(cs)) != 0)
1372 			return (error);
1373 
1374 		/* Don't unconfigure if any other partitions are open */
1375 		part = ccdpart(dev);
1376 		pmask = (1 << part);
1377 		if ((cs->sc_openmask & ~pmask)) {
1378 			ccdunlock(cs);
1379 			return (EBUSY);
1380 		}
1381 
1382 		/* Declare unit null and void (reset all flags) */
1383 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1384 
1385 		/* Close the components and free their pathnames. */
1386 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1387 			/*
1388 			 * XXX: this close could potentially fail and
1389 			 * cause Bad Things.  Maybe we need to force
1390 			 * the close to happen?
1391 			 */
1392 #ifdef DEBUG
1393 			if (ccddebug & CCDB_VNODE)
1394 				vprint("CCDIOCCLR: vnode info",
1395 				    cs->sc_cinfo[i].ci_vp);
1396 #endif
1397 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1398 			    p->p_ucred, p);
1399 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1400 		}
1401 
1402 		/* Free interleave index. */
1403 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1404 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1405 
1406 		/* Free component info and interleave table. */
1407 		free(cs->sc_cinfo, M_DEVBUF);
1408 		free(cs->sc_itable, M_DEVBUF);
1409 		free(cs->sc_vpp, M_DEVBUF);
1410 
1411 		/* And remove the devstat entry. */
1412 		devstat_remove_entry(&cs->device_stats);
1413 
1414 		/* This must be atomic. */
1415 		s = splhigh();
1416 		ccdunlock(cs);
1417 		splx(s);
1418 
1419 		break;
1420 
1421 	case CCDCONFINFO:
1422 		{
1423 			int ninit = 0;
1424 			struct ccdconf *conf = (struct ccdconf *)data;
1425 			struct ccd_s *tmpcs;
1426 			struct ccd_s *ubuf = conf->buffer;
1427 
1428 			/* XXX: LOCK(unique unit numbers) */
1429 			LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1430 				if (IS_INITED(tmpcs))
1431 					ninit++;
1432 
1433 			if (conf->size == 0) {
1434 				conf->size = sizeof(struct ccd_s) * ninit;
1435 				break;
1436 			} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1437 			    (conf->size % sizeof(struct ccd_s) != 0)) {
1438 				/* XXX: UNLOCK(unique unit numbers) */
1439 				return (EINVAL);
1440 			}
1441 
1442 			ubuf += ninit;
1443 			LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1444 				if (!IS_INITED(tmpcs))
1445 					continue;
1446 				error = copyout(tmpcs, --ubuf,
1447 				    sizeof(struct ccd_s));
1448 				if (error != 0)
1449 					/* XXX: UNLOCK(unique unit numbers) */
1450 					return (error);
1451 			}
1452 			/* XXX: UNLOCK(unique unit numbers) */
1453 		}
1454 		break;
1455 
1456 	case CCDCPPINFO:
1457 		if (!IS_INITED(cs))
1458 			return (ENXIO);
1459 
1460 		{
1461 			int len = 0;
1462 			struct ccdcpps *cpps = (struct ccdcpps *)data;
1463 			char *ubuf = cpps->buffer;
1464 
1465 
1466 			for (i = 0; i < cs->sc_nccdisks; ++i)
1467 				len += cs->sc_cinfo[i].ci_pathlen;
1468 
1469 			if (cpps->size == 0) {
1470 				cpps->size = len;
1471 				break;
1472 			} else if (cpps->size != len) {
1473 				return (EINVAL);
1474 			}
1475 
1476 			for (i = 0; i < cs->sc_nccdisks; ++i) {
1477 				len = cs->sc_cinfo[i].ci_pathlen;
1478 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1479 				    len);
1480 				if (error != 0)
1481 					return (error);
1482 				ubuf += len;
1483 			}
1484 		}
1485 		break;
1486 
1487 	case DIOCGDINFO:
1488 		if (!IS_INITED(cs))
1489 			return (ENXIO);
1490 
1491 		*(struct disklabel *)data = cs->sc_label;
1492 		break;
1493 
1494 	case DIOCGPART:
1495 		if (!IS_INITED(cs))
1496 			return (ENXIO);
1497 
1498 		((struct partinfo *)data)->disklab = &cs->sc_label;
1499 		((struct partinfo *)data)->part =
1500 		    &cs->sc_label.d_partitions[ccdpart(dev)];
1501 		break;
1502 
1503 	case DIOCWDINFO:
1504 	case DIOCSDINFO:
1505 		if (!IS_INITED(cs))
1506 			return (ENXIO);
1507 
1508 		if ((flag & FWRITE) == 0)
1509 			return (EBADF);
1510 
1511 		if ((error = ccdlock(cs)) != 0)
1512 			return (error);
1513 
1514 		cs->sc_flags |= CCDF_LABELLING;
1515 
1516 		error = setdisklabel(&cs->sc_label,
1517 		    (struct disklabel *)data, 0);
1518 		if (error == 0) {
1519 			if (cmd == DIOCWDINFO)
1520 				error = writedisklabel(CCDLABELDEV(dev),
1521 				    &cs->sc_label);
1522 		}
1523 
1524 		cs->sc_flags &= ~CCDF_LABELLING;
1525 
1526 		ccdunlock(cs);
1527 
1528 		if (error)
1529 			return (error);
1530 		break;
1531 
1532 	case DIOCWLABEL:
1533 		if (!IS_INITED(cs))
1534 			return (ENXIO);
1535 
1536 		if ((flag & FWRITE) == 0)
1537 			return (EBADF);
1538 		if (*(int *)data != 0)
1539 			cs->sc_flags |= CCDF_WLABEL;
1540 		else
1541 			cs->sc_flags &= ~CCDF_WLABEL;
1542 		break;
1543 
1544 	default:
1545 		return (ENOTTY);
1546 	}
1547 
1548 	return (0);
1549 }
1550 
1551 static int
1552 ccdsize(dev_t dev)
1553 {
1554 	struct ccd_s *cs;
1555 	int part, size;
1556 
1557 	if (ccdopen(dev, 0, S_IFCHR, curproc))
1558 		return (-1);
1559 
1560 	cs = ccdfind(ccdunit(dev));
1561 	part = ccdpart(dev);
1562 
1563 	if (!IS_INITED(cs))
1564 		return (-1);
1565 
1566 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1567 		size = -1;
1568 	else
1569 		size = cs->sc_label.d_partitions[part].p_size;
1570 
1571 	if (ccdclose(dev, 0, S_IFCHR, curproc))
1572 		return (-1);
1573 
1574 	return (size);
1575 }
1576 
1577 static int
1578 ccddump(dev_t dev)
1579 {
1580 
1581 	/* Not implemented. */
1582 	return ENXIO;
1583 }
1584 
1585 /*
1586  * Lookup the provided name in the filesystem.  If the file exists,
1587  * is a valid block device, and isn't being used by anyone else,
1588  * set *vpp to the file's vnode.
1589  */
1590 static int
1591 ccdlookup(char *path, struct proc *p, struct vnode **vpp)
1592 {
1593 	struct nameidata nd;
1594 	struct vnode *vp;
1595 	int error, flags;
1596 
1597 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, p);
1598 	flags = FREAD | FWRITE;
1599 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1600 #ifdef DEBUG
1601 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
1602 			printf("ccdlookup: vn_open error = %d\n", error);
1603 #endif
1604 		return (error);
1605 	}
1606 	vp = nd.ni_vp;
1607 
1608 	if (vp->v_usecount > 1) {
1609 		error = EBUSY;
1610 		goto bad;
1611 	}
1612 
1613 	if (!vn_isdisk(vp, &error))
1614 		goto bad;
1615 
1616 #ifdef DEBUG
1617 	if (ccddebug & CCDB_VNODE)
1618 		vprint("ccdlookup: vnode info", vp);
1619 #endif
1620 
1621 	VOP_UNLOCK(vp, 0, p);
1622 	NDFREE(&nd, NDF_ONLY_PNBUF);
1623 	*vpp = vp;
1624 	return (0);
1625 bad:
1626 	VOP_UNLOCK(vp, 0, p);
1627 	NDFREE(&nd, NDF_ONLY_PNBUF);
1628 	/* vn_close does vrele() for vp */
1629 	(void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
1630 	return (error);
1631 }
1632 
1633 /*
1634  * Read the disklabel from the ccd.  If one is not present, fake one
1635  * up.
1636  */
1637 static void
1638 ccdgetdisklabel(dev_t dev)
1639 {
1640 	int unit = ccdunit(dev);
1641 	struct ccd_s *cs = ccdfind(unit);
1642 	char *errstring;
1643 	struct disklabel *lp = &cs->sc_label;
1644 	struct ccdgeom *ccg = &cs->sc_geom;
1645 
1646 	bzero(lp, sizeof(*lp));
1647 
1648 	lp->d_secperunit = cs->sc_size;
1649 	lp->d_secsize = ccg->ccg_secsize;
1650 	lp->d_nsectors = ccg->ccg_nsectors;
1651 	lp->d_ntracks = ccg->ccg_ntracks;
1652 	lp->d_ncylinders = ccg->ccg_ncylinders;
1653 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1654 
1655 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1656 	lp->d_type = DTYPE_CCD;
1657 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1658 	lp->d_rpm = 3600;
1659 	lp->d_interleave = 1;
1660 	lp->d_flags = 0;
1661 
1662 	lp->d_partitions[RAW_PART].p_offset = 0;
1663 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1664 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1665 	lp->d_npartitions = RAW_PART + 1;
1666 
1667 	lp->d_bbsize = BBSIZE;				/* XXX */
1668 	lp->d_sbsize = SBSIZE;				/* XXX */
1669 
1670 	lp->d_magic = DISKMAGIC;
1671 	lp->d_magic2 = DISKMAGIC;
1672 	lp->d_checksum = dkcksum(&cs->sc_label);
1673 
1674 	/*
1675 	 * Call the generic disklabel extraction routine.
1676 	 */
1677 	errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1678 	if (errstring != NULL)
1679 		ccdmakedisklabel(cs);
1680 
1681 #ifdef DEBUG
1682 	/* It's actually extremely common to have unlabeled ccds. */
1683 	if (ccddebug & CCDB_LABEL)
1684 		if (errstring != NULL)
1685 			printf("ccd%d: %s\n", unit, errstring);
1686 #endif
1687 }
1688 
1689 /*
1690  * Take care of things one might want to take care of in the event
1691  * that a disklabel isn't present.
1692  */
1693 static void
1694 ccdmakedisklabel(struct ccd_s *cs)
1695 {
1696 	struct disklabel *lp = &cs->sc_label;
1697 
1698 	/*
1699 	 * For historical reasons, if there's no disklabel present
1700 	 * the raw partition must be marked FS_BSDFFS.
1701 	 */
1702 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1703 
1704 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1705 }
1706 
1707 /*
1708  * Wait interruptibly for an exclusive lock.
1709  *
1710  * XXX
1711  * Several drivers do this; it should be abstracted and made MP-safe.
1712  */
1713 static int
1714 ccdlock(struct ccd_s *cs)
1715 {
1716 	int error;
1717 
1718 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1719 		cs->sc_flags |= CCDF_WANTED;
1720 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1721 			return (error);
1722 	}
1723 	cs->sc_flags |= CCDF_LOCKED;
1724 	return (0);
1725 }
1726 
1727 /*
1728  * Unlock and wake up any waiters.
1729  */
1730 static void
1731 ccdunlock(struct ccd_s *cs)
1732 {
1733 
1734 	cs->sc_flags &= ~CCDF_LOCKED;
1735 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1736 		cs->sc_flags &= ~CCDF_WANTED;
1737 		wakeup(cs);
1738 	}
1739 }
1740 
1741 #ifdef DEBUG
1742 static void
1743 printiinfo(struct ccdiinfo *ii)
1744 {
1745 	int ix, i;
1746 
1747 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1748 		printf(" itab[%d]: #dk %d sblk %d soff %d",
1749 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1750 		for (i = 0; i < ii->ii_ndisk; i++)
1751 			printf(" %d", ii->ii_index[i]);
1752 		printf("\n");
1753 	}
1754 }
1755 #endif
1756