xref: /freebsd/sys/geom/geom_ccd.c (revision 41466b50c1d5bfd1cf6adaae547a579a75d7c04e)
1 /* $FreeBSD$ */
2 
3 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
4 
5 /*
6  * Copyright (c) 1995 Jason R. Thorpe.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed for the NetBSD Project
20  *	by Jason R. Thorpe.
21  * 4. The name of the author may not be used to endorse or promote products
22  *    derived from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * Copyright (c) 1988 University of Utah.
39  * Copyright (c) 1990, 1993
40  *	The Regents of the University of California.  All rights reserved.
41  *
42  * This code is derived from software contributed to Berkeley by
43  * the Systems Programming Group of the University of Utah Computer
44  * Science Department.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions
48  * are met:
49  * 1. Redistributions of source code must retain the above copyright
50  *    notice, this list of conditions and the following disclaimer.
51  * 2. Redistributions in binary form must reproduce the above copyright
52  *    notice, this list of conditions and the following disclaimer in the
53  *    documentation and/or other materials provided with the distribution.
54  * 3. All advertising materials mentioning features or use of this software
55  *    must display the following acknowledgement:
56  *	This product includes software developed by the University of
57  *	California, Berkeley and its contributors.
58  * 4. Neither the name of the University nor the names of its contributors
59  *    may be used to endorse or promote products derived from this software
60  *    without specific prior written permission.
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72  * SUCH DAMAGE.
73  *
74  * from: Utah $Hdr: cd.c 1.6 90/11/28$
75  *
76  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
77  */
78 
79 /*
80  * "Concatenated" disk driver.
81  *
82  * Dynamic configuration and disklabel support by:
83  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
84  *	Numerical Aerodynamic Simulation Facility
85  *	Mail Stop 258-6
86  *	NASA Ames Research Center
87  *	Moffett Field, CA 94035
88  */
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/kernel.h>
93 #include <sys/module.h>
94 #include <sys/proc.h>
95 #include <sys/bio.h>
96 #include <sys/malloc.h>
97 #include <sys/namei.h>
98 #include <sys/conf.h>
99 #include <sys/stat.h>
100 #include <sys/sysctl.h>
101 #include <sys/disklabel.h>
102 #include <ufs/ffs/fs.h>
103 #include <sys/devicestat.h>
104 #include <sys/fcntl.h>
105 #include <sys/vnode.h>
106 
107 #include <sys/ccdvar.h>
108 
109 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
110 
111 #if defined(CCDDEBUG) && !defined(DEBUG)
112 #define DEBUG
113 #endif
114 
115 #ifdef DEBUG
116 #define CCDB_FOLLOW	0x01
117 #define CCDB_INIT	0x02
118 #define CCDB_IO		0x04
119 #define CCDB_LABEL	0x08
120 #define CCDB_VNODE	0x10
121 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
122     CCDB_VNODE;
123 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
124 #endif
125 
126 #define	ccdunit(x)	dkunit(x)
127 #define ccdpart(x)	dkpart(x)
128 
129 /*
130    This is how mirroring works (only writes are special):
131 
132    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
133    linked together by the cb_mirror field.  "cb_pflags &
134    CCDPF_MIRROR_DONE" is set to 0 on both of them.
135 
136    When a component returns to ccdiodone(), it checks if "cb_pflags &
137    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
138    flag and returns.  If it is, it means its partner has already
139    returned, so it will go to the regular cleanup.
140 
141  */
142 
143 struct ccdbuf {
144 	struct bio	cb_buf;		/* new I/O buf */
145 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
146 	struct ccdbuf	*cb_freenext;	/* free list link */
147 	int		cb_unit;	/* target unit */
148 	int		cb_comp;	/* target component */
149 	int		cb_pflags;	/* mirror/parity status flag */
150 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
151 };
152 
153 /* bits in cb_pflags */
154 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
155 
156 #define CCDLABELDEV(dev)	\
157 	(makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
158 
159 /* convinient macros for often-used statements */
160 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
161 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
162 
163 static d_open_t ccdopen;
164 static d_close_t ccdclose;
165 static d_strategy_t ccdstrategy;
166 static d_ioctl_t ccdioctl;
167 static d_dump_t ccddump;
168 static d_psize_t ccdsize;
169 
170 #define NCCDFREEHIWAT	16
171 
172 #define CDEV_MAJOR 74
173 
174 static struct cdevsw ccd_cdevsw = {
175 	/* open */	ccdopen,
176 	/* close */	ccdclose,
177 	/* read */	physread,
178 	/* write */	physwrite,
179 	/* ioctl */	ccdioctl,
180 	/* poll */	nopoll,
181 	/* mmap */	nommap,
182 	/* strategy */	ccdstrategy,
183 	/* name */	"ccd",
184 	/* maj */	CDEV_MAJOR,
185 	/* dump */	ccddump,
186 	/* psize */	ccdsize,
187 	/* flags */	D_DISK,
188 };
189 static LIST_HEAD(, ccd_s) ccd_softc_list = LIST_HEAD_INITIALIZER(&ccd_softc_list);
190 
191 static struct ccd_s *ccdfind(int);
192 static struct ccd_s *ccdnew(int);
193 static int ccddestroy(struct ccd_s *, struct proc *);
194 
195 /* called during module initialization */
196 static void ccdattach(void);
197 static int ccd_modevent(module_t, int, void *);
198 
199 /* called by biodone() at interrupt time */
200 static void ccdiodone(struct bio *bp);
201 
202 static void ccdstart(struct ccd_s *, struct bio *);
203 static void ccdinterleave(struct ccd_s *, int);
204 static void ccdintr(struct ccd_s *, struct bio *);
205 static int ccdinit(struct ccd_s *, char **, struct thread *);
206 static int ccdlookup(char *, struct thread *p, struct vnode **);
207 static void ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
208 		      struct bio *, daddr_t, caddr_t, long);
209 static void ccdgetdisklabel(dev_t);
210 static void ccdmakedisklabel(struct ccd_s *);
211 static int ccdlock(struct ccd_s *);
212 static void ccdunlock(struct ccd_s *);
213 
214 #ifdef DEBUG
215 static void printiinfo(struct ccdiinfo *);
216 #endif
217 
218 /* Non-private for the benefit of libkvm. */
219 struct ccdbuf *ccdfreebufs;
220 static int numccdfreebufs;
221 
222 /*
223  * getccdbuf() -	Allocate and zero a ccd buffer.
224  *
225  *	This routine is called at splbio().
226  */
227 
228 static __inline
229 struct ccdbuf *
230 getccdbuf(struct ccdbuf *cpy)
231 {
232 	struct ccdbuf *cbp;
233 
234 	/*
235 	 * Allocate from freelist or malloc as necessary
236 	 */
237 	if ((cbp = ccdfreebufs) != NULL) {
238 		ccdfreebufs = cbp->cb_freenext;
239 		--numccdfreebufs;
240 	} else {
241 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
242 	}
243 
244 	/*
245 	 * Used by mirroring code
246 	 */
247 	if (cpy)
248 		bcopy(cpy, cbp, sizeof(struct ccdbuf));
249 	else
250 		bzero(cbp, sizeof(struct ccdbuf));
251 
252 	/*
253 	 * independant struct bio initialization
254 	 */
255 
256 	return(cbp);
257 }
258 
259 /*
260  * putccdbuf() -	Free a ccd buffer.
261  *
262  *	This routine is called at splbio().
263  */
264 
265 static __inline
266 void
267 putccdbuf(struct ccdbuf *cbp)
268 {
269 
270 	if (numccdfreebufs < NCCDFREEHIWAT) {
271 		cbp->cb_freenext = ccdfreebufs;
272 		ccdfreebufs = cbp;
273 		++numccdfreebufs;
274 	} else {
275 		free((caddr_t)cbp, M_DEVBUF);
276 	}
277 }
278 
279 
280 /*
281  * Number of blocks to untouched in front of a component partition.
282  * This is to avoid violating its disklabel area when it starts at the
283  * beginning of the slice.
284  */
285 #if !defined(CCD_OFFSET)
286 #define CCD_OFFSET 16
287 #endif
288 
289 static struct ccd_s *
290 ccdfind(int unit)
291 {
292 	struct ccd_s *sc = NULL;
293 
294 	/* XXX: LOCK(unique unit numbers) */
295 	LIST_FOREACH(sc, &ccd_softc_list, list) {
296 		if (sc->sc_unit == unit)
297 			break;
298 	}
299 	/* XXX: UNLOCK(unique unit numbers) */
300 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
301 }
302 
303 static struct ccd_s *
304 ccdnew(int unit)
305 {
306 	struct ccd_s *sc;
307 
308 	/* XXX: LOCK(unique unit numbers) */
309 	if (IS_ALLOCATED(unit) || unit > DKMAXUNIT)
310 		return (NULL);
311 
312 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
313 	sc->sc_unit = unit;
314 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
315 	/* XXX: UNLOCK(unique unit numbers) */
316 	return (sc);
317 }
318 
319 static int
320 ccddestroy(struct ccd_s *sc, struct proc *p)
321 {
322 
323 	/* XXX: LOCK(unique unit numbers) */
324 	LIST_REMOVE(sc, list);
325 	/* XXX: UNLOCK(unique unit numbers) */
326 	FREE(sc, M_CCD);
327 	return (0);
328 }
329 
330 static void
331 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
332 {
333 	int i, u;
334 	char *s;
335 
336 	if (*dev != NODEV)
337 		return;
338 	i = dev_stdclone(name, &s, "ccd", &u);
339 	if (i != 2)
340 		return;
341 	if (*s < 'a' || *s > 'h')
342 		return;
343 	if (s[1] != '\0')
344 		return;
345 	*dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
346 		UID_ROOT, GID_OPERATOR, 0640, name);
347 }
348 
349 /*
350  * Called by main() during pseudo-device attachment.  All we need
351  * to do is to add devsw entries.
352  */
353 static void
354 ccdattach()
355 {
356 
357 	EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
358 }
359 
360 static int
361 ccd_modevent(module_t mod, int type, void *data)
362 {
363 	int error = 0;
364 
365 	switch (type) {
366 	case MOD_LOAD:
367 		ccdattach();
368 		break;
369 
370 	case MOD_UNLOAD:
371 		printf("ccd0: Unload not supported!\n");
372 		error = EOPNOTSUPP;
373 		break;
374 
375 	default:	/* MOD_SHUTDOWN etc */
376 		break;
377 	}
378 	return (error);
379 }
380 
381 DEV_MODULE(ccd, ccd_modevent, NULL);
382 
383 static int
384 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
385 {
386 	struct ccdcinfo *ci = NULL;	/* XXX */
387 	size_t size;
388 	int ix;
389 	struct vnode *vp;
390 	size_t minsize;
391 	int maxsecsize;
392 	struct partinfo dpart;
393 	struct ccdgeom *ccg = &cs->sc_geom;
394 	char tmppath[MAXPATHLEN];
395 	int error = 0;
396 
397 #ifdef DEBUG
398 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
399 		printf("ccdinit: unit %d\n", cs->sc_unit);
400 #endif
401 
402 	cs->sc_size = 0;
403 
404 	/* Allocate space for the component info. */
405 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
406 	    M_DEVBUF, M_WAITOK);
407 
408 	/*
409 	 * Verify that each component piece exists and record
410 	 * relevant information about it.
411 	 */
412 	maxsecsize = 0;
413 	minsize = 0;
414 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
415 		vp = cs->sc_vpp[ix];
416 		ci = &cs->sc_cinfo[ix];
417 		ci->ci_vp = vp;
418 
419 		/*
420 		 * Copy in the pathname of the component.
421 		 */
422 		bzero(tmppath, sizeof(tmppath));	/* sanity */
423 		if ((error = copyinstr(cpaths[ix], tmppath,
424 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
425 #ifdef DEBUG
426 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
427 				printf("ccd%d: can't copy path, error = %d\n",
428 				    cs->sc_unit, error);
429 #endif
430 			goto fail;
431 		}
432 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
433 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
434 
435 		ci->ci_dev = vn_todev(vp);
436 
437 		/*
438 		 * Get partition information for the component.
439 		 */
440 		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
441 		    FREAD, td->td_proc->p_ucred, td)) != 0) {
442 #ifdef DEBUG
443 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
444 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
445 				     cs->sc_unit, ci->ci_path, error);
446 #endif
447 			goto fail;
448 		}
449 		if (dpart.part->p_fstype == FS_BSDFFS) {
450 			maxsecsize =
451 			    ((dpart.disklab->d_secsize > maxsecsize) ?
452 			    dpart.disklab->d_secsize : maxsecsize);
453 			size = dpart.part->p_size - CCD_OFFSET;
454 		} else {
455 #ifdef DEBUG
456 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
457 				printf("ccd%d: %s: incorrect partition type\n",
458 				    cs->sc_unit, ci->ci_path);
459 #endif
460 			error = EFTYPE;
461 			goto fail;
462 		}
463 
464 		/*
465 		 * Calculate the size, truncating to an interleave
466 		 * boundary if necessary.
467 		 */
468 
469 		if (cs->sc_ileave > 1)
470 			size -= size % cs->sc_ileave;
471 
472 		if (size == 0) {
473 #ifdef DEBUG
474 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
475 				printf("ccd%d: %s: size == 0\n",
476 				    cs->sc_unit, ci->ci_path);
477 #endif
478 			error = ENODEV;
479 			goto fail;
480 		}
481 
482 		if (minsize == 0 || size < minsize)
483 			minsize = size;
484 		ci->ci_size = size;
485 		cs->sc_size += size;
486 	}
487 
488 	/*
489 	 * Don't allow the interleave to be smaller than
490 	 * the biggest component sector.
491 	 */
492 	if ((cs->sc_ileave > 0) &&
493 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
494 #ifdef DEBUG
495 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
496 			printf("ccd%d: interleave must be at least %d\n",
497 			    cs->sc_unit, (maxsecsize / DEV_BSIZE));
498 #endif
499 		error = EINVAL;
500 		goto fail;
501 	}
502 
503 	/*
504 	 * If uniform interleave is desired set all sizes to that of
505 	 * the smallest component.  This will guarentee that a single
506 	 * interleave table is generated.
507 	 *
508 	 * Lost space must be taken into account when calculating the
509 	 * overall size.  Half the space is lost when CCDF_MIRROR is
510 	 * specified.  One disk is lost when CCDF_PARITY is specified.
511 	 */
512 	if (cs->sc_flags & CCDF_UNIFORM) {
513 		for (ci = cs->sc_cinfo;
514 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
515 			ci->ci_size = minsize;
516 		}
517 		if (cs->sc_flags & CCDF_MIRROR) {
518 			/*
519 			 * Check to see if an even number of components
520 			 * have been specified.  The interleave must also
521 			 * be non-zero in order for us to be able to
522 			 * guarentee the topology.
523 			 */
524 			if (cs->sc_nccdisks % 2) {
525 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
526 				error = EINVAL;
527 				goto fail;
528 			}
529 			if (cs->sc_ileave == 0) {
530 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
531 				error = EINVAL;
532 				goto fail;
533 			}
534 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
535 		} else if (cs->sc_flags & CCDF_PARITY) {
536 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
537 		} else {
538 			if (cs->sc_ileave == 0) {
539 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
540 				error = EINVAL;
541 				goto fail;
542 			}
543 			cs->sc_size = cs->sc_nccdisks * minsize;
544 		}
545 	}
546 
547 	/*
548 	 * Construct the interleave table.
549 	 */
550 	ccdinterleave(cs, cs->sc_unit);
551 
552 	/*
553 	 * Create pseudo-geometry based on 1MB cylinders.  It's
554 	 * pretty close.
555 	 */
556 	ccg->ccg_secsize = maxsecsize;
557 	ccg->ccg_ntracks = 1;
558 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
559 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
560 
561 	/*
562 	 * Add an devstat entry for this device.
563 	 */
564 	devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
565 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
566 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
567 			  DEVSTAT_PRIORITY_ARRAY);
568 
569 	cs->sc_flags |= CCDF_INITED;
570 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
571 	return (0);
572 fail:
573 	while (ci > cs->sc_cinfo) {
574 		ci--;
575 		free(ci->ci_path, M_DEVBUF);
576 	}
577 	free(cs->sc_cinfo, M_DEVBUF);
578 	return (error);
579 }
580 
581 static void
582 ccdinterleave(struct ccd_s *cs, int unit)
583 {
584 	struct ccdcinfo *ci, *smallci;
585 	struct ccdiinfo *ii;
586 	daddr_t bn, lbn;
587 	int ix;
588 	u_long size;
589 
590 #ifdef DEBUG
591 	if (ccddebug & CCDB_INIT)
592 		printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
593 #endif
594 
595 	/*
596 	 * Allocate an interleave table.  The worst case occurs when each
597 	 * of N disks is of a different size, resulting in N interleave
598 	 * tables.
599 	 *
600 	 * Chances are this is too big, but we don't care.
601 	 */
602 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
603 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
604 	    M_WAITOK | M_ZERO);
605 
606 	/*
607 	 * Trivial case: no interleave (actually interleave of disk size).
608 	 * Each table entry represents a single component in its entirety.
609 	 *
610 	 * An interleave of 0 may not be used with a mirror or parity setup.
611 	 */
612 	if (cs->sc_ileave == 0) {
613 		bn = 0;
614 		ii = cs->sc_itable;
615 
616 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
617 			/* Allocate space for ii_index. */
618 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
619 			ii->ii_ndisk = 1;
620 			ii->ii_startblk = bn;
621 			ii->ii_startoff = 0;
622 			ii->ii_index[0] = ix;
623 			bn += cs->sc_cinfo[ix].ci_size;
624 			ii++;
625 		}
626 		ii->ii_ndisk = 0;
627 #ifdef DEBUG
628 		if (ccddebug & CCDB_INIT)
629 			printiinfo(cs->sc_itable);
630 #endif
631 		return;
632 	}
633 
634 	/*
635 	 * The following isn't fast or pretty; it doesn't have to be.
636 	 */
637 	size = 0;
638 	bn = lbn = 0;
639 	for (ii = cs->sc_itable; ; ii++) {
640 		/*
641 		 * Allocate space for ii_index.  We might allocate more then
642 		 * we use.
643 		 */
644 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
645 		    M_DEVBUF, M_WAITOK);
646 
647 		/*
648 		 * Locate the smallest of the remaining components
649 		 */
650 		smallci = NULL;
651 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
652 		    ci++) {
653 			if (ci->ci_size > size &&
654 			    (smallci == NULL ||
655 			     ci->ci_size < smallci->ci_size)) {
656 				smallci = ci;
657 			}
658 		}
659 
660 		/*
661 		 * Nobody left, all done
662 		 */
663 		if (smallci == NULL) {
664 			ii->ii_ndisk = 0;
665 			break;
666 		}
667 
668 		/*
669 		 * Record starting logical block using an sc_ileave blocksize.
670 		 */
671 		ii->ii_startblk = bn / cs->sc_ileave;
672 
673 		/*
674 		 * Record starting comopnent block using an sc_ileave
675 		 * blocksize.  This value is relative to the beginning of
676 		 * a component disk.
677 		 */
678 		ii->ii_startoff = lbn;
679 
680 		/*
681 		 * Determine how many disks take part in this interleave
682 		 * and record their indices.
683 		 */
684 		ix = 0;
685 		for (ci = cs->sc_cinfo;
686 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
687 			if (ci->ci_size >= smallci->ci_size) {
688 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
689 			}
690 		}
691 		ii->ii_ndisk = ix;
692 		bn += ix * (smallci->ci_size - size);
693 		lbn = smallci->ci_size / cs->sc_ileave;
694 		size = smallci->ci_size;
695 	}
696 #ifdef DEBUG
697 	if (ccddebug & CCDB_INIT)
698 		printiinfo(cs->sc_itable);
699 #endif
700 }
701 
702 /* ARGSUSED */
703 static int
704 ccdopen(dev_t dev, int flags, int fmt, struct thread *td)
705 {
706 	int unit = ccdunit(dev);
707 	struct ccd_s *cs;
708 	struct disklabel *lp;
709 	int error = 0, part, pmask;
710 
711 #ifdef DEBUG
712 	if (ccddebug & CCDB_FOLLOW)
713 		printf("ccdopen(%p, %x)\n", dev, flags);
714 #endif
715 
716 	cs = IS_ALLOCATED(unit) ? ccdfind(unit) : ccdnew(unit);
717 
718 	if ((error = ccdlock(cs)) != 0)
719 		return (error);
720 
721 	lp = &cs->sc_label;
722 
723 	part = ccdpart(dev);
724 	pmask = (1 << part);
725 
726 	/*
727 	 * If we're initialized, check to see if there are any other
728 	 * open partitions.  If not, then it's safe to update
729 	 * the in-core disklabel.
730 	 */
731 	if (IS_INITED(cs) && (cs->sc_openmask == 0))
732 		ccdgetdisklabel(dev);
733 
734 	/* Check that the partition exists. */
735 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
736 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
737 		error = ENXIO;
738 		goto done;
739 	}
740 
741 	cs->sc_openmask |= pmask;
742  done:
743 	ccdunlock(cs);
744 	return (0);
745 }
746 
747 /* ARGSUSED */
748 static int
749 ccdclose(dev_t dev, int flags, int fmt, struct thread *td)
750 {
751 	int unit = ccdunit(dev);
752 	struct ccd_s *cs;
753 	int error = 0, part;
754 
755 #ifdef DEBUG
756 	if (ccddebug & CCDB_FOLLOW)
757 		printf("ccdclose(%p, %x)\n", dev, flags);
758 #endif
759 
760 	if (!IS_ALLOCATED(unit))
761 		return (ENXIO);
762 	cs = ccdfind(unit);
763 
764 	if ((error = ccdlock(cs)) != 0)
765 		return (error);
766 
767 	part = ccdpart(dev);
768 
769 	/* ...that much closer to allowing unconfiguration... */
770 	cs->sc_openmask &= ~(1 << part);
771 	/* collect "garbage" if possible */
772 	if (!IS_INITED(cs) && (cs->sc_flags & CCDF_WANTED) == 0)
773 		ccddestroy(cs, td->td_proc);
774 	else
775 		ccdunlock(cs);
776 	return (0);
777 }
778 
779 static void
780 ccdstrategy(struct bio *bp)
781 {
782 	int unit = ccdunit(bp->bio_dev);
783 	struct ccd_s *cs = ccdfind(unit);
784 	int s;
785 	int wlabel;
786 	struct disklabel *lp;
787 
788 #ifdef DEBUG
789 	if (ccddebug & CCDB_FOLLOW)
790 		printf("ccdstrategy(%p): unit %d\n", bp, unit);
791 #endif
792 	if (!IS_INITED(cs)) {
793 		biofinish(bp, NULL, ENXIO);
794 		return;
795 	}
796 
797 	/* If it's a nil transfer, wake up the top half now. */
798 	if (bp->bio_bcount == 0) {
799 		biodone(bp);
800 		return;
801 	}
802 
803 	lp = &cs->sc_label;
804 
805 	/*
806 	 * Do bounds checking and adjust transfer.  If there's an
807 	 * error, the bounds check will flag that for us.
808 	 */
809 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
810 	if (ccdpart(bp->bio_dev) != RAW_PART) {
811 		if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
812 			biodone(bp);
813 			return;
814 		}
815 	} else {
816 		int pbn;        /* in sc_secsize chunks */
817 		long sz;        /* in sc_secsize chunks */
818 
819 		pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
820 		sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
821 
822 		/*
823 		 * If out of bounds return an error. If at the EOF point,
824 		 * simply read or write less.
825 		 */
826 
827 		if (pbn < 0 || pbn >= cs->sc_size) {
828 			bp->bio_resid = bp->bio_bcount;
829 			if (pbn != cs->sc_size)
830 				biofinish(bp, NULL, EINVAL);
831 			else
832 				biodone(bp);
833 			return;
834 		}
835 
836 		/*
837 		 * If the request crosses EOF, truncate the request.
838 		 */
839 		if (pbn + sz > cs->sc_size) {
840 			bp->bio_bcount = (cs->sc_size - pbn) *
841 			    cs->sc_geom.ccg_secsize;
842 		}
843 	}
844 
845 	bp->bio_resid = bp->bio_bcount;
846 
847 	/*
848 	 * "Start" the unit.
849 	 */
850 	s = splbio();
851 	ccdstart(cs, bp);
852 	splx(s);
853 	return;
854 }
855 
856 static void
857 ccdstart(struct ccd_s *cs, struct bio *bp)
858 {
859 	long bcount, rcount;
860 	struct ccdbuf *cbp[4];
861 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
862 	caddr_t addr;
863 	daddr_t bn;
864 	struct partition *pp;
865 
866 #ifdef DEBUG
867 	if (ccddebug & CCDB_FOLLOW)
868 		printf("ccdstart(%p, %p)\n", cs, bp);
869 #endif
870 
871 	/* Record the transaction start  */
872 	devstat_start_transaction(&cs->device_stats);
873 
874 	/*
875 	 * Translate the partition-relative block number to an absolute.
876 	 */
877 	bn = bp->bio_blkno;
878 	if (ccdpart(bp->bio_dev) != RAW_PART) {
879 		pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
880 		bn += pp->p_offset;
881 	}
882 
883 	/*
884 	 * Allocate component buffers and fire off the requests
885 	 */
886 	addr = bp->bio_data;
887 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
888 		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
889 		rcount = cbp[0]->cb_buf.bio_bcount;
890 
891 		if (cs->sc_cflags & CCDF_MIRROR) {
892 			/*
893 			 * Mirroring.  Writes go to both disks, reads are
894 			 * taken from whichever disk seems most appropriate.
895 			 *
896 			 * We attempt to localize reads to the disk whos arm
897 			 * is nearest the read request.  We ignore seeks due
898 			 * to writes when making this determination and we
899 			 * also try to avoid hogging.
900 			 */
901 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
902 				BIO_STRATEGY(&cbp[0]->cb_buf, 0);
903 				BIO_STRATEGY(&cbp[1]->cb_buf, 0);
904 			} else {
905 				int pick = cs->sc_pick;
906 				daddr_t range = cs->sc_size / 16;
907 
908 				if (bn < cs->sc_blk[pick] - range ||
909 				    bn > cs->sc_blk[pick] + range
910 				) {
911 					cs->sc_pick = pick = 1 - pick;
912 				}
913 				cs->sc_blk[pick] = bn + btodb(rcount);
914 				BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
915 			}
916 		} else {
917 			/*
918 			 * Not mirroring
919 			 */
920 			BIO_STRATEGY(&cbp[0]->cb_buf, 0);
921 		}
922 		bn += btodb(rcount);
923 		addr += rcount;
924 	}
925 }
926 
927 /*
928  * Build a component buffer header.
929  */
930 static void
931 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
932 {
933 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
934 	struct ccdbuf *cbp;
935 	daddr_t cbn, cboff;
936 	off_t cbc;
937 
938 #ifdef DEBUG
939 	if (ccddebug & CCDB_IO)
940 		printf("ccdbuffer(%p, %p, %d, %p, %ld)\n",
941 		       cs, bp, bn, addr, bcount);
942 #endif
943 	/*
944 	 * Determine which component bn falls in.
945 	 */
946 	cbn = bn;
947 	cboff = 0;
948 
949 	if (cs->sc_ileave == 0) {
950 		/*
951 		 * Serially concatenated and neither a mirror nor a parity
952 		 * config.  This is a special case.
953 		 */
954 		daddr_t sblk;
955 
956 		sblk = 0;
957 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
958 			sblk += ci->ci_size;
959 		cbn -= sblk;
960 	} else {
961 		struct ccdiinfo *ii;
962 		int ccdisk, off;
963 
964 		/*
965 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
966 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
967 		 * to cbn.
968 		 */
969 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
970 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
971 
972 		/*
973 		 * Figure out which interleave table to use.
974 		 */
975 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
976 			if (ii->ii_startblk > cbn)
977 				break;
978 		}
979 		ii--;
980 
981 		/*
982 		 * off is the logical superblock relative to the beginning
983 		 * of this interleave block.
984 		 */
985 		off = cbn - ii->ii_startblk;
986 
987 		/*
988 		 * We must calculate which disk component to use (ccdisk),
989 		 * and recalculate cbn to be the superblock relative to
990 		 * the beginning of the component.  This is typically done by
991 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
992 		 * must typically be divided by the number of components in
993 		 * this interleave array to be properly convert it from a
994 		 * CCD-relative logical superblock number to a
995 		 * component-relative superblock number.
996 		 */
997 		if (ii->ii_ndisk == 1) {
998 			/*
999 			 * When we have just one disk, it can't be a mirror
1000 			 * or a parity config.
1001 			 */
1002 			ccdisk = ii->ii_index[0];
1003 			cbn = ii->ii_startoff + off;
1004 		} else {
1005 			if (cs->sc_cflags & CCDF_MIRROR) {
1006 				/*
1007 				 * We have forced a uniform mapping, resulting
1008 				 * in a single interleave array.  We double
1009 				 * up on the first half of the available
1010 				 * components and our mirror is in the second
1011 				 * half.  This only works with a single
1012 				 * interleave array because doubling up
1013 				 * doubles the number of sectors, so there
1014 				 * cannot be another interleave array because
1015 				 * the next interleave array's calculations
1016 				 * would be off.
1017 				 */
1018 				int ndisk2 = ii->ii_ndisk / 2;
1019 				ccdisk = ii->ii_index[off % ndisk2];
1020 				cbn = ii->ii_startoff + off / ndisk2;
1021 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1022 			} else if (cs->sc_cflags & CCDF_PARITY) {
1023 				/*
1024 				 * XXX not implemented yet
1025 				 */
1026 				int ndisk2 = ii->ii_ndisk - 1;
1027 				ccdisk = ii->ii_index[off % ndisk2];
1028 				cbn = ii->ii_startoff + off / ndisk2;
1029 				if (cbn % ii->ii_ndisk <= ccdisk)
1030 					ccdisk++;
1031 			} else {
1032 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1033 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1034 			}
1035 		}
1036 
1037 		ci = &cs->sc_cinfo[ccdisk];
1038 
1039 		/*
1040 		 * Convert cbn from a superblock to a normal block so it
1041 		 * can be used to calculate (along with cboff) the normal
1042 		 * block index into this particular disk.
1043 		 */
1044 		cbn *= cs->sc_ileave;
1045 	}
1046 
1047 	/*
1048 	 * Fill in the component buf structure.
1049 	 */
1050 	cbp = getccdbuf(NULL);
1051 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
1052 	cbp->cb_buf.bio_done = ccdiodone;
1053 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
1054 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1055 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1056 	cbp->cb_buf.bio_data = addr;
1057 	if (cs->sc_ileave == 0)
1058               cbc = dbtob((off_t)(ci->ci_size - cbn));
1059 	else
1060               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1061 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1062  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1063 
1064 	/*
1065 	 * context for ccdiodone
1066 	 */
1067 	cbp->cb_obp = bp;
1068 	cbp->cb_unit = cs->sc_unit;
1069 	cbp->cb_comp = ci - cs->sc_cinfo;
1070 
1071 #ifdef DEBUG
1072 	if (ccddebug & CCDB_IO)
1073 		printf(" dev %p(u%ld): cbp %p bn %d addr %p bcnt %ld\n",
1074 		       ci->ci_dev, (unsigned long)(ci-cs->sc_cinfo), cbp,
1075 		       cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1076 		       cbp->cb_buf.bio_bcount);
1077 #endif
1078 	cb[0] = cbp;
1079 
1080 	/*
1081 	 * Note: both I/O's setup when reading from mirror, but only one
1082 	 * will be executed.
1083 	 */
1084 	if (cs->sc_cflags & CCDF_MIRROR) {
1085 		/* mirror, setup second I/O */
1086 		cbp = getccdbuf(cb[0]);
1087 		cbp->cb_buf.bio_dev = ci2->ci_dev;
1088 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1089 		cb[1] = cbp;
1090 		/* link together the ccdbuf's and clear "mirror done" flag */
1091 		cb[0]->cb_mirror = cb[1];
1092 		cb[1]->cb_mirror = cb[0];
1093 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1094 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1095 	}
1096 }
1097 
1098 static void
1099 ccdintr(struct ccd_s *cs, struct bio *bp)
1100 {
1101 #ifdef DEBUG
1102 	if (ccddebug & CCDB_FOLLOW)
1103 		printf("ccdintr(%p, %p)\n", cs, bp);
1104 #endif
1105 	/*
1106 	 * Request is done for better or worse, wakeup the top half.
1107 	 */
1108 	if (bp->bio_flags & BIO_ERROR)
1109 		bp->bio_resid = bp->bio_bcount;
1110 	biofinish(bp, &cs->device_stats, 0);
1111 }
1112 
1113 /*
1114  * Called at interrupt time.
1115  * Mark the component as done and if all components are done,
1116  * take a ccd interrupt.
1117  */
1118 static void
1119 ccdiodone(struct bio *ibp)
1120 {
1121 	struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1122 	struct bio *bp = cbp->cb_obp;
1123 	int unit = cbp->cb_unit;
1124 	int count, s;
1125 
1126 	s = splbio();
1127 #ifdef DEBUG
1128 	if (ccddebug & CCDB_FOLLOW)
1129 		printf("ccdiodone(%p)\n", cbp);
1130 	if (ccddebug & CCDB_IO) {
1131 		printf("ccdiodone: bp %p bcount %ld resid %ld\n",
1132 		       bp, bp->bio_bcount, bp->bio_resid);
1133 		printf(" dev %p(u%d), cbp %p bn %d addr %p bcnt %ld\n",
1134 		       cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1135 		       cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1136 		       cbp->cb_buf.bio_bcount);
1137 	}
1138 #endif
1139 	/*
1140 	 * If an error occured, report it.  If this is a mirrored
1141 	 * configuration and the first of two possible reads, do not
1142 	 * set the error in the bp yet because the second read may
1143 	 * succeed.
1144 	 */
1145 
1146 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1147 		const char *msg = "";
1148 
1149 		if ((ccdfind(unit)->sc_cflags & CCDF_MIRROR) &&
1150 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
1151 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1152 			/*
1153 			 * We will try our read on the other disk down
1154 			 * below, also reverse the default pick so if we
1155 			 * are doing a scan we do not keep hitting the
1156 			 * bad disk first.
1157 			 */
1158 			struct ccd_s *cs = ccdfind(unit);
1159 
1160 			msg = ", trying other disk";
1161 			cs->sc_pick = 1 - cs->sc_pick;
1162 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1163 		} else {
1164 			bp->bio_flags |= BIO_ERROR;
1165 			bp->bio_error = cbp->cb_buf.bio_error ?
1166 			    cbp->cb_buf.bio_error : EIO;
1167 		}
1168 		printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1169 		       unit, bp->bio_error, cbp->cb_comp,
1170 		       (int)cbp->cb_buf.bio_blkno, bp->bio_blkno, msg);
1171 	}
1172 
1173 	/*
1174 	 * Process mirror.  If we are writing, I/O has been initiated on both
1175 	 * buffers and we fall through only after both are finished.
1176 	 *
1177 	 * If we are reading only one I/O is initiated at a time.  If an
1178 	 * error occurs we initiate the second I/O and return, otherwise
1179 	 * we free the second I/O without initiating it.
1180 	 */
1181 
1182 	if (ccdfind(unit)->sc_cflags & CCDF_MIRROR) {
1183 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1184 			/*
1185 			 * When writing, handshake with the second buffer
1186 			 * to determine when both are done.  If both are not
1187 			 * done, return here.
1188 			 */
1189 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1190 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1191 				putccdbuf(cbp);
1192 				splx(s);
1193 				return;
1194 			}
1195 		} else {
1196 			/*
1197 			 * When reading, either dispose of the second buffer
1198 			 * or initiate I/O on the second buffer if an error
1199 			 * occured with this one.
1200 			 */
1201 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1202 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1203 					cbp->cb_mirror->cb_pflags |=
1204 					    CCDPF_MIRROR_DONE;
1205 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1206 					putccdbuf(cbp);
1207 					splx(s);
1208 					return;
1209 				} else {
1210 					putccdbuf(cbp->cb_mirror);
1211 					/* fall through */
1212 				}
1213 			}
1214 		}
1215 	}
1216 
1217 	/*
1218 	 * use bio_caller1 to determine how big the original request was rather
1219 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1220 	 *
1221 	 * XXX We check for an error, but we do not test the resid for an
1222 	 * aligned EOF condition.  This may result in character & block
1223 	 * device access not recognizing EOF properly when read or written
1224 	 * sequentially, but will not effect filesystems.
1225 	 */
1226 	count = (long)cbp->cb_buf.bio_caller1;
1227 	putccdbuf(cbp);
1228 
1229 	/*
1230 	 * If all done, "interrupt".
1231 	 */
1232 	bp->bio_resid -= count;
1233 	if (bp->bio_resid < 0)
1234 		panic("ccdiodone: count");
1235 	if (bp->bio_resid == 0)
1236 		ccdintr(ccdfind(unit), bp);
1237 	splx(s);
1238 }
1239 
1240 static int
1241 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
1242 {
1243 	int unit = ccdunit(dev);
1244 	int i, j, lookedup = 0, error = 0;
1245 	int part, pmask, s;
1246 	struct ccd_s *cs;
1247 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1248 	char **cpp;
1249 	struct vnode **vpp;
1250 
1251 	if (!IS_ALLOCATED(unit))
1252 		return (ENXIO);
1253 	cs = ccdfind(unit);
1254 
1255 	switch (cmd) {
1256 	case CCDIOCSET:
1257 		if (IS_INITED(cs))
1258 			return (EBUSY);
1259 
1260 		if ((flag & FWRITE) == 0)
1261 			return (EBADF);
1262 
1263 		if ((error = ccdlock(cs)) != 0)
1264 			return (error);
1265 
1266 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1267 			return (EINVAL);
1268 
1269 		/* Fill in some important bits. */
1270 		cs->sc_ileave = ccio->ccio_ileave;
1271 		if (cs->sc_ileave == 0 &&
1272 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1273 		     (ccio->ccio_flags & CCDF_PARITY))) {
1274 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1275 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1276 		}
1277 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1278 		    (ccio->ccio_flags & CCDF_PARITY)) {
1279 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1280 			ccio->ccio_flags &= ~CCDF_PARITY;
1281 		}
1282 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1283 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1284 			printf("ccd%d: mirror/parity forces uniform flag\n",
1285 			       unit);
1286 			ccio->ccio_flags |= CCDF_UNIFORM;
1287 		}
1288 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1289 
1290 		/*
1291 		 * Allocate space for and copy in the array of
1292 		 * componet pathnames and device numbers.
1293 		 */
1294 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1295 		    M_DEVBUF, M_WAITOK);
1296 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1297 		    M_DEVBUF, M_WAITOK);
1298 
1299 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1300 		    ccio->ccio_ndisks * sizeof(char **));
1301 		if (error) {
1302 			free(vpp, M_DEVBUF);
1303 			free(cpp, M_DEVBUF);
1304 			ccdunlock(cs);
1305 			return (error);
1306 		}
1307 
1308 #ifdef DEBUG
1309 		if (ccddebug & CCDB_INIT)
1310 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1311 				printf("ccdioctl: component %d: %p\n",
1312 				    i, cpp[i]);
1313 #endif
1314 
1315 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1316 #ifdef DEBUG
1317 			if (ccddebug & CCDB_INIT)
1318 				printf("ccdioctl: lookedup = %d\n", lookedup);
1319 #endif
1320 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1321 				for (j = 0; j < lookedup; ++j)
1322 					(void)vn_close(vpp[j], FREAD|FWRITE,
1323 					    td->td_proc->p_ucred, td);
1324 				free(vpp, M_DEVBUF);
1325 				free(cpp, M_DEVBUF);
1326 				ccdunlock(cs);
1327 				return (error);
1328 			}
1329 			++lookedup;
1330 		}
1331 		cs->sc_vpp = vpp;
1332 		cs->sc_nccdisks = ccio->ccio_ndisks;
1333 
1334 		/*
1335 		 * Initialize the ccd.  Fills in the softc for us.
1336 		 */
1337 		if ((error = ccdinit(cs, cpp, td)) != 0) {
1338 			for (j = 0; j < lookedup; ++j)
1339 				(void)vn_close(vpp[j], FREAD|FWRITE,
1340 				    td->td_proc->p_ucred, td);
1341 			/*
1342 			 * We can't ccddestroy() cs just yet, because nothing
1343 			 * prevents user-level app to do another ioctl()
1344 			 * without closing the device first, therefore
1345 			 * declare unit null and void and let ccdclose()
1346 			 * destroy it when it is safe to do so.
1347 			 */
1348 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1349 			free(vpp, M_DEVBUF);
1350 			free(cpp, M_DEVBUF);
1351 			ccdunlock(cs);
1352 			return (error);
1353 		}
1354 
1355 		/*
1356 		 * The ccd has been successfully initialized, so
1357 		 * we can place it into the array and read the disklabel.
1358 		 */
1359 		ccio->ccio_unit = unit;
1360 		ccio->ccio_size = cs->sc_size;
1361 		ccdgetdisklabel(dev);
1362 
1363 		ccdunlock(cs);
1364 
1365 		break;
1366 
1367 	case CCDIOCCLR:
1368 		if (!IS_INITED(cs))
1369 			return (ENXIO);
1370 
1371 		if ((flag & FWRITE) == 0)
1372 			return (EBADF);
1373 
1374 		if ((error = ccdlock(cs)) != 0)
1375 			return (error);
1376 
1377 		/* Don't unconfigure if any other partitions are open */
1378 		part = ccdpart(dev);
1379 		pmask = (1 << part);
1380 		if ((cs->sc_openmask & ~pmask)) {
1381 			ccdunlock(cs);
1382 			return (EBUSY);
1383 		}
1384 
1385 		/* Declare unit null and void (reset all flags) */
1386 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1387 
1388 		/* Close the components and free their pathnames. */
1389 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1390 			/*
1391 			 * XXX: this close could potentially fail and
1392 			 * cause Bad Things.  Maybe we need to force
1393 			 * the close to happen?
1394 			 */
1395 #ifdef DEBUG
1396 			if (ccddebug & CCDB_VNODE)
1397 				vprint("CCDIOCCLR: vnode info",
1398 				    cs->sc_cinfo[i].ci_vp);
1399 #endif
1400 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1401 			    td->td_proc->p_ucred, td);
1402 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1403 		}
1404 
1405 		/* Free interleave index. */
1406 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1407 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1408 
1409 		/* Free component info and interleave table. */
1410 		free(cs->sc_cinfo, M_DEVBUF);
1411 		free(cs->sc_itable, M_DEVBUF);
1412 		free(cs->sc_vpp, M_DEVBUF);
1413 
1414 		/* And remove the devstat entry. */
1415 		devstat_remove_entry(&cs->device_stats);
1416 
1417 		/* This must be atomic. */
1418 		s = splhigh();
1419 		ccdunlock(cs);
1420 		splx(s);
1421 
1422 		break;
1423 
1424 	case CCDCONFINFO:
1425 		{
1426 			int ninit = 0;
1427 			struct ccdconf *conf = (struct ccdconf *)data;
1428 			struct ccd_s *tmpcs;
1429 			struct ccd_s *ubuf = conf->buffer;
1430 
1431 			/* XXX: LOCK(unique unit numbers) */
1432 			LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1433 				if (IS_INITED(tmpcs))
1434 					ninit++;
1435 
1436 			if (conf->size == 0) {
1437 				conf->size = sizeof(struct ccd_s) * ninit;
1438 				break;
1439 			} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1440 			    (conf->size % sizeof(struct ccd_s) != 0)) {
1441 				/* XXX: UNLOCK(unique unit numbers) */
1442 				return (EINVAL);
1443 			}
1444 
1445 			ubuf += ninit;
1446 			LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1447 				if (!IS_INITED(tmpcs))
1448 					continue;
1449 				error = copyout(tmpcs, --ubuf,
1450 				    sizeof(struct ccd_s));
1451 				if (error != 0)
1452 					/* XXX: UNLOCK(unique unit numbers) */
1453 					return (error);
1454 			}
1455 			/* XXX: UNLOCK(unique unit numbers) */
1456 		}
1457 		break;
1458 
1459 	case CCDCPPINFO:
1460 		if (!IS_INITED(cs))
1461 			return (ENXIO);
1462 
1463 		{
1464 			int len = 0;
1465 			struct ccdcpps *cpps = (struct ccdcpps *)data;
1466 			char *ubuf = cpps->buffer;
1467 
1468 
1469 			for (i = 0; i < cs->sc_nccdisks; ++i)
1470 				len += cs->sc_cinfo[i].ci_pathlen;
1471 
1472 			if (cpps->size == 0) {
1473 				cpps->size = len;
1474 				break;
1475 			} else if (cpps->size != len) {
1476 				return (EINVAL);
1477 			}
1478 
1479 			for (i = 0; i < cs->sc_nccdisks; ++i) {
1480 				len = cs->sc_cinfo[i].ci_pathlen;
1481 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1482 				    len);
1483 				if (error != 0)
1484 					return (error);
1485 				ubuf += len;
1486 			}
1487 		}
1488 		break;
1489 
1490 	case DIOCGDINFO:
1491 		if (!IS_INITED(cs))
1492 			return (ENXIO);
1493 
1494 		*(struct disklabel *)data = cs->sc_label;
1495 		break;
1496 
1497 	case DIOCGPART:
1498 		if (!IS_INITED(cs))
1499 			return (ENXIO);
1500 
1501 		((struct partinfo *)data)->disklab = &cs->sc_label;
1502 		((struct partinfo *)data)->part =
1503 		    &cs->sc_label.d_partitions[ccdpart(dev)];
1504 		break;
1505 
1506 	case DIOCWDINFO:
1507 	case DIOCSDINFO:
1508 		if (!IS_INITED(cs))
1509 			return (ENXIO);
1510 
1511 		if ((flag & FWRITE) == 0)
1512 			return (EBADF);
1513 
1514 		if ((error = ccdlock(cs)) != 0)
1515 			return (error);
1516 
1517 		cs->sc_flags |= CCDF_LABELLING;
1518 
1519 		error = setdisklabel(&cs->sc_label,
1520 		    (struct disklabel *)data, 0);
1521 		if (error == 0) {
1522 			if (cmd == DIOCWDINFO)
1523 				error = writedisklabel(CCDLABELDEV(dev),
1524 				    &cs->sc_label);
1525 		}
1526 
1527 		cs->sc_flags &= ~CCDF_LABELLING;
1528 
1529 		ccdunlock(cs);
1530 
1531 		if (error)
1532 			return (error);
1533 		break;
1534 
1535 	case DIOCWLABEL:
1536 		if (!IS_INITED(cs))
1537 			return (ENXIO);
1538 
1539 		if ((flag & FWRITE) == 0)
1540 			return (EBADF);
1541 		if (*(int *)data != 0)
1542 			cs->sc_flags |= CCDF_WLABEL;
1543 		else
1544 			cs->sc_flags &= ~CCDF_WLABEL;
1545 		break;
1546 
1547 	default:
1548 		return (ENOTTY);
1549 	}
1550 
1551 	return (0);
1552 }
1553 
1554 static int
1555 ccdsize(dev_t dev)
1556 {
1557 	struct ccd_s *cs;
1558 	int part, size;
1559 
1560 	if (ccdopen(dev, 0, S_IFCHR, curthread))
1561 		return (-1);
1562 
1563 	cs = ccdfind(ccdunit(dev));
1564 	part = ccdpart(dev);
1565 
1566 	if (!IS_INITED(cs))
1567 		return (-1);
1568 
1569 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1570 		size = -1;
1571 	else
1572 		size = cs->sc_label.d_partitions[part].p_size;
1573 
1574 	if (ccdclose(dev, 0, S_IFCHR, curthread))
1575 		return (-1);
1576 
1577 	return (size);
1578 }
1579 
1580 static int
1581 ccddump(dev_t dev)
1582 {
1583 
1584 	/* Not implemented. */
1585 	return ENXIO;
1586 }
1587 
1588 /*
1589  * Lookup the provided name in the filesystem.  If the file exists,
1590  * is a valid block device, and isn't being used by anyone else,
1591  * set *vpp to the file's vnode.
1592  */
1593 static int
1594 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1595 {
1596 	struct nameidata nd;
1597 	struct vnode *vp;
1598 	int error, flags;
1599 
1600 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1601 	flags = FREAD | FWRITE;
1602 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1603 #ifdef DEBUG
1604 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
1605 			printf("ccdlookup: vn_open error = %d\n", error);
1606 #endif
1607 		return (error);
1608 	}
1609 	vp = nd.ni_vp;
1610 
1611 	if (vp->v_usecount > 1) {
1612 		error = EBUSY;
1613 		goto bad;
1614 	}
1615 
1616 	if (!vn_isdisk(vp, &error))
1617 		goto bad;
1618 
1619 #ifdef DEBUG
1620 	if (ccddebug & CCDB_VNODE)
1621 		vprint("ccdlookup: vnode info", vp);
1622 #endif
1623 
1624 	VOP_UNLOCK(vp, 0, td);
1625 	NDFREE(&nd, NDF_ONLY_PNBUF);
1626 	*vpp = vp;
1627 	return (0);
1628 bad:
1629 	VOP_UNLOCK(vp, 0, td);
1630 	NDFREE(&nd, NDF_ONLY_PNBUF);
1631 	/* vn_close does vrele() for vp */
1632 	(void)vn_close(vp, FREAD|FWRITE, td->td_proc->p_ucred, td);
1633 	return (error);
1634 }
1635 
1636 /*
1637  * Read the disklabel from the ccd.  If one is not present, fake one
1638  * up.
1639  */
1640 static void
1641 ccdgetdisklabel(dev_t dev)
1642 {
1643 	int unit = ccdunit(dev);
1644 	struct ccd_s *cs = ccdfind(unit);
1645 	char *errstring;
1646 	struct disklabel *lp = &cs->sc_label;
1647 	struct ccdgeom *ccg = &cs->sc_geom;
1648 
1649 	bzero(lp, sizeof(*lp));
1650 
1651 	lp->d_secperunit = cs->sc_size;
1652 	lp->d_secsize = ccg->ccg_secsize;
1653 	lp->d_nsectors = ccg->ccg_nsectors;
1654 	lp->d_ntracks = ccg->ccg_ntracks;
1655 	lp->d_ncylinders = ccg->ccg_ncylinders;
1656 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1657 
1658 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1659 	lp->d_type = DTYPE_CCD;
1660 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1661 	lp->d_rpm = 3600;
1662 	lp->d_interleave = 1;
1663 	lp->d_flags = 0;
1664 
1665 	lp->d_partitions[RAW_PART].p_offset = 0;
1666 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1667 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1668 	lp->d_npartitions = RAW_PART + 1;
1669 
1670 	lp->d_bbsize = BBSIZE;				/* XXX */
1671 	lp->d_sbsize = SBSIZE;				/* XXX */
1672 
1673 	lp->d_magic = DISKMAGIC;
1674 	lp->d_magic2 = DISKMAGIC;
1675 	lp->d_checksum = dkcksum(&cs->sc_label);
1676 
1677 	/*
1678 	 * Call the generic disklabel extraction routine.
1679 	 */
1680 	errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1681 	if (errstring != NULL)
1682 		ccdmakedisklabel(cs);
1683 
1684 #ifdef DEBUG
1685 	/* It's actually extremely common to have unlabeled ccds. */
1686 	if (ccddebug & CCDB_LABEL)
1687 		if (errstring != NULL)
1688 			printf("ccd%d: %s\n", unit, errstring);
1689 #endif
1690 }
1691 
1692 /*
1693  * Take care of things one might want to take care of in the event
1694  * that a disklabel isn't present.
1695  */
1696 static void
1697 ccdmakedisklabel(struct ccd_s *cs)
1698 {
1699 	struct disklabel *lp = &cs->sc_label;
1700 
1701 	/*
1702 	 * For historical reasons, if there's no disklabel present
1703 	 * the raw partition must be marked FS_BSDFFS.
1704 	 */
1705 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1706 
1707 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1708 }
1709 
1710 /*
1711  * Wait interruptibly for an exclusive lock.
1712  *
1713  * XXX
1714  * Several drivers do this; it should be abstracted and made MP-safe.
1715  */
1716 static int
1717 ccdlock(struct ccd_s *cs)
1718 {
1719 	int error;
1720 
1721 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1722 		cs->sc_flags |= CCDF_WANTED;
1723 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1724 			return (error);
1725 	}
1726 	cs->sc_flags |= CCDF_LOCKED;
1727 	return (0);
1728 }
1729 
1730 /*
1731  * Unlock and wake up any waiters.
1732  */
1733 static void
1734 ccdunlock(struct ccd_s *cs)
1735 {
1736 
1737 	cs->sc_flags &= ~CCDF_LOCKED;
1738 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1739 		cs->sc_flags &= ~CCDF_WANTED;
1740 		wakeup(cs);
1741 	}
1742 }
1743 
1744 #ifdef DEBUG
1745 static void
1746 printiinfo(struct ccdiinfo *ii)
1747 {
1748 	int ix, i;
1749 
1750 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1751 		printf(" itab[%d]: #dk %d sblk %d soff %d",
1752 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1753 		for (i = 0; i < ii->ii_ndisk; i++)
1754 			printf(" %d", ii->ii_index[i]);
1755 		printf("\n");
1756 	}
1757 }
1758 #endif
1759