xref: /freebsd/sys/geom/geom_ccd.c (revision 8fa113e5fc65fe6abc757f0089f477a87ee4d185)
1 /* $FreeBSD$ */
2 
3 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
4 
5 /*
6  * Copyright (c) 1995 Jason R. Thorpe.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed for the NetBSD Project
20  *	by Jason R. Thorpe.
21  * 4. The name of the author may not be used to endorse or promote products
22  *    derived from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * Copyright (c) 1988 University of Utah.
39  * Copyright (c) 1990, 1993
40  *	The Regents of the University of California.  All rights reserved.
41  *
42  * This code is derived from software contributed to Berkeley by
43  * the Systems Programming Group of the University of Utah Computer
44  * Science Department.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions
48  * are met:
49  * 1. Redistributions of source code must retain the above copyright
50  *    notice, this list of conditions and the following disclaimer.
51  * 2. Redistributions in binary form must reproduce the above copyright
52  *    notice, this list of conditions and the following disclaimer in the
53  *    documentation and/or other materials provided with the distribution.
54  * 3. All advertising materials mentioning features or use of this software
55  *    must display the following acknowledgement:
56  *	This product includes software developed by the University of
57  *	California, Berkeley and its contributors.
58  * 4. Neither the name of the University nor the names of its contributors
59  *    may be used to endorse or promote products derived from this software
60  *    without specific prior written permission.
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72  * SUCH DAMAGE.
73  *
74  * from: Utah $Hdr: cd.c 1.6 90/11/28$
75  *
76  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
77  */
78 
79 /*
80  * "Concatenated" disk driver.
81  *
82  * Dynamic configuration and disklabel support by:
83  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
84  *	Numerical Aerodynamic Simulation Facility
85  *	Mail Stop 258-6
86  *	NASA Ames Research Center
87  *	Moffett Field, CA 94035
88  */
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/kernel.h>
93 #include <sys/module.h>
94 #include <sys/proc.h>
95 #include <sys/bio.h>
96 #include <sys/malloc.h>
97 #include <sys/namei.h>
98 #include <sys/conf.h>
99 #include <sys/stat.h>
100 #include <sys/sysctl.h>
101 #include <sys/disklabel.h>
102 #include <ufs/ffs/fs.h>
103 #include <sys/devicestat.h>
104 #include <sys/fcntl.h>
105 #include <sys/vnode.h>
106 
107 #include <sys/ccdvar.h>
108 
109 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
110 
111 #if defined(CCDDEBUG) && !defined(DEBUG)
112 #define DEBUG
113 #endif
114 
115 #ifdef DEBUG
116 #define CCDB_FOLLOW	0x01
117 #define CCDB_INIT	0x02
118 #define CCDB_IO		0x04
119 #define CCDB_LABEL	0x08
120 #define CCDB_VNODE	0x10
121 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
122     CCDB_VNODE;
123 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
124 #endif
125 
126 #define	ccdunit(x)	dkunit(x)
127 #define ccdpart(x)	dkpart(x)
128 
129 /*
130    This is how mirroring works (only writes are special):
131 
132    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
133    linked together by the cb_mirror field.  "cb_pflags &
134    CCDPF_MIRROR_DONE" is set to 0 on both of them.
135 
136    When a component returns to ccdiodone(), it checks if "cb_pflags &
137    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
138    flag and returns.  If it is, it means its partner has already
139    returned, so it will go to the regular cleanup.
140 
141  */
142 
143 struct ccdbuf {
144 	struct bio	cb_buf;		/* new I/O buf */
145 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
146 	struct ccdbuf	*cb_freenext;	/* free list link */
147 	int		cb_unit;	/* target unit */
148 	int		cb_comp;	/* target component */
149 	int		cb_pflags;	/* mirror/parity status flag */
150 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
151 };
152 
153 /* bits in cb_pflags */
154 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
155 
156 #define CCDLABELDEV(dev)	\
157 	(makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
158 
159 /* convinient macros for often-used statements */
160 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
161 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
162 
163 static d_open_t ccdopen;
164 static d_close_t ccdclose;
165 static d_strategy_t ccdstrategy;
166 static d_ioctl_t ccdioctl;
167 static d_dump_t ccddump;
168 static d_psize_t ccdsize;
169 
170 #define NCCDFREEHIWAT	16
171 
172 #define CDEV_MAJOR 74
173 
174 static struct cdevsw ccd_cdevsw = {
175 	/* open */	ccdopen,
176 	/* close */	ccdclose,
177 	/* read */	physread,
178 	/* write */	physwrite,
179 	/* ioctl */	ccdioctl,
180 	/* poll */	nopoll,
181 	/* mmap */	nommap,
182 	/* strategy */	ccdstrategy,
183 	/* name */	"ccd",
184 	/* maj */	CDEV_MAJOR,
185 	/* dump */	ccddump,
186 	/* psize */	ccdsize,
187 	/* flags */	D_DISK,
188 };
189 static LIST_HEAD(, ccd_s) ccd_softc_list = LIST_HEAD_INITIALIZER(&ccd_softc_list);
190 
191 static struct ccd_s *ccdfind(int);
192 static struct ccd_s *ccdnew(int);
193 static int ccddestroy(struct ccd_s *, struct proc *);
194 
195 /* called during module initialization */
196 static void ccdattach(void);
197 static int ccd_modevent(module_t, int, void *);
198 
199 /* called by biodone() at interrupt time */
200 static void ccdiodone(struct bio *bp);
201 
202 static void ccdstart(struct ccd_s *, struct bio *);
203 static void ccdinterleave(struct ccd_s *, int);
204 static void ccdintr(struct ccd_s *, struct bio *);
205 static int ccdinit(struct ccd_s *, char **, struct thread *);
206 static int ccdlookup(char *, struct thread *p, struct vnode **);
207 static void ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
208 		      struct bio *, daddr_t, caddr_t, long);
209 static void ccdgetdisklabel(dev_t);
210 static void ccdmakedisklabel(struct ccd_s *);
211 static int ccdlock(struct ccd_s *);
212 static void ccdunlock(struct ccd_s *);
213 
214 #ifdef DEBUG
215 static void printiinfo(struct ccdiinfo *);
216 #endif
217 
218 /* Non-private for the benefit of libkvm. */
219 struct ccdbuf *ccdfreebufs;
220 static int numccdfreebufs;
221 
222 /*
223  * getccdbuf() -	Allocate and zero a ccd buffer.
224  *
225  *	This routine is called at splbio().
226  */
227 
228 static __inline
229 struct ccdbuf *
230 getccdbuf(struct ccdbuf *cpy)
231 {
232 	struct ccdbuf *cbp;
233 
234 	/*
235 	 * Allocate from freelist or malloc as necessary
236 	 */
237 	if ((cbp = ccdfreebufs) != NULL) {
238 		ccdfreebufs = cbp->cb_freenext;
239 		--numccdfreebufs;
240 	} else {
241 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
242 	}
243 
244 	/*
245 	 * Used by mirroring code
246 	 */
247 	if (cpy)
248 		bcopy(cpy, cbp, sizeof(struct ccdbuf));
249 	else
250 		bzero(cbp, sizeof(struct ccdbuf));
251 
252 	/*
253 	 * independant struct bio initialization
254 	 */
255 
256 	return(cbp);
257 }
258 
259 /*
260  * putccdbuf() -	Free a ccd buffer.
261  *
262  *	This routine is called at splbio().
263  */
264 
265 static __inline
266 void
267 putccdbuf(struct ccdbuf *cbp)
268 {
269 
270 	if (numccdfreebufs < NCCDFREEHIWAT) {
271 		cbp->cb_freenext = ccdfreebufs;
272 		ccdfreebufs = cbp;
273 		++numccdfreebufs;
274 	} else {
275 		free((caddr_t)cbp, M_DEVBUF);
276 	}
277 }
278 
279 
280 /*
281  * Number of blocks to untouched in front of a component partition.
282  * This is to avoid violating its disklabel area when it starts at the
283  * beginning of the slice.
284  */
285 #if !defined(CCD_OFFSET)
286 #define CCD_OFFSET 16
287 #endif
288 
289 static struct ccd_s *
290 ccdfind(int unit)
291 {
292 	struct ccd_s *sc = NULL;
293 
294 	/* XXX: LOCK(unique unit numbers) */
295 	LIST_FOREACH(sc, &ccd_softc_list, list) {
296 		if (sc->sc_unit == unit)
297 			break;
298 	}
299 	/* XXX: UNLOCK(unique unit numbers) */
300 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
301 }
302 
303 static struct ccd_s *
304 ccdnew(int unit)
305 {
306 	struct ccd_s *sc;
307 
308 	/* XXX: LOCK(unique unit numbers) */
309 	if (IS_ALLOCATED(unit) || unit > DKMAXUNIT)
310 		return (NULL);
311 
312 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
313 	sc->sc_unit = unit;
314 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
315 	/* XXX: UNLOCK(unique unit numbers) */
316 	return (sc);
317 }
318 
319 static int
320 ccddestroy(struct ccd_s *sc, struct proc *p)
321 {
322 
323 	/* XXX: LOCK(unique unit numbers) */
324 	LIST_REMOVE(sc, list);
325 	/* XXX: UNLOCK(unique unit numbers) */
326 	FREE(sc, M_CCD);
327 	return (0);
328 }
329 
330 static void
331 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
332 {
333 	int i, u;
334 	char *s;
335 
336 	if (*dev != NODEV)
337 		return;
338 	i = dev_stdclone(name, &s, "ccd", &u);
339 	if (i != 2)
340 		return;
341 	if (*s < 'a' || *s > 'h')
342 		return;
343 	if (s[1] != '\0')
344 		return;
345 	*dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
346 		UID_ROOT, GID_OPERATOR, 0640, name);
347 }
348 
349 /*
350  * Called by main() during pseudo-device attachment.  All we need
351  * to do is to add devsw entries.
352  */
353 static void
354 ccdattach()
355 {
356 
357 	EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
358 }
359 
360 static int
361 ccd_modevent(module_t mod, int type, void *data)
362 {
363 	int error = 0;
364 
365 	switch (type) {
366 	case MOD_LOAD:
367 		ccdattach();
368 		break;
369 
370 	case MOD_UNLOAD:
371 		printf("ccd0: Unload not supported!\n");
372 		error = EOPNOTSUPP;
373 		break;
374 
375 	case MOD_SHUTDOWN:
376 		break;
377 
378 	default:
379 		error = EOPNOTSUPP;
380 	}
381 	return (error);
382 }
383 
384 DEV_MODULE(ccd, ccd_modevent, NULL);
385 
386 static int
387 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
388 {
389 	struct ccdcinfo *ci = NULL;	/* XXX */
390 	size_t size;
391 	int ix;
392 	struct vnode *vp;
393 	size_t minsize;
394 	int maxsecsize;
395 	struct partinfo dpart;
396 	struct ccdgeom *ccg = &cs->sc_geom;
397 	char tmppath[MAXPATHLEN];
398 	int error = 0;
399 
400 #ifdef DEBUG
401 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
402 		printf("ccdinit: unit %d\n", cs->sc_unit);
403 #endif
404 
405 	cs->sc_size = 0;
406 
407 	/* Allocate space for the component info. */
408 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
409 	    M_DEVBUF, M_WAITOK);
410 
411 	/*
412 	 * Verify that each component piece exists and record
413 	 * relevant information about it.
414 	 */
415 	maxsecsize = 0;
416 	minsize = 0;
417 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
418 		vp = cs->sc_vpp[ix];
419 		ci = &cs->sc_cinfo[ix];
420 		ci->ci_vp = vp;
421 
422 		/*
423 		 * Copy in the pathname of the component.
424 		 */
425 		bzero(tmppath, sizeof(tmppath));	/* sanity */
426 		if ((error = copyinstr(cpaths[ix], tmppath,
427 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
428 #ifdef DEBUG
429 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
430 				printf("ccd%d: can't copy path, error = %d\n",
431 				    cs->sc_unit, error);
432 #endif
433 			goto fail;
434 		}
435 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
436 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
437 
438 		ci->ci_dev = vn_todev(vp);
439 
440 		/*
441 		 * Get partition information for the component.
442 		 */
443 		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
444 		    FREAD, td->td_proc->p_ucred, td)) != 0) {
445 #ifdef DEBUG
446 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
447 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
448 				     cs->sc_unit, ci->ci_path, error);
449 #endif
450 			goto fail;
451 		}
452 		if (dpart.part->p_fstype == FS_BSDFFS) {
453 			maxsecsize =
454 			    ((dpart.disklab->d_secsize > maxsecsize) ?
455 			    dpart.disklab->d_secsize : maxsecsize);
456 			size = dpart.part->p_size - CCD_OFFSET;
457 		} else {
458 #ifdef DEBUG
459 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
460 				printf("ccd%d: %s: incorrect partition type\n",
461 				    cs->sc_unit, ci->ci_path);
462 #endif
463 			error = EFTYPE;
464 			goto fail;
465 		}
466 
467 		/*
468 		 * Calculate the size, truncating to an interleave
469 		 * boundary if necessary.
470 		 */
471 
472 		if (cs->sc_ileave > 1)
473 			size -= size % cs->sc_ileave;
474 
475 		if (size == 0) {
476 #ifdef DEBUG
477 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
478 				printf("ccd%d: %s: size == 0\n",
479 				    cs->sc_unit, ci->ci_path);
480 #endif
481 			error = ENODEV;
482 			goto fail;
483 		}
484 
485 		if (minsize == 0 || size < minsize)
486 			minsize = size;
487 		ci->ci_size = size;
488 		cs->sc_size += size;
489 	}
490 
491 	/*
492 	 * Don't allow the interleave to be smaller than
493 	 * the biggest component sector.
494 	 */
495 	if ((cs->sc_ileave > 0) &&
496 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
497 #ifdef DEBUG
498 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
499 			printf("ccd%d: interleave must be at least %d\n",
500 			    cs->sc_unit, (maxsecsize / DEV_BSIZE));
501 #endif
502 		error = EINVAL;
503 		goto fail;
504 	}
505 
506 	/*
507 	 * If uniform interleave is desired set all sizes to that of
508 	 * the smallest component.  This will guarentee that a single
509 	 * interleave table is generated.
510 	 *
511 	 * Lost space must be taken into account when calculating the
512 	 * overall size.  Half the space is lost when CCDF_MIRROR is
513 	 * specified.  One disk is lost when CCDF_PARITY is specified.
514 	 */
515 	if (cs->sc_flags & CCDF_UNIFORM) {
516 		for (ci = cs->sc_cinfo;
517 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
518 			ci->ci_size = minsize;
519 		}
520 		if (cs->sc_flags & CCDF_MIRROR) {
521 			/*
522 			 * Check to see if an even number of components
523 			 * have been specified.  The interleave must also
524 			 * be non-zero in order for us to be able to
525 			 * guarentee the topology.
526 			 */
527 			if (cs->sc_nccdisks % 2) {
528 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
529 				error = EINVAL;
530 				goto fail;
531 			}
532 			if (cs->sc_ileave == 0) {
533 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
534 				error = EINVAL;
535 				goto fail;
536 			}
537 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
538 		} else if (cs->sc_flags & CCDF_PARITY) {
539 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
540 		} else {
541 			if (cs->sc_ileave == 0) {
542 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
543 				error = EINVAL;
544 				goto fail;
545 			}
546 			cs->sc_size = cs->sc_nccdisks * minsize;
547 		}
548 	}
549 
550 	/*
551 	 * Construct the interleave table.
552 	 */
553 	ccdinterleave(cs, cs->sc_unit);
554 
555 	/*
556 	 * Create pseudo-geometry based on 1MB cylinders.  It's
557 	 * pretty close.
558 	 */
559 	ccg->ccg_secsize = maxsecsize;
560 	ccg->ccg_ntracks = 1;
561 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
562 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
563 
564 	/*
565 	 * Add an devstat entry for this device.
566 	 */
567 	devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
568 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
569 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
570 			  DEVSTAT_PRIORITY_ARRAY);
571 
572 	cs->sc_flags |= CCDF_INITED;
573 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
574 	return (0);
575 fail:
576 	while (ci > cs->sc_cinfo) {
577 		ci--;
578 		free(ci->ci_path, M_DEVBUF);
579 	}
580 	free(cs->sc_cinfo, M_DEVBUF);
581 	return (error);
582 }
583 
584 static void
585 ccdinterleave(struct ccd_s *cs, int unit)
586 {
587 	struct ccdcinfo *ci, *smallci;
588 	struct ccdiinfo *ii;
589 	daddr_t bn, lbn;
590 	int ix;
591 	u_long size;
592 
593 #ifdef DEBUG
594 	if (ccddebug & CCDB_INIT)
595 		printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
596 #endif
597 
598 	/*
599 	 * Allocate an interleave table.  The worst case occurs when each
600 	 * of N disks is of a different size, resulting in N interleave
601 	 * tables.
602 	 *
603 	 * Chances are this is too big, but we don't care.
604 	 */
605 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
606 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
607 	    M_WAITOK | M_ZERO);
608 
609 	/*
610 	 * Trivial case: no interleave (actually interleave of disk size).
611 	 * Each table entry represents a single component in its entirety.
612 	 *
613 	 * An interleave of 0 may not be used with a mirror or parity setup.
614 	 */
615 	if (cs->sc_ileave == 0) {
616 		bn = 0;
617 		ii = cs->sc_itable;
618 
619 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
620 			/* Allocate space for ii_index. */
621 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
622 			ii->ii_ndisk = 1;
623 			ii->ii_startblk = bn;
624 			ii->ii_startoff = 0;
625 			ii->ii_index[0] = ix;
626 			bn += cs->sc_cinfo[ix].ci_size;
627 			ii++;
628 		}
629 		ii->ii_ndisk = 0;
630 #ifdef DEBUG
631 		if (ccddebug & CCDB_INIT)
632 			printiinfo(cs->sc_itable);
633 #endif
634 		return;
635 	}
636 
637 	/*
638 	 * The following isn't fast or pretty; it doesn't have to be.
639 	 */
640 	size = 0;
641 	bn = lbn = 0;
642 	for (ii = cs->sc_itable; ; ii++) {
643 		/*
644 		 * Allocate space for ii_index.  We might allocate more then
645 		 * we use.
646 		 */
647 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
648 		    M_DEVBUF, M_WAITOK);
649 
650 		/*
651 		 * Locate the smallest of the remaining components
652 		 */
653 		smallci = NULL;
654 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
655 		    ci++) {
656 			if (ci->ci_size > size &&
657 			    (smallci == NULL ||
658 			     ci->ci_size < smallci->ci_size)) {
659 				smallci = ci;
660 			}
661 		}
662 
663 		/*
664 		 * Nobody left, all done
665 		 */
666 		if (smallci == NULL) {
667 			ii->ii_ndisk = 0;
668 			break;
669 		}
670 
671 		/*
672 		 * Record starting logical block using an sc_ileave blocksize.
673 		 */
674 		ii->ii_startblk = bn / cs->sc_ileave;
675 
676 		/*
677 		 * Record starting comopnent block using an sc_ileave
678 		 * blocksize.  This value is relative to the beginning of
679 		 * a component disk.
680 		 */
681 		ii->ii_startoff = lbn;
682 
683 		/*
684 		 * Determine how many disks take part in this interleave
685 		 * and record their indices.
686 		 */
687 		ix = 0;
688 		for (ci = cs->sc_cinfo;
689 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
690 			if (ci->ci_size >= smallci->ci_size) {
691 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
692 			}
693 		}
694 		ii->ii_ndisk = ix;
695 		bn += ix * (smallci->ci_size - size);
696 		lbn = smallci->ci_size / cs->sc_ileave;
697 		size = smallci->ci_size;
698 	}
699 #ifdef DEBUG
700 	if (ccddebug & CCDB_INIT)
701 		printiinfo(cs->sc_itable);
702 #endif
703 }
704 
705 /* ARGSUSED */
706 static int
707 ccdopen(dev_t dev, int flags, int fmt, struct thread *td)
708 {
709 	int unit = ccdunit(dev);
710 	struct ccd_s *cs;
711 	struct disklabel *lp;
712 	int error = 0, part, pmask;
713 
714 #ifdef DEBUG
715 	if (ccddebug & CCDB_FOLLOW)
716 		printf("ccdopen(%p, %x)\n", dev, flags);
717 #endif
718 
719 	cs = IS_ALLOCATED(unit) ? ccdfind(unit) : ccdnew(unit);
720 
721 	if ((error = ccdlock(cs)) != 0)
722 		return (error);
723 
724 	lp = &cs->sc_label;
725 
726 	part = ccdpart(dev);
727 	pmask = (1 << part);
728 
729 	/*
730 	 * If we're initialized, check to see if there are any other
731 	 * open partitions.  If not, then it's safe to update
732 	 * the in-core disklabel.
733 	 */
734 	if (IS_INITED(cs) && (cs->sc_openmask == 0))
735 		ccdgetdisklabel(dev);
736 
737 	/* Check that the partition exists. */
738 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
739 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
740 		error = ENXIO;
741 		goto done;
742 	}
743 
744 	cs->sc_openmask |= pmask;
745  done:
746 	ccdunlock(cs);
747 	return (0);
748 }
749 
750 /* ARGSUSED */
751 static int
752 ccdclose(dev_t dev, int flags, int fmt, struct thread *td)
753 {
754 	int unit = ccdunit(dev);
755 	struct ccd_s *cs;
756 	int error = 0, part;
757 
758 #ifdef DEBUG
759 	if (ccddebug & CCDB_FOLLOW)
760 		printf("ccdclose(%p, %x)\n", dev, flags);
761 #endif
762 
763 	if (!IS_ALLOCATED(unit))
764 		return (ENXIO);
765 	cs = ccdfind(unit);
766 
767 	if ((error = ccdlock(cs)) != 0)
768 		return (error);
769 
770 	part = ccdpart(dev);
771 
772 	/* ...that much closer to allowing unconfiguration... */
773 	cs->sc_openmask &= ~(1 << part);
774 	/* collect "garbage" if possible */
775 	if (!IS_INITED(cs) && (cs->sc_flags & CCDF_WANTED) == 0)
776 		ccddestroy(cs, td->td_proc);
777 	else
778 		ccdunlock(cs);
779 	return (0);
780 }
781 
782 static void
783 ccdstrategy(struct bio *bp)
784 {
785 	int unit = ccdunit(bp->bio_dev);
786 	struct ccd_s *cs = ccdfind(unit);
787 	int s;
788 	int wlabel;
789 	struct disklabel *lp;
790 
791 #ifdef DEBUG
792 	if (ccddebug & CCDB_FOLLOW)
793 		printf("ccdstrategy(%p): unit %d\n", bp, unit);
794 #endif
795 	if (!IS_INITED(cs)) {
796 		biofinish(bp, NULL, ENXIO);
797 		return;
798 	}
799 
800 	/* If it's a nil transfer, wake up the top half now. */
801 	if (bp->bio_bcount == 0) {
802 		biodone(bp);
803 		return;
804 	}
805 
806 	lp = &cs->sc_label;
807 
808 	/*
809 	 * Do bounds checking and adjust transfer.  If there's an
810 	 * error, the bounds check will flag that for us.
811 	 */
812 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
813 	if (ccdpart(bp->bio_dev) != RAW_PART) {
814 		if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
815 			biodone(bp);
816 			return;
817 		}
818 	} else {
819 		int pbn;        /* in sc_secsize chunks */
820 		long sz;        /* in sc_secsize chunks */
821 
822 		pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
823 		sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
824 
825 		/*
826 		 * If out of bounds return an error. If at the EOF point,
827 		 * simply read or write less.
828 		 */
829 
830 		if (pbn < 0 || pbn >= cs->sc_size) {
831 			bp->bio_resid = bp->bio_bcount;
832 			if (pbn != cs->sc_size)
833 				biofinish(bp, NULL, EINVAL);
834 			else
835 				biodone(bp);
836 			return;
837 		}
838 
839 		/*
840 		 * If the request crosses EOF, truncate the request.
841 		 */
842 		if (pbn + sz > cs->sc_size) {
843 			bp->bio_bcount = (cs->sc_size - pbn) *
844 			    cs->sc_geom.ccg_secsize;
845 		}
846 	}
847 
848 	bp->bio_resid = bp->bio_bcount;
849 
850 	/*
851 	 * "Start" the unit.
852 	 */
853 	s = splbio();
854 	ccdstart(cs, bp);
855 	splx(s);
856 	return;
857 }
858 
859 static void
860 ccdstart(struct ccd_s *cs, struct bio *bp)
861 {
862 	long bcount, rcount;
863 	struct ccdbuf *cbp[4];
864 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
865 	caddr_t addr;
866 	daddr_t bn;
867 	struct partition *pp;
868 
869 #ifdef DEBUG
870 	if (ccddebug & CCDB_FOLLOW)
871 		printf("ccdstart(%p, %p)\n", cs, bp);
872 #endif
873 
874 	/* Record the transaction start  */
875 	devstat_start_transaction(&cs->device_stats);
876 
877 	/*
878 	 * Translate the partition-relative block number to an absolute.
879 	 */
880 	bn = bp->bio_blkno;
881 	if (ccdpart(bp->bio_dev) != RAW_PART) {
882 		pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
883 		bn += pp->p_offset;
884 	}
885 
886 	/*
887 	 * Allocate component buffers and fire off the requests
888 	 */
889 	addr = bp->bio_data;
890 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
891 		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
892 		rcount = cbp[0]->cb_buf.bio_bcount;
893 
894 		if (cs->sc_cflags & CCDF_MIRROR) {
895 			/*
896 			 * Mirroring.  Writes go to both disks, reads are
897 			 * taken from whichever disk seems most appropriate.
898 			 *
899 			 * We attempt to localize reads to the disk whos arm
900 			 * is nearest the read request.  We ignore seeks due
901 			 * to writes when making this determination and we
902 			 * also try to avoid hogging.
903 			 */
904 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
905 				BIO_STRATEGY(&cbp[0]->cb_buf, 0);
906 				BIO_STRATEGY(&cbp[1]->cb_buf, 0);
907 			} else {
908 				int pick = cs->sc_pick;
909 				daddr_t range = cs->sc_size / 16;
910 
911 				if (bn < cs->sc_blk[pick] - range ||
912 				    bn > cs->sc_blk[pick] + range
913 				) {
914 					cs->sc_pick = pick = 1 - pick;
915 				}
916 				cs->sc_blk[pick] = bn + btodb(rcount);
917 				BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
918 			}
919 		} else {
920 			/*
921 			 * Not mirroring
922 			 */
923 			BIO_STRATEGY(&cbp[0]->cb_buf, 0);
924 		}
925 		bn += btodb(rcount);
926 		addr += rcount;
927 	}
928 }
929 
930 /*
931  * Build a component buffer header.
932  */
933 static void
934 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
935 {
936 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
937 	struct ccdbuf *cbp;
938 	daddr_t cbn, cboff;
939 	off_t cbc;
940 
941 #ifdef DEBUG
942 	if (ccddebug & CCDB_IO)
943 		printf("ccdbuffer(%p, %p, %d, %p, %ld)\n",
944 		       cs, bp, bn, addr, bcount);
945 #endif
946 	/*
947 	 * Determine which component bn falls in.
948 	 */
949 	cbn = bn;
950 	cboff = 0;
951 
952 	if (cs->sc_ileave == 0) {
953 		/*
954 		 * Serially concatenated and neither a mirror nor a parity
955 		 * config.  This is a special case.
956 		 */
957 		daddr_t sblk;
958 
959 		sblk = 0;
960 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
961 			sblk += ci->ci_size;
962 		cbn -= sblk;
963 	} else {
964 		struct ccdiinfo *ii;
965 		int ccdisk, off;
966 
967 		/*
968 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
969 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
970 		 * to cbn.
971 		 */
972 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
973 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
974 
975 		/*
976 		 * Figure out which interleave table to use.
977 		 */
978 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
979 			if (ii->ii_startblk > cbn)
980 				break;
981 		}
982 		ii--;
983 
984 		/*
985 		 * off is the logical superblock relative to the beginning
986 		 * of this interleave block.
987 		 */
988 		off = cbn - ii->ii_startblk;
989 
990 		/*
991 		 * We must calculate which disk component to use (ccdisk),
992 		 * and recalculate cbn to be the superblock relative to
993 		 * the beginning of the component.  This is typically done by
994 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
995 		 * must typically be divided by the number of components in
996 		 * this interleave array to be properly convert it from a
997 		 * CCD-relative logical superblock number to a
998 		 * component-relative superblock number.
999 		 */
1000 		if (ii->ii_ndisk == 1) {
1001 			/*
1002 			 * When we have just one disk, it can't be a mirror
1003 			 * or a parity config.
1004 			 */
1005 			ccdisk = ii->ii_index[0];
1006 			cbn = ii->ii_startoff + off;
1007 		} else {
1008 			if (cs->sc_cflags & CCDF_MIRROR) {
1009 				/*
1010 				 * We have forced a uniform mapping, resulting
1011 				 * in a single interleave array.  We double
1012 				 * up on the first half of the available
1013 				 * components and our mirror is in the second
1014 				 * half.  This only works with a single
1015 				 * interleave array because doubling up
1016 				 * doubles the number of sectors, so there
1017 				 * cannot be another interleave array because
1018 				 * the next interleave array's calculations
1019 				 * would be off.
1020 				 */
1021 				int ndisk2 = ii->ii_ndisk / 2;
1022 				ccdisk = ii->ii_index[off % ndisk2];
1023 				cbn = ii->ii_startoff + off / ndisk2;
1024 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1025 			} else if (cs->sc_cflags & CCDF_PARITY) {
1026 				/*
1027 				 * XXX not implemented yet
1028 				 */
1029 				int ndisk2 = ii->ii_ndisk - 1;
1030 				ccdisk = ii->ii_index[off % ndisk2];
1031 				cbn = ii->ii_startoff + off / ndisk2;
1032 				if (cbn % ii->ii_ndisk <= ccdisk)
1033 					ccdisk++;
1034 			} else {
1035 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1036 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1037 			}
1038 		}
1039 
1040 		ci = &cs->sc_cinfo[ccdisk];
1041 
1042 		/*
1043 		 * Convert cbn from a superblock to a normal block so it
1044 		 * can be used to calculate (along with cboff) the normal
1045 		 * block index into this particular disk.
1046 		 */
1047 		cbn *= cs->sc_ileave;
1048 	}
1049 
1050 	/*
1051 	 * Fill in the component buf structure.
1052 	 */
1053 	cbp = getccdbuf(NULL);
1054 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
1055 	cbp->cb_buf.bio_done = ccdiodone;
1056 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
1057 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1058 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1059 	cbp->cb_buf.bio_data = addr;
1060 	if (cs->sc_ileave == 0)
1061               cbc = dbtob((off_t)(ci->ci_size - cbn));
1062 	else
1063               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1064 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1065  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1066 
1067 	/*
1068 	 * context for ccdiodone
1069 	 */
1070 	cbp->cb_obp = bp;
1071 	cbp->cb_unit = cs->sc_unit;
1072 	cbp->cb_comp = ci - cs->sc_cinfo;
1073 
1074 #ifdef DEBUG
1075 	if (ccddebug & CCDB_IO)
1076 		printf(" dev %p(u%ld): cbp %p bn %d addr %p bcnt %ld\n",
1077 		       ci->ci_dev, (unsigned long)(ci-cs->sc_cinfo), cbp,
1078 		       cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1079 		       cbp->cb_buf.bio_bcount);
1080 #endif
1081 	cb[0] = cbp;
1082 
1083 	/*
1084 	 * Note: both I/O's setup when reading from mirror, but only one
1085 	 * will be executed.
1086 	 */
1087 	if (cs->sc_cflags & CCDF_MIRROR) {
1088 		/* mirror, setup second I/O */
1089 		cbp = getccdbuf(cb[0]);
1090 		cbp->cb_buf.bio_dev = ci2->ci_dev;
1091 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1092 		cb[1] = cbp;
1093 		/* link together the ccdbuf's and clear "mirror done" flag */
1094 		cb[0]->cb_mirror = cb[1];
1095 		cb[1]->cb_mirror = cb[0];
1096 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1097 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1098 	}
1099 }
1100 
1101 static void
1102 ccdintr(struct ccd_s *cs, struct bio *bp)
1103 {
1104 #ifdef DEBUG
1105 	if (ccddebug & CCDB_FOLLOW)
1106 		printf("ccdintr(%p, %p)\n", cs, bp);
1107 #endif
1108 	/*
1109 	 * Request is done for better or worse, wakeup the top half.
1110 	 */
1111 	if (bp->bio_flags & BIO_ERROR)
1112 		bp->bio_resid = bp->bio_bcount;
1113 	biofinish(bp, &cs->device_stats, 0);
1114 }
1115 
1116 /*
1117  * Called at interrupt time.
1118  * Mark the component as done and if all components are done,
1119  * take a ccd interrupt.
1120  */
1121 static void
1122 ccdiodone(struct bio *ibp)
1123 {
1124 	struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1125 	struct bio *bp = cbp->cb_obp;
1126 	int unit = cbp->cb_unit;
1127 	int count, s;
1128 
1129 	s = splbio();
1130 #ifdef DEBUG
1131 	if (ccddebug & CCDB_FOLLOW)
1132 		printf("ccdiodone(%p)\n", cbp);
1133 	if (ccddebug & CCDB_IO) {
1134 		printf("ccdiodone: bp %p bcount %ld resid %ld\n",
1135 		       bp, bp->bio_bcount, bp->bio_resid);
1136 		printf(" dev %p(u%d), cbp %p bn %d addr %p bcnt %ld\n",
1137 		       cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1138 		       cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1139 		       cbp->cb_buf.bio_bcount);
1140 	}
1141 #endif
1142 	/*
1143 	 * If an error occured, report it.  If this is a mirrored
1144 	 * configuration and the first of two possible reads, do not
1145 	 * set the error in the bp yet because the second read may
1146 	 * succeed.
1147 	 */
1148 
1149 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1150 		const char *msg = "";
1151 
1152 		if ((ccdfind(unit)->sc_cflags & CCDF_MIRROR) &&
1153 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
1154 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1155 			/*
1156 			 * We will try our read on the other disk down
1157 			 * below, also reverse the default pick so if we
1158 			 * are doing a scan we do not keep hitting the
1159 			 * bad disk first.
1160 			 */
1161 			struct ccd_s *cs = ccdfind(unit);
1162 
1163 			msg = ", trying other disk";
1164 			cs->sc_pick = 1 - cs->sc_pick;
1165 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1166 		} else {
1167 			bp->bio_flags |= BIO_ERROR;
1168 			bp->bio_error = cbp->cb_buf.bio_error ?
1169 			    cbp->cb_buf.bio_error : EIO;
1170 		}
1171 		printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1172 		       unit, bp->bio_error, cbp->cb_comp,
1173 		       (int)cbp->cb_buf.bio_blkno, bp->bio_blkno, msg);
1174 	}
1175 
1176 	/*
1177 	 * Process mirror.  If we are writing, I/O has been initiated on both
1178 	 * buffers and we fall through only after both are finished.
1179 	 *
1180 	 * If we are reading only one I/O is initiated at a time.  If an
1181 	 * error occurs we initiate the second I/O and return, otherwise
1182 	 * we free the second I/O without initiating it.
1183 	 */
1184 
1185 	if (ccdfind(unit)->sc_cflags & CCDF_MIRROR) {
1186 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1187 			/*
1188 			 * When writing, handshake with the second buffer
1189 			 * to determine when both are done.  If both are not
1190 			 * done, return here.
1191 			 */
1192 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1193 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1194 				putccdbuf(cbp);
1195 				splx(s);
1196 				return;
1197 			}
1198 		} else {
1199 			/*
1200 			 * When reading, either dispose of the second buffer
1201 			 * or initiate I/O on the second buffer if an error
1202 			 * occured with this one.
1203 			 */
1204 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1205 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1206 					cbp->cb_mirror->cb_pflags |=
1207 					    CCDPF_MIRROR_DONE;
1208 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1209 					putccdbuf(cbp);
1210 					splx(s);
1211 					return;
1212 				} else {
1213 					putccdbuf(cbp->cb_mirror);
1214 					/* fall through */
1215 				}
1216 			}
1217 		}
1218 	}
1219 
1220 	/*
1221 	 * use bio_caller1 to determine how big the original request was rather
1222 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1223 	 *
1224 	 * XXX We check for an error, but we do not test the resid for an
1225 	 * aligned EOF condition.  This may result in character & block
1226 	 * device access not recognizing EOF properly when read or written
1227 	 * sequentially, but will not effect filesystems.
1228 	 */
1229 	count = (long)cbp->cb_buf.bio_caller1;
1230 	putccdbuf(cbp);
1231 
1232 	/*
1233 	 * If all done, "interrupt".
1234 	 */
1235 	bp->bio_resid -= count;
1236 	if (bp->bio_resid < 0)
1237 		panic("ccdiodone: count");
1238 	if (bp->bio_resid == 0)
1239 		ccdintr(ccdfind(unit), bp);
1240 	splx(s);
1241 }
1242 
1243 static int
1244 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
1245 {
1246 	int unit = ccdunit(dev);
1247 	int i, j, lookedup = 0, error = 0;
1248 	int part, pmask, s;
1249 	struct ccd_s *cs;
1250 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1251 	char **cpp;
1252 	struct vnode **vpp;
1253 
1254 	if (!IS_ALLOCATED(unit))
1255 		return (ENXIO);
1256 	cs = ccdfind(unit);
1257 
1258 	switch (cmd) {
1259 	case CCDIOCSET:
1260 		if (IS_INITED(cs))
1261 			return (EBUSY);
1262 
1263 		if ((flag & FWRITE) == 0)
1264 			return (EBADF);
1265 
1266 		if ((error = ccdlock(cs)) != 0)
1267 			return (error);
1268 
1269 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1270 			return (EINVAL);
1271 
1272 		/* Fill in some important bits. */
1273 		cs->sc_ileave = ccio->ccio_ileave;
1274 		if (cs->sc_ileave == 0 &&
1275 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1276 		     (ccio->ccio_flags & CCDF_PARITY))) {
1277 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1278 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1279 		}
1280 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1281 		    (ccio->ccio_flags & CCDF_PARITY)) {
1282 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1283 			ccio->ccio_flags &= ~CCDF_PARITY;
1284 		}
1285 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1286 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1287 			printf("ccd%d: mirror/parity forces uniform flag\n",
1288 			       unit);
1289 			ccio->ccio_flags |= CCDF_UNIFORM;
1290 		}
1291 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1292 
1293 		/*
1294 		 * Allocate space for and copy in the array of
1295 		 * componet pathnames and device numbers.
1296 		 */
1297 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1298 		    M_DEVBUF, M_WAITOK);
1299 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1300 		    M_DEVBUF, M_WAITOK);
1301 
1302 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1303 		    ccio->ccio_ndisks * sizeof(char **));
1304 		if (error) {
1305 			free(vpp, M_DEVBUF);
1306 			free(cpp, M_DEVBUF);
1307 			ccdunlock(cs);
1308 			return (error);
1309 		}
1310 
1311 #ifdef DEBUG
1312 		if (ccddebug & CCDB_INIT)
1313 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1314 				printf("ccdioctl: component %d: %p\n",
1315 				    i, cpp[i]);
1316 #endif
1317 
1318 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1319 #ifdef DEBUG
1320 			if (ccddebug & CCDB_INIT)
1321 				printf("ccdioctl: lookedup = %d\n", lookedup);
1322 #endif
1323 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1324 				for (j = 0; j < lookedup; ++j)
1325 					(void)vn_close(vpp[j], FREAD|FWRITE,
1326 					    td->td_proc->p_ucred, td);
1327 				free(vpp, M_DEVBUF);
1328 				free(cpp, M_DEVBUF);
1329 				ccdunlock(cs);
1330 				return (error);
1331 			}
1332 			++lookedup;
1333 		}
1334 		cs->sc_vpp = vpp;
1335 		cs->sc_nccdisks = ccio->ccio_ndisks;
1336 
1337 		/*
1338 		 * Initialize the ccd.  Fills in the softc for us.
1339 		 */
1340 		if ((error = ccdinit(cs, cpp, td)) != 0) {
1341 			for (j = 0; j < lookedup; ++j)
1342 				(void)vn_close(vpp[j], FREAD|FWRITE,
1343 				    td->td_proc->p_ucred, td);
1344 			/*
1345 			 * We can't ccddestroy() cs just yet, because nothing
1346 			 * prevents user-level app to do another ioctl()
1347 			 * without closing the device first, therefore
1348 			 * declare unit null and void and let ccdclose()
1349 			 * destroy it when it is safe to do so.
1350 			 */
1351 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1352 			free(vpp, M_DEVBUF);
1353 			free(cpp, M_DEVBUF);
1354 			ccdunlock(cs);
1355 			return (error);
1356 		}
1357 
1358 		/*
1359 		 * The ccd has been successfully initialized, so
1360 		 * we can place it into the array and read the disklabel.
1361 		 */
1362 		ccio->ccio_unit = unit;
1363 		ccio->ccio_size = cs->sc_size;
1364 		ccdgetdisklabel(dev);
1365 
1366 		ccdunlock(cs);
1367 
1368 		break;
1369 
1370 	case CCDIOCCLR:
1371 		if (!IS_INITED(cs))
1372 			return (ENXIO);
1373 
1374 		if ((flag & FWRITE) == 0)
1375 			return (EBADF);
1376 
1377 		if ((error = ccdlock(cs)) != 0)
1378 			return (error);
1379 
1380 		/* Don't unconfigure if any other partitions are open */
1381 		part = ccdpart(dev);
1382 		pmask = (1 << part);
1383 		if ((cs->sc_openmask & ~pmask)) {
1384 			ccdunlock(cs);
1385 			return (EBUSY);
1386 		}
1387 
1388 		/* Declare unit null and void (reset all flags) */
1389 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1390 
1391 		/* Close the components and free their pathnames. */
1392 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1393 			/*
1394 			 * XXX: this close could potentially fail and
1395 			 * cause Bad Things.  Maybe we need to force
1396 			 * the close to happen?
1397 			 */
1398 #ifdef DEBUG
1399 			if (ccddebug & CCDB_VNODE)
1400 				vprint("CCDIOCCLR: vnode info",
1401 				    cs->sc_cinfo[i].ci_vp);
1402 #endif
1403 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1404 			    td->td_proc->p_ucred, td);
1405 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1406 		}
1407 
1408 		/* Free interleave index. */
1409 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1410 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1411 
1412 		/* Free component info and interleave table. */
1413 		free(cs->sc_cinfo, M_DEVBUF);
1414 		free(cs->sc_itable, M_DEVBUF);
1415 		free(cs->sc_vpp, M_DEVBUF);
1416 
1417 		/* And remove the devstat entry. */
1418 		devstat_remove_entry(&cs->device_stats);
1419 
1420 		/* This must be atomic. */
1421 		s = splhigh();
1422 		ccdunlock(cs);
1423 		splx(s);
1424 
1425 		break;
1426 
1427 	case CCDCONFINFO:
1428 		{
1429 			int ninit = 0;
1430 			struct ccdconf *conf = (struct ccdconf *)data;
1431 			struct ccd_s *tmpcs;
1432 			struct ccd_s *ubuf = conf->buffer;
1433 
1434 			/* XXX: LOCK(unique unit numbers) */
1435 			LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1436 				if (IS_INITED(tmpcs))
1437 					ninit++;
1438 
1439 			if (conf->size == 0) {
1440 				conf->size = sizeof(struct ccd_s) * ninit;
1441 				break;
1442 			} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1443 			    (conf->size % sizeof(struct ccd_s) != 0)) {
1444 				/* XXX: UNLOCK(unique unit numbers) */
1445 				return (EINVAL);
1446 			}
1447 
1448 			ubuf += ninit;
1449 			LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1450 				if (!IS_INITED(tmpcs))
1451 					continue;
1452 				error = copyout(tmpcs, --ubuf,
1453 				    sizeof(struct ccd_s));
1454 				if (error != 0)
1455 					/* XXX: UNLOCK(unique unit numbers) */
1456 					return (error);
1457 			}
1458 			/* XXX: UNLOCK(unique unit numbers) */
1459 		}
1460 		break;
1461 
1462 	case CCDCPPINFO:
1463 		if (!IS_INITED(cs))
1464 			return (ENXIO);
1465 
1466 		{
1467 			int len = 0;
1468 			struct ccdcpps *cpps = (struct ccdcpps *)data;
1469 			char *ubuf = cpps->buffer;
1470 
1471 
1472 			for (i = 0; i < cs->sc_nccdisks; ++i)
1473 				len += cs->sc_cinfo[i].ci_pathlen;
1474 
1475 			if (cpps->size == 0) {
1476 				cpps->size = len;
1477 				break;
1478 			} else if (cpps->size != len) {
1479 				return (EINVAL);
1480 			}
1481 
1482 			for (i = 0; i < cs->sc_nccdisks; ++i) {
1483 				len = cs->sc_cinfo[i].ci_pathlen;
1484 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1485 				    len);
1486 				if (error != 0)
1487 					return (error);
1488 				ubuf += len;
1489 			}
1490 		}
1491 		break;
1492 
1493 	case DIOCGDINFO:
1494 		if (!IS_INITED(cs))
1495 			return (ENXIO);
1496 
1497 		*(struct disklabel *)data = cs->sc_label;
1498 		break;
1499 
1500 	case DIOCGPART:
1501 		if (!IS_INITED(cs))
1502 			return (ENXIO);
1503 
1504 		((struct partinfo *)data)->disklab = &cs->sc_label;
1505 		((struct partinfo *)data)->part =
1506 		    &cs->sc_label.d_partitions[ccdpart(dev)];
1507 		break;
1508 
1509 	case DIOCWDINFO:
1510 	case DIOCSDINFO:
1511 		if (!IS_INITED(cs))
1512 			return (ENXIO);
1513 
1514 		if ((flag & FWRITE) == 0)
1515 			return (EBADF);
1516 
1517 		if ((error = ccdlock(cs)) != 0)
1518 			return (error);
1519 
1520 		cs->sc_flags |= CCDF_LABELLING;
1521 
1522 		error = setdisklabel(&cs->sc_label,
1523 		    (struct disklabel *)data, 0);
1524 		if (error == 0) {
1525 			if (cmd == DIOCWDINFO)
1526 				error = writedisklabel(CCDLABELDEV(dev),
1527 				    &cs->sc_label);
1528 		}
1529 
1530 		cs->sc_flags &= ~CCDF_LABELLING;
1531 
1532 		ccdunlock(cs);
1533 
1534 		if (error)
1535 			return (error);
1536 		break;
1537 
1538 	case DIOCWLABEL:
1539 		if (!IS_INITED(cs))
1540 			return (ENXIO);
1541 
1542 		if ((flag & FWRITE) == 0)
1543 			return (EBADF);
1544 		if (*(int *)data != 0)
1545 			cs->sc_flags |= CCDF_WLABEL;
1546 		else
1547 			cs->sc_flags &= ~CCDF_WLABEL;
1548 		break;
1549 
1550 	default:
1551 		return (ENOTTY);
1552 	}
1553 
1554 	return (0);
1555 }
1556 
1557 static int
1558 ccdsize(dev_t dev)
1559 {
1560 	struct ccd_s *cs;
1561 	int part, size;
1562 
1563 	if (ccdopen(dev, 0, S_IFCHR, curthread))
1564 		return (-1);
1565 
1566 	cs = ccdfind(ccdunit(dev));
1567 	part = ccdpart(dev);
1568 
1569 	if (!IS_INITED(cs))
1570 		return (-1);
1571 
1572 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1573 		size = -1;
1574 	else
1575 		size = cs->sc_label.d_partitions[part].p_size;
1576 
1577 	if (ccdclose(dev, 0, S_IFCHR, curthread))
1578 		return (-1);
1579 
1580 	return (size);
1581 }
1582 
1583 static int
1584 ccddump(dev_t dev)
1585 {
1586 
1587 	/* Not implemented. */
1588 	return ENXIO;
1589 }
1590 
1591 /*
1592  * Lookup the provided name in the filesystem.  If the file exists,
1593  * is a valid block device, and isn't being used by anyone else,
1594  * set *vpp to the file's vnode.
1595  */
1596 static int
1597 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1598 {
1599 	struct nameidata nd;
1600 	struct vnode *vp;
1601 	int error, flags;
1602 
1603 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1604 	flags = FREAD | FWRITE;
1605 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1606 #ifdef DEBUG
1607 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
1608 			printf("ccdlookup: vn_open error = %d\n", error);
1609 #endif
1610 		return (error);
1611 	}
1612 	vp = nd.ni_vp;
1613 
1614 	if (vp->v_usecount > 1) {
1615 		error = EBUSY;
1616 		goto bad;
1617 	}
1618 
1619 	if (!vn_isdisk(vp, &error))
1620 		goto bad;
1621 
1622 #ifdef DEBUG
1623 	if (ccddebug & CCDB_VNODE)
1624 		vprint("ccdlookup: vnode info", vp);
1625 #endif
1626 
1627 	VOP_UNLOCK(vp, 0, td);
1628 	NDFREE(&nd, NDF_ONLY_PNBUF);
1629 	*vpp = vp;
1630 	return (0);
1631 bad:
1632 	VOP_UNLOCK(vp, 0, td);
1633 	NDFREE(&nd, NDF_ONLY_PNBUF);
1634 	/* vn_close does vrele() for vp */
1635 	(void)vn_close(vp, FREAD|FWRITE, td->td_proc->p_ucred, td);
1636 	return (error);
1637 }
1638 
1639 /*
1640  * Read the disklabel from the ccd.  If one is not present, fake one
1641  * up.
1642  */
1643 static void
1644 ccdgetdisklabel(dev_t dev)
1645 {
1646 	int unit = ccdunit(dev);
1647 	struct ccd_s *cs = ccdfind(unit);
1648 	char *errstring;
1649 	struct disklabel *lp = &cs->sc_label;
1650 	struct ccdgeom *ccg = &cs->sc_geom;
1651 
1652 	bzero(lp, sizeof(*lp));
1653 
1654 	lp->d_secperunit = cs->sc_size;
1655 	lp->d_secsize = ccg->ccg_secsize;
1656 	lp->d_nsectors = ccg->ccg_nsectors;
1657 	lp->d_ntracks = ccg->ccg_ntracks;
1658 	lp->d_ncylinders = ccg->ccg_ncylinders;
1659 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1660 
1661 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1662 	lp->d_type = DTYPE_CCD;
1663 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1664 	lp->d_rpm = 3600;
1665 	lp->d_interleave = 1;
1666 	lp->d_flags = 0;
1667 
1668 	lp->d_partitions[RAW_PART].p_offset = 0;
1669 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1670 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1671 	lp->d_npartitions = RAW_PART + 1;
1672 
1673 	lp->d_bbsize = BBSIZE;				/* XXX */
1674 	lp->d_sbsize = SBSIZE;				/* XXX */
1675 
1676 	lp->d_magic = DISKMAGIC;
1677 	lp->d_magic2 = DISKMAGIC;
1678 	lp->d_checksum = dkcksum(&cs->sc_label);
1679 
1680 	/*
1681 	 * Call the generic disklabel extraction routine.
1682 	 */
1683 	errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1684 	if (errstring != NULL)
1685 		ccdmakedisklabel(cs);
1686 
1687 #ifdef DEBUG
1688 	/* It's actually extremely common to have unlabeled ccds. */
1689 	if (ccddebug & CCDB_LABEL)
1690 		if (errstring != NULL)
1691 			printf("ccd%d: %s\n", unit, errstring);
1692 #endif
1693 }
1694 
1695 /*
1696  * Take care of things one might want to take care of in the event
1697  * that a disklabel isn't present.
1698  */
1699 static void
1700 ccdmakedisklabel(struct ccd_s *cs)
1701 {
1702 	struct disklabel *lp = &cs->sc_label;
1703 
1704 	/*
1705 	 * For historical reasons, if there's no disklabel present
1706 	 * the raw partition must be marked FS_BSDFFS.
1707 	 */
1708 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1709 
1710 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1711 }
1712 
1713 /*
1714  * Wait interruptibly for an exclusive lock.
1715  *
1716  * XXX
1717  * Several drivers do this; it should be abstracted and made MP-safe.
1718  */
1719 static int
1720 ccdlock(struct ccd_s *cs)
1721 {
1722 	int error;
1723 
1724 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1725 		cs->sc_flags |= CCDF_WANTED;
1726 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1727 			return (error);
1728 	}
1729 	cs->sc_flags |= CCDF_LOCKED;
1730 	return (0);
1731 }
1732 
1733 /*
1734  * Unlock and wake up any waiters.
1735  */
1736 static void
1737 ccdunlock(struct ccd_s *cs)
1738 {
1739 
1740 	cs->sc_flags &= ~CCDF_LOCKED;
1741 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1742 		cs->sc_flags &= ~CCDF_WANTED;
1743 		wakeup(cs);
1744 	}
1745 }
1746 
1747 #ifdef DEBUG
1748 static void
1749 printiinfo(struct ccdiinfo *ii)
1750 {
1751 	int ix, i;
1752 
1753 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1754 		printf(" itab[%d]: #dk %d sblk %d soff %d",
1755 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1756 		for (i = 0; i < ii->ii_ndisk; i++)
1757 			printf(" %d", ii->ii_index[i]);
1758 		printf("\n");
1759 	}
1760 }
1761 #endif
1762