xref: /freebsd/sys/geom/geom_ccd.c (revision 69c9999d0ca45b210e75706ab4952ad5a33ce6ec)
1 /* $FreeBSD$ */
2 
3 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
4 
5 /*
6  * Copyright (c) 1995 Jason R. Thorpe.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed for the NetBSD Project
20  *	by Jason R. Thorpe.
21  * 4. The name of the author may not be used to endorse or promote products
22  *    derived from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * Copyright (c) 1988 University of Utah.
39  * Copyright (c) 1990, 1993
40  *	The Regents of the University of California.  All rights reserved.
41  *
42  * This code is derived from software contributed to Berkeley by
43  * the Systems Programming Group of the University of Utah Computer
44  * Science Department.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions
48  * are met:
49  * 1. Redistributions of source code must retain the above copyright
50  *    notice, this list of conditions and the following disclaimer.
51  * 2. Redistributions in binary form must reproduce the above copyright
52  *    notice, this list of conditions and the following disclaimer in the
53  *    documentation and/or other materials provided with the distribution.
54  * 3. All advertising materials mentioning features or use of this software
55  *    must display the following acknowledgement:
56  *	This product includes software developed by the University of
57  *	California, Berkeley and its contributors.
58  * 4. Neither the name of the University nor the names of its contributors
59  *    may be used to endorse or promote products derived from this software
60  *    without specific prior written permission.
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72  * SUCH DAMAGE.
73  *
74  * from: Utah $Hdr: cd.c 1.6 90/11/28$
75  *
76  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
77  */
78 
79 /*
80  * "Concatenated" disk driver.
81  *
82  * Dynamic configuration and disklabel support by:
83  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
84  *	Numerical Aerodynamic Simulation Facility
85  *	Mail Stop 258-6
86  *	NASA Ames Research Center
87  *	Moffett Field, CA 94035
88  */
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/kernel.h>
93 #include <sys/module.h>
94 #include <sys/proc.h>
95 #include <sys/bio.h>
96 #include <sys/malloc.h>
97 #include <sys/namei.h>
98 #include <sys/conf.h>
99 #include <sys/stat.h>
100 #include <sys/stdint.h>
101 #include <sys/sysctl.h>
102 #include <sys/disk.h>
103 #include <sys/disklabel.h>
104 #include <sys/devicestat.h>
105 #include <sys/fcntl.h>
106 #include <sys/vnode.h>
107 
108 #include <sys/ccdvar.h>
109 
110 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
111 
112 #if defined(CCDDEBUG) && !defined(DEBUG)
113 #define DEBUG
114 #endif
115 
116 #ifdef DEBUG
117 #define CCDB_FOLLOW	0x01
118 #define CCDB_INIT	0x02
119 #define CCDB_IO		0x04
120 #define CCDB_LABEL	0x08
121 #define CCDB_VNODE	0x10
122 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
123     CCDB_VNODE;
124 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
125 #endif
126 
127 #define	ccdunit(x)	dkunit(x)
128 #define ccdpart(x)	dkpart(x)
129 
130 /*
131    This is how mirroring works (only writes are special):
132 
133    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
134    linked together by the cb_mirror field.  "cb_pflags &
135    CCDPF_MIRROR_DONE" is set to 0 on both of them.
136 
137    When a component returns to ccdiodone(), it checks if "cb_pflags &
138    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
139    flag and returns.  If it is, it means its partner has already
140    returned, so it will go to the regular cleanup.
141 
142  */
143 
144 struct ccdbuf {
145 	struct bio	cb_buf;		/* new I/O buf */
146 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
147 	struct ccdbuf	*cb_freenext;	/* free list link */
148 	int		cb_unit;	/* target unit */
149 	int		cb_comp;	/* target component */
150 	int		cb_pflags;	/* mirror/parity status flag */
151 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
152 };
153 
154 /* bits in cb_pflags */
155 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
156 
157 #define CCDLABELDEV(dev)	\
158 	(makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
159 
160 /* convinient macros for often-used statements */
161 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
162 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
163 
164 static d_open_t ccdopen;
165 static d_close_t ccdclose;
166 static d_strategy_t ccdstrategy;
167 static d_ioctl_t ccdioctl;
168 static d_psize_t ccdsize;
169 
170 #define NCCDFREEHIWAT	16
171 
172 #define CDEV_MAJOR 74
173 
174 static struct cdevsw ccd_cdevsw = {
175 	/* open */	ccdopen,
176 	/* close */	ccdclose,
177 	/* read */	physread,
178 	/* write */	physwrite,
179 	/* ioctl */	ccdioctl,
180 	/* poll */	nopoll,
181 	/* mmap */	nommap,
182 	/* strategy */	ccdstrategy,
183 	/* name */	"ccd",
184 	/* maj */	CDEV_MAJOR,
185 	/* dump */	nodump,
186 	/* psize */	ccdsize,
187 	/* flags */	D_DISK,
188 };
189 static LIST_HEAD(, ccd_s) ccd_softc_list = LIST_HEAD_INITIALIZER(&ccd_softc_list);
190 
191 static struct ccd_s *ccdfind(int);
192 static struct ccd_s *ccdnew(int);
193 static int ccddestroy(struct ccd_s *, struct proc *);
194 
195 /* called during module initialization */
196 static void ccdattach(void);
197 static int ccd_modevent(module_t, int, void *);
198 
199 /* called by biodone() at interrupt time */
200 static void ccdiodone(struct bio *bp);
201 
202 static void ccdstart(struct ccd_s *, struct bio *);
203 static void ccdinterleave(struct ccd_s *, int);
204 static void ccdintr(struct ccd_s *, struct bio *);
205 static int ccdinit(struct ccd_s *, char **, struct thread *);
206 static int ccdlookup(char *, struct thread *p, struct vnode **);
207 static void ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
208 		      struct bio *, daddr_t, caddr_t, long);
209 static void ccdgetdisklabel(dev_t);
210 static void ccdmakedisklabel(struct ccd_s *);
211 static int ccdlock(struct ccd_s *);
212 static void ccdunlock(struct ccd_s *);
213 
214 #ifdef DEBUG
215 static void printiinfo(struct ccdiinfo *);
216 #endif
217 
218 /* Non-private for the benefit of libkvm. */
219 struct ccdbuf *ccdfreebufs;
220 static int numccdfreebufs;
221 
222 /*
223  * getccdbuf() -	Allocate and zero a ccd buffer.
224  *
225  *	This routine is called at splbio().
226  */
227 
228 static __inline
229 struct ccdbuf *
230 getccdbuf(struct ccdbuf *cpy)
231 {
232 	struct ccdbuf *cbp;
233 
234 	/*
235 	 * Allocate from freelist or malloc as necessary
236 	 */
237 	if ((cbp = ccdfreebufs) != NULL) {
238 		ccdfreebufs = cbp->cb_freenext;
239 		--numccdfreebufs;
240 	} else {
241 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
242 	}
243 
244 	/*
245 	 * Used by mirroring code
246 	 */
247 	if (cpy)
248 		bcopy(cpy, cbp, sizeof(struct ccdbuf));
249 	else
250 		bzero(cbp, sizeof(struct ccdbuf));
251 
252 	/*
253 	 * independant struct bio initialization
254 	 */
255 
256 	return(cbp);
257 }
258 
259 /*
260  * putccdbuf() -	Free a ccd buffer.
261  *
262  *	This routine is called at splbio().
263  */
264 
265 static __inline
266 void
267 putccdbuf(struct ccdbuf *cbp)
268 {
269 
270 	if (numccdfreebufs < NCCDFREEHIWAT) {
271 		cbp->cb_freenext = ccdfreebufs;
272 		ccdfreebufs = cbp;
273 		++numccdfreebufs;
274 	} else {
275 		free((caddr_t)cbp, M_DEVBUF);
276 	}
277 }
278 
279 
280 /*
281  * Number of blocks to untouched in front of a component partition.
282  * This is to avoid violating its disklabel area when it starts at the
283  * beginning of the slice.
284  */
285 #if !defined(CCD_OFFSET)
286 #define CCD_OFFSET 16
287 #endif
288 
289 static struct ccd_s *
290 ccdfind(int unit)
291 {
292 	struct ccd_s *sc = NULL;
293 
294 	/* XXX: LOCK(unique unit numbers) */
295 	LIST_FOREACH(sc, &ccd_softc_list, list) {
296 		if (sc->sc_unit == unit)
297 			break;
298 	}
299 	/* XXX: UNLOCK(unique unit numbers) */
300 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
301 }
302 
303 static struct ccd_s *
304 ccdnew(int unit)
305 {
306 	struct ccd_s *sc;
307 
308 	/* XXX: LOCK(unique unit numbers) */
309 	if (IS_ALLOCATED(unit) || unit > DKMAXUNIT)
310 		return (NULL);
311 
312 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
313 	sc->sc_unit = unit;
314 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
315 	/* XXX: UNLOCK(unique unit numbers) */
316 	return (sc);
317 }
318 
319 static int
320 ccddestroy(struct ccd_s *sc, struct proc *p)
321 {
322 
323 	/* XXX: LOCK(unique unit numbers) */
324 	LIST_REMOVE(sc, list);
325 	/* XXX: UNLOCK(unique unit numbers) */
326 	FREE(sc, M_CCD);
327 	return (0);
328 }
329 
330 static void
331 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
332 {
333 	int i, u;
334 	char *s;
335 
336 	if (*dev != NODEV)
337 		return;
338 	i = dev_stdclone(name, &s, "ccd", &u);
339 	if (i != 2)
340 		return;
341 	if (*s < 'a' || *s > 'h')
342 		return;
343 	if (s[1] != '\0')
344 		return;
345 	*dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
346 		UID_ROOT, GID_OPERATOR, 0640, name);
347 }
348 
349 /*
350  * Called by main() during pseudo-device attachment.  All we need
351  * to do is to add devsw entries.
352  */
353 static void
354 ccdattach()
355 {
356 
357 	EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
358 }
359 
360 static int
361 ccd_modevent(module_t mod, int type, void *data)
362 {
363 	int error = 0;
364 
365 	switch (type) {
366 	case MOD_LOAD:
367 		ccdattach();
368 		break;
369 
370 	case MOD_UNLOAD:
371 		printf("ccd0: Unload not supported!\n");
372 		error = EOPNOTSUPP;
373 		break;
374 
375 	case MOD_SHUTDOWN:
376 		break;
377 
378 	default:
379 		error = EOPNOTSUPP;
380 	}
381 	return (error);
382 }
383 
384 DEV_MODULE(ccd, ccd_modevent, NULL);
385 
386 static int
387 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
388 {
389 	struct ccdcinfo *ci = NULL;	/* XXX */
390 	size_t size;
391 	int ix;
392 	struct vnode *vp;
393 	size_t minsize;
394 	int maxsecsize;
395 	struct ccdgeom *ccg = &cs->sc_geom;
396 	char *tmppath = NULL;
397 	int error = 0;
398 	off_t mediasize;
399 	u_int sectorsize;
400 
401 #ifdef DEBUG
402 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
403 		printf("ccdinit: unit %d\n", cs->sc_unit);
404 #endif
405 
406 	cs->sc_size = 0;
407 
408 	/* Allocate space for the component info. */
409 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
410 	    M_DEVBUF, M_WAITOK);
411 
412 	/*
413 	 * Verify that each component piece exists and record
414 	 * relevant information about it.
415 	 */
416 	maxsecsize = 0;
417 	minsize = 0;
418 	tmppath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK);
419 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
420 		vp = cs->sc_vpp[ix];
421 		ci = &cs->sc_cinfo[ix];
422 		ci->ci_vp = vp;
423 
424 		/*
425 		 * Copy in the pathname of the component.
426 		 */
427 		if ((error = copyinstr(cpaths[ix], tmppath,
428 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
429 #ifdef DEBUG
430 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
431 				printf("ccd%d: can't copy path, error = %d\n",
432 				    cs->sc_unit, error);
433 #endif
434 			goto fail;
435 		}
436 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
437 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
438 
439 		ci->ci_dev = vn_todev(vp);
440 
441 		/*
442 		 * Get partition information for the component.
443 		 */
444 		error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
445 		    FREAD, td->td_ucred, td);
446 		if (error != 0) {
447 #ifdef DEBUG
448 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
449 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
450 				     cs->sc_unit, ci->ci_path, error);
451 #endif
452 			goto fail;
453 		}
454 		/*
455 		 * Get partition information for the component.
456 		 */
457 		error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
458 		    FREAD, td->td_ucred, td);
459 		if (error != 0) {
460 #ifdef DEBUG
461 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
462 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
463 				     cs->sc_unit, ci->ci_path, error);
464 #endif
465 			goto fail;
466 		}
467 		if (sectorsize > maxsecsize)
468 			maxsecsize = sectorsize;
469 		size = mediasize / DEV_BSIZE - CCD_OFFSET;
470 
471 		/*
472 		 * Calculate the size, truncating to an interleave
473 		 * boundary if necessary.
474 		 */
475 
476 		if (cs->sc_ileave > 1)
477 			size -= size % cs->sc_ileave;
478 
479 		if (size == 0) {
480 #ifdef DEBUG
481 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
482 				printf("ccd%d: %s: size == 0\n",
483 				    cs->sc_unit, ci->ci_path);
484 #endif
485 			error = ENODEV;
486 			goto fail;
487 		}
488 
489 		if (minsize == 0 || size < minsize)
490 			minsize = size;
491 		ci->ci_size = size;
492 		cs->sc_size += size;
493 	}
494 
495 	free(tmppath, M_DEVBUF);
496 	tmppath = NULL;
497 
498 	/*
499 	 * Don't allow the interleave to be smaller than
500 	 * the biggest component sector.
501 	 */
502 	if ((cs->sc_ileave > 0) &&
503 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
504 #ifdef DEBUG
505 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
506 			printf("ccd%d: interleave must be at least %d\n",
507 			    cs->sc_unit, (maxsecsize / DEV_BSIZE));
508 #endif
509 		error = EINVAL;
510 		goto fail;
511 	}
512 
513 	/*
514 	 * If uniform interleave is desired set all sizes to that of
515 	 * the smallest component.  This will guarentee that a single
516 	 * interleave table is generated.
517 	 *
518 	 * Lost space must be taken into account when calculating the
519 	 * overall size.  Half the space is lost when CCDF_MIRROR is
520 	 * specified.  One disk is lost when CCDF_PARITY is specified.
521 	 */
522 	if (cs->sc_flags & CCDF_UNIFORM) {
523 		for (ci = cs->sc_cinfo;
524 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
525 			ci->ci_size = minsize;
526 		}
527 		if (cs->sc_flags & CCDF_MIRROR) {
528 			/*
529 			 * Check to see if an even number of components
530 			 * have been specified.  The interleave must also
531 			 * be non-zero in order for us to be able to
532 			 * guarentee the topology.
533 			 */
534 			if (cs->sc_nccdisks % 2) {
535 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
536 				error = EINVAL;
537 				goto fail;
538 			}
539 			if (cs->sc_ileave == 0) {
540 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
541 				error = EINVAL;
542 				goto fail;
543 			}
544 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
545 		} else {
546 			if (cs->sc_ileave == 0) {
547 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
548 				error = EINVAL;
549 				goto fail;
550 			}
551 			cs->sc_size = cs->sc_nccdisks * minsize;
552 		}
553 	}
554 
555 	/*
556 	 * Construct the interleave table.
557 	 */
558 	ccdinterleave(cs, cs->sc_unit);
559 
560 	/*
561 	 * Create pseudo-geometry based on 1MB cylinders.  It's
562 	 * pretty close.
563 	 */
564 	ccg->ccg_secsize = maxsecsize;
565 	ccg->ccg_ntracks = 1;
566 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
567 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
568 
569 	/*
570 	 * Add a devstat entry for this device.
571 	 */
572 	devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
573 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
574 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
575 			  DEVSTAT_PRIORITY_ARRAY);
576 
577 	cs->sc_flags |= CCDF_INITED;
578 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
579 	return (0);
580 fail:
581 	while (ci > cs->sc_cinfo) {
582 		ci--;
583 		free(ci->ci_path, M_DEVBUF);
584 	}
585 	if (tmppath != NULL)
586 		free(tmppath, M_DEVBUF);
587 	free(cs->sc_cinfo, M_DEVBUF);
588 	return (error);
589 }
590 
591 static void
592 ccdinterleave(struct ccd_s *cs, int unit)
593 {
594 	struct ccdcinfo *ci, *smallci;
595 	struct ccdiinfo *ii;
596 	daddr_t bn, lbn;
597 	int ix;
598 	u_long size;
599 
600 #ifdef DEBUG
601 	if (ccddebug & CCDB_INIT)
602 		printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
603 #endif
604 
605 	/*
606 	 * Allocate an interleave table.  The worst case occurs when each
607 	 * of N disks is of a different size, resulting in N interleave
608 	 * tables.
609 	 *
610 	 * Chances are this is too big, but we don't care.
611 	 */
612 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
613 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
614 	    M_WAITOK | M_ZERO);
615 
616 	/*
617 	 * Trivial case: no interleave (actually interleave of disk size).
618 	 * Each table entry represents a single component in its entirety.
619 	 *
620 	 * An interleave of 0 may not be used with a mirror or parity setup.
621 	 */
622 	if (cs->sc_ileave == 0) {
623 		bn = 0;
624 		ii = cs->sc_itable;
625 
626 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
627 			/* Allocate space for ii_index. */
628 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
629 			ii->ii_ndisk = 1;
630 			ii->ii_startblk = bn;
631 			ii->ii_startoff = 0;
632 			ii->ii_index[0] = ix;
633 			bn += cs->sc_cinfo[ix].ci_size;
634 			ii++;
635 		}
636 		ii->ii_ndisk = 0;
637 #ifdef DEBUG
638 		if (ccddebug & CCDB_INIT)
639 			printiinfo(cs->sc_itable);
640 #endif
641 		return;
642 	}
643 
644 	/*
645 	 * The following isn't fast or pretty; it doesn't have to be.
646 	 */
647 	size = 0;
648 	bn = lbn = 0;
649 	for (ii = cs->sc_itable; ; ii++) {
650 		/*
651 		 * Allocate space for ii_index.  We might allocate more then
652 		 * we use.
653 		 */
654 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
655 		    M_DEVBUF, M_WAITOK);
656 
657 		/*
658 		 * Locate the smallest of the remaining components
659 		 */
660 		smallci = NULL;
661 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
662 		    ci++) {
663 			if (ci->ci_size > size &&
664 			    (smallci == NULL ||
665 			     ci->ci_size < smallci->ci_size)) {
666 				smallci = ci;
667 			}
668 		}
669 
670 		/*
671 		 * Nobody left, all done
672 		 */
673 		if (smallci == NULL) {
674 			ii->ii_ndisk = 0;
675 			break;
676 		}
677 
678 		/*
679 		 * Record starting logical block using an sc_ileave blocksize.
680 		 */
681 		ii->ii_startblk = bn / cs->sc_ileave;
682 
683 		/*
684 		 * Record starting comopnent block using an sc_ileave
685 		 * blocksize.  This value is relative to the beginning of
686 		 * a component disk.
687 		 */
688 		ii->ii_startoff = lbn;
689 
690 		/*
691 		 * Determine how many disks take part in this interleave
692 		 * and record their indices.
693 		 */
694 		ix = 0;
695 		for (ci = cs->sc_cinfo;
696 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
697 			if (ci->ci_size >= smallci->ci_size) {
698 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
699 			}
700 		}
701 		ii->ii_ndisk = ix;
702 		bn += ix * (smallci->ci_size - size);
703 		lbn = smallci->ci_size / cs->sc_ileave;
704 		size = smallci->ci_size;
705 	}
706 #ifdef DEBUG
707 	if (ccddebug & CCDB_INIT)
708 		printiinfo(cs->sc_itable);
709 #endif
710 }
711 
712 /* ARGSUSED */
713 static int
714 ccdopen(dev_t dev, int flags, int fmt, struct thread *td)
715 {
716 	int unit = ccdunit(dev);
717 	struct ccd_s *cs;
718 	struct disklabel *lp;
719 	int error = 0, part, pmask;
720 
721 #ifdef DEBUG
722 	if (ccddebug & CCDB_FOLLOW)
723 		printf("ccdopen(%p, %x)\n", dev, flags);
724 #endif
725 
726 	cs = IS_ALLOCATED(unit) ? ccdfind(unit) : ccdnew(unit);
727 
728 	if ((error = ccdlock(cs)) != 0)
729 		return (error);
730 
731 	lp = &cs->sc_label;
732 
733 	part = ccdpart(dev);
734 	pmask = (1 << part);
735 
736 	/*
737 	 * If we're initialized, check to see if there are any other
738 	 * open partitions.  If not, then it's safe to update
739 	 * the in-core disklabel.
740 	 */
741 	if (IS_INITED(cs) && (cs->sc_openmask == 0))
742 		ccdgetdisklabel(dev);
743 
744 	/* Check that the partition exists. */
745 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
746 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
747 		error = ENXIO;
748 		goto done;
749 	}
750 
751 	cs->sc_openmask |= pmask;
752  done:
753 	ccdunlock(cs);
754 	return (0);
755 }
756 
757 /* ARGSUSED */
758 static int
759 ccdclose(dev_t dev, int flags, int fmt, struct thread *td)
760 {
761 	int unit = ccdunit(dev);
762 	struct ccd_s *cs;
763 	int error = 0, part;
764 
765 #ifdef DEBUG
766 	if (ccddebug & CCDB_FOLLOW)
767 		printf("ccdclose(%p, %x)\n", dev, flags);
768 #endif
769 
770 	if (!IS_ALLOCATED(unit))
771 		return (ENXIO);
772 	cs = ccdfind(unit);
773 
774 	if ((error = ccdlock(cs)) != 0)
775 		return (error);
776 
777 	part = ccdpart(dev);
778 
779 	/* ...that much closer to allowing unconfiguration... */
780 	cs->sc_openmask &= ~(1 << part);
781 	/* collect "garbage" if possible */
782 	if (!IS_INITED(cs) && (cs->sc_flags & CCDF_WANTED) == 0)
783 		ccddestroy(cs, td->td_proc);
784 	else
785 		ccdunlock(cs);
786 	return (0);
787 }
788 
789 static void
790 ccdstrategy(struct bio *bp)
791 {
792 	int unit = ccdunit(bp->bio_dev);
793 	struct ccd_s *cs = ccdfind(unit);
794 	int s;
795 	int wlabel;
796 	struct disklabel *lp;
797 
798 #ifdef DEBUG
799 	if (ccddebug & CCDB_FOLLOW)
800 		printf("ccdstrategy(%p): unit %d\n", bp, unit);
801 #endif
802 	if (!IS_INITED(cs)) {
803 		biofinish(bp, NULL, ENXIO);
804 		return;
805 	}
806 
807 	/* If it's a nil transfer, wake up the top half now. */
808 	if (bp->bio_bcount == 0) {
809 		biodone(bp);
810 		return;
811 	}
812 
813 	lp = &cs->sc_label;
814 
815 	/*
816 	 * Do bounds checking and adjust transfer.  If there's an
817 	 * error, the bounds check will flag that for us.
818 	 */
819 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
820 	if (ccdpart(bp->bio_dev) != RAW_PART) {
821 		if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
822 			biodone(bp);
823 			return;
824 		}
825 	} else {
826 		int pbn;        /* in sc_secsize chunks */
827 		long sz;        /* in sc_secsize chunks */
828 
829 		pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
830 		sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
831 
832 		/*
833 		 * If out of bounds return an error. If at the EOF point,
834 		 * simply read or write less.
835 		 */
836 
837 		if (pbn < 0 || pbn >= cs->sc_size) {
838 			bp->bio_resid = bp->bio_bcount;
839 			if (pbn != cs->sc_size)
840 				biofinish(bp, NULL, EINVAL);
841 			else
842 				biodone(bp);
843 			return;
844 		}
845 
846 		/*
847 		 * If the request crosses EOF, truncate the request.
848 		 */
849 		if (pbn + sz > cs->sc_size) {
850 			bp->bio_bcount = (cs->sc_size - pbn) *
851 			    cs->sc_geom.ccg_secsize;
852 		}
853 	}
854 
855 	bp->bio_resid = bp->bio_bcount;
856 
857 	/*
858 	 * "Start" the unit.
859 	 */
860 	s = splbio();
861 	ccdstart(cs, bp);
862 	splx(s);
863 	return;
864 }
865 
866 static void
867 ccdstart(struct ccd_s *cs, struct bio *bp)
868 {
869 	long bcount, rcount;
870 	struct ccdbuf *cbp[4];
871 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
872 	caddr_t addr;
873 	daddr_t bn;
874 	struct partition *pp;
875 
876 #ifdef DEBUG
877 	if (ccddebug & CCDB_FOLLOW)
878 		printf("ccdstart(%p, %p)\n", cs, bp);
879 #endif
880 
881 	/* Record the transaction start  */
882 	devstat_start_transaction(&cs->device_stats);
883 
884 	/*
885 	 * Translate the partition-relative block number to an absolute.
886 	 */
887 	bn = bp->bio_blkno;
888 	if (ccdpart(bp->bio_dev) != RAW_PART) {
889 		pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
890 		bn += pp->p_offset;
891 	}
892 
893 	/*
894 	 * Allocate component buffers and fire off the requests
895 	 */
896 	addr = bp->bio_data;
897 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
898 		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
899 		rcount = cbp[0]->cb_buf.bio_bcount;
900 
901 		if (cs->sc_cflags & CCDF_MIRROR) {
902 			/*
903 			 * Mirroring.  Writes go to both disks, reads are
904 			 * taken from whichever disk seems most appropriate.
905 			 *
906 			 * We attempt to localize reads to the disk whos arm
907 			 * is nearest the read request.  We ignore seeks due
908 			 * to writes when making this determination and we
909 			 * also try to avoid hogging.
910 			 */
911 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
912 				BIO_STRATEGY(&cbp[0]->cb_buf);
913 				BIO_STRATEGY(&cbp[1]->cb_buf);
914 			} else {
915 				int pick = cs->sc_pick;
916 				daddr_t range = cs->sc_size / 16;
917 
918 				if (bn < cs->sc_blk[pick] - range ||
919 				    bn > cs->sc_blk[pick] + range
920 				) {
921 					cs->sc_pick = pick = 1 - pick;
922 				}
923 				cs->sc_blk[pick] = bn + btodb(rcount);
924 				BIO_STRATEGY(&cbp[pick]->cb_buf);
925 			}
926 		} else {
927 			/*
928 			 * Not mirroring
929 			 */
930 			BIO_STRATEGY(&cbp[0]->cb_buf);
931 		}
932 		bn += btodb(rcount);
933 		addr += rcount;
934 	}
935 }
936 
937 /*
938  * Build a component buffer header.
939  */
940 static void
941 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
942 {
943 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
944 	struct ccdbuf *cbp;
945 	daddr_t cbn, cboff;
946 	off_t cbc;
947 
948 #ifdef DEBUG
949 	if (ccddebug & CCDB_IO)
950 		printf("ccdbuffer(%p, %p, %lld, %p, %ld)\n",
951 		    (void *)cs, (void *)bp, (long long)bn, (void *)addr,
952 		    bcount);
953 #endif
954 	/*
955 	 * Determine which component bn falls in.
956 	 */
957 	cbn = bn;
958 	cboff = 0;
959 
960 	if (cs->sc_ileave == 0) {
961 		/*
962 		 * Serially concatenated and neither a mirror nor a parity
963 		 * config.  This is a special case.
964 		 */
965 		daddr_t sblk;
966 
967 		sblk = 0;
968 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
969 			sblk += ci->ci_size;
970 		cbn -= sblk;
971 	} else {
972 		struct ccdiinfo *ii;
973 		int ccdisk, off;
974 
975 		/*
976 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
977 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
978 		 * to cbn.
979 		 */
980 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
981 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
982 
983 		/*
984 		 * Figure out which interleave table to use.
985 		 */
986 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
987 			if (ii->ii_startblk > cbn)
988 				break;
989 		}
990 		ii--;
991 
992 		/*
993 		 * off is the logical superblock relative to the beginning
994 		 * of this interleave block.
995 		 */
996 		off = cbn - ii->ii_startblk;
997 
998 		/*
999 		 * We must calculate which disk component to use (ccdisk),
1000 		 * and recalculate cbn to be the superblock relative to
1001 		 * the beginning of the component.  This is typically done by
1002 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
1003 		 * must typically be divided by the number of components in
1004 		 * this interleave array to be properly convert it from a
1005 		 * CCD-relative logical superblock number to a
1006 		 * component-relative superblock number.
1007 		 */
1008 		if (ii->ii_ndisk == 1) {
1009 			/*
1010 			 * When we have just one disk, it can't be a mirror
1011 			 * or a parity config.
1012 			 */
1013 			ccdisk = ii->ii_index[0];
1014 			cbn = ii->ii_startoff + off;
1015 		} else {
1016 			if (cs->sc_cflags & CCDF_MIRROR) {
1017 				/*
1018 				 * We have forced a uniform mapping, resulting
1019 				 * in a single interleave array.  We double
1020 				 * up on the first half of the available
1021 				 * components and our mirror is in the second
1022 				 * half.  This only works with a single
1023 				 * interleave array because doubling up
1024 				 * doubles the number of sectors, so there
1025 				 * cannot be another interleave array because
1026 				 * the next interleave array's calculations
1027 				 * would be off.
1028 				 */
1029 				int ndisk2 = ii->ii_ndisk / 2;
1030 				ccdisk = ii->ii_index[off % ndisk2];
1031 				cbn = ii->ii_startoff + off / ndisk2;
1032 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1033 			} else {
1034 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1035 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1036 			}
1037 		}
1038 
1039 		ci = &cs->sc_cinfo[ccdisk];
1040 
1041 		/*
1042 		 * Convert cbn from a superblock to a normal block so it
1043 		 * can be used to calculate (along with cboff) the normal
1044 		 * block index into this particular disk.
1045 		 */
1046 		cbn *= cs->sc_ileave;
1047 	}
1048 
1049 	/*
1050 	 * Fill in the component buf structure.
1051 	 */
1052 	cbp = getccdbuf(NULL);
1053 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
1054 	cbp->cb_buf.bio_done = ccdiodone;
1055 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
1056 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1057 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1058 	cbp->cb_buf.bio_data = addr;
1059 	if (cs->sc_ileave == 0)
1060               cbc = dbtob((off_t)(ci->ci_size - cbn));
1061 	else
1062               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1063 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1064  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1065 
1066 	/*
1067 	 * context for ccdiodone
1068 	 */
1069 	cbp->cb_obp = bp;
1070 	cbp->cb_unit = cs->sc_unit;
1071 	cbp->cb_comp = ci - cs->sc_cinfo;
1072 
1073 #ifdef DEBUG
1074 	if (ccddebug & CCDB_IO)
1075 		printf(" dev %p(u%ld): cbp %p bn %jd addr %p bcnt %ld\n",
1076 		       ci->ci_dev, (unsigned long)(ci-cs->sc_cinfo), cbp,
1077 		       (intmax_t)cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1078 		       cbp->cb_buf.bio_bcount);
1079 #endif
1080 	cb[0] = cbp;
1081 
1082 	/*
1083 	 * Note: both I/O's setup when reading from mirror, but only one
1084 	 * will be executed.
1085 	 */
1086 	if (cs->sc_cflags & CCDF_MIRROR) {
1087 		/* mirror, setup second I/O */
1088 		cbp = getccdbuf(cb[0]);
1089 		cbp->cb_buf.bio_dev = ci2->ci_dev;
1090 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1091 		cb[1] = cbp;
1092 		/* link together the ccdbuf's and clear "mirror done" flag */
1093 		cb[0]->cb_mirror = cb[1];
1094 		cb[1]->cb_mirror = cb[0];
1095 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1096 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1097 	}
1098 }
1099 
1100 static void
1101 ccdintr(struct ccd_s *cs, struct bio *bp)
1102 {
1103 #ifdef DEBUG
1104 	if (ccddebug & CCDB_FOLLOW)
1105 		printf("ccdintr(%p, %p)\n", cs, bp);
1106 #endif
1107 	/*
1108 	 * Request is done for better or worse, wakeup the top half.
1109 	 */
1110 	if (bp->bio_flags & BIO_ERROR)
1111 		bp->bio_resid = bp->bio_bcount;
1112 	biofinish(bp, &cs->device_stats, 0);
1113 }
1114 
1115 /*
1116  * Called at interrupt time.
1117  * Mark the component as done and if all components are done,
1118  * take a ccd interrupt.
1119  */
1120 static void
1121 ccdiodone(struct bio *ibp)
1122 {
1123 	struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1124 	struct bio *bp = cbp->cb_obp;
1125 	int unit = cbp->cb_unit;
1126 	int count, s;
1127 
1128 	s = splbio();
1129 #ifdef DEBUG
1130 	if (ccddebug & CCDB_FOLLOW)
1131 		printf("ccdiodone(%p)\n", cbp);
1132 	if (ccddebug & CCDB_IO) {
1133 		printf("ccdiodone: bp %p bcount %ld resid %ld\n",
1134 		       bp, bp->bio_bcount, bp->bio_resid);
1135 		printf(" dev %p(u%d), cbp %p bn %jd addr %p bcnt %ld\n",
1136 		       cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1137 		       (intmax_t)cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1138 		       cbp->cb_buf.bio_bcount);
1139 	}
1140 #endif
1141 	/*
1142 	 * If an error occured, report it.  If this is a mirrored
1143 	 * configuration and the first of two possible reads, do not
1144 	 * set the error in the bp yet because the second read may
1145 	 * succeed.
1146 	 */
1147 
1148 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1149 		const char *msg = "";
1150 
1151 		if ((ccdfind(unit)->sc_cflags & CCDF_MIRROR) &&
1152 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
1153 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1154 			/*
1155 			 * We will try our read on the other disk down
1156 			 * below, also reverse the default pick so if we
1157 			 * are doing a scan we do not keep hitting the
1158 			 * bad disk first.
1159 			 */
1160 			struct ccd_s *cs = ccdfind(unit);
1161 
1162 			msg = ", trying other disk";
1163 			cs->sc_pick = 1 - cs->sc_pick;
1164 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1165 		} else {
1166 			bp->bio_flags |= BIO_ERROR;
1167 			bp->bio_error = cbp->cb_buf.bio_error ?
1168 			    cbp->cb_buf.bio_error : EIO;
1169 		}
1170 		printf("ccd%d: error %d on component %d block %jd "
1171 		    "(ccd block %jd)%s\n", unit, bp->bio_error, cbp->cb_comp,
1172 		    (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno,
1173 		    msg);
1174 	}
1175 
1176 	/*
1177 	 * Process mirror.  If we are writing, I/O has been initiated on both
1178 	 * buffers and we fall through only after both are finished.
1179 	 *
1180 	 * If we are reading only one I/O is initiated at a time.  If an
1181 	 * error occurs we initiate the second I/O and return, otherwise
1182 	 * we free the second I/O without initiating it.
1183 	 */
1184 
1185 	if (ccdfind(unit)->sc_cflags & CCDF_MIRROR) {
1186 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1187 			/*
1188 			 * When writing, handshake with the second buffer
1189 			 * to determine when both are done.  If both are not
1190 			 * done, return here.
1191 			 */
1192 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1193 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1194 				putccdbuf(cbp);
1195 				splx(s);
1196 				return;
1197 			}
1198 		} else {
1199 			/*
1200 			 * When reading, either dispose of the second buffer
1201 			 * or initiate I/O on the second buffer if an error
1202 			 * occured with this one.
1203 			 */
1204 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1205 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1206 					cbp->cb_mirror->cb_pflags |=
1207 					    CCDPF_MIRROR_DONE;
1208 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf);
1209 					putccdbuf(cbp);
1210 					splx(s);
1211 					return;
1212 				} else {
1213 					putccdbuf(cbp->cb_mirror);
1214 					/* fall through */
1215 				}
1216 			}
1217 		}
1218 	}
1219 
1220 	/*
1221 	 * use bio_caller1 to determine how big the original request was rather
1222 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1223 	 *
1224 	 * XXX We check for an error, but we do not test the resid for an
1225 	 * aligned EOF condition.  This may result in character & block
1226 	 * device access not recognizing EOF properly when read or written
1227 	 * sequentially, but will not effect filesystems.
1228 	 */
1229 	count = (long)cbp->cb_buf.bio_caller1;
1230 	putccdbuf(cbp);
1231 
1232 	/*
1233 	 * If all done, "interrupt".
1234 	 */
1235 	bp->bio_resid -= count;
1236 	if (bp->bio_resid < 0)
1237 		panic("ccdiodone: count");
1238 	if (bp->bio_resid == 0)
1239 		ccdintr(ccdfind(unit), bp);
1240 	splx(s);
1241 }
1242 
1243 static int
1244 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
1245 {
1246 	int unit = ccdunit(dev);
1247 	int i, j, lookedup = 0, error = 0;
1248 	int part, pmask, s;
1249 	struct ccd_s *cs;
1250 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1251 	char **cpp;
1252 	struct vnode **vpp;
1253 
1254 	if (!IS_ALLOCATED(unit))
1255 		return (ENXIO);
1256 	cs = ccdfind(unit);
1257 
1258 	switch (cmd) {
1259 	case CCDIOCSET:
1260 		if (IS_INITED(cs))
1261 			return (EBUSY);
1262 
1263 		if ((flag & FWRITE) == 0)
1264 			return (EBADF);
1265 
1266 		if ((error = ccdlock(cs)) != 0)
1267 			return (error);
1268 
1269 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1270 			return (EINVAL);
1271 
1272 		/* Fill in some important bits. */
1273 		cs->sc_ileave = ccio->ccio_ileave;
1274 		if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) {
1275 			printf("ccd%d: disabling mirror, interleave is 0\n",
1276 			    unit);
1277 			ccio->ccio_flags &= ~(CCDF_MIRROR);
1278 		}
1279 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1280 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1281 			printf("ccd%d: mirror/parity forces uniform flag\n",
1282 			       unit);
1283 			ccio->ccio_flags |= CCDF_UNIFORM;
1284 		}
1285 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1286 
1287 		/*
1288 		 * Allocate space for and copy in the array of
1289 		 * componet pathnames and device numbers.
1290 		 */
1291 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1292 		    M_DEVBUF, M_WAITOK);
1293 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1294 		    M_DEVBUF, M_WAITOK);
1295 
1296 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1297 		    ccio->ccio_ndisks * sizeof(char **));
1298 		if (error) {
1299 			free(vpp, M_DEVBUF);
1300 			free(cpp, M_DEVBUF);
1301 			ccdunlock(cs);
1302 			return (error);
1303 		}
1304 
1305 #ifdef DEBUG
1306 		if (ccddebug & CCDB_INIT)
1307 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1308 				printf("ccdioctl: component %d: %p\n",
1309 				    i, cpp[i]);
1310 #endif
1311 
1312 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1313 #ifdef DEBUG
1314 			if (ccddebug & CCDB_INIT)
1315 				printf("ccdioctl: lookedup = %d\n", lookedup);
1316 #endif
1317 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1318 				for (j = 0; j < lookedup; ++j)
1319 					(void)vn_close(vpp[j], FREAD|FWRITE,
1320 					    td->td_ucred, td);
1321 				free(vpp, M_DEVBUF);
1322 				free(cpp, M_DEVBUF);
1323 				ccdunlock(cs);
1324 				return (error);
1325 			}
1326 			++lookedup;
1327 		}
1328 		cs->sc_vpp = vpp;
1329 		cs->sc_nccdisks = ccio->ccio_ndisks;
1330 
1331 		/*
1332 		 * Initialize the ccd.  Fills in the softc for us.
1333 		 */
1334 		if ((error = ccdinit(cs, cpp, td)) != 0) {
1335 			for (j = 0; j < lookedup; ++j)
1336 				(void)vn_close(vpp[j], FREAD|FWRITE,
1337 				    td->td_ucred, td);
1338 			/*
1339 			 * We can't ccddestroy() cs just yet, because nothing
1340 			 * prevents user-level app to do another ioctl()
1341 			 * without closing the device first, therefore
1342 			 * declare unit null and void and let ccdclose()
1343 			 * destroy it when it is safe to do so.
1344 			 */
1345 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1346 			free(vpp, M_DEVBUF);
1347 			free(cpp, M_DEVBUF);
1348 			ccdunlock(cs);
1349 			return (error);
1350 		}
1351 
1352 		/*
1353 		 * The ccd has been successfully initialized, so
1354 		 * we can place it into the array and read the disklabel.
1355 		 */
1356 		ccio->ccio_unit = unit;
1357 		ccio->ccio_size = cs->sc_size;
1358 		ccdgetdisklabel(dev);
1359 
1360 		ccdunlock(cs);
1361 
1362 		break;
1363 
1364 	case CCDIOCCLR:
1365 		if (!IS_INITED(cs))
1366 			return (ENXIO);
1367 
1368 		if ((flag & FWRITE) == 0)
1369 			return (EBADF);
1370 
1371 		if ((error = ccdlock(cs)) != 0)
1372 			return (error);
1373 
1374 		/* Don't unconfigure if any other partitions are open */
1375 		part = ccdpart(dev);
1376 		pmask = (1 << part);
1377 		if ((cs->sc_openmask & ~pmask)) {
1378 			ccdunlock(cs);
1379 			return (EBUSY);
1380 		}
1381 
1382 		/* Declare unit null and void (reset all flags) */
1383 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1384 
1385 		/* Close the components and free their pathnames. */
1386 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1387 			/*
1388 			 * XXX: this close could potentially fail and
1389 			 * cause Bad Things.  Maybe we need to force
1390 			 * the close to happen?
1391 			 */
1392 #ifdef DEBUG
1393 			if (ccddebug & CCDB_VNODE)
1394 				vprint("CCDIOCCLR: vnode info",
1395 				    cs->sc_cinfo[i].ci_vp);
1396 #endif
1397 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1398 			    td->td_ucred, td);
1399 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1400 		}
1401 
1402 		/* Free interleave index. */
1403 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1404 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1405 
1406 		/* Free component info and interleave table. */
1407 		free(cs->sc_cinfo, M_DEVBUF);
1408 		free(cs->sc_itable, M_DEVBUF);
1409 		free(cs->sc_vpp, M_DEVBUF);
1410 
1411 		/* And remove the devstat entry. */
1412 		devstat_remove_entry(&cs->device_stats);
1413 
1414 		/* This must be atomic. */
1415 		s = splhigh();
1416 		ccdunlock(cs);
1417 		splx(s);
1418 
1419 		break;
1420 
1421 	case CCDCONFINFO:
1422 		{
1423 			int ninit = 0;
1424 			struct ccdconf *conf = (struct ccdconf *)data;
1425 			struct ccd_s *tmpcs;
1426 			struct ccd_s *ubuf = conf->buffer;
1427 
1428 			/* XXX: LOCK(unique unit numbers) */
1429 			LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1430 				if (IS_INITED(tmpcs))
1431 					ninit++;
1432 
1433 			if (conf->size == 0) {
1434 				conf->size = sizeof(struct ccd_s) * ninit;
1435 				break;
1436 			} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1437 			    (conf->size % sizeof(struct ccd_s) != 0)) {
1438 				/* XXX: UNLOCK(unique unit numbers) */
1439 				return (EINVAL);
1440 			}
1441 
1442 			ubuf += ninit;
1443 			LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1444 				if (!IS_INITED(tmpcs))
1445 					continue;
1446 				error = copyout(tmpcs, --ubuf,
1447 				    sizeof(struct ccd_s));
1448 				if (error != 0)
1449 					/* XXX: UNLOCK(unique unit numbers) */
1450 					return (error);
1451 			}
1452 			/* XXX: UNLOCK(unique unit numbers) */
1453 		}
1454 		break;
1455 
1456 	case CCDCPPINFO:
1457 		if (!IS_INITED(cs))
1458 			return (ENXIO);
1459 
1460 		{
1461 			int len = 0;
1462 			struct ccdcpps *cpps = (struct ccdcpps *)data;
1463 			char *ubuf = cpps->buffer;
1464 
1465 
1466 			for (i = 0; i < cs->sc_nccdisks; ++i)
1467 				len += cs->sc_cinfo[i].ci_pathlen;
1468 
1469 			if (cpps->size == 0) {
1470 				cpps->size = len;
1471 				break;
1472 			} else if (cpps->size != len) {
1473 				return (EINVAL);
1474 			}
1475 
1476 			for (i = 0; i < cs->sc_nccdisks; ++i) {
1477 				len = cs->sc_cinfo[i].ci_pathlen;
1478 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1479 				    len);
1480 				if (error != 0)
1481 					return (error);
1482 				ubuf += len;
1483 			}
1484 		}
1485 		break;
1486 
1487 	case DIOCGDINFO:
1488 		if (!IS_INITED(cs))
1489 			return (ENXIO);
1490 
1491 		*(struct disklabel *)data = cs->sc_label;
1492 		break;
1493 
1494 	case DIOCWDINFO:
1495 	case DIOCSDINFO:
1496 		if (!IS_INITED(cs))
1497 			return (ENXIO);
1498 
1499 		if ((flag & FWRITE) == 0)
1500 			return (EBADF);
1501 
1502 		if ((error = ccdlock(cs)) != 0)
1503 			return (error);
1504 
1505 		cs->sc_flags |= CCDF_LABELLING;
1506 
1507 		error = setdisklabel(&cs->sc_label,
1508 		    (struct disklabel *)data, 0);
1509 		if (error == 0) {
1510 			if (cmd == DIOCWDINFO)
1511 				error = writedisklabel(CCDLABELDEV(dev),
1512 				    &cs->sc_label);
1513 		}
1514 
1515 		cs->sc_flags &= ~CCDF_LABELLING;
1516 
1517 		ccdunlock(cs);
1518 
1519 		if (error)
1520 			return (error);
1521 		break;
1522 
1523 	case DIOCWLABEL:
1524 		if (!IS_INITED(cs))
1525 			return (ENXIO);
1526 
1527 		if ((flag & FWRITE) == 0)
1528 			return (EBADF);
1529 		if (*(int *)data != 0)
1530 			cs->sc_flags |= CCDF_WLABEL;
1531 		else
1532 			cs->sc_flags &= ~CCDF_WLABEL;
1533 		break;
1534 
1535 	default:
1536 		return (ENOTTY);
1537 	}
1538 
1539 	return (0);
1540 }
1541 
1542 static int
1543 ccdsize(dev_t dev)
1544 {
1545 	struct ccd_s *cs;
1546 	int part, size;
1547 
1548 	if (ccdopen(dev, 0, S_IFCHR, curthread))
1549 		return (-1);
1550 
1551 	cs = ccdfind(ccdunit(dev));
1552 	part = ccdpart(dev);
1553 
1554 	if (!IS_INITED(cs))
1555 		return (-1);
1556 
1557 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1558 		size = -1;
1559 	else
1560 		size = cs->sc_label.d_partitions[part].p_size;
1561 
1562 	if (ccdclose(dev, 0, S_IFCHR, curthread))
1563 		return (-1);
1564 
1565 	return (size);
1566 }
1567 
1568 /*
1569  * Lookup the provided name in the filesystem.  If the file exists,
1570  * is a valid block device, and isn't being used by anyone else,
1571  * set *vpp to the file's vnode.
1572  */
1573 static int
1574 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1575 {
1576 	struct nameidata nd;
1577 	struct vnode *vp;
1578 	int error, flags;
1579 
1580 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1581 	flags = FREAD | FWRITE;
1582 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1583 #ifdef DEBUG
1584 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
1585 			printf("ccdlookup: vn_open error = %d\n", error);
1586 #endif
1587 		return (error);
1588 	}
1589 	vp = nd.ni_vp;
1590 
1591 	if (vrefcnt(vp) > 1) {
1592 		error = EBUSY;
1593 		goto bad;
1594 	}
1595 
1596 	if (!vn_isdisk(vp, &error))
1597 		goto bad;
1598 
1599 #ifdef DEBUG
1600 	if (ccddebug & CCDB_VNODE)
1601 		vprint("ccdlookup: vnode info", vp);
1602 #endif
1603 
1604 	VOP_UNLOCK(vp, 0, td);
1605 	NDFREE(&nd, NDF_ONLY_PNBUF);
1606 	*vpp = vp;
1607 	return (0);
1608 bad:
1609 	VOP_UNLOCK(vp, 0, td);
1610 	NDFREE(&nd, NDF_ONLY_PNBUF);
1611 	/* vn_close does vrele() for vp */
1612 	(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1613 	return (error);
1614 }
1615 
1616 /*
1617  * Read the disklabel from the ccd.  If one is not present, fake one
1618  * up.
1619  */
1620 static void
1621 ccdgetdisklabel(dev_t dev)
1622 {
1623 	int unit = ccdunit(dev);
1624 	struct ccd_s *cs = ccdfind(unit);
1625 	char *errstring;
1626 	struct disklabel *lp = &cs->sc_label;
1627 	struct ccdgeom *ccg = &cs->sc_geom;
1628 
1629 	bzero(lp, sizeof(*lp));
1630 
1631 	lp->d_secperunit = cs->sc_size;
1632 	lp->d_secsize = ccg->ccg_secsize;
1633 	lp->d_nsectors = ccg->ccg_nsectors;
1634 	lp->d_ntracks = ccg->ccg_ntracks;
1635 	lp->d_ncylinders = ccg->ccg_ncylinders;
1636 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1637 
1638 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1639 	lp->d_type = DTYPE_CCD;
1640 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1641 	lp->d_rpm = 3600;
1642 	lp->d_interleave = 1;
1643 	lp->d_flags = 0;
1644 
1645 	lp->d_partitions[RAW_PART].p_offset = 0;
1646 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1647 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1648 	lp->d_npartitions = RAW_PART + 1;
1649 
1650 	lp->d_bbsize = BBSIZE;				/* XXX */
1651 	lp->d_sbsize = 0;
1652 
1653 	lp->d_magic = DISKMAGIC;
1654 	lp->d_magic2 = DISKMAGIC;
1655 	lp->d_checksum = dkcksum(&cs->sc_label);
1656 
1657 	/*
1658 	 * Call the generic disklabel extraction routine.
1659 	 */
1660 	errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1661 	if (errstring != NULL)
1662 		ccdmakedisklabel(cs);
1663 
1664 #ifdef DEBUG
1665 	/* It's actually extremely common to have unlabeled ccds. */
1666 	if (ccddebug & CCDB_LABEL)
1667 		if (errstring != NULL)
1668 			printf("ccd%d: %s\n", unit, errstring);
1669 #endif
1670 }
1671 
1672 /*
1673  * Take care of things one might want to take care of in the event
1674  * that a disklabel isn't present.
1675  */
1676 static void
1677 ccdmakedisklabel(struct ccd_s *cs)
1678 {
1679 	struct disklabel *lp = &cs->sc_label;
1680 
1681 	/*
1682 	 * For historical reasons, if there's no disklabel present
1683 	 * the raw partition must be marked FS_BSDFFS.
1684 	 */
1685 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1686 
1687 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1688 }
1689 
1690 /*
1691  * Wait interruptibly for an exclusive lock.
1692  *
1693  * XXX
1694  * Several drivers do this; it should be abstracted and made MP-safe.
1695  */
1696 static int
1697 ccdlock(struct ccd_s *cs)
1698 {
1699 	int error;
1700 
1701 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1702 		cs->sc_flags |= CCDF_WANTED;
1703 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1704 			return (error);
1705 	}
1706 	cs->sc_flags |= CCDF_LOCKED;
1707 	return (0);
1708 }
1709 
1710 /*
1711  * Unlock and wake up any waiters.
1712  */
1713 static void
1714 ccdunlock(struct ccd_s *cs)
1715 {
1716 
1717 	cs->sc_flags &= ~CCDF_LOCKED;
1718 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1719 		cs->sc_flags &= ~CCDF_WANTED;
1720 		wakeup(cs);
1721 	}
1722 }
1723 
1724 #ifdef DEBUG
1725 static void
1726 printiinfo(struct ccdiinfo *ii)
1727 {
1728 	int ix, i;
1729 
1730 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1731 		printf(" itab[%d]: #dk %d sblk %lld soff %lld",
1732 		    ix, ii->ii_ndisk, (long long)ii->ii_startblk,
1733 		    (long long)ii->ii_startoff);
1734 		for (i = 0; i < ii->ii_ndisk; i++)
1735 			printf(" %d", ii->ii_index[i]);
1736 		printf("\n");
1737 	}
1738 }
1739 #endif
1740