xref: /freebsd/sys/geom/geom_ccd.c (revision a3e8fd0b7f663db7eafff527d5c3ca3bcfa8a537)
1 /* $FreeBSD$ */
2 
3 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
4 
5 /*
6  * Copyright (c) 1995 Jason R. Thorpe.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed for the NetBSD Project
20  *	by Jason R. Thorpe.
21  * 4. The name of the author may not be used to endorse or promote products
22  *    derived from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * Copyright (c) 1988 University of Utah.
39  * Copyright (c) 1990, 1993
40  *	The Regents of the University of California.  All rights reserved.
41  *
42  * This code is derived from software contributed to Berkeley by
43  * the Systems Programming Group of the University of Utah Computer
44  * Science Department.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions
48  * are met:
49  * 1. Redistributions of source code must retain the above copyright
50  *    notice, this list of conditions and the following disclaimer.
51  * 2. Redistributions in binary form must reproduce the above copyright
52  *    notice, this list of conditions and the following disclaimer in the
53  *    documentation and/or other materials provided with the distribution.
54  * 3. All advertising materials mentioning features or use of this software
55  *    must display the following acknowledgement:
56  *	This product includes software developed by the University of
57  *	California, Berkeley and its contributors.
58  * 4. Neither the name of the University nor the names of its contributors
59  *    may be used to endorse or promote products derived from this software
60  *    without specific prior written permission.
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72  * SUCH DAMAGE.
73  *
74  * from: Utah $Hdr: cd.c 1.6 90/11/28$
75  *
76  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
77  */
78 
79 /*
80  * "Concatenated" disk driver.
81  *
82  * Dynamic configuration and disklabel support by:
83  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
84  *	Numerical Aerodynamic Simulation Facility
85  *	Mail Stop 258-6
86  *	NASA Ames Research Center
87  *	Moffett Field, CA 94035
88  */
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/kernel.h>
93 #include <sys/module.h>
94 #include <sys/proc.h>
95 #include <sys/bio.h>
96 #include <sys/malloc.h>
97 #include <sys/namei.h>
98 #include <sys/conf.h>
99 #include <sys/stat.h>
100 #include <sys/stdint.h>
101 #include <sys/sysctl.h>
102 #include <sys/disk.h>
103 #include <sys/disklabel.h>
104 #include <sys/devicestat.h>
105 #include <sys/fcntl.h>
106 #include <sys/vnode.h>
107 
108 #include <sys/ccdvar.h>
109 
110 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
111 
112 #if defined(CCDDEBUG) && !defined(DEBUG)
113 #define DEBUG
114 #endif
115 
116 #ifdef DEBUG
117 #define CCDB_FOLLOW	0x01
118 #define CCDB_INIT	0x02
119 #define CCDB_IO		0x04
120 #define CCDB_LABEL	0x08
121 #define CCDB_VNODE	0x10
122 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
123     CCDB_VNODE;
124 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
125 #endif
126 
127 #define	ccdunit(x)	dkunit(x)
128 #define ccdpart(x)	dkpart(x)
129 
130 /*
131    This is how mirroring works (only writes are special):
132 
133    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
134    linked together by the cb_mirror field.  "cb_pflags &
135    CCDPF_MIRROR_DONE" is set to 0 on both of them.
136 
137    When a component returns to ccdiodone(), it checks if "cb_pflags &
138    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
139    flag and returns.  If it is, it means its partner has already
140    returned, so it will go to the regular cleanup.
141 
142  */
143 
144 struct ccdbuf {
145 	struct bio	cb_buf;		/* new I/O buf */
146 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
147 	struct ccdbuf	*cb_freenext;	/* free list link */
148 	int		cb_unit;	/* target unit */
149 	int		cb_comp;	/* target component */
150 	int		cb_pflags;	/* mirror/parity status flag */
151 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
152 };
153 
154 /* bits in cb_pflags */
155 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
156 
157 #define CCDLABELDEV(dev)	\
158 	(makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
159 
160 /* convinient macros for often-used statements */
161 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
162 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
163 
164 static d_open_t ccdopen;
165 static d_close_t ccdclose;
166 static d_strategy_t ccdstrategy;
167 static d_ioctl_t ccdioctl;
168 static d_psize_t ccdsize;
169 
170 #define NCCDFREEHIWAT	16
171 
172 #define CDEV_MAJOR 74
173 
174 static struct cdevsw ccd_cdevsw = {
175 	/* open */	ccdopen,
176 	/* close */	ccdclose,
177 	/* read */	physread,
178 	/* write */	physwrite,
179 	/* ioctl */	ccdioctl,
180 	/* poll */	nopoll,
181 	/* mmap */	nommap,
182 	/* strategy */	ccdstrategy,
183 	/* name */	"ccd",
184 	/* maj */	CDEV_MAJOR,
185 	/* dump */	nodump,
186 	/* psize */	ccdsize,
187 	/* flags */	D_DISK,
188 };
189 static LIST_HEAD(, ccd_s) ccd_softc_list = LIST_HEAD_INITIALIZER(&ccd_softc_list);
190 
191 static struct ccd_s *ccdfind(int);
192 static struct ccd_s *ccdnew(int);
193 static int ccddestroy(struct ccd_s *, struct proc *);
194 
195 /* called during module initialization */
196 static void ccdattach(void);
197 static int ccd_modevent(module_t, int, void *);
198 
199 /* called by biodone() at interrupt time */
200 static void ccdiodone(struct bio *bp);
201 
202 static void ccdstart(struct ccd_s *, struct bio *);
203 static void ccdinterleave(struct ccd_s *, int);
204 static void ccdintr(struct ccd_s *, struct bio *);
205 static int ccdinit(struct ccd_s *, char **, struct thread *);
206 static int ccdlookup(char *, struct thread *p, struct vnode **);
207 static void ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
208 		      struct bio *, daddr_t, caddr_t, long);
209 static void ccdgetdisklabel(dev_t);
210 static void ccdmakedisklabel(struct ccd_s *);
211 static int ccdlock(struct ccd_s *);
212 static void ccdunlock(struct ccd_s *);
213 
214 #ifdef DEBUG
215 static void printiinfo(struct ccdiinfo *);
216 #endif
217 
218 /* Non-private for the benefit of libkvm. */
219 struct ccdbuf *ccdfreebufs;
220 static int numccdfreebufs;
221 
222 /*
223  * getccdbuf() -	Allocate and zero a ccd buffer.
224  *
225  *	This routine is called at splbio().
226  */
227 
228 static __inline
229 struct ccdbuf *
230 getccdbuf(struct ccdbuf *cpy)
231 {
232 	struct ccdbuf *cbp;
233 
234 	/*
235 	 * Allocate from freelist or malloc as necessary
236 	 */
237 	if ((cbp = ccdfreebufs) != NULL) {
238 		ccdfreebufs = cbp->cb_freenext;
239 		--numccdfreebufs;
240 	} else {
241 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
242 	}
243 
244 	/*
245 	 * Used by mirroring code
246 	 */
247 	if (cpy)
248 		bcopy(cpy, cbp, sizeof(struct ccdbuf));
249 	else
250 		bzero(cbp, sizeof(struct ccdbuf));
251 
252 	/*
253 	 * independant struct bio initialization
254 	 */
255 
256 	return(cbp);
257 }
258 
259 /*
260  * putccdbuf() -	Free a ccd buffer.
261  *
262  *	This routine is called at splbio().
263  */
264 
265 static __inline
266 void
267 putccdbuf(struct ccdbuf *cbp)
268 {
269 
270 	if (numccdfreebufs < NCCDFREEHIWAT) {
271 		cbp->cb_freenext = ccdfreebufs;
272 		ccdfreebufs = cbp;
273 		++numccdfreebufs;
274 	} else {
275 		free((caddr_t)cbp, M_DEVBUF);
276 	}
277 }
278 
279 
280 /*
281  * Number of blocks to untouched in front of a component partition.
282  * This is to avoid violating its disklabel area when it starts at the
283  * beginning of the slice.
284  */
285 #if !defined(CCD_OFFSET)
286 #define CCD_OFFSET 16
287 #endif
288 
289 static struct ccd_s *
290 ccdfind(int unit)
291 {
292 	struct ccd_s *sc = NULL;
293 
294 	/* XXX: LOCK(unique unit numbers) */
295 	LIST_FOREACH(sc, &ccd_softc_list, list) {
296 		if (sc->sc_unit == unit)
297 			break;
298 	}
299 	/* XXX: UNLOCK(unique unit numbers) */
300 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
301 }
302 
303 static struct ccd_s *
304 ccdnew(int unit)
305 {
306 	struct ccd_s *sc;
307 
308 	/* XXX: LOCK(unique unit numbers) */
309 	if (IS_ALLOCATED(unit) || unit > DKMAXUNIT)
310 		return (NULL);
311 
312 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
313 	sc->sc_unit = unit;
314 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
315 	/* XXX: UNLOCK(unique unit numbers) */
316 	return (sc);
317 }
318 
319 static int
320 ccddestroy(struct ccd_s *sc, struct proc *p)
321 {
322 
323 	/* XXX: LOCK(unique unit numbers) */
324 	LIST_REMOVE(sc, list);
325 	/* XXX: UNLOCK(unique unit numbers) */
326 	FREE(sc, M_CCD);
327 	return (0);
328 }
329 
330 static void
331 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
332 {
333 	int i, u;
334 	char *s;
335 
336 	if (*dev != NODEV)
337 		return;
338 	i = dev_stdclone(name, &s, "ccd", &u);
339 	if (i != 2)
340 		return;
341 	if (*s < 'a' || *s > 'h')
342 		return;
343 	if (s[1] != '\0')
344 		return;
345 	*dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
346 		UID_ROOT, GID_OPERATOR, 0640, name);
347 }
348 
349 /*
350  * Called by main() during pseudo-device attachment.  All we need
351  * to do is to add devsw entries.
352  */
353 static void
354 ccdattach()
355 {
356 
357 	EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
358 }
359 
360 static int
361 ccd_modevent(module_t mod, int type, void *data)
362 {
363 	int error = 0;
364 
365 	switch (type) {
366 	case MOD_LOAD:
367 		ccdattach();
368 		break;
369 
370 	case MOD_UNLOAD:
371 		printf("ccd0: Unload not supported!\n");
372 		error = EOPNOTSUPP;
373 		break;
374 
375 	case MOD_SHUTDOWN:
376 		break;
377 
378 	default:
379 		error = EOPNOTSUPP;
380 	}
381 	return (error);
382 }
383 
384 DEV_MODULE(ccd, ccd_modevent, NULL);
385 
386 static int
387 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
388 {
389 	struct ccdcinfo *ci = NULL;	/* XXX */
390 	size_t size;
391 	int ix;
392 	struct vnode *vp;
393 	size_t minsize;
394 	int maxsecsize;
395 	struct ccdgeom *ccg = &cs->sc_geom;
396 	char *tmppath = NULL;
397 	int error = 0;
398 	off_t mediasize;
399 	u_int sectorsize;
400 
401 #ifdef DEBUG
402 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
403 		printf("ccdinit: unit %d\n", cs->sc_unit);
404 #endif
405 
406 	cs->sc_size = 0;
407 
408 	/* Allocate space for the component info. */
409 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
410 	    M_DEVBUF, M_WAITOK);
411 
412 	/*
413 	 * Verify that each component piece exists and record
414 	 * relevant information about it.
415 	 */
416 	maxsecsize = 0;
417 	minsize = 0;
418 	tmppath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK);
419 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
420 		vp = cs->sc_vpp[ix];
421 		ci = &cs->sc_cinfo[ix];
422 		ci->ci_vp = vp;
423 
424 		/*
425 		 * Copy in the pathname of the component.
426 		 */
427 		if ((error = copyinstr(cpaths[ix], tmppath,
428 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
429 #ifdef DEBUG
430 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
431 				printf("ccd%d: can't copy path, error = %d\n",
432 				    cs->sc_unit, error);
433 #endif
434 			goto fail;
435 		}
436 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
437 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
438 
439 		ci->ci_dev = vn_todev(vp);
440 
441 		/*
442 		 * Get partition information for the component.
443 		 */
444 		error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
445 		    FREAD, td->td_ucred, td);
446 		if (error != 0) {
447 #ifdef DEBUG
448 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
449 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
450 				     cs->sc_unit, ci->ci_path, error);
451 #endif
452 			goto fail;
453 		}
454 		/*
455 		 * Get partition information for the component.
456 		 */
457 		error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
458 		    FREAD, td->td_ucred, td);
459 		if (error != 0) {
460 #ifdef DEBUG
461 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
462 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
463 				     cs->sc_unit, ci->ci_path, error);
464 #endif
465 			goto fail;
466 		}
467 		if (sectorsize > maxsecsize)
468 			maxsecsize = sectorsize;
469 		size = mediasize / DEV_BSIZE - CCD_OFFSET;
470 
471 		/*
472 		 * Calculate the size, truncating to an interleave
473 		 * boundary if necessary.
474 		 */
475 
476 		if (cs->sc_ileave > 1)
477 			size -= size % cs->sc_ileave;
478 
479 		if (size == 0) {
480 #ifdef DEBUG
481 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
482 				printf("ccd%d: %s: size == 0\n",
483 				    cs->sc_unit, ci->ci_path);
484 #endif
485 			error = ENODEV;
486 			goto fail;
487 		}
488 
489 		if (minsize == 0 || size < minsize)
490 			minsize = size;
491 		ci->ci_size = size;
492 		cs->sc_size += size;
493 	}
494 
495 	free(tmppath, M_DEVBUF);
496 	tmppath = NULL;
497 
498 	/*
499 	 * Don't allow the interleave to be smaller than
500 	 * the biggest component sector.
501 	 */
502 	if ((cs->sc_ileave > 0) &&
503 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
504 #ifdef DEBUG
505 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
506 			printf("ccd%d: interleave must be at least %d\n",
507 			    cs->sc_unit, (maxsecsize / DEV_BSIZE));
508 #endif
509 		error = EINVAL;
510 		goto fail;
511 	}
512 
513 	/*
514 	 * If uniform interleave is desired set all sizes to that of
515 	 * the smallest component.  This will guarentee that a single
516 	 * interleave table is generated.
517 	 *
518 	 * Lost space must be taken into account when calculating the
519 	 * overall size.  Half the space is lost when CCDF_MIRROR is
520 	 * specified.  One disk is lost when CCDF_PARITY is specified.
521 	 */
522 	if (cs->sc_flags & CCDF_UNIFORM) {
523 		for (ci = cs->sc_cinfo;
524 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
525 			ci->ci_size = minsize;
526 		}
527 		if (cs->sc_flags & CCDF_MIRROR) {
528 			/*
529 			 * Check to see if an even number of components
530 			 * have been specified.  The interleave must also
531 			 * be non-zero in order for us to be able to
532 			 * guarentee the topology.
533 			 */
534 			if (cs->sc_nccdisks % 2) {
535 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
536 				error = EINVAL;
537 				goto fail;
538 			}
539 			if (cs->sc_ileave == 0) {
540 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
541 				error = EINVAL;
542 				goto fail;
543 			}
544 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
545 		} else if (cs->sc_flags & CCDF_PARITY) {
546 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
547 		} else {
548 			if (cs->sc_ileave == 0) {
549 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
550 				error = EINVAL;
551 				goto fail;
552 			}
553 			cs->sc_size = cs->sc_nccdisks * minsize;
554 		}
555 	}
556 
557 	/*
558 	 * Construct the interleave table.
559 	 */
560 	ccdinterleave(cs, cs->sc_unit);
561 
562 	/*
563 	 * Create pseudo-geometry based on 1MB cylinders.  It's
564 	 * pretty close.
565 	 */
566 	ccg->ccg_secsize = maxsecsize;
567 	ccg->ccg_ntracks = 1;
568 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
569 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
570 
571 	/*
572 	 * Add an devstat entry for this device.
573 	 */
574 	devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
575 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
576 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
577 			  DEVSTAT_PRIORITY_ARRAY);
578 
579 	cs->sc_flags |= CCDF_INITED;
580 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
581 	return (0);
582 fail:
583 	while (ci > cs->sc_cinfo) {
584 		ci--;
585 		free(ci->ci_path, M_DEVBUF);
586 	}
587 	if (tmppath != NULL)
588 		free(tmppath, M_DEVBUF);
589 	free(cs->sc_cinfo, M_DEVBUF);
590 	return (error);
591 }
592 
593 static void
594 ccdinterleave(struct ccd_s *cs, int unit)
595 {
596 	struct ccdcinfo *ci, *smallci;
597 	struct ccdiinfo *ii;
598 	daddr_t bn, lbn;
599 	int ix;
600 	u_long size;
601 
602 #ifdef DEBUG
603 	if (ccddebug & CCDB_INIT)
604 		printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
605 #endif
606 
607 	/*
608 	 * Allocate an interleave table.  The worst case occurs when each
609 	 * of N disks is of a different size, resulting in N interleave
610 	 * tables.
611 	 *
612 	 * Chances are this is too big, but we don't care.
613 	 */
614 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
615 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
616 	    M_WAITOK | M_ZERO);
617 
618 	/*
619 	 * Trivial case: no interleave (actually interleave of disk size).
620 	 * Each table entry represents a single component in its entirety.
621 	 *
622 	 * An interleave of 0 may not be used with a mirror or parity setup.
623 	 */
624 	if (cs->sc_ileave == 0) {
625 		bn = 0;
626 		ii = cs->sc_itable;
627 
628 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
629 			/* Allocate space for ii_index. */
630 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
631 			ii->ii_ndisk = 1;
632 			ii->ii_startblk = bn;
633 			ii->ii_startoff = 0;
634 			ii->ii_index[0] = ix;
635 			bn += cs->sc_cinfo[ix].ci_size;
636 			ii++;
637 		}
638 		ii->ii_ndisk = 0;
639 #ifdef DEBUG
640 		if (ccddebug & CCDB_INIT)
641 			printiinfo(cs->sc_itable);
642 #endif
643 		return;
644 	}
645 
646 	/*
647 	 * The following isn't fast or pretty; it doesn't have to be.
648 	 */
649 	size = 0;
650 	bn = lbn = 0;
651 	for (ii = cs->sc_itable; ; ii++) {
652 		/*
653 		 * Allocate space for ii_index.  We might allocate more then
654 		 * we use.
655 		 */
656 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
657 		    M_DEVBUF, M_WAITOK);
658 
659 		/*
660 		 * Locate the smallest of the remaining components
661 		 */
662 		smallci = NULL;
663 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
664 		    ci++) {
665 			if (ci->ci_size > size &&
666 			    (smallci == NULL ||
667 			     ci->ci_size < smallci->ci_size)) {
668 				smallci = ci;
669 			}
670 		}
671 
672 		/*
673 		 * Nobody left, all done
674 		 */
675 		if (smallci == NULL) {
676 			ii->ii_ndisk = 0;
677 			break;
678 		}
679 
680 		/*
681 		 * Record starting logical block using an sc_ileave blocksize.
682 		 */
683 		ii->ii_startblk = bn / cs->sc_ileave;
684 
685 		/*
686 		 * Record starting comopnent block using an sc_ileave
687 		 * blocksize.  This value is relative to the beginning of
688 		 * a component disk.
689 		 */
690 		ii->ii_startoff = lbn;
691 
692 		/*
693 		 * Determine how many disks take part in this interleave
694 		 * and record their indices.
695 		 */
696 		ix = 0;
697 		for (ci = cs->sc_cinfo;
698 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
699 			if (ci->ci_size >= smallci->ci_size) {
700 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
701 			}
702 		}
703 		ii->ii_ndisk = ix;
704 		bn += ix * (smallci->ci_size - size);
705 		lbn = smallci->ci_size / cs->sc_ileave;
706 		size = smallci->ci_size;
707 	}
708 #ifdef DEBUG
709 	if (ccddebug & CCDB_INIT)
710 		printiinfo(cs->sc_itable);
711 #endif
712 }
713 
714 /* ARGSUSED */
715 static int
716 ccdopen(dev_t dev, int flags, int fmt, struct thread *td)
717 {
718 	int unit = ccdunit(dev);
719 	struct ccd_s *cs;
720 	struct disklabel *lp;
721 	int error = 0, part, pmask;
722 
723 #ifdef DEBUG
724 	if (ccddebug & CCDB_FOLLOW)
725 		printf("ccdopen(%p, %x)\n", dev, flags);
726 #endif
727 
728 	cs = IS_ALLOCATED(unit) ? ccdfind(unit) : ccdnew(unit);
729 
730 	if ((error = ccdlock(cs)) != 0)
731 		return (error);
732 
733 	lp = &cs->sc_label;
734 
735 	part = ccdpart(dev);
736 	pmask = (1 << part);
737 
738 	/*
739 	 * If we're initialized, check to see if there are any other
740 	 * open partitions.  If not, then it's safe to update
741 	 * the in-core disklabel.
742 	 */
743 	if (IS_INITED(cs) && (cs->sc_openmask == 0))
744 		ccdgetdisklabel(dev);
745 
746 	/* Check that the partition exists. */
747 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
748 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
749 		error = ENXIO;
750 		goto done;
751 	}
752 
753 	cs->sc_openmask |= pmask;
754  done:
755 	ccdunlock(cs);
756 	return (0);
757 }
758 
759 /* ARGSUSED */
760 static int
761 ccdclose(dev_t dev, int flags, int fmt, struct thread *td)
762 {
763 	int unit = ccdunit(dev);
764 	struct ccd_s *cs;
765 	int error = 0, part;
766 
767 #ifdef DEBUG
768 	if (ccddebug & CCDB_FOLLOW)
769 		printf("ccdclose(%p, %x)\n", dev, flags);
770 #endif
771 
772 	if (!IS_ALLOCATED(unit))
773 		return (ENXIO);
774 	cs = ccdfind(unit);
775 
776 	if ((error = ccdlock(cs)) != 0)
777 		return (error);
778 
779 	part = ccdpart(dev);
780 
781 	/* ...that much closer to allowing unconfiguration... */
782 	cs->sc_openmask &= ~(1 << part);
783 	/* collect "garbage" if possible */
784 	if (!IS_INITED(cs) && (cs->sc_flags & CCDF_WANTED) == 0)
785 		ccddestroy(cs, td->td_proc);
786 	else
787 		ccdunlock(cs);
788 	return (0);
789 }
790 
791 static void
792 ccdstrategy(struct bio *bp)
793 {
794 	int unit = ccdunit(bp->bio_dev);
795 	struct ccd_s *cs = ccdfind(unit);
796 	int s;
797 	int wlabel;
798 	struct disklabel *lp;
799 
800 #ifdef DEBUG
801 	if (ccddebug & CCDB_FOLLOW)
802 		printf("ccdstrategy(%p): unit %d\n", bp, unit);
803 #endif
804 	if (!IS_INITED(cs)) {
805 		biofinish(bp, NULL, ENXIO);
806 		return;
807 	}
808 
809 	/* If it's a nil transfer, wake up the top half now. */
810 	if (bp->bio_bcount == 0) {
811 		biodone(bp);
812 		return;
813 	}
814 
815 	lp = &cs->sc_label;
816 
817 	/*
818 	 * Do bounds checking and adjust transfer.  If there's an
819 	 * error, the bounds check will flag that for us.
820 	 */
821 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
822 	if (ccdpart(bp->bio_dev) != RAW_PART) {
823 		if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
824 			biodone(bp);
825 			return;
826 		}
827 	} else {
828 		int pbn;        /* in sc_secsize chunks */
829 		long sz;        /* in sc_secsize chunks */
830 
831 		pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
832 		sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
833 
834 		/*
835 		 * If out of bounds return an error. If at the EOF point,
836 		 * simply read or write less.
837 		 */
838 
839 		if (pbn < 0 || pbn >= cs->sc_size) {
840 			bp->bio_resid = bp->bio_bcount;
841 			if (pbn != cs->sc_size)
842 				biofinish(bp, NULL, EINVAL);
843 			else
844 				biodone(bp);
845 			return;
846 		}
847 
848 		/*
849 		 * If the request crosses EOF, truncate the request.
850 		 */
851 		if (pbn + sz > cs->sc_size) {
852 			bp->bio_bcount = (cs->sc_size - pbn) *
853 			    cs->sc_geom.ccg_secsize;
854 		}
855 	}
856 
857 	bp->bio_resid = bp->bio_bcount;
858 
859 	/*
860 	 * "Start" the unit.
861 	 */
862 	s = splbio();
863 	ccdstart(cs, bp);
864 	splx(s);
865 	return;
866 }
867 
868 static void
869 ccdstart(struct ccd_s *cs, struct bio *bp)
870 {
871 	long bcount, rcount;
872 	struct ccdbuf *cbp[4];
873 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
874 	caddr_t addr;
875 	daddr_t bn;
876 	struct partition *pp;
877 
878 #ifdef DEBUG
879 	if (ccddebug & CCDB_FOLLOW)
880 		printf("ccdstart(%p, %p)\n", cs, bp);
881 #endif
882 
883 	/* Record the transaction start  */
884 	devstat_start_transaction(&cs->device_stats);
885 
886 	/*
887 	 * Translate the partition-relative block number to an absolute.
888 	 */
889 	bn = bp->bio_blkno;
890 	if (ccdpart(bp->bio_dev) != RAW_PART) {
891 		pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
892 		bn += pp->p_offset;
893 	}
894 
895 	/*
896 	 * Allocate component buffers and fire off the requests
897 	 */
898 	addr = bp->bio_data;
899 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
900 		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
901 		rcount = cbp[0]->cb_buf.bio_bcount;
902 
903 		if (cs->sc_cflags & CCDF_MIRROR) {
904 			/*
905 			 * Mirroring.  Writes go to both disks, reads are
906 			 * taken from whichever disk seems most appropriate.
907 			 *
908 			 * We attempt to localize reads to the disk whos arm
909 			 * is nearest the read request.  We ignore seeks due
910 			 * to writes when making this determination and we
911 			 * also try to avoid hogging.
912 			 */
913 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
914 				BIO_STRATEGY(&cbp[0]->cb_buf, 0);
915 				BIO_STRATEGY(&cbp[1]->cb_buf, 0);
916 			} else {
917 				int pick = cs->sc_pick;
918 				daddr_t range = cs->sc_size / 16;
919 
920 				if (bn < cs->sc_blk[pick] - range ||
921 				    bn > cs->sc_blk[pick] + range
922 				) {
923 					cs->sc_pick = pick = 1 - pick;
924 				}
925 				cs->sc_blk[pick] = bn + btodb(rcount);
926 				BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
927 			}
928 		} else {
929 			/*
930 			 * Not mirroring
931 			 */
932 			BIO_STRATEGY(&cbp[0]->cb_buf, 0);
933 		}
934 		bn += btodb(rcount);
935 		addr += rcount;
936 	}
937 }
938 
939 /*
940  * Build a component buffer header.
941  */
942 static void
943 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
944 {
945 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
946 	struct ccdbuf *cbp;
947 	daddr_t cbn, cboff;
948 	off_t cbc;
949 
950 #ifdef DEBUG
951 	if (ccddebug & CCDB_IO)
952 		printf("ccdbuffer(%p, %p, %lld, %p, %ld)\n",
953 		    (void *)cs, (void *)bp, (long long)bn, (void *)addr,
954 		    bcount);
955 #endif
956 	/*
957 	 * Determine which component bn falls in.
958 	 */
959 	cbn = bn;
960 	cboff = 0;
961 
962 	if (cs->sc_ileave == 0) {
963 		/*
964 		 * Serially concatenated and neither a mirror nor a parity
965 		 * config.  This is a special case.
966 		 */
967 		daddr_t sblk;
968 
969 		sblk = 0;
970 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
971 			sblk += ci->ci_size;
972 		cbn -= sblk;
973 	} else {
974 		struct ccdiinfo *ii;
975 		int ccdisk, off;
976 
977 		/*
978 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
979 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
980 		 * to cbn.
981 		 */
982 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
983 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
984 
985 		/*
986 		 * Figure out which interleave table to use.
987 		 */
988 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
989 			if (ii->ii_startblk > cbn)
990 				break;
991 		}
992 		ii--;
993 
994 		/*
995 		 * off is the logical superblock relative to the beginning
996 		 * of this interleave block.
997 		 */
998 		off = cbn - ii->ii_startblk;
999 
1000 		/*
1001 		 * We must calculate which disk component to use (ccdisk),
1002 		 * and recalculate cbn to be the superblock relative to
1003 		 * the beginning of the component.  This is typically done by
1004 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
1005 		 * must typically be divided by the number of components in
1006 		 * this interleave array to be properly convert it from a
1007 		 * CCD-relative logical superblock number to a
1008 		 * component-relative superblock number.
1009 		 */
1010 		if (ii->ii_ndisk == 1) {
1011 			/*
1012 			 * When we have just one disk, it can't be a mirror
1013 			 * or a parity config.
1014 			 */
1015 			ccdisk = ii->ii_index[0];
1016 			cbn = ii->ii_startoff + off;
1017 		} else {
1018 			if (cs->sc_cflags & CCDF_MIRROR) {
1019 				/*
1020 				 * We have forced a uniform mapping, resulting
1021 				 * in a single interleave array.  We double
1022 				 * up on the first half of the available
1023 				 * components and our mirror is in the second
1024 				 * half.  This only works with a single
1025 				 * interleave array because doubling up
1026 				 * doubles the number of sectors, so there
1027 				 * cannot be another interleave array because
1028 				 * the next interleave array's calculations
1029 				 * would be off.
1030 				 */
1031 				int ndisk2 = ii->ii_ndisk / 2;
1032 				ccdisk = ii->ii_index[off % ndisk2];
1033 				cbn = ii->ii_startoff + off / ndisk2;
1034 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1035 			} else if (cs->sc_cflags & CCDF_PARITY) {
1036 				/*
1037 				 * XXX not implemented yet
1038 				 */
1039 				int ndisk2 = ii->ii_ndisk - 1;
1040 				ccdisk = ii->ii_index[off % ndisk2];
1041 				cbn = ii->ii_startoff + off / ndisk2;
1042 				if (cbn % ii->ii_ndisk <= ccdisk)
1043 					ccdisk++;
1044 			} else {
1045 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1046 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1047 			}
1048 		}
1049 
1050 		ci = &cs->sc_cinfo[ccdisk];
1051 
1052 		/*
1053 		 * Convert cbn from a superblock to a normal block so it
1054 		 * can be used to calculate (along with cboff) the normal
1055 		 * block index into this particular disk.
1056 		 */
1057 		cbn *= cs->sc_ileave;
1058 	}
1059 
1060 	/*
1061 	 * Fill in the component buf structure.
1062 	 */
1063 	cbp = getccdbuf(NULL);
1064 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
1065 	cbp->cb_buf.bio_done = ccdiodone;
1066 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
1067 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1068 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1069 	cbp->cb_buf.bio_data = addr;
1070 	if (cs->sc_ileave == 0)
1071               cbc = dbtob((off_t)(ci->ci_size - cbn));
1072 	else
1073               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1074 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1075  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1076 
1077 	/*
1078 	 * context for ccdiodone
1079 	 */
1080 	cbp->cb_obp = bp;
1081 	cbp->cb_unit = cs->sc_unit;
1082 	cbp->cb_comp = ci - cs->sc_cinfo;
1083 
1084 #ifdef DEBUG
1085 	if (ccddebug & CCDB_IO)
1086 		printf(" dev %p(u%ld): cbp %p bn %jd addr %p bcnt %ld\n",
1087 		       ci->ci_dev, (unsigned long)(ci-cs->sc_cinfo), cbp,
1088 		       (intmax_t)cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1089 		       cbp->cb_buf.bio_bcount);
1090 #endif
1091 	cb[0] = cbp;
1092 
1093 	/*
1094 	 * Note: both I/O's setup when reading from mirror, but only one
1095 	 * will be executed.
1096 	 */
1097 	if (cs->sc_cflags & CCDF_MIRROR) {
1098 		/* mirror, setup second I/O */
1099 		cbp = getccdbuf(cb[0]);
1100 		cbp->cb_buf.bio_dev = ci2->ci_dev;
1101 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1102 		cb[1] = cbp;
1103 		/* link together the ccdbuf's and clear "mirror done" flag */
1104 		cb[0]->cb_mirror = cb[1];
1105 		cb[1]->cb_mirror = cb[0];
1106 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1107 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1108 	}
1109 }
1110 
1111 static void
1112 ccdintr(struct ccd_s *cs, struct bio *bp)
1113 {
1114 #ifdef DEBUG
1115 	if (ccddebug & CCDB_FOLLOW)
1116 		printf("ccdintr(%p, %p)\n", cs, bp);
1117 #endif
1118 	/*
1119 	 * Request is done for better or worse, wakeup the top half.
1120 	 */
1121 	if (bp->bio_flags & BIO_ERROR)
1122 		bp->bio_resid = bp->bio_bcount;
1123 	biofinish(bp, &cs->device_stats, 0);
1124 }
1125 
1126 /*
1127  * Called at interrupt time.
1128  * Mark the component as done and if all components are done,
1129  * take a ccd interrupt.
1130  */
1131 static void
1132 ccdiodone(struct bio *ibp)
1133 {
1134 	struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1135 	struct bio *bp = cbp->cb_obp;
1136 	int unit = cbp->cb_unit;
1137 	int count, s;
1138 
1139 	s = splbio();
1140 #ifdef DEBUG
1141 	if (ccddebug & CCDB_FOLLOW)
1142 		printf("ccdiodone(%p)\n", cbp);
1143 	if (ccddebug & CCDB_IO) {
1144 		printf("ccdiodone: bp %p bcount %ld resid %ld\n",
1145 		       bp, bp->bio_bcount, bp->bio_resid);
1146 		printf(" dev %p(u%d), cbp %p bn %jd addr %p bcnt %ld\n",
1147 		       cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1148 		       (intmax_t)cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1149 		       cbp->cb_buf.bio_bcount);
1150 	}
1151 #endif
1152 	/*
1153 	 * If an error occured, report it.  If this is a mirrored
1154 	 * configuration and the first of two possible reads, do not
1155 	 * set the error in the bp yet because the second read may
1156 	 * succeed.
1157 	 */
1158 
1159 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1160 		const char *msg = "";
1161 
1162 		if ((ccdfind(unit)->sc_cflags & CCDF_MIRROR) &&
1163 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
1164 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1165 			/*
1166 			 * We will try our read on the other disk down
1167 			 * below, also reverse the default pick so if we
1168 			 * are doing a scan we do not keep hitting the
1169 			 * bad disk first.
1170 			 */
1171 			struct ccd_s *cs = ccdfind(unit);
1172 
1173 			msg = ", trying other disk";
1174 			cs->sc_pick = 1 - cs->sc_pick;
1175 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1176 		} else {
1177 			bp->bio_flags |= BIO_ERROR;
1178 			bp->bio_error = cbp->cb_buf.bio_error ?
1179 			    cbp->cb_buf.bio_error : EIO;
1180 		}
1181 		printf("ccd%d: error %d on component %d block %jd "
1182 		    "(ccd block %jd)%s\n", unit, bp->bio_error, cbp->cb_comp,
1183 		    (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno,
1184 		    msg);
1185 	}
1186 
1187 	/*
1188 	 * Process mirror.  If we are writing, I/O has been initiated on both
1189 	 * buffers and we fall through only after both are finished.
1190 	 *
1191 	 * If we are reading only one I/O is initiated at a time.  If an
1192 	 * error occurs we initiate the second I/O and return, otherwise
1193 	 * we free the second I/O without initiating it.
1194 	 */
1195 
1196 	if (ccdfind(unit)->sc_cflags & CCDF_MIRROR) {
1197 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1198 			/*
1199 			 * When writing, handshake with the second buffer
1200 			 * to determine when both are done.  If both are not
1201 			 * done, return here.
1202 			 */
1203 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1204 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1205 				putccdbuf(cbp);
1206 				splx(s);
1207 				return;
1208 			}
1209 		} else {
1210 			/*
1211 			 * When reading, either dispose of the second buffer
1212 			 * or initiate I/O on the second buffer if an error
1213 			 * occured with this one.
1214 			 */
1215 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1216 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1217 					cbp->cb_mirror->cb_pflags |=
1218 					    CCDPF_MIRROR_DONE;
1219 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1220 					putccdbuf(cbp);
1221 					splx(s);
1222 					return;
1223 				} else {
1224 					putccdbuf(cbp->cb_mirror);
1225 					/* fall through */
1226 				}
1227 			}
1228 		}
1229 	}
1230 
1231 	/*
1232 	 * use bio_caller1 to determine how big the original request was rather
1233 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1234 	 *
1235 	 * XXX We check for an error, but we do not test the resid for an
1236 	 * aligned EOF condition.  This may result in character & block
1237 	 * device access not recognizing EOF properly when read or written
1238 	 * sequentially, but will not effect filesystems.
1239 	 */
1240 	count = (long)cbp->cb_buf.bio_caller1;
1241 	putccdbuf(cbp);
1242 
1243 	/*
1244 	 * If all done, "interrupt".
1245 	 */
1246 	bp->bio_resid -= count;
1247 	if (bp->bio_resid < 0)
1248 		panic("ccdiodone: count");
1249 	if (bp->bio_resid == 0)
1250 		ccdintr(ccdfind(unit), bp);
1251 	splx(s);
1252 }
1253 
1254 static int
1255 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
1256 {
1257 	int unit = ccdunit(dev);
1258 	int i, j, lookedup = 0, error = 0;
1259 	int part, pmask, s;
1260 	struct ccd_s *cs;
1261 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1262 	char **cpp;
1263 	struct vnode **vpp;
1264 
1265 	if (!IS_ALLOCATED(unit))
1266 		return (ENXIO);
1267 	cs = ccdfind(unit);
1268 
1269 	switch (cmd) {
1270 	case CCDIOCSET:
1271 		if (IS_INITED(cs))
1272 			return (EBUSY);
1273 
1274 		if ((flag & FWRITE) == 0)
1275 			return (EBADF);
1276 
1277 		if ((error = ccdlock(cs)) != 0)
1278 			return (error);
1279 
1280 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1281 			return (EINVAL);
1282 
1283 		/* Fill in some important bits. */
1284 		cs->sc_ileave = ccio->ccio_ileave;
1285 		if (cs->sc_ileave == 0 &&
1286 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1287 		     (ccio->ccio_flags & CCDF_PARITY))) {
1288 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1289 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1290 		}
1291 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1292 		    (ccio->ccio_flags & CCDF_PARITY)) {
1293 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1294 			ccio->ccio_flags &= ~CCDF_PARITY;
1295 		}
1296 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1297 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1298 			printf("ccd%d: mirror/parity forces uniform flag\n",
1299 			       unit);
1300 			ccio->ccio_flags |= CCDF_UNIFORM;
1301 		}
1302 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1303 
1304 		/*
1305 		 * Allocate space for and copy in the array of
1306 		 * componet pathnames and device numbers.
1307 		 */
1308 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1309 		    M_DEVBUF, M_WAITOK);
1310 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1311 		    M_DEVBUF, M_WAITOK);
1312 
1313 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1314 		    ccio->ccio_ndisks * sizeof(char **));
1315 		if (error) {
1316 			free(vpp, M_DEVBUF);
1317 			free(cpp, M_DEVBUF);
1318 			ccdunlock(cs);
1319 			return (error);
1320 		}
1321 
1322 #ifdef DEBUG
1323 		if (ccddebug & CCDB_INIT)
1324 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1325 				printf("ccdioctl: component %d: %p\n",
1326 				    i, cpp[i]);
1327 #endif
1328 
1329 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1330 #ifdef DEBUG
1331 			if (ccddebug & CCDB_INIT)
1332 				printf("ccdioctl: lookedup = %d\n", lookedup);
1333 #endif
1334 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1335 				for (j = 0; j < lookedup; ++j)
1336 					(void)vn_close(vpp[j], FREAD|FWRITE,
1337 					    td->td_ucred, td);
1338 				free(vpp, M_DEVBUF);
1339 				free(cpp, M_DEVBUF);
1340 				ccdunlock(cs);
1341 				return (error);
1342 			}
1343 			++lookedup;
1344 		}
1345 		cs->sc_vpp = vpp;
1346 		cs->sc_nccdisks = ccio->ccio_ndisks;
1347 
1348 		/*
1349 		 * Initialize the ccd.  Fills in the softc for us.
1350 		 */
1351 		if ((error = ccdinit(cs, cpp, td)) != 0) {
1352 			for (j = 0; j < lookedup; ++j)
1353 				(void)vn_close(vpp[j], FREAD|FWRITE,
1354 				    td->td_ucred, td);
1355 			/*
1356 			 * We can't ccddestroy() cs just yet, because nothing
1357 			 * prevents user-level app to do another ioctl()
1358 			 * without closing the device first, therefore
1359 			 * declare unit null and void and let ccdclose()
1360 			 * destroy it when it is safe to do so.
1361 			 */
1362 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1363 			free(vpp, M_DEVBUF);
1364 			free(cpp, M_DEVBUF);
1365 			ccdunlock(cs);
1366 			return (error);
1367 		}
1368 
1369 		/*
1370 		 * The ccd has been successfully initialized, so
1371 		 * we can place it into the array and read the disklabel.
1372 		 */
1373 		ccio->ccio_unit = unit;
1374 		ccio->ccio_size = cs->sc_size;
1375 		ccdgetdisklabel(dev);
1376 
1377 		ccdunlock(cs);
1378 
1379 		break;
1380 
1381 	case CCDIOCCLR:
1382 		if (!IS_INITED(cs))
1383 			return (ENXIO);
1384 
1385 		if ((flag & FWRITE) == 0)
1386 			return (EBADF);
1387 
1388 		if ((error = ccdlock(cs)) != 0)
1389 			return (error);
1390 
1391 		/* Don't unconfigure if any other partitions are open */
1392 		part = ccdpart(dev);
1393 		pmask = (1 << part);
1394 		if ((cs->sc_openmask & ~pmask)) {
1395 			ccdunlock(cs);
1396 			return (EBUSY);
1397 		}
1398 
1399 		/* Declare unit null and void (reset all flags) */
1400 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1401 
1402 		/* Close the components and free their pathnames. */
1403 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1404 			/*
1405 			 * XXX: this close could potentially fail and
1406 			 * cause Bad Things.  Maybe we need to force
1407 			 * the close to happen?
1408 			 */
1409 #ifdef DEBUG
1410 			if (ccddebug & CCDB_VNODE)
1411 				vprint("CCDIOCCLR: vnode info",
1412 				    cs->sc_cinfo[i].ci_vp);
1413 #endif
1414 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1415 			    td->td_ucred, td);
1416 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1417 		}
1418 
1419 		/* Free interleave index. */
1420 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1421 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1422 
1423 		/* Free component info and interleave table. */
1424 		free(cs->sc_cinfo, M_DEVBUF);
1425 		free(cs->sc_itable, M_DEVBUF);
1426 		free(cs->sc_vpp, M_DEVBUF);
1427 
1428 		/* And remove the devstat entry. */
1429 		devstat_remove_entry(&cs->device_stats);
1430 
1431 		/* This must be atomic. */
1432 		s = splhigh();
1433 		ccdunlock(cs);
1434 		splx(s);
1435 
1436 		break;
1437 
1438 	case CCDCONFINFO:
1439 		{
1440 			int ninit = 0;
1441 			struct ccdconf *conf = (struct ccdconf *)data;
1442 			struct ccd_s *tmpcs;
1443 			struct ccd_s *ubuf = conf->buffer;
1444 
1445 			/* XXX: LOCK(unique unit numbers) */
1446 			LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1447 				if (IS_INITED(tmpcs))
1448 					ninit++;
1449 
1450 			if (conf->size == 0) {
1451 				conf->size = sizeof(struct ccd_s) * ninit;
1452 				break;
1453 			} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1454 			    (conf->size % sizeof(struct ccd_s) != 0)) {
1455 				/* XXX: UNLOCK(unique unit numbers) */
1456 				return (EINVAL);
1457 			}
1458 
1459 			ubuf += ninit;
1460 			LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1461 				if (!IS_INITED(tmpcs))
1462 					continue;
1463 				error = copyout(tmpcs, --ubuf,
1464 				    sizeof(struct ccd_s));
1465 				if (error != 0)
1466 					/* XXX: UNLOCK(unique unit numbers) */
1467 					return (error);
1468 			}
1469 			/* XXX: UNLOCK(unique unit numbers) */
1470 		}
1471 		break;
1472 
1473 	case CCDCPPINFO:
1474 		if (!IS_INITED(cs))
1475 			return (ENXIO);
1476 
1477 		{
1478 			int len = 0;
1479 			struct ccdcpps *cpps = (struct ccdcpps *)data;
1480 			char *ubuf = cpps->buffer;
1481 
1482 
1483 			for (i = 0; i < cs->sc_nccdisks; ++i)
1484 				len += cs->sc_cinfo[i].ci_pathlen;
1485 
1486 			if (cpps->size == 0) {
1487 				cpps->size = len;
1488 				break;
1489 			} else if (cpps->size != len) {
1490 				return (EINVAL);
1491 			}
1492 
1493 			for (i = 0; i < cs->sc_nccdisks; ++i) {
1494 				len = cs->sc_cinfo[i].ci_pathlen;
1495 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1496 				    len);
1497 				if (error != 0)
1498 					return (error);
1499 				ubuf += len;
1500 			}
1501 		}
1502 		break;
1503 
1504 	case DIOCGDINFO:
1505 		if (!IS_INITED(cs))
1506 			return (ENXIO);
1507 
1508 		*(struct disklabel *)data = cs->sc_label;
1509 		break;
1510 
1511 	case DIOCWDINFO:
1512 	case DIOCSDINFO:
1513 		if (!IS_INITED(cs))
1514 			return (ENXIO);
1515 
1516 		if ((flag & FWRITE) == 0)
1517 			return (EBADF);
1518 
1519 		if ((error = ccdlock(cs)) != 0)
1520 			return (error);
1521 
1522 		cs->sc_flags |= CCDF_LABELLING;
1523 
1524 		error = setdisklabel(&cs->sc_label,
1525 		    (struct disklabel *)data, 0);
1526 		if (error == 0) {
1527 			if (cmd == DIOCWDINFO)
1528 				error = writedisklabel(CCDLABELDEV(dev),
1529 				    &cs->sc_label);
1530 		}
1531 
1532 		cs->sc_flags &= ~CCDF_LABELLING;
1533 
1534 		ccdunlock(cs);
1535 
1536 		if (error)
1537 			return (error);
1538 		break;
1539 
1540 	case DIOCWLABEL:
1541 		if (!IS_INITED(cs))
1542 			return (ENXIO);
1543 
1544 		if ((flag & FWRITE) == 0)
1545 			return (EBADF);
1546 		if (*(int *)data != 0)
1547 			cs->sc_flags |= CCDF_WLABEL;
1548 		else
1549 			cs->sc_flags &= ~CCDF_WLABEL;
1550 		break;
1551 
1552 	default:
1553 		return (ENOTTY);
1554 	}
1555 
1556 	return (0);
1557 }
1558 
1559 static int
1560 ccdsize(dev_t dev)
1561 {
1562 	struct ccd_s *cs;
1563 	int part, size;
1564 
1565 	if (ccdopen(dev, 0, S_IFCHR, curthread))
1566 		return (-1);
1567 
1568 	cs = ccdfind(ccdunit(dev));
1569 	part = ccdpart(dev);
1570 
1571 	if (!IS_INITED(cs))
1572 		return (-1);
1573 
1574 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1575 		size = -1;
1576 	else
1577 		size = cs->sc_label.d_partitions[part].p_size;
1578 
1579 	if (ccdclose(dev, 0, S_IFCHR, curthread))
1580 		return (-1);
1581 
1582 	return (size);
1583 }
1584 
1585 /*
1586  * Lookup the provided name in the filesystem.  If the file exists,
1587  * is a valid block device, and isn't being used by anyone else,
1588  * set *vpp to the file's vnode.
1589  */
1590 static int
1591 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1592 {
1593 	struct nameidata nd;
1594 	struct vnode *vp;
1595 	int error, flags;
1596 
1597 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1598 	flags = FREAD | FWRITE;
1599 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1600 #ifdef DEBUG
1601 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
1602 			printf("ccdlookup: vn_open error = %d\n", error);
1603 #endif
1604 		return (error);
1605 	}
1606 	vp = nd.ni_vp;
1607 
1608 	if (vrefcnt(vp) > 1) {
1609 		error = EBUSY;
1610 		goto bad;
1611 	}
1612 
1613 	if (!vn_isdisk(vp, &error))
1614 		goto bad;
1615 
1616 #ifdef DEBUG
1617 	if (ccddebug & CCDB_VNODE)
1618 		vprint("ccdlookup: vnode info", vp);
1619 #endif
1620 
1621 	VOP_UNLOCK(vp, 0, td);
1622 	NDFREE(&nd, NDF_ONLY_PNBUF);
1623 	*vpp = vp;
1624 	return (0);
1625 bad:
1626 	VOP_UNLOCK(vp, 0, td);
1627 	NDFREE(&nd, NDF_ONLY_PNBUF);
1628 	/* vn_close does vrele() for vp */
1629 	(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1630 	return (error);
1631 }
1632 
1633 /*
1634  * Read the disklabel from the ccd.  If one is not present, fake one
1635  * up.
1636  */
1637 static void
1638 ccdgetdisklabel(dev_t dev)
1639 {
1640 	int unit = ccdunit(dev);
1641 	struct ccd_s *cs = ccdfind(unit);
1642 	char *errstring;
1643 	struct disklabel *lp = &cs->sc_label;
1644 	struct ccdgeom *ccg = &cs->sc_geom;
1645 
1646 	bzero(lp, sizeof(*lp));
1647 
1648 	lp->d_secperunit = cs->sc_size;
1649 	lp->d_secsize = ccg->ccg_secsize;
1650 	lp->d_nsectors = ccg->ccg_nsectors;
1651 	lp->d_ntracks = ccg->ccg_ntracks;
1652 	lp->d_ncylinders = ccg->ccg_ncylinders;
1653 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1654 
1655 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1656 	lp->d_type = DTYPE_CCD;
1657 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1658 	lp->d_rpm = 3600;
1659 	lp->d_interleave = 1;
1660 	lp->d_flags = 0;
1661 
1662 	lp->d_partitions[RAW_PART].p_offset = 0;
1663 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1664 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1665 	lp->d_npartitions = RAW_PART + 1;
1666 
1667 	lp->d_bbsize = BBSIZE;				/* XXX */
1668 	lp->d_sbsize = 0;
1669 
1670 	lp->d_magic = DISKMAGIC;
1671 	lp->d_magic2 = DISKMAGIC;
1672 	lp->d_checksum = dkcksum(&cs->sc_label);
1673 
1674 	/*
1675 	 * Call the generic disklabel extraction routine.
1676 	 */
1677 	errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1678 	if (errstring != NULL)
1679 		ccdmakedisklabel(cs);
1680 
1681 #ifdef DEBUG
1682 	/* It's actually extremely common to have unlabeled ccds. */
1683 	if (ccddebug & CCDB_LABEL)
1684 		if (errstring != NULL)
1685 			printf("ccd%d: %s\n", unit, errstring);
1686 #endif
1687 }
1688 
1689 /*
1690  * Take care of things one might want to take care of in the event
1691  * that a disklabel isn't present.
1692  */
1693 static void
1694 ccdmakedisklabel(struct ccd_s *cs)
1695 {
1696 	struct disklabel *lp = &cs->sc_label;
1697 
1698 	/*
1699 	 * For historical reasons, if there's no disklabel present
1700 	 * the raw partition must be marked FS_BSDFFS.
1701 	 */
1702 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1703 
1704 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1705 }
1706 
1707 /*
1708  * Wait interruptibly for an exclusive lock.
1709  *
1710  * XXX
1711  * Several drivers do this; it should be abstracted and made MP-safe.
1712  */
1713 static int
1714 ccdlock(struct ccd_s *cs)
1715 {
1716 	int error;
1717 
1718 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1719 		cs->sc_flags |= CCDF_WANTED;
1720 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1721 			return (error);
1722 	}
1723 	cs->sc_flags |= CCDF_LOCKED;
1724 	return (0);
1725 }
1726 
1727 /*
1728  * Unlock and wake up any waiters.
1729  */
1730 static void
1731 ccdunlock(struct ccd_s *cs)
1732 {
1733 
1734 	cs->sc_flags &= ~CCDF_LOCKED;
1735 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1736 		cs->sc_flags &= ~CCDF_WANTED;
1737 		wakeup(cs);
1738 	}
1739 }
1740 
1741 #ifdef DEBUG
1742 static void
1743 printiinfo(struct ccdiinfo *ii)
1744 {
1745 	int ix, i;
1746 
1747 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1748 		printf(" itab[%d]: #dk %d sblk %lld soff %lld",
1749 		    ix, ii->ii_ndisk, (long long)ii->ii_startblk,
1750 		    (long long)ii->ii_startoff);
1751 		for (i = 0; i < ii->ii_ndisk; i++)
1752 			printf(" %d", ii->ii_index[i]);
1753 		printf("\n");
1754 	}
1755 }
1756 #endif
1757