xref: /freebsd/sys/geom/geom_ccd.c (revision eacee0ff7ec955b32e09515246bd97b6edcd2b0f)
1 /* $FreeBSD$ */
2 
3 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
4 
5 /*
6  * Copyright (c) 1995 Jason R. Thorpe.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed for the NetBSD Project
20  *	by Jason R. Thorpe.
21  * 4. The name of the author may not be used to endorse or promote products
22  *    derived from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * Copyright (c) 1988 University of Utah.
39  * Copyright (c) 1990, 1993
40  *	The Regents of the University of California.  All rights reserved.
41  *
42  * This code is derived from software contributed to Berkeley by
43  * the Systems Programming Group of the University of Utah Computer
44  * Science Department.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions
48  * are met:
49  * 1. Redistributions of source code must retain the above copyright
50  *    notice, this list of conditions and the following disclaimer.
51  * 2. Redistributions in binary form must reproduce the above copyright
52  *    notice, this list of conditions and the following disclaimer in the
53  *    documentation and/or other materials provided with the distribution.
54  * 3. All advertising materials mentioning features or use of this software
55  *    must display the following acknowledgement:
56  *	This product includes software developed by the University of
57  *	California, Berkeley and its contributors.
58  * 4. Neither the name of the University nor the names of its contributors
59  *    may be used to endorse or promote products derived from this software
60  *    without specific prior written permission.
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72  * SUCH DAMAGE.
73  *
74  * from: Utah $Hdr: cd.c 1.6 90/11/28$
75  *
76  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
77  */
78 
79 /*
80  * "Concatenated" disk driver.
81  *
82  * Dynamic configuration and disklabel support by:
83  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
84  *	Numerical Aerodynamic Simulation Facility
85  *	Mail Stop 258-6
86  *	NASA Ames Research Center
87  *	Moffett Field, CA 94035
88  */
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/kernel.h>
93 #include <sys/module.h>
94 #include <sys/proc.h>
95 #include <sys/bio.h>
96 #include <sys/malloc.h>
97 #include <sys/namei.h>
98 #include <sys/conf.h>
99 #include <sys/stat.h>
100 #include <sys/sysctl.h>
101 #include <sys/disklabel.h>
102 #include <ufs/ffs/fs.h>
103 #include <sys/devicestat.h>
104 #include <sys/fcntl.h>
105 #include <sys/vnode.h>
106 
107 #include <sys/ccdvar.h>
108 
109 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
110 
111 #if defined(CCDDEBUG) && !defined(DEBUG)
112 #define DEBUG
113 #endif
114 
115 #ifdef DEBUG
116 #define CCDB_FOLLOW	0x01
117 #define CCDB_INIT	0x02
118 #define CCDB_IO		0x04
119 #define CCDB_LABEL	0x08
120 #define CCDB_VNODE	0x10
121 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
122     CCDB_VNODE;
123 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
124 #endif
125 
126 #define	ccdunit(x)	dkunit(x)
127 #define ccdpart(x)	dkpart(x)
128 
129 /*
130    This is how mirroring works (only writes are special):
131 
132    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
133    linked together by the cb_mirror field.  "cb_pflags &
134    CCDPF_MIRROR_DONE" is set to 0 on both of them.
135 
136    When a component returns to ccdiodone(), it checks if "cb_pflags &
137    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
138    flag and returns.  If it is, it means its partner has already
139    returned, so it will go to the regular cleanup.
140 
141  */
142 
143 struct ccdbuf {
144 	struct bio	cb_buf;		/* new I/O buf */
145 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
146 	struct ccdbuf	*cb_freenext;	/* free list link */
147 	int		cb_unit;	/* target unit */
148 	int		cb_comp;	/* target component */
149 	int		cb_pflags;	/* mirror/parity status flag */
150 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
151 };
152 
153 /* bits in cb_pflags */
154 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
155 
156 #define CCDLABELDEV(dev)	\
157 	(makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
158 
159 /* convinient macros for often-used statements */
160 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
161 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
162 
163 static d_open_t ccdopen;
164 static d_close_t ccdclose;
165 static d_strategy_t ccdstrategy;
166 static d_ioctl_t ccdioctl;
167 static d_dump_t ccddump;
168 static d_psize_t ccdsize;
169 
170 #define NCCDFREEHIWAT	16
171 
172 #define CDEV_MAJOR 74
173 
174 static struct cdevsw ccd_cdevsw = {
175 	/* open */	ccdopen,
176 	/* close */	ccdclose,
177 	/* read */	physread,
178 	/* write */	physwrite,
179 	/* ioctl */	ccdioctl,
180 	/* poll */	nopoll,
181 	/* mmap */	nommap,
182 	/* strategy */	ccdstrategy,
183 	/* name */	"ccd",
184 	/* maj */	CDEV_MAJOR,
185 	/* dump */	ccddump,
186 	/* psize */	ccdsize,
187 	/* flags */	D_DISK,
188 };
189 static LIST_HEAD(, ccd_s) ccd_softc_list = LIST_HEAD_INITIALIZER(&ccd_softc_list);
190 
191 static struct ccd_s *ccdfind(int);
192 static struct ccd_s *ccdnew(int);
193 static int ccddestroy(struct ccd_s *, struct proc *);
194 
195 /* called during module initialization */
196 static void ccdattach(void);
197 static int ccd_modevent(module_t, int, void *);
198 
199 /* called by biodone() at interrupt time */
200 static void ccdiodone(struct bio *bp);
201 
202 static void ccdstart(struct ccd_s *, struct bio *);
203 static void ccdinterleave(struct ccd_s *, int);
204 static void ccdintr(struct ccd_s *, struct bio *);
205 static int ccdinit(struct ccd_s *, char **, struct thread *);
206 static int ccdlookup(char *, struct thread *p, struct vnode **);
207 static void ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
208 		      struct bio *, daddr_t, caddr_t, long);
209 static void ccdgetdisklabel(dev_t);
210 static void ccdmakedisklabel(struct ccd_s *);
211 static int ccdlock(struct ccd_s *);
212 static void ccdunlock(struct ccd_s *);
213 
214 #ifdef DEBUG
215 static void printiinfo(struct ccdiinfo *);
216 #endif
217 
218 /* Non-private for the benefit of libkvm. */
219 struct ccdbuf *ccdfreebufs;
220 static int numccdfreebufs;
221 
222 /*
223  * getccdbuf() -	Allocate and zero a ccd buffer.
224  *
225  *	This routine is called at splbio().
226  */
227 
228 static __inline
229 struct ccdbuf *
230 getccdbuf(struct ccdbuf *cpy)
231 {
232 	struct ccdbuf *cbp;
233 
234 	/*
235 	 * Allocate from freelist or malloc as necessary
236 	 */
237 	if ((cbp = ccdfreebufs) != NULL) {
238 		ccdfreebufs = cbp->cb_freenext;
239 		--numccdfreebufs;
240 	} else {
241 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
242 	}
243 
244 	/*
245 	 * Used by mirroring code
246 	 */
247 	if (cpy)
248 		bcopy(cpy, cbp, sizeof(struct ccdbuf));
249 	else
250 		bzero(cbp, sizeof(struct ccdbuf));
251 
252 	/*
253 	 * independant struct bio initialization
254 	 */
255 
256 	return(cbp);
257 }
258 
259 /*
260  * putccdbuf() -	Free a ccd buffer.
261  *
262  *	This routine is called at splbio().
263  */
264 
265 static __inline
266 void
267 putccdbuf(struct ccdbuf *cbp)
268 {
269 
270 	if (numccdfreebufs < NCCDFREEHIWAT) {
271 		cbp->cb_freenext = ccdfreebufs;
272 		ccdfreebufs = cbp;
273 		++numccdfreebufs;
274 	} else {
275 		free((caddr_t)cbp, M_DEVBUF);
276 	}
277 }
278 
279 
280 /*
281  * Number of blocks to untouched in front of a component partition.
282  * This is to avoid violating its disklabel area when it starts at the
283  * beginning of the slice.
284  */
285 #if !defined(CCD_OFFSET)
286 #define CCD_OFFSET 16
287 #endif
288 
289 static struct ccd_s *
290 ccdfind(int unit)
291 {
292 	struct ccd_s *sc = NULL;
293 
294 	/* XXX: LOCK(unique unit numbers) */
295 	LIST_FOREACH(sc, &ccd_softc_list, list) {
296 		if (sc->sc_unit == unit)
297 			break;
298 	}
299 	/* XXX: UNLOCK(unique unit numbers) */
300 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
301 }
302 
303 static struct ccd_s *
304 ccdnew(int unit)
305 {
306 	struct ccd_s *sc;
307 
308 	/* XXX: LOCK(unique unit numbers) */
309 	if (IS_ALLOCATED(unit) || unit > DKMAXUNIT)
310 		return (NULL);
311 
312 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
313 	sc->sc_unit = unit;
314 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
315 	/* XXX: UNLOCK(unique unit numbers) */
316 	return (sc);
317 }
318 
319 static int
320 ccddestroy(struct ccd_s *sc, struct proc *p)
321 {
322 
323 	/* XXX: LOCK(unique unit numbers) */
324 	LIST_REMOVE(sc, list);
325 	/* XXX: UNLOCK(unique unit numbers) */
326 	FREE(sc, M_CCD);
327 	return (0);
328 }
329 
330 static void
331 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
332 {
333 	int i, u;
334 	char *s;
335 
336 	if (*dev != NODEV)
337 		return;
338 	i = dev_stdclone(name, &s, "ccd", &u);
339 	if (i != 2)
340 		return;
341 	if (*s < 'a' || *s > 'h')
342 		return;
343 	if (s[1] != '\0')
344 		return;
345 	*dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
346 		UID_ROOT, GID_OPERATOR, 0640, name);
347 }
348 
349 /*
350  * Called by main() during pseudo-device attachment.  All we need
351  * to do is to add devsw entries.
352  */
353 static void
354 ccdattach()
355 {
356 
357 	EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
358 }
359 
360 static int
361 ccd_modevent(module_t mod, int type, void *data)
362 {
363 	int error = 0;
364 
365 	switch (type) {
366 	case MOD_LOAD:
367 		ccdattach();
368 		break;
369 
370 	case MOD_UNLOAD:
371 		printf("ccd0: Unload not supported!\n");
372 		error = EOPNOTSUPP;
373 		break;
374 
375 	case MOD_SHUTDOWN:
376 		break;
377 
378 	default:
379 		error = EOPNOTSUPP;
380 	}
381 	return (error);
382 }
383 
384 DEV_MODULE(ccd, ccd_modevent, NULL);
385 
386 static int
387 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
388 {
389 	struct ccdcinfo *ci = NULL;	/* XXX */
390 	size_t size;
391 	int ix;
392 	struct vnode *vp;
393 	size_t minsize;
394 	int maxsecsize;
395 	struct partinfo dpart;
396 	struct ccdgeom *ccg = &cs->sc_geom;
397 	char *tmppath = NULL;
398 	int error = 0;
399 
400 #ifdef DEBUG
401 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
402 		printf("ccdinit: unit %d\n", cs->sc_unit);
403 #endif
404 
405 	cs->sc_size = 0;
406 
407 	/* Allocate space for the component info. */
408 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
409 	    M_DEVBUF, M_WAITOK);
410 
411 	/*
412 	 * Verify that each component piece exists and record
413 	 * relevant information about it.
414 	 */
415 	maxsecsize = 0;
416 	minsize = 0;
417 	tmppath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK);
418 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
419 		vp = cs->sc_vpp[ix];
420 		ci = &cs->sc_cinfo[ix];
421 		ci->ci_vp = vp;
422 
423 		/*
424 		 * Copy in the pathname of the component.
425 		 */
426 		if ((error = copyinstr(cpaths[ix], tmppath,
427 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
428 #ifdef DEBUG
429 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
430 				printf("ccd%d: can't copy path, error = %d\n",
431 				    cs->sc_unit, error);
432 #endif
433 			goto fail;
434 		}
435 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
436 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
437 
438 		ci->ci_dev = vn_todev(vp);
439 
440 		/*
441 		 * Get partition information for the component.
442 		 */
443 		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
444 		    FREAD, td->td_proc->p_ucred, td)) != 0) {
445 #ifdef DEBUG
446 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
447 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
448 				     cs->sc_unit, ci->ci_path, error);
449 #endif
450 			goto fail;
451 		}
452 		if (dpart.part->p_fstype == FS_BSDFFS) {
453 			maxsecsize =
454 			    ((dpart.disklab->d_secsize > maxsecsize) ?
455 			    dpart.disklab->d_secsize : maxsecsize);
456 			size = dpart.part->p_size - CCD_OFFSET;
457 		} else {
458 #ifdef DEBUG
459 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
460 				printf("ccd%d: %s: incorrect partition type\n",
461 				    cs->sc_unit, ci->ci_path);
462 #endif
463 			error = EFTYPE;
464 			goto fail;
465 		}
466 
467 		/*
468 		 * Calculate the size, truncating to an interleave
469 		 * boundary if necessary.
470 		 */
471 
472 		if (cs->sc_ileave > 1)
473 			size -= size % cs->sc_ileave;
474 
475 		if (size == 0) {
476 #ifdef DEBUG
477 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
478 				printf("ccd%d: %s: size == 0\n",
479 				    cs->sc_unit, ci->ci_path);
480 #endif
481 			error = ENODEV;
482 			goto fail;
483 		}
484 
485 		if (minsize == 0 || size < minsize)
486 			minsize = size;
487 		ci->ci_size = size;
488 		cs->sc_size += size;
489 	}
490 
491 	free(tmppath, M_DEVBUF);
492 	tmppath = NULL;
493 
494 	/*
495 	 * Don't allow the interleave to be smaller than
496 	 * the biggest component sector.
497 	 */
498 	if ((cs->sc_ileave > 0) &&
499 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
500 #ifdef DEBUG
501 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
502 			printf("ccd%d: interleave must be at least %d\n",
503 			    cs->sc_unit, (maxsecsize / DEV_BSIZE));
504 #endif
505 		error = EINVAL;
506 		goto fail;
507 	}
508 
509 	/*
510 	 * If uniform interleave is desired set all sizes to that of
511 	 * the smallest component.  This will guarentee that a single
512 	 * interleave table is generated.
513 	 *
514 	 * Lost space must be taken into account when calculating the
515 	 * overall size.  Half the space is lost when CCDF_MIRROR is
516 	 * specified.  One disk is lost when CCDF_PARITY is specified.
517 	 */
518 	if (cs->sc_flags & CCDF_UNIFORM) {
519 		for (ci = cs->sc_cinfo;
520 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
521 			ci->ci_size = minsize;
522 		}
523 		if (cs->sc_flags & CCDF_MIRROR) {
524 			/*
525 			 * Check to see if an even number of components
526 			 * have been specified.  The interleave must also
527 			 * be non-zero in order for us to be able to
528 			 * guarentee the topology.
529 			 */
530 			if (cs->sc_nccdisks % 2) {
531 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
532 				error = EINVAL;
533 				goto fail;
534 			}
535 			if (cs->sc_ileave == 0) {
536 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
537 				error = EINVAL;
538 				goto fail;
539 			}
540 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
541 		} else if (cs->sc_flags & CCDF_PARITY) {
542 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
543 		} else {
544 			if (cs->sc_ileave == 0) {
545 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
546 				error = EINVAL;
547 				goto fail;
548 			}
549 			cs->sc_size = cs->sc_nccdisks * minsize;
550 		}
551 	}
552 
553 	/*
554 	 * Construct the interleave table.
555 	 */
556 	ccdinterleave(cs, cs->sc_unit);
557 
558 	/*
559 	 * Create pseudo-geometry based on 1MB cylinders.  It's
560 	 * pretty close.
561 	 */
562 	ccg->ccg_secsize = maxsecsize;
563 	ccg->ccg_ntracks = 1;
564 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
565 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
566 
567 	/*
568 	 * Add an devstat entry for this device.
569 	 */
570 	devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
571 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
572 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
573 			  DEVSTAT_PRIORITY_ARRAY);
574 
575 	cs->sc_flags |= CCDF_INITED;
576 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
577 	return (0);
578 fail:
579 	while (ci > cs->sc_cinfo) {
580 		ci--;
581 		free(ci->ci_path, M_DEVBUF);
582 	}
583 	if (tmppath != NULL)
584 		free(tmppath, M_DEVBUF);
585 	free(cs->sc_cinfo, M_DEVBUF);
586 	return (error);
587 }
588 
589 static void
590 ccdinterleave(struct ccd_s *cs, int unit)
591 {
592 	struct ccdcinfo *ci, *smallci;
593 	struct ccdiinfo *ii;
594 	daddr_t bn, lbn;
595 	int ix;
596 	u_long size;
597 
598 #ifdef DEBUG
599 	if (ccddebug & CCDB_INIT)
600 		printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
601 #endif
602 
603 	/*
604 	 * Allocate an interleave table.  The worst case occurs when each
605 	 * of N disks is of a different size, resulting in N interleave
606 	 * tables.
607 	 *
608 	 * Chances are this is too big, but we don't care.
609 	 */
610 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
611 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
612 	    M_WAITOK | M_ZERO);
613 
614 	/*
615 	 * Trivial case: no interleave (actually interleave of disk size).
616 	 * Each table entry represents a single component in its entirety.
617 	 *
618 	 * An interleave of 0 may not be used with a mirror or parity setup.
619 	 */
620 	if (cs->sc_ileave == 0) {
621 		bn = 0;
622 		ii = cs->sc_itable;
623 
624 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
625 			/* Allocate space for ii_index. */
626 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
627 			ii->ii_ndisk = 1;
628 			ii->ii_startblk = bn;
629 			ii->ii_startoff = 0;
630 			ii->ii_index[0] = ix;
631 			bn += cs->sc_cinfo[ix].ci_size;
632 			ii++;
633 		}
634 		ii->ii_ndisk = 0;
635 #ifdef DEBUG
636 		if (ccddebug & CCDB_INIT)
637 			printiinfo(cs->sc_itable);
638 #endif
639 		return;
640 	}
641 
642 	/*
643 	 * The following isn't fast or pretty; it doesn't have to be.
644 	 */
645 	size = 0;
646 	bn = lbn = 0;
647 	for (ii = cs->sc_itable; ; ii++) {
648 		/*
649 		 * Allocate space for ii_index.  We might allocate more then
650 		 * we use.
651 		 */
652 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
653 		    M_DEVBUF, M_WAITOK);
654 
655 		/*
656 		 * Locate the smallest of the remaining components
657 		 */
658 		smallci = NULL;
659 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
660 		    ci++) {
661 			if (ci->ci_size > size &&
662 			    (smallci == NULL ||
663 			     ci->ci_size < smallci->ci_size)) {
664 				smallci = ci;
665 			}
666 		}
667 
668 		/*
669 		 * Nobody left, all done
670 		 */
671 		if (smallci == NULL) {
672 			ii->ii_ndisk = 0;
673 			break;
674 		}
675 
676 		/*
677 		 * Record starting logical block using an sc_ileave blocksize.
678 		 */
679 		ii->ii_startblk = bn / cs->sc_ileave;
680 
681 		/*
682 		 * Record starting comopnent block using an sc_ileave
683 		 * blocksize.  This value is relative to the beginning of
684 		 * a component disk.
685 		 */
686 		ii->ii_startoff = lbn;
687 
688 		/*
689 		 * Determine how many disks take part in this interleave
690 		 * and record their indices.
691 		 */
692 		ix = 0;
693 		for (ci = cs->sc_cinfo;
694 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
695 			if (ci->ci_size >= smallci->ci_size) {
696 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
697 			}
698 		}
699 		ii->ii_ndisk = ix;
700 		bn += ix * (smallci->ci_size - size);
701 		lbn = smallci->ci_size / cs->sc_ileave;
702 		size = smallci->ci_size;
703 	}
704 #ifdef DEBUG
705 	if (ccddebug & CCDB_INIT)
706 		printiinfo(cs->sc_itable);
707 #endif
708 }
709 
710 /* ARGSUSED */
711 static int
712 ccdopen(dev_t dev, int flags, int fmt, struct thread *td)
713 {
714 	int unit = ccdunit(dev);
715 	struct ccd_s *cs;
716 	struct disklabel *lp;
717 	int error = 0, part, pmask;
718 
719 #ifdef DEBUG
720 	if (ccddebug & CCDB_FOLLOW)
721 		printf("ccdopen(%p, %x)\n", dev, flags);
722 #endif
723 
724 	cs = IS_ALLOCATED(unit) ? ccdfind(unit) : ccdnew(unit);
725 
726 	if ((error = ccdlock(cs)) != 0)
727 		return (error);
728 
729 	lp = &cs->sc_label;
730 
731 	part = ccdpart(dev);
732 	pmask = (1 << part);
733 
734 	/*
735 	 * If we're initialized, check to see if there are any other
736 	 * open partitions.  If not, then it's safe to update
737 	 * the in-core disklabel.
738 	 */
739 	if (IS_INITED(cs) && (cs->sc_openmask == 0))
740 		ccdgetdisklabel(dev);
741 
742 	/* Check that the partition exists. */
743 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
744 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
745 		error = ENXIO;
746 		goto done;
747 	}
748 
749 	cs->sc_openmask |= pmask;
750  done:
751 	ccdunlock(cs);
752 	return (0);
753 }
754 
755 /* ARGSUSED */
756 static int
757 ccdclose(dev_t dev, int flags, int fmt, struct thread *td)
758 {
759 	int unit = ccdunit(dev);
760 	struct ccd_s *cs;
761 	int error = 0, part;
762 
763 #ifdef DEBUG
764 	if (ccddebug & CCDB_FOLLOW)
765 		printf("ccdclose(%p, %x)\n", dev, flags);
766 #endif
767 
768 	if (!IS_ALLOCATED(unit))
769 		return (ENXIO);
770 	cs = ccdfind(unit);
771 
772 	if ((error = ccdlock(cs)) != 0)
773 		return (error);
774 
775 	part = ccdpart(dev);
776 
777 	/* ...that much closer to allowing unconfiguration... */
778 	cs->sc_openmask &= ~(1 << part);
779 	/* collect "garbage" if possible */
780 	if (!IS_INITED(cs) && (cs->sc_flags & CCDF_WANTED) == 0)
781 		ccddestroy(cs, td->td_proc);
782 	else
783 		ccdunlock(cs);
784 	return (0);
785 }
786 
787 static void
788 ccdstrategy(struct bio *bp)
789 {
790 	int unit = ccdunit(bp->bio_dev);
791 	struct ccd_s *cs = ccdfind(unit);
792 	int s;
793 	int wlabel;
794 	struct disklabel *lp;
795 
796 #ifdef DEBUG
797 	if (ccddebug & CCDB_FOLLOW)
798 		printf("ccdstrategy(%p): unit %d\n", bp, unit);
799 #endif
800 	if (!IS_INITED(cs)) {
801 		biofinish(bp, NULL, ENXIO);
802 		return;
803 	}
804 
805 	/* If it's a nil transfer, wake up the top half now. */
806 	if (bp->bio_bcount == 0) {
807 		biodone(bp);
808 		return;
809 	}
810 
811 	lp = &cs->sc_label;
812 
813 	/*
814 	 * Do bounds checking and adjust transfer.  If there's an
815 	 * error, the bounds check will flag that for us.
816 	 */
817 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
818 	if (ccdpart(bp->bio_dev) != RAW_PART) {
819 		if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
820 			biodone(bp);
821 			return;
822 		}
823 	} else {
824 		int pbn;        /* in sc_secsize chunks */
825 		long sz;        /* in sc_secsize chunks */
826 
827 		pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
828 		sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
829 
830 		/*
831 		 * If out of bounds return an error. If at the EOF point,
832 		 * simply read or write less.
833 		 */
834 
835 		if (pbn < 0 || pbn >= cs->sc_size) {
836 			bp->bio_resid = bp->bio_bcount;
837 			if (pbn != cs->sc_size)
838 				biofinish(bp, NULL, EINVAL);
839 			else
840 				biodone(bp);
841 			return;
842 		}
843 
844 		/*
845 		 * If the request crosses EOF, truncate the request.
846 		 */
847 		if (pbn + sz > cs->sc_size) {
848 			bp->bio_bcount = (cs->sc_size - pbn) *
849 			    cs->sc_geom.ccg_secsize;
850 		}
851 	}
852 
853 	bp->bio_resid = bp->bio_bcount;
854 
855 	/*
856 	 * "Start" the unit.
857 	 */
858 	s = splbio();
859 	ccdstart(cs, bp);
860 	splx(s);
861 	return;
862 }
863 
864 static void
865 ccdstart(struct ccd_s *cs, struct bio *bp)
866 {
867 	long bcount, rcount;
868 	struct ccdbuf *cbp[4];
869 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
870 	caddr_t addr;
871 	daddr_t bn;
872 	struct partition *pp;
873 
874 #ifdef DEBUG
875 	if (ccddebug & CCDB_FOLLOW)
876 		printf("ccdstart(%p, %p)\n", cs, bp);
877 #endif
878 
879 	/* Record the transaction start  */
880 	devstat_start_transaction(&cs->device_stats);
881 
882 	/*
883 	 * Translate the partition-relative block number to an absolute.
884 	 */
885 	bn = bp->bio_blkno;
886 	if (ccdpart(bp->bio_dev) != RAW_PART) {
887 		pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
888 		bn += pp->p_offset;
889 	}
890 
891 	/*
892 	 * Allocate component buffers and fire off the requests
893 	 */
894 	addr = bp->bio_data;
895 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
896 		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
897 		rcount = cbp[0]->cb_buf.bio_bcount;
898 
899 		if (cs->sc_cflags & CCDF_MIRROR) {
900 			/*
901 			 * Mirroring.  Writes go to both disks, reads are
902 			 * taken from whichever disk seems most appropriate.
903 			 *
904 			 * We attempt to localize reads to the disk whos arm
905 			 * is nearest the read request.  We ignore seeks due
906 			 * to writes when making this determination and we
907 			 * also try to avoid hogging.
908 			 */
909 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
910 				BIO_STRATEGY(&cbp[0]->cb_buf, 0);
911 				BIO_STRATEGY(&cbp[1]->cb_buf, 0);
912 			} else {
913 				int pick = cs->sc_pick;
914 				daddr_t range = cs->sc_size / 16;
915 
916 				if (bn < cs->sc_blk[pick] - range ||
917 				    bn > cs->sc_blk[pick] + range
918 				) {
919 					cs->sc_pick = pick = 1 - pick;
920 				}
921 				cs->sc_blk[pick] = bn + btodb(rcount);
922 				BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
923 			}
924 		} else {
925 			/*
926 			 * Not mirroring
927 			 */
928 			BIO_STRATEGY(&cbp[0]->cb_buf, 0);
929 		}
930 		bn += btodb(rcount);
931 		addr += rcount;
932 	}
933 }
934 
935 /*
936  * Build a component buffer header.
937  */
938 static void
939 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
940 {
941 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
942 	struct ccdbuf *cbp;
943 	daddr_t cbn, cboff;
944 	off_t cbc;
945 
946 #ifdef DEBUG
947 	if (ccddebug & CCDB_IO)
948 		printf("ccdbuffer(%p, %p, %d, %p, %ld)\n",
949 		       cs, bp, bn, addr, bcount);
950 #endif
951 	/*
952 	 * Determine which component bn falls in.
953 	 */
954 	cbn = bn;
955 	cboff = 0;
956 
957 	if (cs->sc_ileave == 0) {
958 		/*
959 		 * Serially concatenated and neither a mirror nor a parity
960 		 * config.  This is a special case.
961 		 */
962 		daddr_t sblk;
963 
964 		sblk = 0;
965 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
966 			sblk += ci->ci_size;
967 		cbn -= sblk;
968 	} else {
969 		struct ccdiinfo *ii;
970 		int ccdisk, off;
971 
972 		/*
973 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
974 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
975 		 * to cbn.
976 		 */
977 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
978 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
979 
980 		/*
981 		 * Figure out which interleave table to use.
982 		 */
983 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
984 			if (ii->ii_startblk > cbn)
985 				break;
986 		}
987 		ii--;
988 
989 		/*
990 		 * off is the logical superblock relative to the beginning
991 		 * of this interleave block.
992 		 */
993 		off = cbn - ii->ii_startblk;
994 
995 		/*
996 		 * We must calculate which disk component to use (ccdisk),
997 		 * and recalculate cbn to be the superblock relative to
998 		 * the beginning of the component.  This is typically done by
999 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
1000 		 * must typically be divided by the number of components in
1001 		 * this interleave array to be properly convert it from a
1002 		 * CCD-relative logical superblock number to a
1003 		 * component-relative superblock number.
1004 		 */
1005 		if (ii->ii_ndisk == 1) {
1006 			/*
1007 			 * When we have just one disk, it can't be a mirror
1008 			 * or a parity config.
1009 			 */
1010 			ccdisk = ii->ii_index[0];
1011 			cbn = ii->ii_startoff + off;
1012 		} else {
1013 			if (cs->sc_cflags & CCDF_MIRROR) {
1014 				/*
1015 				 * We have forced a uniform mapping, resulting
1016 				 * in a single interleave array.  We double
1017 				 * up on the first half of the available
1018 				 * components and our mirror is in the second
1019 				 * half.  This only works with a single
1020 				 * interleave array because doubling up
1021 				 * doubles the number of sectors, so there
1022 				 * cannot be another interleave array because
1023 				 * the next interleave array's calculations
1024 				 * would be off.
1025 				 */
1026 				int ndisk2 = ii->ii_ndisk / 2;
1027 				ccdisk = ii->ii_index[off % ndisk2];
1028 				cbn = ii->ii_startoff + off / ndisk2;
1029 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1030 			} else if (cs->sc_cflags & CCDF_PARITY) {
1031 				/*
1032 				 * XXX not implemented yet
1033 				 */
1034 				int ndisk2 = ii->ii_ndisk - 1;
1035 				ccdisk = ii->ii_index[off % ndisk2];
1036 				cbn = ii->ii_startoff + off / ndisk2;
1037 				if (cbn % ii->ii_ndisk <= ccdisk)
1038 					ccdisk++;
1039 			} else {
1040 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1041 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1042 			}
1043 		}
1044 
1045 		ci = &cs->sc_cinfo[ccdisk];
1046 
1047 		/*
1048 		 * Convert cbn from a superblock to a normal block so it
1049 		 * can be used to calculate (along with cboff) the normal
1050 		 * block index into this particular disk.
1051 		 */
1052 		cbn *= cs->sc_ileave;
1053 	}
1054 
1055 	/*
1056 	 * Fill in the component buf structure.
1057 	 */
1058 	cbp = getccdbuf(NULL);
1059 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
1060 	cbp->cb_buf.bio_done = ccdiodone;
1061 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
1062 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1063 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1064 	cbp->cb_buf.bio_data = addr;
1065 	if (cs->sc_ileave == 0)
1066               cbc = dbtob((off_t)(ci->ci_size - cbn));
1067 	else
1068               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1069 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1070  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1071 
1072 	/*
1073 	 * context for ccdiodone
1074 	 */
1075 	cbp->cb_obp = bp;
1076 	cbp->cb_unit = cs->sc_unit;
1077 	cbp->cb_comp = ci - cs->sc_cinfo;
1078 
1079 #ifdef DEBUG
1080 	if (ccddebug & CCDB_IO)
1081 		printf(" dev %p(u%ld): cbp %p bn %d addr %p bcnt %ld\n",
1082 		       ci->ci_dev, (unsigned long)(ci-cs->sc_cinfo), cbp,
1083 		       cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1084 		       cbp->cb_buf.bio_bcount);
1085 #endif
1086 	cb[0] = cbp;
1087 
1088 	/*
1089 	 * Note: both I/O's setup when reading from mirror, but only one
1090 	 * will be executed.
1091 	 */
1092 	if (cs->sc_cflags & CCDF_MIRROR) {
1093 		/* mirror, setup second I/O */
1094 		cbp = getccdbuf(cb[0]);
1095 		cbp->cb_buf.bio_dev = ci2->ci_dev;
1096 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1097 		cb[1] = cbp;
1098 		/* link together the ccdbuf's and clear "mirror done" flag */
1099 		cb[0]->cb_mirror = cb[1];
1100 		cb[1]->cb_mirror = cb[0];
1101 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1102 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1103 	}
1104 }
1105 
1106 static void
1107 ccdintr(struct ccd_s *cs, struct bio *bp)
1108 {
1109 #ifdef DEBUG
1110 	if (ccddebug & CCDB_FOLLOW)
1111 		printf("ccdintr(%p, %p)\n", cs, bp);
1112 #endif
1113 	/*
1114 	 * Request is done for better or worse, wakeup the top half.
1115 	 */
1116 	if (bp->bio_flags & BIO_ERROR)
1117 		bp->bio_resid = bp->bio_bcount;
1118 	biofinish(bp, &cs->device_stats, 0);
1119 }
1120 
1121 /*
1122  * Called at interrupt time.
1123  * Mark the component as done and if all components are done,
1124  * take a ccd interrupt.
1125  */
1126 static void
1127 ccdiodone(struct bio *ibp)
1128 {
1129 	struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1130 	struct bio *bp = cbp->cb_obp;
1131 	int unit = cbp->cb_unit;
1132 	int count, s;
1133 
1134 	s = splbio();
1135 #ifdef DEBUG
1136 	if (ccddebug & CCDB_FOLLOW)
1137 		printf("ccdiodone(%p)\n", cbp);
1138 	if (ccddebug & CCDB_IO) {
1139 		printf("ccdiodone: bp %p bcount %ld resid %ld\n",
1140 		       bp, bp->bio_bcount, bp->bio_resid);
1141 		printf(" dev %p(u%d), cbp %p bn %d addr %p bcnt %ld\n",
1142 		       cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1143 		       cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1144 		       cbp->cb_buf.bio_bcount);
1145 	}
1146 #endif
1147 	/*
1148 	 * If an error occured, report it.  If this is a mirrored
1149 	 * configuration and the first of two possible reads, do not
1150 	 * set the error in the bp yet because the second read may
1151 	 * succeed.
1152 	 */
1153 
1154 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1155 		const char *msg = "";
1156 
1157 		if ((ccdfind(unit)->sc_cflags & CCDF_MIRROR) &&
1158 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
1159 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1160 			/*
1161 			 * We will try our read on the other disk down
1162 			 * below, also reverse the default pick so if we
1163 			 * are doing a scan we do not keep hitting the
1164 			 * bad disk first.
1165 			 */
1166 			struct ccd_s *cs = ccdfind(unit);
1167 
1168 			msg = ", trying other disk";
1169 			cs->sc_pick = 1 - cs->sc_pick;
1170 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1171 		} else {
1172 			bp->bio_flags |= BIO_ERROR;
1173 			bp->bio_error = cbp->cb_buf.bio_error ?
1174 			    cbp->cb_buf.bio_error : EIO;
1175 		}
1176 		printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1177 		       unit, bp->bio_error, cbp->cb_comp,
1178 		       (int)cbp->cb_buf.bio_blkno, bp->bio_blkno, msg);
1179 	}
1180 
1181 	/*
1182 	 * Process mirror.  If we are writing, I/O has been initiated on both
1183 	 * buffers and we fall through only after both are finished.
1184 	 *
1185 	 * If we are reading only one I/O is initiated at a time.  If an
1186 	 * error occurs we initiate the second I/O and return, otherwise
1187 	 * we free the second I/O without initiating it.
1188 	 */
1189 
1190 	if (ccdfind(unit)->sc_cflags & CCDF_MIRROR) {
1191 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1192 			/*
1193 			 * When writing, handshake with the second buffer
1194 			 * to determine when both are done.  If both are not
1195 			 * done, return here.
1196 			 */
1197 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1198 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1199 				putccdbuf(cbp);
1200 				splx(s);
1201 				return;
1202 			}
1203 		} else {
1204 			/*
1205 			 * When reading, either dispose of the second buffer
1206 			 * or initiate I/O on the second buffer if an error
1207 			 * occured with this one.
1208 			 */
1209 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1210 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1211 					cbp->cb_mirror->cb_pflags |=
1212 					    CCDPF_MIRROR_DONE;
1213 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1214 					putccdbuf(cbp);
1215 					splx(s);
1216 					return;
1217 				} else {
1218 					putccdbuf(cbp->cb_mirror);
1219 					/* fall through */
1220 				}
1221 			}
1222 		}
1223 	}
1224 
1225 	/*
1226 	 * use bio_caller1 to determine how big the original request was rather
1227 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1228 	 *
1229 	 * XXX We check for an error, but we do not test the resid for an
1230 	 * aligned EOF condition.  This may result in character & block
1231 	 * device access not recognizing EOF properly when read or written
1232 	 * sequentially, but will not effect filesystems.
1233 	 */
1234 	count = (long)cbp->cb_buf.bio_caller1;
1235 	putccdbuf(cbp);
1236 
1237 	/*
1238 	 * If all done, "interrupt".
1239 	 */
1240 	bp->bio_resid -= count;
1241 	if (bp->bio_resid < 0)
1242 		panic("ccdiodone: count");
1243 	if (bp->bio_resid == 0)
1244 		ccdintr(ccdfind(unit), bp);
1245 	splx(s);
1246 }
1247 
1248 static int
1249 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
1250 {
1251 	int unit = ccdunit(dev);
1252 	int i, j, lookedup = 0, error = 0;
1253 	int part, pmask, s;
1254 	struct ccd_s *cs;
1255 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1256 	char **cpp;
1257 	struct vnode **vpp;
1258 
1259 	if (!IS_ALLOCATED(unit))
1260 		return (ENXIO);
1261 	cs = ccdfind(unit);
1262 
1263 	switch (cmd) {
1264 	case CCDIOCSET:
1265 		if (IS_INITED(cs))
1266 			return (EBUSY);
1267 
1268 		if ((flag & FWRITE) == 0)
1269 			return (EBADF);
1270 
1271 		if ((error = ccdlock(cs)) != 0)
1272 			return (error);
1273 
1274 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1275 			return (EINVAL);
1276 
1277 		/* Fill in some important bits. */
1278 		cs->sc_ileave = ccio->ccio_ileave;
1279 		if (cs->sc_ileave == 0 &&
1280 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1281 		     (ccio->ccio_flags & CCDF_PARITY))) {
1282 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1283 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1284 		}
1285 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1286 		    (ccio->ccio_flags & CCDF_PARITY)) {
1287 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1288 			ccio->ccio_flags &= ~CCDF_PARITY;
1289 		}
1290 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1291 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1292 			printf("ccd%d: mirror/parity forces uniform flag\n",
1293 			       unit);
1294 			ccio->ccio_flags |= CCDF_UNIFORM;
1295 		}
1296 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1297 
1298 		/*
1299 		 * Allocate space for and copy in the array of
1300 		 * componet pathnames and device numbers.
1301 		 */
1302 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1303 		    M_DEVBUF, M_WAITOK);
1304 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1305 		    M_DEVBUF, M_WAITOK);
1306 
1307 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1308 		    ccio->ccio_ndisks * sizeof(char **));
1309 		if (error) {
1310 			free(vpp, M_DEVBUF);
1311 			free(cpp, M_DEVBUF);
1312 			ccdunlock(cs);
1313 			return (error);
1314 		}
1315 
1316 #ifdef DEBUG
1317 		if (ccddebug & CCDB_INIT)
1318 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1319 				printf("ccdioctl: component %d: %p\n",
1320 				    i, cpp[i]);
1321 #endif
1322 
1323 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1324 #ifdef DEBUG
1325 			if (ccddebug & CCDB_INIT)
1326 				printf("ccdioctl: lookedup = %d\n", lookedup);
1327 #endif
1328 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1329 				for (j = 0; j < lookedup; ++j)
1330 					(void)vn_close(vpp[j], FREAD|FWRITE,
1331 					    td->td_proc->p_ucred, td);
1332 				free(vpp, M_DEVBUF);
1333 				free(cpp, M_DEVBUF);
1334 				ccdunlock(cs);
1335 				return (error);
1336 			}
1337 			++lookedup;
1338 		}
1339 		cs->sc_vpp = vpp;
1340 		cs->sc_nccdisks = ccio->ccio_ndisks;
1341 
1342 		/*
1343 		 * Initialize the ccd.  Fills in the softc for us.
1344 		 */
1345 		if ((error = ccdinit(cs, cpp, td)) != 0) {
1346 			for (j = 0; j < lookedup; ++j)
1347 				(void)vn_close(vpp[j], FREAD|FWRITE,
1348 				    td->td_proc->p_ucred, td);
1349 			/*
1350 			 * We can't ccddestroy() cs just yet, because nothing
1351 			 * prevents user-level app to do another ioctl()
1352 			 * without closing the device first, therefore
1353 			 * declare unit null and void and let ccdclose()
1354 			 * destroy it when it is safe to do so.
1355 			 */
1356 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1357 			free(vpp, M_DEVBUF);
1358 			free(cpp, M_DEVBUF);
1359 			ccdunlock(cs);
1360 			return (error);
1361 		}
1362 
1363 		/*
1364 		 * The ccd has been successfully initialized, so
1365 		 * we can place it into the array and read the disklabel.
1366 		 */
1367 		ccio->ccio_unit = unit;
1368 		ccio->ccio_size = cs->sc_size;
1369 		ccdgetdisklabel(dev);
1370 
1371 		ccdunlock(cs);
1372 
1373 		break;
1374 
1375 	case CCDIOCCLR:
1376 		if (!IS_INITED(cs))
1377 			return (ENXIO);
1378 
1379 		if ((flag & FWRITE) == 0)
1380 			return (EBADF);
1381 
1382 		if ((error = ccdlock(cs)) != 0)
1383 			return (error);
1384 
1385 		/* Don't unconfigure if any other partitions are open */
1386 		part = ccdpart(dev);
1387 		pmask = (1 << part);
1388 		if ((cs->sc_openmask & ~pmask)) {
1389 			ccdunlock(cs);
1390 			return (EBUSY);
1391 		}
1392 
1393 		/* Declare unit null and void (reset all flags) */
1394 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1395 
1396 		/* Close the components and free their pathnames. */
1397 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1398 			/*
1399 			 * XXX: this close could potentially fail and
1400 			 * cause Bad Things.  Maybe we need to force
1401 			 * the close to happen?
1402 			 */
1403 #ifdef DEBUG
1404 			if (ccddebug & CCDB_VNODE)
1405 				vprint("CCDIOCCLR: vnode info",
1406 				    cs->sc_cinfo[i].ci_vp);
1407 #endif
1408 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1409 			    td->td_proc->p_ucred, td);
1410 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1411 		}
1412 
1413 		/* Free interleave index. */
1414 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1415 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1416 
1417 		/* Free component info and interleave table. */
1418 		free(cs->sc_cinfo, M_DEVBUF);
1419 		free(cs->sc_itable, M_DEVBUF);
1420 		free(cs->sc_vpp, M_DEVBUF);
1421 
1422 		/* And remove the devstat entry. */
1423 		devstat_remove_entry(&cs->device_stats);
1424 
1425 		/* This must be atomic. */
1426 		s = splhigh();
1427 		ccdunlock(cs);
1428 		splx(s);
1429 
1430 		break;
1431 
1432 	case CCDCONFINFO:
1433 		{
1434 			int ninit = 0;
1435 			struct ccdconf *conf = (struct ccdconf *)data;
1436 			struct ccd_s *tmpcs;
1437 			struct ccd_s *ubuf = conf->buffer;
1438 
1439 			/* XXX: LOCK(unique unit numbers) */
1440 			LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1441 				if (IS_INITED(tmpcs))
1442 					ninit++;
1443 
1444 			if (conf->size == 0) {
1445 				conf->size = sizeof(struct ccd_s) * ninit;
1446 				break;
1447 			} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1448 			    (conf->size % sizeof(struct ccd_s) != 0)) {
1449 				/* XXX: UNLOCK(unique unit numbers) */
1450 				return (EINVAL);
1451 			}
1452 
1453 			ubuf += ninit;
1454 			LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1455 				if (!IS_INITED(tmpcs))
1456 					continue;
1457 				error = copyout(tmpcs, --ubuf,
1458 				    sizeof(struct ccd_s));
1459 				if (error != 0)
1460 					/* XXX: UNLOCK(unique unit numbers) */
1461 					return (error);
1462 			}
1463 			/* XXX: UNLOCK(unique unit numbers) */
1464 		}
1465 		break;
1466 
1467 	case CCDCPPINFO:
1468 		if (!IS_INITED(cs))
1469 			return (ENXIO);
1470 
1471 		{
1472 			int len = 0;
1473 			struct ccdcpps *cpps = (struct ccdcpps *)data;
1474 			char *ubuf = cpps->buffer;
1475 
1476 
1477 			for (i = 0; i < cs->sc_nccdisks; ++i)
1478 				len += cs->sc_cinfo[i].ci_pathlen;
1479 
1480 			if (cpps->size == 0) {
1481 				cpps->size = len;
1482 				break;
1483 			} else if (cpps->size != len) {
1484 				return (EINVAL);
1485 			}
1486 
1487 			for (i = 0; i < cs->sc_nccdisks; ++i) {
1488 				len = cs->sc_cinfo[i].ci_pathlen;
1489 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1490 				    len);
1491 				if (error != 0)
1492 					return (error);
1493 				ubuf += len;
1494 			}
1495 		}
1496 		break;
1497 
1498 	case DIOCGDINFO:
1499 		if (!IS_INITED(cs))
1500 			return (ENXIO);
1501 
1502 		*(struct disklabel *)data = cs->sc_label;
1503 		break;
1504 
1505 	case DIOCGPART:
1506 		if (!IS_INITED(cs))
1507 			return (ENXIO);
1508 
1509 		((struct partinfo *)data)->disklab = &cs->sc_label;
1510 		((struct partinfo *)data)->part =
1511 		    &cs->sc_label.d_partitions[ccdpart(dev)];
1512 		break;
1513 
1514 	case DIOCWDINFO:
1515 	case DIOCSDINFO:
1516 		if (!IS_INITED(cs))
1517 			return (ENXIO);
1518 
1519 		if ((flag & FWRITE) == 0)
1520 			return (EBADF);
1521 
1522 		if ((error = ccdlock(cs)) != 0)
1523 			return (error);
1524 
1525 		cs->sc_flags |= CCDF_LABELLING;
1526 
1527 		error = setdisklabel(&cs->sc_label,
1528 		    (struct disklabel *)data, 0);
1529 		if (error == 0) {
1530 			if (cmd == DIOCWDINFO)
1531 				error = writedisklabel(CCDLABELDEV(dev),
1532 				    &cs->sc_label);
1533 		}
1534 
1535 		cs->sc_flags &= ~CCDF_LABELLING;
1536 
1537 		ccdunlock(cs);
1538 
1539 		if (error)
1540 			return (error);
1541 		break;
1542 
1543 	case DIOCWLABEL:
1544 		if (!IS_INITED(cs))
1545 			return (ENXIO);
1546 
1547 		if ((flag & FWRITE) == 0)
1548 			return (EBADF);
1549 		if (*(int *)data != 0)
1550 			cs->sc_flags |= CCDF_WLABEL;
1551 		else
1552 			cs->sc_flags &= ~CCDF_WLABEL;
1553 		break;
1554 
1555 	default:
1556 		return (ENOTTY);
1557 	}
1558 
1559 	return (0);
1560 }
1561 
1562 static int
1563 ccdsize(dev_t dev)
1564 {
1565 	struct ccd_s *cs;
1566 	int part, size;
1567 
1568 	if (ccdopen(dev, 0, S_IFCHR, curthread))
1569 		return (-1);
1570 
1571 	cs = ccdfind(ccdunit(dev));
1572 	part = ccdpart(dev);
1573 
1574 	if (!IS_INITED(cs))
1575 		return (-1);
1576 
1577 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1578 		size = -1;
1579 	else
1580 		size = cs->sc_label.d_partitions[part].p_size;
1581 
1582 	if (ccdclose(dev, 0, S_IFCHR, curthread))
1583 		return (-1);
1584 
1585 	return (size);
1586 }
1587 
1588 static int
1589 ccddump(dev_t dev)
1590 {
1591 
1592 	/* Not implemented. */
1593 	return ENXIO;
1594 }
1595 
1596 /*
1597  * Lookup the provided name in the filesystem.  If the file exists,
1598  * is a valid block device, and isn't being used by anyone else,
1599  * set *vpp to the file's vnode.
1600  */
1601 static int
1602 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1603 {
1604 	struct nameidata nd;
1605 	struct vnode *vp;
1606 	int error, flags;
1607 
1608 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1609 	flags = FREAD | FWRITE;
1610 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1611 #ifdef DEBUG
1612 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
1613 			printf("ccdlookup: vn_open error = %d\n", error);
1614 #endif
1615 		return (error);
1616 	}
1617 	vp = nd.ni_vp;
1618 
1619 	if (vp->v_usecount > 1) {
1620 		error = EBUSY;
1621 		goto bad;
1622 	}
1623 
1624 	if (!vn_isdisk(vp, &error))
1625 		goto bad;
1626 
1627 #ifdef DEBUG
1628 	if (ccddebug & CCDB_VNODE)
1629 		vprint("ccdlookup: vnode info", vp);
1630 #endif
1631 
1632 	VOP_UNLOCK(vp, 0, td);
1633 	NDFREE(&nd, NDF_ONLY_PNBUF);
1634 	*vpp = vp;
1635 	return (0);
1636 bad:
1637 	VOP_UNLOCK(vp, 0, td);
1638 	NDFREE(&nd, NDF_ONLY_PNBUF);
1639 	/* vn_close does vrele() for vp */
1640 	(void)vn_close(vp, FREAD|FWRITE, td->td_proc->p_ucred, td);
1641 	return (error);
1642 }
1643 
1644 /*
1645  * Read the disklabel from the ccd.  If one is not present, fake one
1646  * up.
1647  */
1648 static void
1649 ccdgetdisklabel(dev_t dev)
1650 {
1651 	int unit = ccdunit(dev);
1652 	struct ccd_s *cs = ccdfind(unit);
1653 	char *errstring;
1654 	struct disklabel *lp = &cs->sc_label;
1655 	struct ccdgeom *ccg = &cs->sc_geom;
1656 
1657 	bzero(lp, sizeof(*lp));
1658 
1659 	lp->d_secperunit = cs->sc_size;
1660 	lp->d_secsize = ccg->ccg_secsize;
1661 	lp->d_nsectors = ccg->ccg_nsectors;
1662 	lp->d_ntracks = ccg->ccg_ntracks;
1663 	lp->d_ncylinders = ccg->ccg_ncylinders;
1664 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1665 
1666 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1667 	lp->d_type = DTYPE_CCD;
1668 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1669 	lp->d_rpm = 3600;
1670 	lp->d_interleave = 1;
1671 	lp->d_flags = 0;
1672 
1673 	lp->d_partitions[RAW_PART].p_offset = 0;
1674 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1675 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1676 	lp->d_npartitions = RAW_PART + 1;
1677 
1678 	lp->d_bbsize = BBSIZE;				/* XXX */
1679 	lp->d_sbsize = SBSIZE;				/* XXX */
1680 
1681 	lp->d_magic = DISKMAGIC;
1682 	lp->d_magic2 = DISKMAGIC;
1683 	lp->d_checksum = dkcksum(&cs->sc_label);
1684 
1685 	/*
1686 	 * Call the generic disklabel extraction routine.
1687 	 */
1688 	errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1689 	if (errstring != NULL)
1690 		ccdmakedisklabel(cs);
1691 
1692 #ifdef DEBUG
1693 	/* It's actually extremely common to have unlabeled ccds. */
1694 	if (ccddebug & CCDB_LABEL)
1695 		if (errstring != NULL)
1696 			printf("ccd%d: %s\n", unit, errstring);
1697 #endif
1698 }
1699 
1700 /*
1701  * Take care of things one might want to take care of in the event
1702  * that a disklabel isn't present.
1703  */
1704 static void
1705 ccdmakedisklabel(struct ccd_s *cs)
1706 {
1707 	struct disklabel *lp = &cs->sc_label;
1708 
1709 	/*
1710 	 * For historical reasons, if there's no disklabel present
1711 	 * the raw partition must be marked FS_BSDFFS.
1712 	 */
1713 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1714 
1715 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1716 }
1717 
1718 /*
1719  * Wait interruptibly for an exclusive lock.
1720  *
1721  * XXX
1722  * Several drivers do this; it should be abstracted and made MP-safe.
1723  */
1724 static int
1725 ccdlock(struct ccd_s *cs)
1726 {
1727 	int error;
1728 
1729 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1730 		cs->sc_flags |= CCDF_WANTED;
1731 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1732 			return (error);
1733 	}
1734 	cs->sc_flags |= CCDF_LOCKED;
1735 	return (0);
1736 }
1737 
1738 /*
1739  * Unlock and wake up any waiters.
1740  */
1741 static void
1742 ccdunlock(struct ccd_s *cs)
1743 {
1744 
1745 	cs->sc_flags &= ~CCDF_LOCKED;
1746 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1747 		cs->sc_flags &= ~CCDF_WANTED;
1748 		wakeup(cs);
1749 	}
1750 }
1751 
1752 #ifdef DEBUG
1753 static void
1754 printiinfo(struct ccdiinfo *ii)
1755 {
1756 	int ix, i;
1757 
1758 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1759 		printf(" itab[%d]: #dk %d sblk %d soff %d",
1760 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1761 		for (i = 0; i < ii->ii_ndisk; i++)
1762 			printf(" %d", ii->ii_index[i]);
1763 		printf("\n");
1764 	}
1765 }
1766 #endif
1767