xref: /freebsd/sys/geom/geom_ccd.c (revision 0b87f79976047c8f4332bbf7dc03146f6b0de79f)
1 /* $FreeBSD$ */
2 
3 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
4 
5 /*
6  * Copyright (c) 1995 Jason R. Thorpe.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed for the NetBSD Project
20  *	by Jason R. Thorpe.
21  * 4. The name of the author may not be used to endorse or promote products
22  *    derived from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * Copyright (c) 1988 University of Utah.
39  * Copyright (c) 1990, 1993
40  *	The Regents of the University of California.  All rights reserved.
41  *
42  * This code is derived from software contributed to Berkeley by
43  * the Systems Programming Group of the University of Utah Computer
44  * Science Department.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions
48  * are met:
49  * 1. Redistributions of source code must retain the above copyright
50  *    notice, this list of conditions and the following disclaimer.
51  * 2. Redistributions in binary form must reproduce the above copyright
52  *    notice, this list of conditions and the following disclaimer in the
53  *    documentation and/or other materials provided with the distribution.
54  * 3. All advertising materials mentioning features or use of this software
55  *    must display the following acknowledgement:
56  *	This product includes software developed by the University of
57  *	California, Berkeley and its contributors.
58  * 4. Neither the name of the University nor the names of its contributors
59  *    may be used to endorse or promote products derived from this software
60  *    without specific prior written permission.
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72  * SUCH DAMAGE.
73  *
74  * from: Utah $Hdr: cd.c 1.6 90/11/28$
75  *
76  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
77  */
78 
79 /*
80  * "Concatenated" disk driver.
81  *
82  * Dynamic configuration and disklabel support by:
83  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
84  *	Numerical Aerodynamic Simulation Facility
85  *	Mail Stop 258-6
86  *	NASA Ames Research Center
87  *	Moffett Field, CA 94035
88  */
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/kernel.h>
93 #include <sys/module.h>
94 #include <sys/proc.h>
95 #include <sys/bio.h>
96 #include <sys/malloc.h>
97 #include <sys/namei.h>
98 #include <sys/conf.h>
99 #include <sys/stat.h>
100 #include <sys/sysctl.h>
101 #include <sys/disk.h>
102 #include <sys/devicestat.h>
103 #include <sys/fcntl.h>
104 #include <sys/vnode.h>
105 
106 #include <sys/ccdvar.h>
107 
108 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
109 
110 #if defined(CCDDEBUG) && !defined(DEBUG)
111 #define DEBUG
112 #endif
113 
114 #ifdef DEBUG
115 #define CCDB_FOLLOW	0x01
116 #define CCDB_INIT	0x02
117 #define CCDB_IO		0x04
118 #define CCDB_LABEL	0x08
119 #define CCDB_VNODE	0x10
120 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
121     CCDB_VNODE;
122 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
123 #endif
124 
125 #define	ccdunit(x)	dkunit(x)
126 #define ccdpart(x)	dkpart(x)
127 
128 /*
129    This is how mirroring works (only writes are special):
130 
131    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
132    linked together by the cb_mirror field.  "cb_pflags &
133    CCDPF_MIRROR_DONE" is set to 0 on both of them.
134 
135    When a component returns to ccdiodone(), it checks if "cb_pflags &
136    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
137    flag and returns.  If it is, it means its partner has already
138    returned, so it will go to the regular cleanup.
139 
140  */
141 
142 struct ccdbuf {
143 	struct bio	cb_buf;		/* new I/O buf */
144 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
145 	struct ccdbuf	*cb_freenext;	/* free list link */
146 	int		cb_unit;	/* target unit */
147 	int		cb_comp;	/* target component */
148 	int		cb_pflags;	/* mirror/parity status flag */
149 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
150 };
151 
152 /* bits in cb_pflags */
153 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
154 
155 #define CCDLABELDEV(dev)	\
156 	(makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
157 
158 /* convinient macros for often-used statements */
159 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
160 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
161 
162 static d_open_t ccdopen;
163 static d_close_t ccdclose;
164 static d_strategy_t ccdstrategy;
165 static d_ioctl_t ccdioctl;
166 static d_psize_t ccdsize;
167 
168 #define NCCDFREEHIWAT	16
169 
170 #define CDEV_MAJOR 74
171 
172 static struct cdevsw ccd_cdevsw = {
173 	/* open */	ccdopen,
174 	/* close */	ccdclose,
175 	/* read */	physread,
176 	/* write */	physwrite,
177 	/* ioctl */	ccdioctl,
178 	/* poll */	nopoll,
179 	/* mmap */	nommap,
180 	/* strategy */	ccdstrategy,
181 	/* name */	"ccd",
182 	/* maj */	CDEV_MAJOR,
183 	/* dump */	nodump,
184 	/* psize */	ccdsize,
185 	/* flags */	D_DISK,
186 };
187 static LIST_HEAD(, ccd_s) ccd_softc_list = LIST_HEAD_INITIALIZER(&ccd_softc_list);
188 
189 static struct ccd_s *ccdfind(int);
190 static struct ccd_s *ccdnew(int);
191 static int ccddestroy(struct ccd_s *, struct proc *);
192 
193 /* called during module initialization */
194 static void ccdattach(void);
195 static int ccd_modevent(module_t, int, void *);
196 
197 /* called by biodone() at interrupt time */
198 static void ccdiodone(struct bio *bp);
199 
200 static void ccdstart(struct ccd_s *, struct bio *);
201 static void ccdinterleave(struct ccd_s *, int);
202 static void ccdintr(struct ccd_s *, struct bio *);
203 static int ccdinit(struct ccd_s *, char **, struct thread *);
204 static int ccdlookup(char *, struct thread *p, struct vnode **);
205 static void ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
206 		      struct bio *, daddr_t, caddr_t, long);
207 static void ccdgetdisklabel(dev_t);
208 static void ccdmakedisklabel(struct ccd_s *);
209 static int ccdlock(struct ccd_s *);
210 static void ccdunlock(struct ccd_s *);
211 
212 #ifdef DEBUG
213 static void printiinfo(struct ccdiinfo *);
214 #endif
215 
216 /* Non-private for the benefit of libkvm. */
217 struct ccdbuf *ccdfreebufs;
218 static int numccdfreebufs;
219 
220 /*
221  * getccdbuf() -	Allocate and zero a ccd buffer.
222  *
223  *	This routine is called at splbio().
224  */
225 
226 static __inline
227 struct ccdbuf *
228 getccdbuf(struct ccdbuf *cpy)
229 {
230 	struct ccdbuf *cbp;
231 
232 	/*
233 	 * Allocate from freelist or malloc as necessary
234 	 */
235 	if ((cbp = ccdfreebufs) != NULL) {
236 		ccdfreebufs = cbp->cb_freenext;
237 		--numccdfreebufs;
238 	} else {
239 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
240 	}
241 
242 	/*
243 	 * Used by mirroring code
244 	 */
245 	if (cpy)
246 		bcopy(cpy, cbp, sizeof(struct ccdbuf));
247 	else
248 		bzero(cbp, sizeof(struct ccdbuf));
249 
250 	/*
251 	 * independant struct bio initialization
252 	 */
253 
254 	return(cbp);
255 }
256 
257 /*
258  * putccdbuf() -	Free a ccd buffer.
259  *
260  *	This routine is called at splbio().
261  */
262 
263 static __inline
264 void
265 putccdbuf(struct ccdbuf *cbp)
266 {
267 
268 	if (numccdfreebufs < NCCDFREEHIWAT) {
269 		cbp->cb_freenext = ccdfreebufs;
270 		ccdfreebufs = cbp;
271 		++numccdfreebufs;
272 	} else {
273 		free((caddr_t)cbp, M_DEVBUF);
274 	}
275 }
276 
277 
278 /*
279  * Number of blocks to untouched in front of a component partition.
280  * This is to avoid violating its disklabel area when it starts at the
281  * beginning of the slice.
282  */
283 #if !defined(CCD_OFFSET)
284 #define CCD_OFFSET 16
285 #endif
286 
287 static struct ccd_s *
288 ccdfind(int unit)
289 {
290 	struct ccd_s *sc = NULL;
291 
292 	/* XXX: LOCK(unique unit numbers) */
293 	LIST_FOREACH(sc, &ccd_softc_list, list) {
294 		if (sc->sc_unit == unit)
295 			break;
296 	}
297 	/* XXX: UNLOCK(unique unit numbers) */
298 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
299 }
300 
301 static struct ccd_s *
302 ccdnew(int unit)
303 {
304 	struct ccd_s *sc;
305 
306 	/* XXX: LOCK(unique unit numbers) */
307 	if (IS_ALLOCATED(unit) || unit > DKMAXUNIT)
308 		return (NULL);
309 
310 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
311 	sc->sc_unit = unit;
312 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
313 	/* XXX: UNLOCK(unique unit numbers) */
314 	return (sc);
315 }
316 
317 static int
318 ccddestroy(struct ccd_s *sc, struct proc *p)
319 {
320 
321 	/* XXX: LOCK(unique unit numbers) */
322 	LIST_REMOVE(sc, list);
323 	/* XXX: UNLOCK(unique unit numbers) */
324 	FREE(sc, M_CCD);
325 	return (0);
326 }
327 
328 static void
329 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
330 {
331 	int i, u;
332 	char *s;
333 
334 	if (*dev != NODEV)
335 		return;
336 	i = dev_stdclone(name, &s, "ccd", &u);
337 	if (i != 2)
338 		return;
339 	if (*s < 'a' || *s > 'h')
340 		return;
341 	if (s[1] != '\0')
342 		return;
343 	*dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
344 		UID_ROOT, GID_OPERATOR, 0640, name);
345 }
346 
347 /*
348  * Called by main() during pseudo-device attachment.  All we need
349  * to do is to add devsw entries.
350  */
351 static void
352 ccdattach()
353 {
354 
355 	EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
356 }
357 
358 static int
359 ccd_modevent(module_t mod, int type, void *data)
360 {
361 	int error = 0;
362 
363 	switch (type) {
364 	case MOD_LOAD:
365 		ccdattach();
366 		break;
367 
368 	case MOD_UNLOAD:
369 		printf("ccd0: Unload not supported!\n");
370 		error = EOPNOTSUPP;
371 		break;
372 
373 	case MOD_SHUTDOWN:
374 		break;
375 
376 	default:
377 		error = EOPNOTSUPP;
378 	}
379 	return (error);
380 }
381 
382 DEV_MODULE(ccd, ccd_modevent, NULL);
383 
384 static int
385 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
386 {
387 	struct ccdcinfo *ci = NULL;	/* XXX */
388 	size_t size;
389 	int ix;
390 	struct vnode *vp;
391 	size_t minsize;
392 	int maxsecsize;
393 	struct ccdgeom *ccg = &cs->sc_geom;
394 	char *tmppath = NULL;
395 	int error = 0;
396 	off_t mediasize;
397 	u_int sectorsize;
398 
399 #ifdef DEBUG
400 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
401 		printf("ccdinit: unit %d\n", cs->sc_unit);
402 #endif
403 
404 	cs->sc_size = 0;
405 
406 	/* Allocate space for the component info. */
407 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
408 	    M_DEVBUF, M_WAITOK);
409 
410 	/*
411 	 * Verify that each component piece exists and record
412 	 * relevant information about it.
413 	 */
414 	maxsecsize = 0;
415 	minsize = 0;
416 	tmppath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK);
417 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
418 		vp = cs->sc_vpp[ix];
419 		ci = &cs->sc_cinfo[ix];
420 		ci->ci_vp = vp;
421 
422 		/*
423 		 * Copy in the pathname of the component.
424 		 */
425 		if ((error = copyinstr(cpaths[ix], tmppath,
426 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
427 #ifdef DEBUG
428 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
429 				printf("ccd%d: can't copy path, error = %d\n",
430 				    cs->sc_unit, error);
431 #endif
432 			goto fail;
433 		}
434 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
435 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
436 
437 		ci->ci_dev = vn_todev(vp);
438 
439 		/*
440 		 * Get partition information for the component.
441 		 */
442 		error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
443 		    FREAD, td->td_ucred, td);
444 		if (error != 0) {
445 #ifdef DEBUG
446 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
447 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
448 				     cs->sc_unit, ci->ci_path, error);
449 #endif
450 			goto fail;
451 		}
452 		/*
453 		 * Get partition information for the component.
454 		 */
455 		error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
456 		    FREAD, td->td_ucred, td);
457 		if (error != 0) {
458 #ifdef DEBUG
459 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
460 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
461 				     cs->sc_unit, ci->ci_path, error);
462 #endif
463 			goto fail;
464 		}
465 		if (sectorsize > maxsecsize)
466 			maxsecsize = sectorsize;
467 		size = mediasize / DEV_BSIZE - CCD_OFFSET;
468 
469 		/*
470 		 * Calculate the size, truncating to an interleave
471 		 * boundary if necessary.
472 		 */
473 
474 		if (cs->sc_ileave > 1)
475 			size -= size % cs->sc_ileave;
476 
477 		if (size == 0) {
478 #ifdef DEBUG
479 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
480 				printf("ccd%d: %s: size == 0\n",
481 				    cs->sc_unit, ci->ci_path);
482 #endif
483 			error = ENODEV;
484 			goto fail;
485 		}
486 
487 		if (minsize == 0 || size < minsize)
488 			minsize = size;
489 		ci->ci_size = size;
490 		cs->sc_size += size;
491 	}
492 
493 	free(tmppath, M_DEVBUF);
494 	tmppath = NULL;
495 
496 	/*
497 	 * Don't allow the interleave to be smaller than
498 	 * the biggest component sector.
499 	 */
500 	if ((cs->sc_ileave > 0) &&
501 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
502 #ifdef DEBUG
503 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
504 			printf("ccd%d: interleave must be at least %d\n",
505 			    cs->sc_unit, (maxsecsize / DEV_BSIZE));
506 #endif
507 		error = EINVAL;
508 		goto fail;
509 	}
510 
511 	/*
512 	 * If uniform interleave is desired set all sizes to that of
513 	 * the smallest component.  This will guarentee that a single
514 	 * interleave table is generated.
515 	 *
516 	 * Lost space must be taken into account when calculating the
517 	 * overall size.  Half the space is lost when CCDF_MIRROR is
518 	 * specified.  One disk is lost when CCDF_PARITY is specified.
519 	 */
520 	if (cs->sc_flags & CCDF_UNIFORM) {
521 		for (ci = cs->sc_cinfo;
522 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
523 			ci->ci_size = minsize;
524 		}
525 		if (cs->sc_flags & CCDF_MIRROR) {
526 			/*
527 			 * Check to see if an even number of components
528 			 * have been specified.  The interleave must also
529 			 * be non-zero in order for us to be able to
530 			 * guarentee the topology.
531 			 */
532 			if (cs->sc_nccdisks % 2) {
533 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
534 				error = EINVAL;
535 				goto fail;
536 			}
537 			if (cs->sc_ileave == 0) {
538 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
539 				error = EINVAL;
540 				goto fail;
541 			}
542 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
543 		} else if (cs->sc_flags & CCDF_PARITY) {
544 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
545 		} else {
546 			if (cs->sc_ileave == 0) {
547 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
548 				error = EINVAL;
549 				goto fail;
550 			}
551 			cs->sc_size = cs->sc_nccdisks * minsize;
552 		}
553 	}
554 
555 	/*
556 	 * Construct the interleave table.
557 	 */
558 	ccdinterleave(cs, cs->sc_unit);
559 
560 	/*
561 	 * Create pseudo-geometry based on 1MB cylinders.  It's
562 	 * pretty close.
563 	 */
564 	ccg->ccg_secsize = maxsecsize;
565 	ccg->ccg_ntracks = 1;
566 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
567 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
568 
569 	/*
570 	 * Add an devstat entry for this device.
571 	 */
572 	devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
573 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
574 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
575 			  DEVSTAT_PRIORITY_ARRAY);
576 
577 	cs->sc_flags |= CCDF_INITED;
578 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
579 	return (0);
580 fail:
581 	while (ci > cs->sc_cinfo) {
582 		ci--;
583 		free(ci->ci_path, M_DEVBUF);
584 	}
585 	if (tmppath != NULL)
586 		free(tmppath, M_DEVBUF);
587 	free(cs->sc_cinfo, M_DEVBUF);
588 	return (error);
589 }
590 
591 static void
592 ccdinterleave(struct ccd_s *cs, int unit)
593 {
594 	struct ccdcinfo *ci, *smallci;
595 	struct ccdiinfo *ii;
596 	daddr_t bn, lbn;
597 	int ix;
598 	u_long size;
599 
600 #ifdef DEBUG
601 	if (ccddebug & CCDB_INIT)
602 		printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
603 #endif
604 
605 	/*
606 	 * Allocate an interleave table.  The worst case occurs when each
607 	 * of N disks is of a different size, resulting in N interleave
608 	 * tables.
609 	 *
610 	 * Chances are this is too big, but we don't care.
611 	 */
612 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
613 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
614 	    M_WAITOK | M_ZERO);
615 
616 	/*
617 	 * Trivial case: no interleave (actually interleave of disk size).
618 	 * Each table entry represents a single component in its entirety.
619 	 *
620 	 * An interleave of 0 may not be used with a mirror or parity setup.
621 	 */
622 	if (cs->sc_ileave == 0) {
623 		bn = 0;
624 		ii = cs->sc_itable;
625 
626 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
627 			/* Allocate space for ii_index. */
628 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
629 			ii->ii_ndisk = 1;
630 			ii->ii_startblk = bn;
631 			ii->ii_startoff = 0;
632 			ii->ii_index[0] = ix;
633 			bn += cs->sc_cinfo[ix].ci_size;
634 			ii++;
635 		}
636 		ii->ii_ndisk = 0;
637 #ifdef DEBUG
638 		if (ccddebug & CCDB_INIT)
639 			printiinfo(cs->sc_itable);
640 #endif
641 		return;
642 	}
643 
644 	/*
645 	 * The following isn't fast or pretty; it doesn't have to be.
646 	 */
647 	size = 0;
648 	bn = lbn = 0;
649 	for (ii = cs->sc_itable; ; ii++) {
650 		/*
651 		 * Allocate space for ii_index.  We might allocate more then
652 		 * we use.
653 		 */
654 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
655 		    M_DEVBUF, M_WAITOK);
656 
657 		/*
658 		 * Locate the smallest of the remaining components
659 		 */
660 		smallci = NULL;
661 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
662 		    ci++) {
663 			if (ci->ci_size > size &&
664 			    (smallci == NULL ||
665 			     ci->ci_size < smallci->ci_size)) {
666 				smallci = ci;
667 			}
668 		}
669 
670 		/*
671 		 * Nobody left, all done
672 		 */
673 		if (smallci == NULL) {
674 			ii->ii_ndisk = 0;
675 			break;
676 		}
677 
678 		/*
679 		 * Record starting logical block using an sc_ileave blocksize.
680 		 */
681 		ii->ii_startblk = bn / cs->sc_ileave;
682 
683 		/*
684 		 * Record starting comopnent block using an sc_ileave
685 		 * blocksize.  This value is relative to the beginning of
686 		 * a component disk.
687 		 */
688 		ii->ii_startoff = lbn;
689 
690 		/*
691 		 * Determine how many disks take part in this interleave
692 		 * and record their indices.
693 		 */
694 		ix = 0;
695 		for (ci = cs->sc_cinfo;
696 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
697 			if (ci->ci_size >= smallci->ci_size) {
698 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
699 			}
700 		}
701 		ii->ii_ndisk = ix;
702 		bn += ix * (smallci->ci_size - size);
703 		lbn = smallci->ci_size / cs->sc_ileave;
704 		size = smallci->ci_size;
705 	}
706 #ifdef DEBUG
707 	if (ccddebug & CCDB_INIT)
708 		printiinfo(cs->sc_itable);
709 #endif
710 }
711 
712 /* ARGSUSED */
713 static int
714 ccdopen(dev_t dev, int flags, int fmt, struct thread *td)
715 {
716 	int unit = ccdunit(dev);
717 	struct ccd_s *cs;
718 	struct disklabel *lp;
719 	int error = 0, part, pmask;
720 
721 #ifdef DEBUG
722 	if (ccddebug & CCDB_FOLLOW)
723 		printf("ccdopen(%p, %x)\n", dev, flags);
724 #endif
725 
726 	cs = IS_ALLOCATED(unit) ? ccdfind(unit) : ccdnew(unit);
727 
728 	if ((error = ccdlock(cs)) != 0)
729 		return (error);
730 
731 	lp = &cs->sc_label;
732 
733 	part = ccdpart(dev);
734 	pmask = (1 << part);
735 
736 	/*
737 	 * If we're initialized, check to see if there are any other
738 	 * open partitions.  If not, then it's safe to update
739 	 * the in-core disklabel.
740 	 */
741 	if (IS_INITED(cs) && (cs->sc_openmask == 0))
742 		ccdgetdisklabel(dev);
743 
744 	/* Check that the partition exists. */
745 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
746 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
747 		error = ENXIO;
748 		goto done;
749 	}
750 
751 	cs->sc_openmask |= pmask;
752  done:
753 	ccdunlock(cs);
754 	return (0);
755 }
756 
757 /* ARGSUSED */
758 static int
759 ccdclose(dev_t dev, int flags, int fmt, struct thread *td)
760 {
761 	int unit = ccdunit(dev);
762 	struct ccd_s *cs;
763 	int error = 0, part;
764 
765 #ifdef DEBUG
766 	if (ccddebug & CCDB_FOLLOW)
767 		printf("ccdclose(%p, %x)\n", dev, flags);
768 #endif
769 
770 	if (!IS_ALLOCATED(unit))
771 		return (ENXIO);
772 	cs = ccdfind(unit);
773 
774 	if ((error = ccdlock(cs)) != 0)
775 		return (error);
776 
777 	part = ccdpart(dev);
778 
779 	/* ...that much closer to allowing unconfiguration... */
780 	cs->sc_openmask &= ~(1 << part);
781 	/* collect "garbage" if possible */
782 	if (!IS_INITED(cs) && (cs->sc_flags & CCDF_WANTED) == 0)
783 		ccddestroy(cs, td->td_proc);
784 	else
785 		ccdunlock(cs);
786 	return (0);
787 }
788 
789 static void
790 ccdstrategy(struct bio *bp)
791 {
792 	int unit = ccdunit(bp->bio_dev);
793 	struct ccd_s *cs = ccdfind(unit);
794 	int s;
795 	int wlabel;
796 	struct disklabel *lp;
797 
798 #ifdef DEBUG
799 	if (ccddebug & CCDB_FOLLOW)
800 		printf("ccdstrategy(%p): unit %d\n", bp, unit);
801 #endif
802 	if (!IS_INITED(cs)) {
803 		biofinish(bp, NULL, ENXIO);
804 		return;
805 	}
806 
807 	/* If it's a nil transfer, wake up the top half now. */
808 	if (bp->bio_bcount == 0) {
809 		biodone(bp);
810 		return;
811 	}
812 
813 	lp = &cs->sc_label;
814 
815 	/*
816 	 * Do bounds checking and adjust transfer.  If there's an
817 	 * error, the bounds check will flag that for us.
818 	 */
819 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
820 	if (ccdpart(bp->bio_dev) != RAW_PART) {
821 		if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
822 			biodone(bp);
823 			return;
824 		}
825 	} else {
826 		int pbn;        /* in sc_secsize chunks */
827 		long sz;        /* in sc_secsize chunks */
828 
829 		pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
830 		sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
831 
832 		/*
833 		 * If out of bounds return an error. If at the EOF point,
834 		 * simply read or write less.
835 		 */
836 
837 		if (pbn < 0 || pbn >= cs->sc_size) {
838 			bp->bio_resid = bp->bio_bcount;
839 			if (pbn != cs->sc_size)
840 				biofinish(bp, NULL, EINVAL);
841 			else
842 				biodone(bp);
843 			return;
844 		}
845 
846 		/*
847 		 * If the request crosses EOF, truncate the request.
848 		 */
849 		if (pbn + sz > cs->sc_size) {
850 			bp->bio_bcount = (cs->sc_size - pbn) *
851 			    cs->sc_geom.ccg_secsize;
852 		}
853 	}
854 
855 	bp->bio_resid = bp->bio_bcount;
856 
857 	/*
858 	 * "Start" the unit.
859 	 */
860 	s = splbio();
861 	ccdstart(cs, bp);
862 	splx(s);
863 	return;
864 }
865 
866 static void
867 ccdstart(struct ccd_s *cs, struct bio *bp)
868 {
869 	long bcount, rcount;
870 	struct ccdbuf *cbp[4];
871 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
872 	caddr_t addr;
873 	daddr_t bn;
874 	struct partition *pp;
875 
876 #ifdef DEBUG
877 	if (ccddebug & CCDB_FOLLOW)
878 		printf("ccdstart(%p, %p)\n", cs, bp);
879 #endif
880 
881 	/* Record the transaction start  */
882 	devstat_start_transaction(&cs->device_stats);
883 
884 	/*
885 	 * Translate the partition-relative block number to an absolute.
886 	 */
887 	bn = bp->bio_blkno;
888 	if (ccdpart(bp->bio_dev) != RAW_PART) {
889 		pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
890 		bn += pp->p_offset;
891 	}
892 
893 	/*
894 	 * Allocate component buffers and fire off the requests
895 	 */
896 	addr = bp->bio_data;
897 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
898 		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
899 		rcount = cbp[0]->cb_buf.bio_bcount;
900 
901 		if (cs->sc_cflags & CCDF_MIRROR) {
902 			/*
903 			 * Mirroring.  Writes go to both disks, reads are
904 			 * taken from whichever disk seems most appropriate.
905 			 *
906 			 * We attempt to localize reads to the disk whos arm
907 			 * is nearest the read request.  We ignore seeks due
908 			 * to writes when making this determination and we
909 			 * also try to avoid hogging.
910 			 */
911 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
912 				BIO_STRATEGY(&cbp[0]->cb_buf, 0);
913 				BIO_STRATEGY(&cbp[1]->cb_buf, 0);
914 			} else {
915 				int pick = cs->sc_pick;
916 				daddr_t range = cs->sc_size / 16;
917 
918 				if (bn < cs->sc_blk[pick] - range ||
919 				    bn > cs->sc_blk[pick] + range
920 				) {
921 					cs->sc_pick = pick = 1 - pick;
922 				}
923 				cs->sc_blk[pick] = bn + btodb(rcount);
924 				BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
925 			}
926 		} else {
927 			/*
928 			 * Not mirroring
929 			 */
930 			BIO_STRATEGY(&cbp[0]->cb_buf, 0);
931 		}
932 		bn += btodb(rcount);
933 		addr += rcount;
934 	}
935 }
936 
937 /*
938  * Build a component buffer header.
939  */
940 static void
941 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
942 {
943 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
944 	struct ccdbuf *cbp;
945 	daddr_t cbn, cboff;
946 	off_t cbc;
947 
948 #ifdef DEBUG
949 	if (ccddebug & CCDB_IO)
950 		printf("ccdbuffer(%p, %p, %lld, %p, %ld)\n",
951 		    (void *)cs, (void *)bp, (long long)bn, (void *)addr,
952 		    bcount);
953 #endif
954 	/*
955 	 * Determine which component bn falls in.
956 	 */
957 	cbn = bn;
958 	cboff = 0;
959 
960 	if (cs->sc_ileave == 0) {
961 		/*
962 		 * Serially concatenated and neither a mirror nor a parity
963 		 * config.  This is a special case.
964 		 */
965 		daddr_t sblk;
966 
967 		sblk = 0;
968 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
969 			sblk += ci->ci_size;
970 		cbn -= sblk;
971 	} else {
972 		struct ccdiinfo *ii;
973 		int ccdisk, off;
974 
975 		/*
976 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
977 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
978 		 * to cbn.
979 		 */
980 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
981 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
982 
983 		/*
984 		 * Figure out which interleave table to use.
985 		 */
986 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
987 			if (ii->ii_startblk > cbn)
988 				break;
989 		}
990 		ii--;
991 
992 		/*
993 		 * off is the logical superblock relative to the beginning
994 		 * of this interleave block.
995 		 */
996 		off = cbn - ii->ii_startblk;
997 
998 		/*
999 		 * We must calculate which disk component to use (ccdisk),
1000 		 * and recalculate cbn to be the superblock relative to
1001 		 * the beginning of the component.  This is typically done by
1002 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
1003 		 * must typically be divided by the number of components in
1004 		 * this interleave array to be properly convert it from a
1005 		 * CCD-relative logical superblock number to a
1006 		 * component-relative superblock number.
1007 		 */
1008 		if (ii->ii_ndisk == 1) {
1009 			/*
1010 			 * When we have just one disk, it can't be a mirror
1011 			 * or a parity config.
1012 			 */
1013 			ccdisk = ii->ii_index[0];
1014 			cbn = ii->ii_startoff + off;
1015 		} else {
1016 			if (cs->sc_cflags & CCDF_MIRROR) {
1017 				/*
1018 				 * We have forced a uniform mapping, resulting
1019 				 * in a single interleave array.  We double
1020 				 * up on the first half of the available
1021 				 * components and our mirror is in the second
1022 				 * half.  This only works with a single
1023 				 * interleave array because doubling up
1024 				 * doubles the number of sectors, so there
1025 				 * cannot be another interleave array because
1026 				 * the next interleave array's calculations
1027 				 * would be off.
1028 				 */
1029 				int ndisk2 = ii->ii_ndisk / 2;
1030 				ccdisk = ii->ii_index[off % ndisk2];
1031 				cbn = ii->ii_startoff + off / ndisk2;
1032 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1033 			} else if (cs->sc_cflags & CCDF_PARITY) {
1034 				/*
1035 				 * XXX not implemented yet
1036 				 */
1037 				int ndisk2 = ii->ii_ndisk - 1;
1038 				ccdisk = ii->ii_index[off % ndisk2];
1039 				cbn = ii->ii_startoff + off / ndisk2;
1040 				if (cbn % ii->ii_ndisk <= ccdisk)
1041 					ccdisk++;
1042 			} else {
1043 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1044 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1045 			}
1046 		}
1047 
1048 		ci = &cs->sc_cinfo[ccdisk];
1049 
1050 		/*
1051 		 * Convert cbn from a superblock to a normal block so it
1052 		 * can be used to calculate (along with cboff) the normal
1053 		 * block index into this particular disk.
1054 		 */
1055 		cbn *= cs->sc_ileave;
1056 	}
1057 
1058 	/*
1059 	 * Fill in the component buf structure.
1060 	 */
1061 	cbp = getccdbuf(NULL);
1062 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
1063 	cbp->cb_buf.bio_done = ccdiodone;
1064 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
1065 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1066 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1067 	cbp->cb_buf.bio_data = addr;
1068 	if (cs->sc_ileave == 0)
1069               cbc = dbtob((off_t)(ci->ci_size - cbn));
1070 	else
1071               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1072 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1073  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1074 
1075 	/*
1076 	 * context for ccdiodone
1077 	 */
1078 	cbp->cb_obp = bp;
1079 	cbp->cb_unit = cs->sc_unit;
1080 	cbp->cb_comp = ci - cs->sc_cinfo;
1081 
1082 #ifdef DEBUG
1083 	if (ccddebug & CCDB_IO)
1084 		printf(" dev %p(u%ld): cbp %p bn %lld addr %p bcnt %ld\n",
1085 		       ci->ci_dev, (unsigned long)(ci-cs->sc_cinfo), cbp,
1086 		       (long long)cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1087 		       cbp->cb_buf.bio_bcount);
1088 #endif
1089 	cb[0] = cbp;
1090 
1091 	/*
1092 	 * Note: both I/O's setup when reading from mirror, but only one
1093 	 * will be executed.
1094 	 */
1095 	if (cs->sc_cflags & CCDF_MIRROR) {
1096 		/* mirror, setup second I/O */
1097 		cbp = getccdbuf(cb[0]);
1098 		cbp->cb_buf.bio_dev = ci2->ci_dev;
1099 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1100 		cb[1] = cbp;
1101 		/* link together the ccdbuf's and clear "mirror done" flag */
1102 		cb[0]->cb_mirror = cb[1];
1103 		cb[1]->cb_mirror = cb[0];
1104 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1105 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1106 	}
1107 }
1108 
1109 static void
1110 ccdintr(struct ccd_s *cs, struct bio *bp)
1111 {
1112 #ifdef DEBUG
1113 	if (ccddebug & CCDB_FOLLOW)
1114 		printf("ccdintr(%p, %p)\n", cs, bp);
1115 #endif
1116 	/*
1117 	 * Request is done for better or worse, wakeup the top half.
1118 	 */
1119 	if (bp->bio_flags & BIO_ERROR)
1120 		bp->bio_resid = bp->bio_bcount;
1121 	biofinish(bp, &cs->device_stats, 0);
1122 }
1123 
1124 /*
1125  * Called at interrupt time.
1126  * Mark the component as done and if all components are done,
1127  * take a ccd interrupt.
1128  */
1129 static void
1130 ccdiodone(struct bio *ibp)
1131 {
1132 	struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1133 	struct bio *bp = cbp->cb_obp;
1134 	int unit = cbp->cb_unit;
1135 	int count, s;
1136 
1137 	s = splbio();
1138 #ifdef DEBUG
1139 	if (ccddebug & CCDB_FOLLOW)
1140 		printf("ccdiodone(%p)\n", cbp);
1141 	if (ccddebug & CCDB_IO) {
1142 		printf("ccdiodone: bp %p bcount %ld resid %ld\n",
1143 		       bp, bp->bio_bcount, bp->bio_resid);
1144 		printf(" dev %p(u%d), cbp %p bn %lld addr %p bcnt %ld\n",
1145 		       cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1146 		       (long long)cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1147 		       cbp->cb_buf.bio_bcount);
1148 	}
1149 #endif
1150 	/*
1151 	 * If an error occured, report it.  If this is a mirrored
1152 	 * configuration and the first of two possible reads, do not
1153 	 * set the error in the bp yet because the second read may
1154 	 * succeed.
1155 	 */
1156 
1157 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1158 		const char *msg = "";
1159 
1160 		if ((ccdfind(unit)->sc_cflags & CCDF_MIRROR) &&
1161 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
1162 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1163 			/*
1164 			 * We will try our read on the other disk down
1165 			 * below, also reverse the default pick so if we
1166 			 * are doing a scan we do not keep hitting the
1167 			 * bad disk first.
1168 			 */
1169 			struct ccd_s *cs = ccdfind(unit);
1170 
1171 			msg = ", trying other disk";
1172 			cs->sc_pick = 1 - cs->sc_pick;
1173 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1174 		} else {
1175 			bp->bio_flags |= BIO_ERROR;
1176 			bp->bio_error = cbp->cb_buf.bio_error ?
1177 			    cbp->cb_buf.bio_error : EIO;
1178 		}
1179 		printf("ccd%d: error %d on component %d block %d (ccd block %lld)%s\n",
1180 		       unit, bp->bio_error, cbp->cb_comp,
1181 		       (int)cbp->cb_buf.bio_blkno, bp->bio_blkno, msg);
1182 	}
1183 
1184 	/*
1185 	 * Process mirror.  If we are writing, I/O has been initiated on both
1186 	 * buffers and we fall through only after both are finished.
1187 	 *
1188 	 * If we are reading only one I/O is initiated at a time.  If an
1189 	 * error occurs we initiate the second I/O and return, otherwise
1190 	 * we free the second I/O without initiating it.
1191 	 */
1192 
1193 	if (ccdfind(unit)->sc_cflags & CCDF_MIRROR) {
1194 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1195 			/*
1196 			 * When writing, handshake with the second buffer
1197 			 * to determine when both are done.  If both are not
1198 			 * done, return here.
1199 			 */
1200 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1201 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1202 				putccdbuf(cbp);
1203 				splx(s);
1204 				return;
1205 			}
1206 		} else {
1207 			/*
1208 			 * When reading, either dispose of the second buffer
1209 			 * or initiate I/O on the second buffer if an error
1210 			 * occured with this one.
1211 			 */
1212 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1213 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1214 					cbp->cb_mirror->cb_pflags |=
1215 					    CCDPF_MIRROR_DONE;
1216 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1217 					putccdbuf(cbp);
1218 					splx(s);
1219 					return;
1220 				} else {
1221 					putccdbuf(cbp->cb_mirror);
1222 					/* fall through */
1223 				}
1224 			}
1225 		}
1226 	}
1227 
1228 	/*
1229 	 * use bio_caller1 to determine how big the original request was rather
1230 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1231 	 *
1232 	 * XXX We check for an error, but we do not test the resid for an
1233 	 * aligned EOF condition.  This may result in character & block
1234 	 * device access not recognizing EOF properly when read or written
1235 	 * sequentially, but will not effect filesystems.
1236 	 */
1237 	count = (long)cbp->cb_buf.bio_caller1;
1238 	putccdbuf(cbp);
1239 
1240 	/*
1241 	 * If all done, "interrupt".
1242 	 */
1243 	bp->bio_resid -= count;
1244 	if (bp->bio_resid < 0)
1245 		panic("ccdiodone: count");
1246 	if (bp->bio_resid == 0)
1247 		ccdintr(ccdfind(unit), bp);
1248 	splx(s);
1249 }
1250 
1251 static int
1252 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
1253 {
1254 	int unit = ccdunit(dev);
1255 	int i, j, lookedup = 0, error = 0;
1256 	int part, pmask, s;
1257 	struct ccd_s *cs;
1258 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1259 	char **cpp;
1260 	struct vnode **vpp;
1261 
1262 	if (!IS_ALLOCATED(unit))
1263 		return (ENXIO);
1264 	cs = ccdfind(unit);
1265 
1266 	switch (cmd) {
1267 	case CCDIOCSET:
1268 		if (IS_INITED(cs))
1269 			return (EBUSY);
1270 
1271 		if ((flag & FWRITE) == 0)
1272 			return (EBADF);
1273 
1274 		if ((error = ccdlock(cs)) != 0)
1275 			return (error);
1276 
1277 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1278 			return (EINVAL);
1279 
1280 		/* Fill in some important bits. */
1281 		cs->sc_ileave = ccio->ccio_ileave;
1282 		if (cs->sc_ileave == 0 &&
1283 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1284 		     (ccio->ccio_flags & CCDF_PARITY))) {
1285 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1286 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1287 		}
1288 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1289 		    (ccio->ccio_flags & CCDF_PARITY)) {
1290 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1291 			ccio->ccio_flags &= ~CCDF_PARITY;
1292 		}
1293 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1294 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1295 			printf("ccd%d: mirror/parity forces uniform flag\n",
1296 			       unit);
1297 			ccio->ccio_flags |= CCDF_UNIFORM;
1298 		}
1299 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1300 
1301 		/*
1302 		 * Allocate space for and copy in the array of
1303 		 * componet pathnames and device numbers.
1304 		 */
1305 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1306 		    M_DEVBUF, M_WAITOK);
1307 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1308 		    M_DEVBUF, M_WAITOK);
1309 
1310 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1311 		    ccio->ccio_ndisks * sizeof(char **));
1312 		if (error) {
1313 			free(vpp, M_DEVBUF);
1314 			free(cpp, M_DEVBUF);
1315 			ccdunlock(cs);
1316 			return (error);
1317 		}
1318 
1319 #ifdef DEBUG
1320 		if (ccddebug & CCDB_INIT)
1321 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1322 				printf("ccdioctl: component %d: %p\n",
1323 				    i, cpp[i]);
1324 #endif
1325 
1326 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1327 #ifdef DEBUG
1328 			if (ccddebug & CCDB_INIT)
1329 				printf("ccdioctl: lookedup = %d\n", lookedup);
1330 #endif
1331 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1332 				for (j = 0; j < lookedup; ++j)
1333 					(void)vn_close(vpp[j], FREAD|FWRITE,
1334 					    td->td_ucred, td);
1335 				free(vpp, M_DEVBUF);
1336 				free(cpp, M_DEVBUF);
1337 				ccdunlock(cs);
1338 				return (error);
1339 			}
1340 			++lookedup;
1341 		}
1342 		cs->sc_vpp = vpp;
1343 		cs->sc_nccdisks = ccio->ccio_ndisks;
1344 
1345 		/*
1346 		 * Initialize the ccd.  Fills in the softc for us.
1347 		 */
1348 		if ((error = ccdinit(cs, cpp, td)) != 0) {
1349 			for (j = 0; j < lookedup; ++j)
1350 				(void)vn_close(vpp[j], FREAD|FWRITE,
1351 				    td->td_ucred, td);
1352 			/*
1353 			 * We can't ccddestroy() cs just yet, because nothing
1354 			 * prevents user-level app to do another ioctl()
1355 			 * without closing the device first, therefore
1356 			 * declare unit null and void and let ccdclose()
1357 			 * destroy it when it is safe to do so.
1358 			 */
1359 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1360 			free(vpp, M_DEVBUF);
1361 			free(cpp, M_DEVBUF);
1362 			ccdunlock(cs);
1363 			return (error);
1364 		}
1365 
1366 		/*
1367 		 * The ccd has been successfully initialized, so
1368 		 * we can place it into the array and read the disklabel.
1369 		 */
1370 		ccio->ccio_unit = unit;
1371 		ccio->ccio_size = cs->sc_size;
1372 		ccdgetdisklabel(dev);
1373 
1374 		ccdunlock(cs);
1375 
1376 		break;
1377 
1378 	case CCDIOCCLR:
1379 		if (!IS_INITED(cs))
1380 			return (ENXIO);
1381 
1382 		if ((flag & FWRITE) == 0)
1383 			return (EBADF);
1384 
1385 		if ((error = ccdlock(cs)) != 0)
1386 			return (error);
1387 
1388 		/* Don't unconfigure if any other partitions are open */
1389 		part = ccdpart(dev);
1390 		pmask = (1 << part);
1391 		if ((cs->sc_openmask & ~pmask)) {
1392 			ccdunlock(cs);
1393 			return (EBUSY);
1394 		}
1395 
1396 		/* Declare unit null and void (reset all flags) */
1397 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1398 
1399 		/* Close the components and free their pathnames. */
1400 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1401 			/*
1402 			 * XXX: this close could potentially fail and
1403 			 * cause Bad Things.  Maybe we need to force
1404 			 * the close to happen?
1405 			 */
1406 #ifdef DEBUG
1407 			if (ccddebug & CCDB_VNODE)
1408 				vprint("CCDIOCCLR: vnode info",
1409 				    cs->sc_cinfo[i].ci_vp);
1410 #endif
1411 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1412 			    td->td_ucred, td);
1413 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1414 		}
1415 
1416 		/* Free interleave index. */
1417 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1418 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1419 
1420 		/* Free component info and interleave table. */
1421 		free(cs->sc_cinfo, M_DEVBUF);
1422 		free(cs->sc_itable, M_DEVBUF);
1423 		free(cs->sc_vpp, M_DEVBUF);
1424 
1425 		/* And remove the devstat entry. */
1426 		devstat_remove_entry(&cs->device_stats);
1427 
1428 		/* This must be atomic. */
1429 		s = splhigh();
1430 		ccdunlock(cs);
1431 		splx(s);
1432 
1433 		break;
1434 
1435 	case CCDCONFINFO:
1436 		{
1437 			int ninit = 0;
1438 			struct ccdconf *conf = (struct ccdconf *)data;
1439 			struct ccd_s *tmpcs;
1440 			struct ccd_s *ubuf = conf->buffer;
1441 
1442 			/* XXX: LOCK(unique unit numbers) */
1443 			LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1444 				if (IS_INITED(tmpcs))
1445 					ninit++;
1446 
1447 			if (conf->size == 0) {
1448 				conf->size = sizeof(struct ccd_s) * ninit;
1449 				break;
1450 			} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1451 			    (conf->size % sizeof(struct ccd_s) != 0)) {
1452 				/* XXX: UNLOCK(unique unit numbers) */
1453 				return (EINVAL);
1454 			}
1455 
1456 			ubuf += ninit;
1457 			LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1458 				if (!IS_INITED(tmpcs))
1459 					continue;
1460 				error = copyout(tmpcs, --ubuf,
1461 				    sizeof(struct ccd_s));
1462 				if (error != 0)
1463 					/* XXX: UNLOCK(unique unit numbers) */
1464 					return (error);
1465 			}
1466 			/* XXX: UNLOCK(unique unit numbers) */
1467 		}
1468 		break;
1469 
1470 	case CCDCPPINFO:
1471 		if (!IS_INITED(cs))
1472 			return (ENXIO);
1473 
1474 		{
1475 			int len = 0;
1476 			struct ccdcpps *cpps = (struct ccdcpps *)data;
1477 			char *ubuf = cpps->buffer;
1478 
1479 
1480 			for (i = 0; i < cs->sc_nccdisks; ++i)
1481 				len += cs->sc_cinfo[i].ci_pathlen;
1482 
1483 			if (cpps->size == 0) {
1484 				cpps->size = len;
1485 				break;
1486 			} else if (cpps->size != len) {
1487 				return (EINVAL);
1488 			}
1489 
1490 			for (i = 0; i < cs->sc_nccdisks; ++i) {
1491 				len = cs->sc_cinfo[i].ci_pathlen;
1492 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1493 				    len);
1494 				if (error != 0)
1495 					return (error);
1496 				ubuf += len;
1497 			}
1498 		}
1499 		break;
1500 
1501 	case DIOCGDINFO:
1502 		if (!IS_INITED(cs))
1503 			return (ENXIO);
1504 
1505 		*(struct disklabel *)data = cs->sc_label;
1506 		break;
1507 
1508 	case DIOCWDINFO:
1509 	case DIOCSDINFO:
1510 		if (!IS_INITED(cs))
1511 			return (ENXIO);
1512 
1513 		if ((flag & FWRITE) == 0)
1514 			return (EBADF);
1515 
1516 		if ((error = ccdlock(cs)) != 0)
1517 			return (error);
1518 
1519 		cs->sc_flags |= CCDF_LABELLING;
1520 
1521 		error = setdisklabel(&cs->sc_label,
1522 		    (struct disklabel *)data, 0);
1523 		if (error == 0) {
1524 			if (cmd == DIOCWDINFO)
1525 				error = writedisklabel(CCDLABELDEV(dev),
1526 				    &cs->sc_label);
1527 		}
1528 
1529 		cs->sc_flags &= ~CCDF_LABELLING;
1530 
1531 		ccdunlock(cs);
1532 
1533 		if (error)
1534 			return (error);
1535 		break;
1536 
1537 	case DIOCWLABEL:
1538 		if (!IS_INITED(cs))
1539 			return (ENXIO);
1540 
1541 		if ((flag & FWRITE) == 0)
1542 			return (EBADF);
1543 		if (*(int *)data != 0)
1544 			cs->sc_flags |= CCDF_WLABEL;
1545 		else
1546 			cs->sc_flags &= ~CCDF_WLABEL;
1547 		break;
1548 
1549 	default:
1550 		return (ENOTTY);
1551 	}
1552 
1553 	return (0);
1554 }
1555 
1556 static int
1557 ccdsize(dev_t dev)
1558 {
1559 	struct ccd_s *cs;
1560 	int part, size;
1561 
1562 	if (ccdopen(dev, 0, S_IFCHR, curthread))
1563 		return (-1);
1564 
1565 	cs = ccdfind(ccdunit(dev));
1566 	part = ccdpart(dev);
1567 
1568 	if (!IS_INITED(cs))
1569 		return (-1);
1570 
1571 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1572 		size = -1;
1573 	else
1574 		size = cs->sc_label.d_partitions[part].p_size;
1575 
1576 	if (ccdclose(dev, 0, S_IFCHR, curthread))
1577 		return (-1);
1578 
1579 	return (size);
1580 }
1581 
1582 /*
1583  * Lookup the provided name in the filesystem.  If the file exists,
1584  * is a valid block device, and isn't being used by anyone else,
1585  * set *vpp to the file's vnode.
1586  */
1587 static int
1588 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1589 {
1590 	struct nameidata nd;
1591 	struct vnode *vp;
1592 	int error, flags;
1593 
1594 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1595 	flags = FREAD | FWRITE;
1596 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1597 #ifdef DEBUG
1598 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
1599 			printf("ccdlookup: vn_open error = %d\n", error);
1600 #endif
1601 		return (error);
1602 	}
1603 	vp = nd.ni_vp;
1604 
1605 	if (vp->v_usecount > 1) {
1606 		error = EBUSY;
1607 		goto bad;
1608 	}
1609 
1610 	if (!vn_isdisk(vp, &error))
1611 		goto bad;
1612 
1613 #ifdef DEBUG
1614 	if (ccddebug & CCDB_VNODE)
1615 		vprint("ccdlookup: vnode info", vp);
1616 #endif
1617 
1618 	VOP_UNLOCK(vp, 0, td);
1619 	NDFREE(&nd, NDF_ONLY_PNBUF);
1620 	*vpp = vp;
1621 	return (0);
1622 bad:
1623 	VOP_UNLOCK(vp, 0, td);
1624 	NDFREE(&nd, NDF_ONLY_PNBUF);
1625 	/* vn_close does vrele() for vp */
1626 	(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1627 	return (error);
1628 }
1629 
1630 /*
1631  * Read the disklabel from the ccd.  If one is not present, fake one
1632  * up.
1633  */
1634 static void
1635 ccdgetdisklabel(dev_t dev)
1636 {
1637 	int unit = ccdunit(dev);
1638 	struct ccd_s *cs = ccdfind(unit);
1639 	char *errstring;
1640 	struct disklabel *lp = &cs->sc_label;
1641 	struct ccdgeom *ccg = &cs->sc_geom;
1642 
1643 	bzero(lp, sizeof(*lp));
1644 
1645 	lp->d_secperunit = cs->sc_size;
1646 	lp->d_secsize = ccg->ccg_secsize;
1647 	lp->d_nsectors = ccg->ccg_nsectors;
1648 	lp->d_ntracks = ccg->ccg_ntracks;
1649 	lp->d_ncylinders = ccg->ccg_ncylinders;
1650 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1651 
1652 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1653 	lp->d_type = DTYPE_CCD;
1654 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1655 	lp->d_rpm = 3600;
1656 	lp->d_interleave = 1;
1657 	lp->d_flags = 0;
1658 
1659 	lp->d_partitions[RAW_PART].p_offset = 0;
1660 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1661 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1662 	lp->d_npartitions = RAW_PART + 1;
1663 
1664 	lp->d_bbsize = BBSIZE;				/* XXX */
1665 	lp->d_sbsize = 0;
1666 
1667 	lp->d_magic = DISKMAGIC;
1668 	lp->d_magic2 = DISKMAGIC;
1669 	lp->d_checksum = dkcksum(&cs->sc_label);
1670 
1671 	/*
1672 	 * Call the generic disklabel extraction routine.
1673 	 */
1674 	errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1675 	if (errstring != NULL)
1676 		ccdmakedisklabel(cs);
1677 
1678 #ifdef DEBUG
1679 	/* It's actually extremely common to have unlabeled ccds. */
1680 	if (ccddebug & CCDB_LABEL)
1681 		if (errstring != NULL)
1682 			printf("ccd%d: %s\n", unit, errstring);
1683 #endif
1684 }
1685 
1686 /*
1687  * Take care of things one might want to take care of in the event
1688  * that a disklabel isn't present.
1689  */
1690 static void
1691 ccdmakedisklabel(struct ccd_s *cs)
1692 {
1693 	struct disklabel *lp = &cs->sc_label;
1694 
1695 	/*
1696 	 * For historical reasons, if there's no disklabel present
1697 	 * the raw partition must be marked FS_BSDFFS.
1698 	 */
1699 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1700 
1701 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1702 }
1703 
1704 /*
1705  * Wait interruptibly for an exclusive lock.
1706  *
1707  * XXX
1708  * Several drivers do this; it should be abstracted and made MP-safe.
1709  */
1710 static int
1711 ccdlock(struct ccd_s *cs)
1712 {
1713 	int error;
1714 
1715 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1716 		cs->sc_flags |= CCDF_WANTED;
1717 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1718 			return (error);
1719 	}
1720 	cs->sc_flags |= CCDF_LOCKED;
1721 	return (0);
1722 }
1723 
1724 /*
1725  * Unlock and wake up any waiters.
1726  */
1727 static void
1728 ccdunlock(struct ccd_s *cs)
1729 {
1730 
1731 	cs->sc_flags &= ~CCDF_LOCKED;
1732 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1733 		cs->sc_flags &= ~CCDF_WANTED;
1734 		wakeup(cs);
1735 	}
1736 }
1737 
1738 #ifdef DEBUG
1739 static void
1740 printiinfo(struct ccdiinfo *ii)
1741 {
1742 	int ix, i;
1743 
1744 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1745 		printf(" itab[%d]: #dk %d sblk %lld soff %lld",
1746 		    ix, ii->ii_ndisk, (long long)ii->ii_startblk,
1747 		    (long long)ii->ii_startoff);
1748 		for (i = 0; i < ii->ii_ndisk; i++)
1749 			printf(" %d", ii->ii_index[i]);
1750 		printf("\n");
1751 	}
1752 }
1753 #endif
1754