xref: /freebsd/sys/geom/geom_ccd.c (revision f4c5766baa461767ccb595252b1614f1ecc6f1a7)
1 /*
2  * Copyright (c) 2003 Poul-Henning Kamp.
3  * Copyright (c) 1995 Jason R. Thorpe.
4  * Copyright (c) 1990, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * All rights reserved.
7  * Copyright (c) 1988 University of Utah.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * the Systems Programming Group of the University of Utah Computer
11  * Science Department.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. All advertising materials mentioning features or use of this software
22  *    must display the following acknowledgement:
23  *	This product includes software developed for the NetBSD Project
24  *	by Jason R. Thorpe.
25  * 4. The names of the authors may not be used to endorse or promote products
26  *    derived from this software without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
35  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
36  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  * Dynamic configuration and disklabel support by:
41  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
42  *	Numerical Aerodynamic Simulation Facility
43  *	Mail Stop 258-6
44  *	NASA Ames Research Center
45  *	Moffett Field, CA 94035
46  *
47  * from: Utah $Hdr: cd.c 1.6 90/11/28$
48  *
49  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
50  *
51  *	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
52  *
53  * $FreeBSD$
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/module.h>
60 #include <sys/proc.h>
61 #include <sys/bio.h>
62 #include <sys/malloc.h>
63 #include <sys/namei.h>
64 #include <sys/conf.h>
65 #include <sys/stat.h>
66 #include <sys/sysctl.h>
67 #include <sys/disk.h>
68 #include <sys/fcntl.h>
69 #include <sys/vnode.h>
70 #include <geom/geom_disk.h>
71 
72 #include <sys/ccdvar.h>
73 
74 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
75 
76 /*
77    This is how mirroring works (only writes are special):
78 
79    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
80    linked together by the cb_mirror field.  "cb_pflags &
81    CCDPF_MIRROR_DONE" is set to 0 on both of them.
82 
83    When a component returns to ccdiodone(), it checks if "cb_pflags &
84    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
85    flag and returns.  If it is, it means its partner has already
86    returned, so it will go to the regular cleanup.
87 
88  */
89 
90 struct ccdbuf {
91 	struct bio	cb_buf;		/* new I/O buf */
92 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
93 	struct ccdbuf	*cb_freenext;	/* free list link */
94 	struct ccd_s	*cb_softc;
95 	int		cb_comp;	/* target component */
96 	int		cb_pflags;	/* mirror/parity status flag */
97 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
98 };
99 
100 /* bits in cb_pflags */
101 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
102 
103 /* convinient macros for often-used statements */
104 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
105 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
106 
107 static dev_t	ccdctldev;
108 
109 static disk_strategy_t ccdstrategy;
110 static d_ioctl_t ccdctlioctl;
111 
112 #define NCCDFREEHIWAT	16
113 
114 #define CDEV_MAJOR 74
115 
116 static struct cdevsw ccdctl_cdevsw = {
117 	.d_open =	nullopen,
118 	.d_close =	nullclose,
119 	.d_ioctl =	ccdctlioctl,
120 	.d_name =	"ccdctl",
121 	.d_maj =	CDEV_MAJOR,
122 };
123 
124 static LIST_HEAD(, ccd_s) ccd_softc_list =
125 	LIST_HEAD_INITIALIZER(&ccd_softc_list);
126 
127 static struct ccd_s *ccdfind(int);
128 static struct ccd_s *ccdnew(int);
129 static int ccddestroy(struct ccd_s *);
130 
131 /* called during module initialization */
132 static void ccdattach(void);
133 static int ccd_modevent(module_t, int, void *);
134 
135 /* called by biodone() at interrupt time */
136 static void ccdiodone(struct bio *bp);
137 
138 static void ccdstart(struct ccd_s *, struct bio *);
139 static void ccdinterleave(struct ccd_s *, int);
140 static int ccdinit(struct ccd_s *, char **, struct thread *);
141 static int ccdlookup(char *, struct thread *p, struct vnode **);
142 static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
143 		      struct bio *, daddr_t, caddr_t, long);
144 static int ccdlock(struct ccd_s *);
145 static void ccdunlock(struct ccd_s *);
146 
147 
148 /*
149  * Number of blocks to untouched in front of a component partition.
150  * This is to avoid violating its disklabel area when it starts at the
151  * beginning of the slice.
152  */
153 #if !defined(CCD_OFFSET)
154 #define CCD_OFFSET 16
155 #endif
156 
157 static struct ccd_s *
158 ccdfind(int unit)
159 {
160 	struct ccd_s *sc = NULL;
161 
162 	/* XXX: LOCK(unique unit numbers) */
163 	LIST_FOREACH(sc, &ccd_softc_list, list) {
164 		if (sc->sc_unit == unit)
165 			break;
166 	}
167 	/* XXX: UNLOCK(unique unit numbers) */
168 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
169 }
170 
171 static struct ccd_s *
172 ccdnew(int unit)
173 {
174 	struct ccd_s *sc;
175 
176 	/* XXX: LOCK(unique unit numbers) */
177 	if (IS_ALLOCATED(unit) || unit > 32)
178 		return (NULL);
179 
180 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
181 	sc->sc_unit = unit;
182 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
183 	/* XXX: UNLOCK(unique unit numbers) */
184 	return (sc);
185 }
186 
187 static int
188 ccddestroy(struct ccd_s *sc)
189 {
190 
191 	/* XXX: LOCK(unique unit numbers) */
192 	LIST_REMOVE(sc, list);
193 	/* XXX: UNLOCK(unique unit numbers) */
194 	FREE(sc, M_CCD);
195 	return (0);
196 }
197 
198 /*
199  * Called by main() during pseudo-device attachment.  All we need
200  * to do is to add devsw entries.
201  */
202 static void
203 ccdattach()
204 {
205 
206 	ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff,
207 		UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl");
208 	ccdctldev->si_drv1 = ccdctldev;
209 }
210 
211 static int
212 ccd_modevent(module_t mod, int type, void *data)
213 {
214 	int error = 0;
215 
216 	switch (type) {
217 	case MOD_LOAD:
218 		ccdattach();
219 		break;
220 
221 	case MOD_UNLOAD:
222 		printf("ccd0: Unload not supported!\n");
223 		error = EOPNOTSUPP;
224 		break;
225 
226 	case MOD_SHUTDOWN:
227 		break;
228 
229 	default:
230 		error = EOPNOTSUPP;
231 	}
232 	return (error);
233 }
234 
235 DEV_MODULE(ccd, ccd_modevent, NULL);
236 
237 static int
238 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
239 {
240 	struct ccdcinfo *ci = NULL;	/* XXX */
241 	size_t size;
242 	int ix;
243 	struct vnode *vp;
244 	size_t minsize;
245 	int maxsecsize;
246 	struct ccdgeom *ccg = &cs->sc_geom;
247 	char *tmppath = NULL;
248 	int error = 0;
249 	off_t mediasize;
250 	u_int sectorsize;
251 
252 
253 	cs->sc_size = 0;
254 
255 	/* Allocate space for the component info. */
256 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
257 	    M_CCD, M_WAITOK);
258 
259 	/*
260 	 * Verify that each component piece exists and record
261 	 * relevant information about it.
262 	 */
263 	maxsecsize = 0;
264 	minsize = 0;
265 	tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK);
266 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
267 		vp = cs->sc_vpp[ix];
268 		ci = &cs->sc_cinfo[ix];
269 		ci->ci_vp = vp;
270 
271 		/*
272 		 * Copy in the pathname of the component.
273 		 */
274 		if ((error = copyinstr(cpaths[ix], tmppath,
275 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
276 			goto fail;
277 		}
278 		ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK);
279 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
280 
281 		ci->ci_dev = vn_todev(vp);
282 
283 		/*
284 		 * Get partition information for the component.
285 		 */
286 		error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
287 		    FREAD, td->td_ucred, td);
288 		if (error != 0) {
289 			goto fail;
290 		}
291 		/*
292 		 * Get partition information for the component.
293 		 */
294 		error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
295 		    FREAD, td->td_ucred, td);
296 		if (error != 0) {
297 			goto fail;
298 		}
299 		if (sectorsize > maxsecsize)
300 			maxsecsize = sectorsize;
301 		size = mediasize / DEV_BSIZE - CCD_OFFSET;
302 
303 		/*
304 		 * Calculate the size, truncating to an interleave
305 		 * boundary if necessary.
306 		 */
307 
308 		if (cs->sc_ileave > 1)
309 			size -= size % cs->sc_ileave;
310 
311 		if (size == 0) {
312 			error = ENODEV;
313 			goto fail;
314 		}
315 
316 		if (minsize == 0 || size < minsize)
317 			minsize = size;
318 		ci->ci_size = size;
319 		cs->sc_size += size;
320 	}
321 
322 	free(tmppath, M_CCD);
323 	tmppath = NULL;
324 
325 	/*
326 	 * Don't allow the interleave to be smaller than
327 	 * the biggest component sector.
328 	 */
329 	if ((cs->sc_ileave > 0) &&
330 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
331 		error = EINVAL;
332 		goto fail;
333 	}
334 
335 	/*
336 	 * If uniform interleave is desired set all sizes to that of
337 	 * the smallest component.  This will guarentee that a single
338 	 * interleave table is generated.
339 	 *
340 	 * Lost space must be taken into account when calculating the
341 	 * overall size.  Half the space is lost when CCDF_MIRROR is
342 	 * specified.
343 	 */
344 	if (cs->sc_flags & CCDF_UNIFORM) {
345 		for (ci = cs->sc_cinfo;
346 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
347 			ci->ci_size = minsize;
348 		}
349 		if (cs->sc_flags & CCDF_MIRROR) {
350 			/*
351 			 * Check to see if an even number of components
352 			 * have been specified.  The interleave must also
353 			 * be non-zero in order for us to be able to
354 			 * guarentee the topology.
355 			 */
356 			if (cs->sc_nccdisks % 2) {
357 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
358 				error = EINVAL;
359 				goto fail;
360 			}
361 			if (cs->sc_ileave == 0) {
362 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
363 				error = EINVAL;
364 				goto fail;
365 			}
366 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
367 		} else {
368 			if (cs->sc_ileave == 0) {
369 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
370 				error = EINVAL;
371 				goto fail;
372 			}
373 			cs->sc_size = cs->sc_nccdisks * minsize;
374 		}
375 	}
376 
377 	/*
378 	 * Construct the interleave table.
379 	 */
380 	ccdinterleave(cs, cs->sc_unit);
381 
382 	/*
383 	 * Create pseudo-geometry based on 1MB cylinders.  It's
384 	 * pretty close.
385 	 */
386 	ccg->ccg_secsize = maxsecsize;
387 	ccg->ccg_ntracks = 1;
388 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
389 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
390 
391 	cs->sc_flags |= CCDF_INITED;
392 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
393 	return (0);
394 fail:
395 	while (ci > cs->sc_cinfo) {
396 		ci--;
397 		free(ci->ci_path, M_CCD);
398 	}
399 	if (tmppath != NULL)
400 		free(tmppath, M_CCD);
401 	free(cs->sc_cinfo, M_CCD);
402 	ccddestroy(cs);
403 	return (error);
404 }
405 
406 static void
407 ccdinterleave(struct ccd_s *cs, int unit)
408 {
409 	struct ccdcinfo *ci, *smallci;
410 	struct ccdiinfo *ii;
411 	daddr_t bn, lbn;
412 	int ix;
413 	u_long size;
414 
415 
416 	/*
417 	 * Allocate an interleave table.  The worst case occurs when each
418 	 * of N disks is of a different size, resulting in N interleave
419 	 * tables.
420 	 *
421 	 * Chances are this is too big, but we don't care.
422 	 */
423 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
424 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD,
425 	    M_WAITOK | M_ZERO);
426 
427 	/*
428 	 * Trivial case: no interleave (actually interleave of disk size).
429 	 * Each table entry represents a single component in its entirety.
430 	 *
431 	 * An interleave of 0 may not be used with a mirror setup.
432 	 */
433 	if (cs->sc_ileave == 0) {
434 		bn = 0;
435 		ii = cs->sc_itable;
436 
437 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
438 			/* Allocate space for ii_index. */
439 			ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK);
440 			ii->ii_ndisk = 1;
441 			ii->ii_startblk = bn;
442 			ii->ii_startoff = 0;
443 			ii->ii_index[0] = ix;
444 			bn += cs->sc_cinfo[ix].ci_size;
445 			ii++;
446 		}
447 		ii->ii_ndisk = 0;
448 		return;
449 	}
450 
451 	/*
452 	 * The following isn't fast or pretty; it doesn't have to be.
453 	 */
454 	size = 0;
455 	bn = lbn = 0;
456 	for (ii = cs->sc_itable; ; ii++) {
457 		/*
458 		 * Allocate space for ii_index.  We might allocate more then
459 		 * we use.
460 		 */
461 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
462 		    M_CCD, M_WAITOK);
463 
464 		/*
465 		 * Locate the smallest of the remaining components
466 		 */
467 		smallci = NULL;
468 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
469 		    ci++) {
470 			if (ci->ci_size > size &&
471 			    (smallci == NULL ||
472 			     ci->ci_size < smallci->ci_size)) {
473 				smallci = ci;
474 			}
475 		}
476 
477 		/*
478 		 * Nobody left, all done
479 		 */
480 		if (smallci == NULL) {
481 			ii->ii_ndisk = 0;
482 			free(ii->ii_index, M_CCD);
483 			break;
484 		}
485 
486 		/*
487 		 * Record starting logical block using an sc_ileave blocksize.
488 		 */
489 		ii->ii_startblk = bn / cs->sc_ileave;
490 
491 		/*
492 		 * Record starting comopnent block using an sc_ileave
493 		 * blocksize.  This value is relative to the beginning of
494 		 * a component disk.
495 		 */
496 		ii->ii_startoff = lbn;
497 
498 		/*
499 		 * Determine how many disks take part in this interleave
500 		 * and record their indices.
501 		 */
502 		ix = 0;
503 		for (ci = cs->sc_cinfo;
504 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
505 			if (ci->ci_size >= smallci->ci_size) {
506 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
507 			}
508 		}
509 		ii->ii_ndisk = ix;
510 		bn += ix * (smallci->ci_size - size);
511 		lbn = smallci->ci_size / cs->sc_ileave;
512 		size = smallci->ci_size;
513 	}
514 }
515 
516 static void
517 ccdstrategy(struct bio *bp)
518 {
519 	struct ccd_s *cs;
520 	int pbn;        /* in sc_secsize chunks */
521 	long sz;        /* in sc_secsize chunks */
522 
523 	cs = bp->bio_disk->d_drv1;
524 
525 	pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
526 	sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
527 
528 	/*
529 	 * If out of bounds return an error. If at the EOF point,
530 	 * simply read or write less.
531 	 */
532 
533 	if (pbn < 0 || pbn >= cs->sc_size) {
534 		bp->bio_resid = bp->bio_bcount;
535 		if (pbn != cs->sc_size)
536 			biofinish(bp, NULL, EINVAL);
537 		else
538 			biodone(bp);
539 		return;
540 	}
541 
542 	/*
543 	 * If the request crosses EOF, truncate the request.
544 	 */
545 	if (pbn + sz > cs->sc_size) {
546 		bp->bio_bcount = (cs->sc_size - pbn) *
547 		    cs->sc_geom.ccg_secsize;
548 	}
549 
550 	bp->bio_resid = bp->bio_bcount;
551 
552 	/*
553 	 * "Start" the unit.
554 	 */
555 	ccdstart(cs, bp);
556 	return;
557 }
558 
559 static void
560 ccdstart(struct ccd_s *cs, struct bio *bp)
561 {
562 	long bcount, rcount;
563 	struct ccdbuf *cbp[2];
564 	caddr_t addr;
565 	daddr_t bn;
566 	int err;
567 	int sent;
568 
569 	/*
570 	 * Translate the partition-relative block number to an absolute.
571 	 */
572 	bn = bp->bio_blkno;
573 
574 	/*
575 	 * Allocate component buffers and fire off the requests
576 	 */
577 	addr = bp->bio_data;
578 	sent = 0;
579 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
580 		err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
581 		if (err) {
582 			printf("ccdbuffer error %d\n", err);
583 			if (!sent)
584 				biofinish(bp, NULL, err);
585 			else {
586 				/*
587 				 * XXX: maybe a race where the partners
588 				 * XXX: we sent already have been in
589 				 * XXX: ccdiodone().  Single-threaded g_down
590 				 * XXX: may protect against this.
591 				 */
592 				bp->bio_resid -= bcount;
593 				bp->bio_error = err;
594 				bp->bio_flags |= BIO_ERROR;
595 			}
596 			return;
597 		}
598 		rcount = cbp[0]->cb_buf.bio_bcount;
599 
600 		if (cs->sc_cflags & CCDF_MIRROR) {
601 			/*
602 			 * Mirroring.  Writes go to both disks, reads are
603 			 * taken from whichever disk seems most appropriate.
604 			 *
605 			 * We attempt to localize reads to the disk whos arm
606 			 * is nearest the read request.  We ignore seeks due
607 			 * to writes when making this determination and we
608 			 * also try to avoid hogging.
609 			 */
610 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
611 				BIO_STRATEGY(&cbp[0]->cb_buf);
612 				BIO_STRATEGY(&cbp[1]->cb_buf);
613 				sent++;
614 			} else {
615 				int pick = cs->sc_pick;
616 				daddr_t range = cs->sc_size / 16;
617 
618 				if (bn < cs->sc_blk[pick] - range ||
619 				    bn > cs->sc_blk[pick] + range
620 				) {
621 					cs->sc_pick = pick = 1 - pick;
622 				}
623 				cs->sc_blk[pick] = bn + btodb(rcount);
624 				BIO_STRATEGY(&cbp[pick]->cb_buf);
625 				sent++;
626 			}
627 		} else {
628 			/*
629 			 * Not mirroring
630 			 */
631 			BIO_STRATEGY(&cbp[0]->cb_buf);
632 			sent++;
633 		}
634 		bn += btodb(rcount);
635 		addr += rcount;
636 	}
637 }
638 
639 /*
640  * Build a component buffer header.
641  */
642 static int
643 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
644 {
645 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
646 	struct ccdbuf *cbp;
647 	daddr_t cbn, cboff;
648 	off_t cbc;
649 
650 	/*
651 	 * Determine which component bn falls in.
652 	 */
653 	cbn = bn;
654 	cboff = 0;
655 
656 	if (cs->sc_ileave == 0) {
657 		/*
658 		 * Serially concatenated and neither a mirror nor a parity
659 		 * config.  This is a special case.
660 		 */
661 		daddr_t sblk;
662 
663 		sblk = 0;
664 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
665 			sblk += ci->ci_size;
666 		cbn -= sblk;
667 	} else {
668 		struct ccdiinfo *ii;
669 		int ccdisk, off;
670 
671 		/*
672 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
673 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
674 		 * to cbn.
675 		 */
676 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
677 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
678 
679 		/*
680 		 * Figure out which interleave table to use.
681 		 */
682 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
683 			if (ii->ii_startblk > cbn)
684 				break;
685 		}
686 		ii--;
687 
688 		/*
689 		 * off is the logical superblock relative to the beginning
690 		 * of this interleave block.
691 		 */
692 		off = cbn - ii->ii_startblk;
693 
694 		/*
695 		 * We must calculate which disk component to use (ccdisk),
696 		 * and recalculate cbn to be the superblock relative to
697 		 * the beginning of the component.  This is typically done by
698 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
699 		 * must typically be divided by the number of components in
700 		 * this interleave array to be properly convert it from a
701 		 * CCD-relative logical superblock number to a
702 		 * component-relative superblock number.
703 		 */
704 		if (ii->ii_ndisk == 1) {
705 			/*
706 			 * When we have just one disk, it can't be a mirror
707 			 * or a parity config.
708 			 */
709 			ccdisk = ii->ii_index[0];
710 			cbn = ii->ii_startoff + off;
711 		} else {
712 			if (cs->sc_cflags & CCDF_MIRROR) {
713 				/*
714 				 * We have forced a uniform mapping, resulting
715 				 * in a single interleave array.  We double
716 				 * up on the first half of the available
717 				 * components and our mirror is in the second
718 				 * half.  This only works with a single
719 				 * interleave array because doubling up
720 				 * doubles the number of sectors, so there
721 				 * cannot be another interleave array because
722 				 * the next interleave array's calculations
723 				 * would be off.
724 				 */
725 				int ndisk2 = ii->ii_ndisk / 2;
726 				ccdisk = ii->ii_index[off % ndisk2];
727 				cbn = ii->ii_startoff + off / ndisk2;
728 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
729 			} else {
730 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
731 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
732 			}
733 		}
734 
735 		ci = &cs->sc_cinfo[ccdisk];
736 
737 		/*
738 		 * Convert cbn from a superblock to a normal block so it
739 		 * can be used to calculate (along with cboff) the normal
740 		 * block index into this particular disk.
741 		 */
742 		cbn *= cs->sc_ileave;
743 	}
744 
745 	/*
746 	 * Fill in the component buf structure.
747 	 */
748 	cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO);
749 	if (cbp == NULL)
750 		return (ENOMEM);
751 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
752 	cbp->cb_buf.bio_done = ccdiodone;
753 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
754 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
755 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
756 	cbp->cb_buf.bio_data = addr;
757 	cbp->cb_buf.bio_caller2 = cbp;
758 	if (cs->sc_ileave == 0)
759               cbc = dbtob((off_t)(ci->ci_size - cbn));
760 	else
761               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
762 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
763  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
764 
765 	/*
766 	 * context for ccdiodone
767 	 */
768 	cbp->cb_obp = bp;
769 	cbp->cb_softc = cs;
770 	cbp->cb_comp = ci - cs->sc_cinfo;
771 
772 	cb[0] = cbp;
773 
774 	/*
775 	 * Note: both I/O's setup when reading from mirror, but only one
776 	 * will be executed.
777 	 */
778 	if (cs->sc_cflags & CCDF_MIRROR) {
779 		/* mirror, setup second I/O */
780 		cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT);
781 		if (cbp == NULL) {
782 			free(cb[0], M_CCD);
783 			cb[0] = NULL;
784 			return (ENOMEM);
785 		}
786 		bcopy(cb[0], cbp, sizeof(struct ccdbuf));
787 		cbp->cb_buf.bio_caller2 = cbp;
788 		cbp->cb_buf.bio_dev = ci2->ci_dev;
789 		cbp->cb_comp = ci2 - cs->sc_cinfo;
790 		cb[1] = cbp;
791 		/* link together the ccdbuf's and clear "mirror done" flag */
792 		cb[0]->cb_mirror = cb[1];
793 		cb[1]->cb_mirror = cb[0];
794 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
795 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
796 	}
797 	return (0);
798 }
799 
800 /*
801  * Called at interrupt time.
802  * Mark the component as done and if all components are done,
803  * take a ccd interrupt.
804  */
805 static void
806 ccdiodone(struct bio *ibp)
807 {
808 	struct ccdbuf *cbp;
809 	struct bio *bp;
810 	struct ccd_s *cs;
811 	int count;
812 
813 	cbp = ibp->bio_caller2;
814 	cs = cbp->cb_softc;
815 	bp = cbp->cb_obp;
816 	/*
817 	 * If an error occured, report it.  If this is a mirrored
818 	 * configuration and the first of two possible reads, do not
819 	 * set the error in the bp yet because the second read may
820 	 * succeed.
821 	 */
822 
823 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
824 		const char *msg = "";
825 
826 		if ((cs->sc_cflags & CCDF_MIRROR) &&
827 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
828 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
829 			/*
830 			 * We will try our read on the other disk down
831 			 * below, also reverse the default pick so if we
832 			 * are doing a scan we do not keep hitting the
833 			 * bad disk first.
834 			 */
835 
836 			msg = ", trying other disk";
837 			cs->sc_pick = 1 - cs->sc_pick;
838 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
839 		} else {
840 			bp->bio_flags |= BIO_ERROR;
841 			bp->bio_error = cbp->cb_buf.bio_error ?
842 			    cbp->cb_buf.bio_error : EIO;
843 		}
844 		printf("ccd%d: error %d on component %d block %jd "
845 		    "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error,
846 		    cbp->cb_comp,
847 		    (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno,
848 		    msg);
849 	}
850 
851 	/*
852 	 * Process mirror.  If we are writing, I/O has been initiated on both
853 	 * buffers and we fall through only after both are finished.
854 	 *
855 	 * If we are reading only one I/O is initiated at a time.  If an
856 	 * error occurs we initiate the second I/O and return, otherwise
857 	 * we free the second I/O without initiating it.
858 	 */
859 
860 	if (cs->sc_cflags & CCDF_MIRROR) {
861 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
862 			/*
863 			 * When writing, handshake with the second buffer
864 			 * to determine when both are done.  If both are not
865 			 * done, return here.
866 			 */
867 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
868 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
869 				free(cbp, M_CCD);
870 				return;
871 			}
872 		} else {
873 			/*
874 			 * When reading, either dispose of the second buffer
875 			 * or initiate I/O on the second buffer if an error
876 			 * occured with this one.
877 			 */
878 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
879 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
880 					cbp->cb_mirror->cb_pflags |=
881 					    CCDPF_MIRROR_DONE;
882 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf);
883 					free(cbp, M_CCD);
884 					return;
885 				} else {
886 					free(cbp->cb_mirror, M_CCD);
887 				}
888 			}
889 		}
890 	}
891 
892 	/*
893 	 * use bio_caller1 to determine how big the original request was rather
894 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
895 	 *
896 	 * XXX We check for an error, but we do not test the resid for an
897 	 * aligned EOF condition.  This may result in character & block
898 	 * device access not recognizing EOF properly when read or written
899 	 * sequentially, but will not effect filesystems.
900 	 */
901 	count = (long)cbp->cb_buf.bio_caller1;
902 	free(cbp, M_CCD);
903 
904 	/*
905 	 * If all done, "interrupt".
906 	 */
907 	bp->bio_resid -= count;
908 	if (bp->bio_resid < 0)
909 		panic("ccdiodone: count");
910 	if (bp->bio_resid == 0) {
911 		if (bp->bio_flags & BIO_ERROR)
912 			bp->bio_resid = bp->bio_bcount;
913 		biodone(bp);
914 	}
915 }
916 
917 static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td);
918 
919 static int
920 ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
921 {
922 	struct ccd_ioctl *ccio;
923 	u_int unit;
924 	dev_t dev2;
925 	int error;
926 
927 	switch (cmd) {
928 	case CCDIOCSET:
929 	case CCDIOCCLR:
930 		ccio = (struct ccd_ioctl *)data;
931 		unit = ccio->ccio_size;
932 		return (ccdioctltoo(unit, cmd, data, flag, td));
933 	case CCDCONFINFO:
934 		{
935 		int ninit = 0;
936 		struct ccdconf *conf = (struct ccdconf *)data;
937 		struct ccd_s *tmpcs;
938 		struct ccd_s *ubuf = conf->buffer;
939 
940 		/* XXX: LOCK(unique unit numbers) */
941 		LIST_FOREACH(tmpcs, &ccd_softc_list, list)
942 			if (IS_INITED(tmpcs))
943 				ninit++;
944 
945 		if (conf->size == 0) {
946 			conf->size = sizeof(struct ccd_s) * ninit;
947 			return (0);
948 		} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
949 		    (conf->size % sizeof(struct ccd_s) != 0)) {
950 			/* XXX: UNLOCK(unique unit numbers) */
951 			return (EINVAL);
952 		}
953 
954 		ubuf += ninit;
955 		LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
956 			if (!IS_INITED(tmpcs))
957 				continue;
958 			error = copyout(tmpcs, --ubuf,
959 			    sizeof(struct ccd_s));
960 			if (error != 0)
961 				/* XXX: UNLOCK(unique unit numbers) */
962 				return (error);
963 		}
964 		/* XXX: UNLOCK(unique unit numbers) */
965 		return (0);
966 		}
967 
968 	case CCDCPPINFO:
969 		{
970 		struct ccdcpps *cpps = (struct ccdcpps *)data;
971 		char *ubuf = cpps->buffer;
972 		struct ccd_s *cs;
973 
974 
975 		error = copyin(ubuf, &unit, sizeof (unit));
976 		if (error)
977 			return (error);
978 
979 		if (!IS_ALLOCATED(unit))
980 			return (ENXIO);
981 		dev2 = makedev(CDEV_MAJOR, unit * 8 + 2);
982 		cs = ccdfind(unit);
983 		if (!IS_INITED(cs))
984 			return (ENXIO);
985 
986 		{
987 			int len = 0, i;
988 			struct ccdcpps *cpps = (struct ccdcpps *)data;
989 			char *ubuf = cpps->buffer;
990 
991 
992 			for (i = 0; i < cs->sc_nccdisks; ++i)
993 				len += cs->sc_cinfo[i].ci_pathlen;
994 
995 			if (cpps->size < len)
996 				return (ENOMEM);
997 
998 			for (i = 0; i < cs->sc_nccdisks; ++i) {
999 				len = cs->sc_cinfo[i].ci_pathlen;
1000 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1001 				    len);
1002 				if (error != 0)
1003 					return (error);
1004 				ubuf += len;
1005 			}
1006 			return(copyout("", ubuf, 1));
1007 		}
1008 		break;
1009 		}
1010 
1011 	default:
1012 		return (ENXIO);
1013 	}
1014 }
1015 
1016 static int
1017 ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td)
1018 {
1019 	int i, j, lookedup = 0, error = 0;
1020 	struct ccd_s *cs;
1021 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1022 	struct ccdgeom *ccg;
1023 	char **cpp;
1024 	struct vnode **vpp;
1025 
1026 	cs = ccdfind(unit);
1027 	switch (cmd) {
1028 	case CCDIOCSET:
1029 		if (cs == NULL)
1030 			cs = ccdnew(unit);
1031 		if (IS_INITED(cs))
1032 			return (EBUSY);
1033 
1034 		if ((flag & FWRITE) == 0)
1035 			return (EBADF);
1036 
1037 		if ((error = ccdlock(cs)) != 0)
1038 			return (error);
1039 
1040 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1041 			return (EINVAL);
1042 
1043 		/* Fill in some important bits. */
1044 		cs->sc_ileave = ccio->ccio_ileave;
1045 		if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) {
1046 			printf("ccd%d: disabling mirror, interleave is 0\n",
1047 			    unit);
1048 			ccio->ccio_flags &= ~(CCDF_MIRROR);
1049 		}
1050 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1051 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1052 			printf("ccd%d: mirror/parity forces uniform flag\n",
1053 			       unit);
1054 			ccio->ccio_flags |= CCDF_UNIFORM;
1055 		}
1056 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1057 
1058 		/*
1059 		 * Allocate space for and copy in the array of
1060 		 * componet pathnames and device numbers.
1061 		 */
1062 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1063 		    M_CCD, M_WAITOK);
1064 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1065 		    M_CCD, M_WAITOK);
1066 
1067 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1068 		    ccio->ccio_ndisks * sizeof(char **));
1069 		if (error) {
1070 			free(vpp, M_CCD);
1071 			free(cpp, M_CCD);
1072 			ccdunlock(cs);
1073 			return (error);
1074 		}
1075 
1076 
1077 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1078 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1079 				for (j = 0; j < lookedup; ++j)
1080 					(void)vn_close(vpp[j], FREAD|FWRITE,
1081 					    td->td_ucred, td);
1082 				free(vpp, M_CCD);
1083 				free(cpp, M_CCD);
1084 				ccdunlock(cs);
1085 				return (error);
1086 			}
1087 			++lookedup;
1088 		}
1089 		cs->sc_vpp = vpp;
1090 		cs->sc_nccdisks = ccio->ccio_ndisks;
1091 
1092 		/*
1093 		 * Initialize the ccd.  Fills in the softc for us.
1094 		 */
1095 		if ((error = ccdinit(cs, cpp, td)) != 0) {
1096 			for (j = 0; j < lookedup; ++j)
1097 				(void)vn_close(vpp[j], FREAD|FWRITE,
1098 				    td->td_ucred, td);
1099 			/*
1100 			 * We can't ccddestroy() cs just yet, because nothing
1101 			 * prevents user-level app to do another ioctl()
1102 			 * without closing the device first, therefore
1103 			 * declare unit null and void and let ccdclose()
1104 			 * destroy it when it is safe to do so.
1105 			 */
1106 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1107 			free(vpp, M_CCD);
1108 			free(cpp, M_CCD);
1109 			ccdunlock(cs);
1110 			return (error);
1111 		}
1112 		free(cpp, M_CCD);
1113 
1114 		/*
1115 		 * The ccd has been successfully initialized, so
1116 		 * we can place it into the array and read the disklabel.
1117 		 */
1118 		ccio->ccio_unit = unit;
1119 		ccio->ccio_size = cs->sc_size;
1120 		ccg = &cs->sc_geom;
1121 		cs->sc_disk = malloc(sizeof(struct disk), M_CCD,
1122 		    M_ZERO | M_WAITOK);
1123 		cs->sc_disk->d_strategy = ccdstrategy;
1124 		cs->sc_disk->d_name = "ccd";
1125 		cs->sc_disk->d_sectorsize = ccg->ccg_secsize;
1126 		cs->sc_disk->d_mediasize =
1127 		    cs->sc_size * (off_t)ccg->ccg_secsize;
1128 		cs->sc_disk->d_fwsectors = ccg->ccg_nsectors;
1129 		cs->sc_disk->d_fwheads = ccg->ccg_ntracks;
1130 		cs->sc_disk->d_drv1 = cs;
1131 		cs->sc_disk->d_maxsize = MAXPHYS;
1132 		disk_create(unit, cs->sc_disk, 0, NULL, NULL);
1133 
1134 		ccdunlock(cs);
1135 
1136 		break;
1137 
1138 	case CCDIOCCLR:
1139 		if (cs == NULL)
1140 			return (ENXIO);
1141 
1142 		if (!IS_INITED(cs))
1143 			return (ENXIO);
1144 
1145 		if ((flag & FWRITE) == 0)
1146 			return (EBADF);
1147 
1148 		if ((error = ccdlock(cs)) != 0)
1149 			return (error);
1150 
1151 		/* Don't unconfigure if any other partitions are open */
1152 		if (cs->sc_disk->d_flags & DISKFLAG_OPEN) {
1153 			ccdunlock(cs);
1154 			return (EBUSY);
1155 		}
1156 
1157 		disk_destroy(cs->sc_disk);
1158 		free(cs->sc_disk, M_CCD);
1159 		cs->sc_disk = NULL;
1160 		/* Declare unit null and void (reset all flags) */
1161 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1162 
1163 		/* Close the components and free their pathnames. */
1164 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1165 			/*
1166 			 * XXX: this close could potentially fail and
1167 			 * cause Bad Things.  Maybe we need to force
1168 			 * the close to happen?
1169 			 */
1170 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1171 			    td->td_ucred, td);
1172 			free(cs->sc_cinfo[i].ci_path, M_CCD);
1173 		}
1174 
1175 		/* Free interleave index. */
1176 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1177 			free(cs->sc_itable[i].ii_index, M_CCD);
1178 
1179 		/* Free component info and interleave table. */
1180 		free(cs->sc_cinfo, M_CCD);
1181 		free(cs->sc_itable, M_CCD);
1182 		free(cs->sc_vpp, M_CCD);
1183 
1184 		/* This must be atomic. */
1185 		ccdunlock(cs);
1186 		ccddestroy(cs);
1187 
1188 		break;
1189 	}
1190 
1191 	return (0);
1192 }
1193 
1194 
1195 /*
1196  * Lookup the provided name in the filesystem.  If the file exists,
1197  * is a valid block device, and isn't being used by anyone else,
1198  * set *vpp to the file's vnode.
1199  */
1200 static int
1201 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1202 {
1203 	struct nameidata nd;
1204 	struct vnode *vp;
1205 	int error, flags;
1206 
1207 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1208 	flags = FREAD | FWRITE;
1209 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1210 		return (error);
1211 	}
1212 	vp = nd.ni_vp;
1213 
1214 	if (vrefcnt(vp) > 1) {
1215 		error = EBUSY;
1216 		goto bad;
1217 	}
1218 
1219 	if (!vn_isdisk(vp, &error))
1220 		goto bad;
1221 
1222 
1223 	VOP_UNLOCK(vp, 0, td);
1224 	NDFREE(&nd, NDF_ONLY_PNBUF);
1225 	*vpp = vp;
1226 	return (0);
1227 bad:
1228 	VOP_UNLOCK(vp, 0, td);
1229 	NDFREE(&nd, NDF_ONLY_PNBUF);
1230 	/* vn_close does vrele() for vp */
1231 	(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1232 	return (error);
1233 }
1234 
1235 /*
1236 
1237  * Wait interruptibly for an exclusive lock.
1238  *
1239  * XXX
1240  * Several drivers do this; it should be abstracted and made MP-safe.
1241  */
1242 static int
1243 ccdlock(struct ccd_s *cs)
1244 {
1245 	int error;
1246 
1247 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1248 		cs->sc_flags |= CCDF_WANTED;
1249 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1250 			return (error);
1251 	}
1252 	cs->sc_flags |= CCDF_LOCKED;
1253 	return (0);
1254 }
1255 
1256 /*
1257  * Unlock and wake up any waiters.
1258  */
1259 static void
1260 ccdunlock(struct ccd_s *cs)
1261 {
1262 
1263 	cs->sc_flags &= ~CCDF_LOCKED;
1264 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1265 		cs->sc_flags &= ~CCDF_WANTED;
1266 		wakeup(cs);
1267 	}
1268 }
1269