xref: /freebsd/sys/geom/geom_ccd.c (revision 628f583ce90d3587595c2f4dd16d57eec3511af3)
1 /*
2  * Copyright (c) 2003 Poul-Henning Kamp.
3  * Copyright (c) 1995 Jason R. Thorpe.
4  * Copyright (c) 1990, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * All rights reserved.
7  * Copyright (c) 1988 University of Utah.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * the Systems Programming Group of the University of Utah Computer
11  * Science Department.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. All advertising materials mentioning features or use of this software
22  *    must display the following acknowledgement:
23  *	This product includes software developed for the NetBSD Project
24  *	by Jason R. Thorpe.
25  * 4. The names of the authors may not be used to endorse or promote products
26  *    derived from this software without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
35  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
36  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  * Dynamic configuration and disklabel support by:
41  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
42  *	Numerical Aerodynamic Simulation Facility
43  *	Mail Stop 258-6
44  *	NASA Ames Research Center
45  *	Moffett Field, CA 94035
46  *
47  * from: Utah $Hdr: cd.c 1.6 90/11/28$
48  *
49  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
50  *
51  *	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
52  *
53  * $FreeBSD$
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/module.h>
60 #include <sys/proc.h>
61 #include <sys/bio.h>
62 #include <sys/malloc.h>
63 #include <sys/namei.h>
64 #include <sys/conf.h>
65 #include <sys/stat.h>
66 #include <sys/sysctl.h>
67 #include <sys/disk.h>
68 #include <sys/fcntl.h>
69 #include <sys/vnode.h>
70 #include <geom/geom_disk.h>
71 
72 #include <sys/ccdvar.h>
73 
74 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
75 
76 /*
77    This is how mirroring works (only writes are special):
78 
79    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
80    linked together by the cb_mirror field.  "cb_pflags &
81    CCDPF_MIRROR_DONE" is set to 0 on both of them.
82 
83    When a component returns to ccdiodone(), it checks if "cb_pflags &
84    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
85    flag and returns.  If it is, it means its partner has already
86    returned, so it will go to the regular cleanup.
87 
88  */
89 
90 struct ccdbuf {
91 	struct bio	cb_buf;		/* new I/O buf */
92 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
93 	struct ccdbuf	*cb_freenext;	/* free list link */
94 	struct ccd_s	*cb_softc;
95 	int		cb_comp;	/* target component */
96 	int		cb_pflags;	/* mirror/parity status flag */
97 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
98 };
99 
100 /* bits in cb_pflags */
101 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
102 
103 /* convinient macros for often-used statements */
104 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
105 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
106 
107 static dev_t	ccdctldev;
108 
109 static disk_strategy_t ccdstrategy;
110 static d_ioctl_t ccdctlioctl;
111 
112 #define NCCDFREEHIWAT	16
113 
114 #define CDEV_MAJOR 74
115 
116 static struct cdevsw ccdctl_cdevsw = {
117 	.d_open =	nullopen,
118 	.d_close =	nullclose,
119 	.d_ioctl =	ccdctlioctl,
120 	.d_name =	"ccdctl",
121 	.d_maj =	CDEV_MAJOR,
122 };
123 
124 static LIST_HEAD(, ccd_s) ccd_softc_list =
125 	LIST_HEAD_INITIALIZER(&ccd_softc_list);
126 
127 static struct ccd_s *ccdfind(int);
128 static struct ccd_s *ccdnew(int);
129 static int ccddestroy(struct ccd_s *);
130 
131 /* called during module initialization */
132 static void ccdattach(void);
133 static int ccd_modevent(module_t, int, void *);
134 
135 /* called by biodone() at interrupt time */
136 static void ccdiodone(struct bio *bp);
137 
138 static void ccdstart(struct ccd_s *, struct bio *);
139 static void ccdinterleave(struct ccd_s *, int);
140 static int ccdinit(struct ccd_s *, char **, struct thread *);
141 static int ccdlookup(char *, struct thread *p, struct vnode **);
142 static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
143 		      struct bio *, daddr_t, caddr_t, long);
144 static int ccdlock(struct ccd_s *);
145 static void ccdunlock(struct ccd_s *);
146 
147 
148 /*
149  * Number of blocks to untouched in front of a component partition.
150  * This is to avoid violating its disklabel area when it starts at the
151  * beginning of the slice.
152  */
153 #if !defined(CCD_OFFSET)
154 #define CCD_OFFSET 16
155 #endif
156 
157 static struct ccd_s *
158 ccdfind(int unit)
159 {
160 	struct ccd_s *sc = NULL;
161 
162 	/* XXX: LOCK(unique unit numbers) */
163 	LIST_FOREACH(sc, &ccd_softc_list, list) {
164 		if (sc->sc_unit == unit)
165 			break;
166 	}
167 	/* XXX: UNLOCK(unique unit numbers) */
168 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
169 }
170 
171 static struct ccd_s *
172 ccdnew(int unit)
173 {
174 	struct ccd_s *sc;
175 
176 	/* XXX: LOCK(unique unit numbers) */
177 	if (IS_ALLOCATED(unit) || unit > 32)
178 		return (NULL);
179 
180 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
181 	sc->sc_unit = unit;
182 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
183 	/* XXX: UNLOCK(unique unit numbers) */
184 	return (sc);
185 }
186 
187 static int
188 ccddestroy(struct ccd_s *sc)
189 {
190 
191 	/* XXX: LOCK(unique unit numbers) */
192 	LIST_REMOVE(sc, list);
193 	/* XXX: UNLOCK(unique unit numbers) */
194 	FREE(sc, M_CCD);
195 	return (0);
196 }
197 
198 /*
199  * Called by main() during pseudo-device attachment.  All we need
200  * to do is to add devsw entries.
201  */
202 static void
203 ccdattach()
204 {
205 
206 	ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff,
207 		UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl");
208 	ccdctldev->si_drv1 = ccdctldev;
209 }
210 
211 static int
212 ccd_modevent(module_t mod, int type, void *data)
213 {
214 	int error = 0;
215 
216 	switch (type) {
217 	case MOD_LOAD:
218 		ccdattach();
219 		break;
220 
221 	case MOD_UNLOAD:
222 		printf("ccd0: Unload not supported!\n");
223 		error = EOPNOTSUPP;
224 		break;
225 
226 	case MOD_SHUTDOWN:
227 		break;
228 
229 	default:
230 		error = EOPNOTSUPP;
231 	}
232 	return (error);
233 }
234 
235 DEV_MODULE(ccd, ccd_modevent, NULL);
236 
237 static int
238 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
239 {
240 	struct ccdcinfo *ci = NULL;	/* XXX */
241 	size_t size;
242 	int ix;
243 	struct vnode *vp;
244 	size_t minsize;
245 	int maxsecsize;
246 	struct ccdgeom *ccg = &cs->sc_geom;
247 	char *tmppath = NULL;
248 	int error = 0;
249 	off_t mediasize;
250 	u_int sectorsize;
251 
252 
253 	cs->sc_size = 0;
254 
255 	/* Allocate space for the component info. */
256 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
257 	    M_CCD, M_WAITOK);
258 
259 	/*
260 	 * Verify that each component piece exists and record
261 	 * relevant information about it.
262 	 */
263 	maxsecsize = 0;
264 	minsize = 0;
265 	tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK);
266 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
267 		vp = cs->sc_vpp[ix];
268 		ci = &cs->sc_cinfo[ix];
269 		ci->ci_vp = vp;
270 
271 		/*
272 		 * Copy in the pathname of the component.
273 		 */
274 		if ((error = copyinstr(cpaths[ix], tmppath,
275 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
276 			goto fail;
277 		}
278 		ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK);
279 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
280 
281 		ci->ci_dev = vn_todev(vp);
282 
283 		/*
284 		 * Get partition information for the component.
285 		 */
286 		error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
287 		    FREAD, td->td_ucred, td);
288 		if (error != 0) {
289 			goto fail;
290 		}
291 		/*
292 		 * Get partition information for the component.
293 		 */
294 		error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
295 		    FREAD, td->td_ucred, td);
296 		if (error != 0) {
297 			goto fail;
298 		}
299 		if (sectorsize > maxsecsize)
300 			maxsecsize = sectorsize;
301 		size = mediasize / DEV_BSIZE - CCD_OFFSET;
302 
303 		/*
304 		 * Calculate the size, truncating to an interleave
305 		 * boundary if necessary.
306 		 */
307 
308 		if (cs->sc_ileave > 1)
309 			size -= size % cs->sc_ileave;
310 
311 		if (size == 0) {
312 			error = ENODEV;
313 			goto fail;
314 		}
315 
316 		if (minsize == 0 || size < minsize)
317 			minsize = size;
318 		ci->ci_size = size;
319 		cs->sc_size += size;
320 	}
321 
322 	free(tmppath, M_CCD);
323 	tmppath = NULL;
324 
325 	/*
326 	 * Don't allow the interleave to be smaller than
327 	 * the biggest component sector.
328 	 */
329 	if ((cs->sc_ileave > 0) &&
330 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
331 		error = EINVAL;
332 		goto fail;
333 	}
334 
335 	/*
336 	 * If uniform interleave is desired set all sizes to that of
337 	 * the smallest component.  This will guarentee that a single
338 	 * interleave table is generated.
339 	 *
340 	 * Lost space must be taken into account when calculating the
341 	 * overall size.  Half the space is lost when CCDF_MIRROR is
342 	 * specified.
343 	 */
344 	if (cs->sc_flags & CCDF_UNIFORM) {
345 		for (ci = cs->sc_cinfo;
346 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
347 			ci->ci_size = minsize;
348 		}
349 		if (cs->sc_flags & CCDF_MIRROR) {
350 			/*
351 			 * Check to see if an even number of components
352 			 * have been specified.  The interleave must also
353 			 * be non-zero in order for us to be able to
354 			 * guarentee the topology.
355 			 */
356 			if (cs->sc_nccdisks % 2) {
357 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
358 				error = EINVAL;
359 				goto fail;
360 			}
361 			if (cs->sc_ileave == 0) {
362 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
363 				error = EINVAL;
364 				goto fail;
365 			}
366 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
367 		} else {
368 			if (cs->sc_ileave == 0) {
369 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
370 				error = EINVAL;
371 				goto fail;
372 			}
373 			cs->sc_size = cs->sc_nccdisks * minsize;
374 		}
375 	}
376 
377 	/*
378 	 * Construct the interleave table.
379 	 */
380 	ccdinterleave(cs, cs->sc_unit);
381 
382 	/*
383 	 * Create pseudo-geometry based on 1MB cylinders.  It's
384 	 * pretty close.
385 	 */
386 	ccg->ccg_secsize = maxsecsize;
387 	ccg->ccg_ntracks = 1;
388 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
389 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
390 
391 	cs->sc_flags |= CCDF_INITED;
392 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
393 	return (0);
394 fail:
395 	while (ci > cs->sc_cinfo) {
396 		ci--;
397 		free(ci->ci_path, M_CCD);
398 	}
399 	if (tmppath != NULL)
400 		free(tmppath, M_CCD);
401 	free(cs->sc_cinfo, M_CCD);
402 	ccddestroy(cs);
403 	return (error);
404 }
405 
406 static void
407 ccdinterleave(struct ccd_s *cs, int unit)
408 {
409 	struct ccdcinfo *ci, *smallci;
410 	struct ccdiinfo *ii;
411 	daddr_t bn, lbn;
412 	int ix;
413 	u_long size;
414 
415 
416 	/*
417 	 * Allocate an interleave table.  The worst case occurs when each
418 	 * of N disks is of a different size, resulting in N interleave
419 	 * tables.
420 	 *
421 	 * Chances are this is too big, but we don't care.
422 	 */
423 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
424 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD,
425 	    M_WAITOK | M_ZERO);
426 
427 	/*
428 	 * Trivial case: no interleave (actually interleave of disk size).
429 	 * Each table entry represents a single component in its entirety.
430 	 *
431 	 * An interleave of 0 may not be used with a mirror setup.
432 	 */
433 	if (cs->sc_ileave == 0) {
434 		bn = 0;
435 		ii = cs->sc_itable;
436 
437 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
438 			/* Allocate space for ii_index. */
439 			ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK);
440 			ii->ii_ndisk = 1;
441 			ii->ii_startblk = bn;
442 			ii->ii_startoff = 0;
443 			ii->ii_index[0] = ix;
444 			bn += cs->sc_cinfo[ix].ci_size;
445 			ii++;
446 		}
447 		ii->ii_ndisk = 0;
448 		return;
449 	}
450 
451 	/*
452 	 * The following isn't fast or pretty; it doesn't have to be.
453 	 */
454 	size = 0;
455 	bn = lbn = 0;
456 	for (ii = cs->sc_itable; ; ii++) {
457 		/*
458 		 * Allocate space for ii_index.  We might allocate more then
459 		 * we use.
460 		 */
461 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
462 		    M_CCD, M_WAITOK);
463 
464 		/*
465 		 * Locate the smallest of the remaining components
466 		 */
467 		smallci = NULL;
468 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
469 		    ci++) {
470 			if (ci->ci_size > size &&
471 			    (smallci == NULL ||
472 			     ci->ci_size < smallci->ci_size)) {
473 				smallci = ci;
474 			}
475 		}
476 
477 		/*
478 		 * Nobody left, all done
479 		 */
480 		if (smallci == NULL) {
481 			ii->ii_ndisk = 0;
482 			free(ii->ii_index, M_CCD);
483 			break;
484 		}
485 
486 		/*
487 		 * Record starting logical block using an sc_ileave blocksize.
488 		 */
489 		ii->ii_startblk = bn / cs->sc_ileave;
490 
491 		/*
492 		 * Record starting comopnent block using an sc_ileave
493 		 * blocksize.  This value is relative to the beginning of
494 		 * a component disk.
495 		 */
496 		ii->ii_startoff = lbn;
497 
498 		/*
499 		 * Determine how many disks take part in this interleave
500 		 * and record their indices.
501 		 */
502 		ix = 0;
503 		for (ci = cs->sc_cinfo;
504 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
505 			if (ci->ci_size >= smallci->ci_size) {
506 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
507 			}
508 		}
509 		ii->ii_ndisk = ix;
510 		bn += ix * (smallci->ci_size - size);
511 		lbn = smallci->ci_size / cs->sc_ileave;
512 		size = smallci->ci_size;
513 	}
514 }
515 
516 static void
517 ccdstrategy(struct bio *bp)
518 {
519 	struct ccd_s *cs;
520 	int pbn;        /* in sc_secsize chunks */
521 	long sz;        /* in sc_secsize chunks */
522 
523 	cs = bp->bio_disk->d_drv1;
524 
525 	pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
526 	sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
527 
528 	/*
529 	 * If out of bounds return an error. If at the EOF point,
530 	 * simply read or write less.
531 	 */
532 
533 	if (pbn < 0 || pbn >= cs->sc_size) {
534 		bp->bio_resid = bp->bio_bcount;
535 		if (pbn != cs->sc_size)
536 			biofinish(bp, NULL, EINVAL);
537 		else
538 			biodone(bp);
539 		return;
540 	}
541 
542 	/*
543 	 * If the request crosses EOF, truncate the request.
544 	 */
545 	if (pbn + sz > cs->sc_size) {
546 		bp->bio_bcount = (cs->sc_size - pbn) *
547 		    cs->sc_geom.ccg_secsize;
548 	}
549 
550 	bp->bio_resid = bp->bio_bcount;
551 
552 	/*
553 	 * "Start" the unit.
554 	 */
555 	ccdstart(cs, bp);
556 	return;
557 }
558 
559 static void
560 ccdstart(struct ccd_s *cs, struct bio *bp)
561 {
562 	long bcount, rcount;
563 	struct ccdbuf *cbp[2];
564 	caddr_t addr;
565 	daddr_t bn;
566 	int err;
567 
568 	/*
569 	 * Translate the partition-relative block number to an absolute.
570 	 */
571 	bn = bp->bio_blkno;
572 
573 	/*
574 	 * Allocate component buffers and fire off the requests
575 	 */
576 	addr = bp->bio_data;
577 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
578 		err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
579 		if (err) {
580 			printf("ccdbuffer error %d\n", err);
581 			/* We're screwed */
582 			bp->bio_resid -= bcount;
583 			bp->bio_error = ENOMEM;
584 			bp->bio_flags |= BIO_ERROR;
585 			return;
586 		}
587 		rcount = cbp[0]->cb_buf.bio_bcount;
588 
589 		if (cs->sc_cflags & CCDF_MIRROR) {
590 			/*
591 			 * Mirroring.  Writes go to both disks, reads are
592 			 * taken from whichever disk seems most appropriate.
593 			 *
594 			 * We attempt to localize reads to the disk whos arm
595 			 * is nearest the read request.  We ignore seeks due
596 			 * to writes when making this determination and we
597 			 * also try to avoid hogging.
598 			 */
599 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
600 				BIO_STRATEGY(&cbp[0]->cb_buf);
601 				BIO_STRATEGY(&cbp[1]->cb_buf);
602 			} else {
603 				int pick = cs->sc_pick;
604 				daddr_t range = cs->sc_size / 16;
605 
606 				if (bn < cs->sc_blk[pick] - range ||
607 				    bn > cs->sc_blk[pick] + range
608 				) {
609 					cs->sc_pick = pick = 1 - pick;
610 				}
611 				cs->sc_blk[pick] = bn + btodb(rcount);
612 				BIO_STRATEGY(&cbp[pick]->cb_buf);
613 			}
614 		} else {
615 			/*
616 			 * Not mirroring
617 			 */
618 			BIO_STRATEGY(&cbp[0]->cb_buf);
619 		}
620 		bn += btodb(rcount);
621 		addr += rcount;
622 	}
623 }
624 
625 /*
626  * Build a component buffer header.
627  */
628 static int
629 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
630 {
631 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
632 	struct ccdbuf *cbp;
633 	daddr_t cbn, cboff;
634 	off_t cbc;
635 
636 	/*
637 	 * Determine which component bn falls in.
638 	 */
639 	cbn = bn;
640 	cboff = 0;
641 
642 	if (cs->sc_ileave == 0) {
643 		/*
644 		 * Serially concatenated and neither a mirror nor a parity
645 		 * config.  This is a special case.
646 		 */
647 		daddr_t sblk;
648 
649 		sblk = 0;
650 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
651 			sblk += ci->ci_size;
652 		cbn -= sblk;
653 	} else {
654 		struct ccdiinfo *ii;
655 		int ccdisk, off;
656 
657 		/*
658 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
659 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
660 		 * to cbn.
661 		 */
662 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
663 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
664 
665 		/*
666 		 * Figure out which interleave table to use.
667 		 */
668 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
669 			if (ii->ii_startblk > cbn)
670 				break;
671 		}
672 		ii--;
673 
674 		/*
675 		 * off is the logical superblock relative to the beginning
676 		 * of this interleave block.
677 		 */
678 		off = cbn - ii->ii_startblk;
679 
680 		/*
681 		 * We must calculate which disk component to use (ccdisk),
682 		 * and recalculate cbn to be the superblock relative to
683 		 * the beginning of the component.  This is typically done by
684 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
685 		 * must typically be divided by the number of components in
686 		 * this interleave array to be properly convert it from a
687 		 * CCD-relative logical superblock number to a
688 		 * component-relative superblock number.
689 		 */
690 		if (ii->ii_ndisk == 1) {
691 			/*
692 			 * When we have just one disk, it can't be a mirror
693 			 * or a parity config.
694 			 */
695 			ccdisk = ii->ii_index[0];
696 			cbn = ii->ii_startoff + off;
697 		} else {
698 			if (cs->sc_cflags & CCDF_MIRROR) {
699 				/*
700 				 * We have forced a uniform mapping, resulting
701 				 * in a single interleave array.  We double
702 				 * up on the first half of the available
703 				 * components and our mirror is in the second
704 				 * half.  This only works with a single
705 				 * interleave array because doubling up
706 				 * doubles the number of sectors, so there
707 				 * cannot be another interleave array because
708 				 * the next interleave array's calculations
709 				 * would be off.
710 				 */
711 				int ndisk2 = ii->ii_ndisk / 2;
712 				ccdisk = ii->ii_index[off % ndisk2];
713 				cbn = ii->ii_startoff + off / ndisk2;
714 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
715 			} else {
716 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
717 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
718 			}
719 		}
720 
721 		ci = &cs->sc_cinfo[ccdisk];
722 
723 		/*
724 		 * Convert cbn from a superblock to a normal block so it
725 		 * can be used to calculate (along with cboff) the normal
726 		 * block index into this particular disk.
727 		 */
728 		cbn *= cs->sc_ileave;
729 	}
730 
731 	/*
732 	 * Fill in the component buf structure.
733 	 */
734 	cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO);
735 	if (cbp == NULL)
736 		return (ENOMEM);
737 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
738 	cbp->cb_buf.bio_done = ccdiodone;
739 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
740 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
741 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
742 	cbp->cb_buf.bio_data = addr;
743 	cbp->cb_buf.bio_caller2 = cbp;
744 	if (cs->sc_ileave == 0)
745               cbc = dbtob((off_t)(ci->ci_size - cbn));
746 	else
747               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
748 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
749  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
750 
751 	/*
752 	 * context for ccdiodone
753 	 */
754 	cbp->cb_obp = bp;
755 	cbp->cb_softc = cs;
756 	cbp->cb_comp = ci - cs->sc_cinfo;
757 
758 	cb[0] = cbp;
759 
760 	/*
761 	 * Note: both I/O's setup when reading from mirror, but only one
762 	 * will be executed.
763 	 */
764 	if (cs->sc_cflags & CCDF_MIRROR) {
765 		/* mirror, setup second I/O */
766 		cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT);
767 		if (cbp == NULL) {
768 			free(cb[0], M_CCD);
769 			cb[0] = NULL;
770 			return (ENOMEM);
771 		}
772 		bcopy(cb[0], cbp, sizeof(struct ccdbuf));
773 		cbp->cb_buf.bio_caller2 = cbp;
774 		cbp->cb_buf.bio_dev = ci2->ci_dev;
775 		cbp->cb_comp = ci2 - cs->sc_cinfo;
776 		cb[1] = cbp;
777 		/* link together the ccdbuf's and clear "mirror done" flag */
778 		cb[0]->cb_mirror = cb[1];
779 		cb[1]->cb_mirror = cb[0];
780 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
781 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
782 	}
783 	return (0);
784 }
785 
786 /*
787  * Called at interrupt time.
788  * Mark the component as done and if all components are done,
789  * take a ccd interrupt.
790  */
791 static void
792 ccdiodone(struct bio *ibp)
793 {
794 	struct ccdbuf *cbp;
795 	struct bio *bp;
796 	struct ccd_s *cs;
797 	int count;
798 
799 	cbp = ibp->bio_caller2;
800 	cs = cbp->cb_softc;
801 	bp = cbp->cb_obp;
802 	/*
803 	 * If an error occured, report it.  If this is a mirrored
804 	 * configuration and the first of two possible reads, do not
805 	 * set the error in the bp yet because the second read may
806 	 * succeed.
807 	 */
808 
809 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
810 		const char *msg = "";
811 
812 		if ((cs->sc_cflags & CCDF_MIRROR) &&
813 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
814 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
815 			/*
816 			 * We will try our read on the other disk down
817 			 * below, also reverse the default pick so if we
818 			 * are doing a scan we do not keep hitting the
819 			 * bad disk first.
820 			 */
821 
822 			msg = ", trying other disk";
823 			cs->sc_pick = 1 - cs->sc_pick;
824 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
825 		} else {
826 			bp->bio_flags |= BIO_ERROR;
827 			bp->bio_error = cbp->cb_buf.bio_error ?
828 			    cbp->cb_buf.bio_error : EIO;
829 		}
830 		printf("ccd%d: error %d on component %d block %jd "
831 		    "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error,
832 		    cbp->cb_comp,
833 		    (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno,
834 		    msg);
835 	}
836 
837 	/*
838 	 * Process mirror.  If we are writing, I/O has been initiated on both
839 	 * buffers and we fall through only after both are finished.
840 	 *
841 	 * If we are reading only one I/O is initiated at a time.  If an
842 	 * error occurs we initiate the second I/O and return, otherwise
843 	 * we free the second I/O without initiating it.
844 	 */
845 
846 	if (cs->sc_cflags & CCDF_MIRROR) {
847 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
848 			/*
849 			 * When writing, handshake with the second buffer
850 			 * to determine when both are done.  If both are not
851 			 * done, return here.
852 			 */
853 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
854 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
855 				free(cbp, M_CCD);
856 				return;
857 			}
858 		} else {
859 			/*
860 			 * When reading, either dispose of the second buffer
861 			 * or initiate I/O on the second buffer if an error
862 			 * occured with this one.
863 			 */
864 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
865 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
866 					cbp->cb_mirror->cb_pflags |=
867 					    CCDPF_MIRROR_DONE;
868 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf);
869 					free(cbp, M_CCD);
870 					return;
871 				} else {
872 					free(cbp->cb_mirror, M_CCD);
873 				}
874 			}
875 		}
876 	}
877 
878 	/*
879 	 * use bio_caller1 to determine how big the original request was rather
880 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
881 	 *
882 	 * XXX We check for an error, but we do not test the resid for an
883 	 * aligned EOF condition.  This may result in character & block
884 	 * device access not recognizing EOF properly when read or written
885 	 * sequentially, but will not effect filesystems.
886 	 */
887 	count = (long)cbp->cb_buf.bio_caller1;
888 	free(cbp, M_CCD);
889 
890 	/*
891 	 * If all done, "interrupt".
892 	 */
893 	bp->bio_resid -= count;
894 	if (bp->bio_resid < 0)
895 		panic("ccdiodone: count");
896 	if (bp->bio_resid == 0) {
897 		if (bp->bio_flags & BIO_ERROR)
898 			bp->bio_resid = bp->bio_bcount;
899 		biodone(bp);
900 	}
901 }
902 
903 static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td);
904 
905 static int
906 ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
907 {
908 	struct ccd_ioctl *ccio;
909 	u_int unit;
910 	dev_t dev2;
911 	int error;
912 
913 	switch (cmd) {
914 	case CCDIOCSET:
915 	case CCDIOCCLR:
916 		ccio = (struct ccd_ioctl *)data;
917 		unit = ccio->ccio_size;
918 		return (ccdioctltoo(unit, cmd, data, flag, td));
919 	case CCDCONFINFO:
920 		{
921 		int ninit = 0;
922 		struct ccdconf *conf = (struct ccdconf *)data;
923 		struct ccd_s *tmpcs;
924 		struct ccd_s *ubuf = conf->buffer;
925 
926 		/* XXX: LOCK(unique unit numbers) */
927 		LIST_FOREACH(tmpcs, &ccd_softc_list, list)
928 			if (IS_INITED(tmpcs))
929 				ninit++;
930 
931 		if (conf->size == 0) {
932 			conf->size = sizeof(struct ccd_s) * ninit;
933 			return (0);
934 		} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
935 		    (conf->size % sizeof(struct ccd_s) != 0)) {
936 			/* XXX: UNLOCK(unique unit numbers) */
937 			return (EINVAL);
938 		}
939 
940 		ubuf += ninit;
941 		LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
942 			if (!IS_INITED(tmpcs))
943 				continue;
944 			error = copyout(tmpcs, --ubuf,
945 			    sizeof(struct ccd_s));
946 			if (error != 0)
947 				/* XXX: UNLOCK(unique unit numbers) */
948 				return (error);
949 		}
950 		/* XXX: UNLOCK(unique unit numbers) */
951 		return (0);
952 		}
953 
954 	case CCDCPPINFO:
955 		{
956 		struct ccdcpps *cpps = (struct ccdcpps *)data;
957 		char *ubuf = cpps->buffer;
958 		struct ccd_s *cs;
959 
960 
961 		error = copyin(ubuf, &unit, sizeof (unit));
962 		if (error)
963 			return (error);
964 
965 		if (!IS_ALLOCATED(unit))
966 			return (ENXIO);
967 		dev2 = makedev(CDEV_MAJOR, unit * 8 + 2);
968 		cs = ccdfind(unit);
969 		if (!IS_INITED(cs))
970 			return (ENXIO);
971 
972 		{
973 			int len = 0, i;
974 			struct ccdcpps *cpps = (struct ccdcpps *)data;
975 			char *ubuf = cpps->buffer;
976 
977 
978 			for (i = 0; i < cs->sc_nccdisks; ++i)
979 				len += cs->sc_cinfo[i].ci_pathlen;
980 
981 			if (cpps->size < len)
982 				return (ENOMEM);
983 
984 			for (i = 0; i < cs->sc_nccdisks; ++i) {
985 				len = cs->sc_cinfo[i].ci_pathlen;
986 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
987 				    len);
988 				if (error != 0)
989 					return (error);
990 				ubuf += len;
991 			}
992 			return(copyout("", ubuf, 1));
993 		}
994 		break;
995 		}
996 
997 	default:
998 		return (ENXIO);
999 	}
1000 }
1001 
1002 static int
1003 ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td)
1004 {
1005 	int i, j, lookedup = 0, error = 0;
1006 	struct ccd_s *cs;
1007 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1008 	struct ccdgeom *ccg;
1009 	char **cpp;
1010 	struct vnode **vpp;
1011 
1012 	cs = ccdfind(unit);
1013 	switch (cmd) {
1014 	case CCDIOCSET:
1015 		if (cs == NULL)
1016 			cs = ccdnew(unit);
1017 		if (IS_INITED(cs))
1018 			return (EBUSY);
1019 
1020 		if ((flag & FWRITE) == 0)
1021 			return (EBADF);
1022 
1023 		if ((error = ccdlock(cs)) != 0)
1024 			return (error);
1025 
1026 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1027 			return (EINVAL);
1028 
1029 		/* Fill in some important bits. */
1030 		cs->sc_ileave = ccio->ccio_ileave;
1031 		if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) {
1032 			printf("ccd%d: disabling mirror, interleave is 0\n",
1033 			    unit);
1034 			ccio->ccio_flags &= ~(CCDF_MIRROR);
1035 		}
1036 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1037 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1038 			printf("ccd%d: mirror/parity forces uniform flag\n",
1039 			       unit);
1040 			ccio->ccio_flags |= CCDF_UNIFORM;
1041 		}
1042 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1043 
1044 		/*
1045 		 * Allocate space for and copy in the array of
1046 		 * componet pathnames and device numbers.
1047 		 */
1048 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1049 		    M_CCD, M_WAITOK);
1050 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1051 		    M_CCD, M_WAITOK);
1052 
1053 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1054 		    ccio->ccio_ndisks * sizeof(char **));
1055 		if (error) {
1056 			free(vpp, M_CCD);
1057 			free(cpp, M_CCD);
1058 			ccdunlock(cs);
1059 			return (error);
1060 		}
1061 
1062 
1063 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1064 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1065 				for (j = 0; j < lookedup; ++j)
1066 					(void)vn_close(vpp[j], FREAD|FWRITE,
1067 					    td->td_ucred, td);
1068 				free(vpp, M_CCD);
1069 				free(cpp, M_CCD);
1070 				ccdunlock(cs);
1071 				return (error);
1072 			}
1073 			++lookedup;
1074 		}
1075 		cs->sc_vpp = vpp;
1076 		cs->sc_nccdisks = ccio->ccio_ndisks;
1077 
1078 		/*
1079 		 * Initialize the ccd.  Fills in the softc for us.
1080 		 */
1081 		if ((error = ccdinit(cs, cpp, td)) != 0) {
1082 			for (j = 0; j < lookedup; ++j)
1083 				(void)vn_close(vpp[j], FREAD|FWRITE,
1084 				    td->td_ucred, td);
1085 			/*
1086 			 * We can't ccddestroy() cs just yet, because nothing
1087 			 * prevents user-level app to do another ioctl()
1088 			 * without closing the device first, therefore
1089 			 * declare unit null and void and let ccdclose()
1090 			 * destroy it when it is safe to do so.
1091 			 */
1092 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1093 			free(vpp, M_CCD);
1094 			free(cpp, M_CCD);
1095 			ccdunlock(cs);
1096 			return (error);
1097 		}
1098 		free(cpp, M_CCD);
1099 
1100 		/*
1101 		 * The ccd has been successfully initialized, so
1102 		 * we can place it into the array and read the disklabel.
1103 		 */
1104 		ccio->ccio_unit = unit;
1105 		ccio->ccio_size = cs->sc_size;
1106 		ccg = &cs->sc_geom;
1107 		cs->sc_disk = malloc(sizeof(struct disk), M_CCD,
1108 		    M_ZERO | M_WAITOK);
1109 		cs->sc_disk->d_strategy = ccdstrategy;
1110 		cs->sc_disk->d_name = "ccd";
1111 		cs->sc_disk->d_sectorsize = ccg->ccg_secsize;
1112 		cs->sc_disk->d_mediasize =
1113 		    cs->sc_size * (off_t)ccg->ccg_secsize;
1114 		cs->sc_disk->d_fwsectors = ccg->ccg_nsectors;
1115 		cs->sc_disk->d_fwheads = ccg->ccg_ntracks;
1116 		cs->sc_disk->d_drv1 = cs;
1117 		cs->sc_disk->d_maxsize = MAXPHYS;
1118 		disk_create(unit, cs->sc_disk, 0, NULL, NULL);
1119 
1120 		ccdunlock(cs);
1121 
1122 		break;
1123 
1124 	case CCDIOCCLR:
1125 		if (cs == NULL)
1126 			return (ENXIO);
1127 
1128 		if (!IS_INITED(cs))
1129 			return (ENXIO);
1130 
1131 		if ((flag & FWRITE) == 0)
1132 			return (EBADF);
1133 
1134 		if ((error = ccdlock(cs)) != 0)
1135 			return (error);
1136 
1137 		/* Don't unconfigure if any other partitions are open */
1138 		if (cs->sc_disk->d_flags & DISKFLAG_OPEN) {
1139 			ccdunlock(cs);
1140 			return (EBUSY);
1141 		}
1142 
1143 		disk_destroy(cs->sc_disk);
1144 		free(cs->sc_disk, M_CCD);
1145 		cs->sc_disk = NULL;
1146 		/* Declare unit null and void (reset all flags) */
1147 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1148 
1149 		/* Close the components and free their pathnames. */
1150 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1151 			/*
1152 			 * XXX: this close could potentially fail and
1153 			 * cause Bad Things.  Maybe we need to force
1154 			 * the close to happen?
1155 			 */
1156 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1157 			    td->td_ucred, td);
1158 			free(cs->sc_cinfo[i].ci_path, M_CCD);
1159 		}
1160 
1161 		/* Free interleave index. */
1162 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1163 			free(cs->sc_itable[i].ii_index, M_CCD);
1164 
1165 		/* Free component info and interleave table. */
1166 		free(cs->sc_cinfo, M_CCD);
1167 		free(cs->sc_itable, M_CCD);
1168 		free(cs->sc_vpp, M_CCD);
1169 
1170 		/* This must be atomic. */
1171 		ccdunlock(cs);
1172 		ccddestroy(cs);
1173 
1174 		break;
1175 	}
1176 
1177 	return (0);
1178 }
1179 
1180 
1181 /*
1182  * Lookup the provided name in the filesystem.  If the file exists,
1183  * is a valid block device, and isn't being used by anyone else,
1184  * set *vpp to the file's vnode.
1185  */
1186 static int
1187 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1188 {
1189 	struct nameidata nd;
1190 	struct vnode *vp;
1191 	int error, flags;
1192 
1193 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1194 	flags = FREAD | FWRITE;
1195 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1196 		return (error);
1197 	}
1198 	vp = nd.ni_vp;
1199 
1200 	if (vrefcnt(vp) > 1) {
1201 		error = EBUSY;
1202 		goto bad;
1203 	}
1204 
1205 	if (!vn_isdisk(vp, &error))
1206 		goto bad;
1207 
1208 
1209 	VOP_UNLOCK(vp, 0, td);
1210 	NDFREE(&nd, NDF_ONLY_PNBUF);
1211 	*vpp = vp;
1212 	return (0);
1213 bad:
1214 	VOP_UNLOCK(vp, 0, td);
1215 	NDFREE(&nd, NDF_ONLY_PNBUF);
1216 	/* vn_close does vrele() for vp */
1217 	(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1218 	return (error);
1219 }
1220 
1221 /*
1222 
1223  * Wait interruptibly for an exclusive lock.
1224  *
1225  * XXX
1226  * Several drivers do this; it should be abstracted and made MP-safe.
1227  */
1228 static int
1229 ccdlock(struct ccd_s *cs)
1230 {
1231 	int error;
1232 
1233 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1234 		cs->sc_flags |= CCDF_WANTED;
1235 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1236 			return (error);
1237 	}
1238 	cs->sc_flags |= CCDF_LOCKED;
1239 	return (0);
1240 }
1241 
1242 /*
1243  * Unlock and wake up any waiters.
1244  */
1245 static void
1246 ccdunlock(struct ccd_s *cs)
1247 {
1248 
1249 	cs->sc_flags &= ~CCDF_LOCKED;
1250 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1251 		cs->sc_flags &= ~CCDF_WANTED;
1252 		wakeup(cs);
1253 	}
1254 }
1255