xref: /freebsd/sys/geom/geom_ccd.c (revision 729362425c09cf6b362366aabc6fb547eee8035a)
1 /*
2  * Copyright (c) 2003 Poul-Henning Kamp.
3  * Copyright (c) 1995 Jason R. Thorpe.
4  * Copyright (c) 1990, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * All rights reserved.
7  * Copyright (c) 1988 University of Utah.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * the Systems Programming Group of the University of Utah Computer
11  * Science Department.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. All advertising materials mentioning features or use of this software
22  *    must display the following acknowledgement:
23  *	This product includes software developed for the NetBSD Project
24  *	by Jason R. Thorpe.
25  * 4. The names of the authors may not be used to endorse or promote products
26  *    derived from this software without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
35  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
36  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  * Dynamic configuration and disklabel support by:
41  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
42  *	Numerical Aerodynamic Simulation Facility
43  *	Mail Stop 258-6
44  *	NASA Ames Research Center
45  *	Moffett Field, CA 94035
46  *
47  * from: Utah $Hdr: cd.c 1.6 90/11/28$
48  *
49  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
50  *
51  *	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
52  *
53  * $FreeBSD$
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/module.h>
60 #include <sys/proc.h>
61 #include <sys/bio.h>
62 #include <sys/malloc.h>
63 #include <sys/namei.h>
64 #include <sys/conf.h>
65 #include <sys/stat.h>
66 #include <sys/sysctl.h>
67 #include <sys/disk.h>
68 #include <sys/fcntl.h>
69 #include <sys/vnode.h>
70 
71 #include <sys/ccdvar.h>
72 
73 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
74 
75 /*
76    This is how mirroring works (only writes are special):
77 
78    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
79    linked together by the cb_mirror field.  "cb_pflags &
80    CCDPF_MIRROR_DONE" is set to 0 on both of them.
81 
82    When a component returns to ccdiodone(), it checks if "cb_pflags &
83    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
84    flag and returns.  If it is, it means its partner has already
85    returned, so it will go to the regular cleanup.
86 
87  */
88 
89 struct ccdbuf {
90 	struct bio	cb_buf;		/* new I/O buf */
91 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
92 	struct ccdbuf	*cb_freenext;	/* free list link */
93 	struct ccd_s	*cb_softc;
94 	int		cb_comp;	/* target component */
95 	int		cb_pflags;	/* mirror/parity status flag */
96 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
97 };
98 
99 /* bits in cb_pflags */
100 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
101 
102 /* convinient macros for often-used statements */
103 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
104 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
105 
106 static dev_t	ccdctldev;
107 
108 static disk_strategy_t ccdstrategy;
109 static d_ioctl_t ccdctlioctl;
110 
111 #define NCCDFREEHIWAT	16
112 
113 #define CDEV_MAJOR 74
114 
115 static struct cdevsw ccdctl_cdevsw = {
116 	.d_open =	nullopen,
117 	.d_close =	nullclose,
118 	.d_ioctl =	ccdctlioctl,
119 	.d_name =	"ccdctl",
120 	.d_maj =	CDEV_MAJOR,
121 };
122 
123 static LIST_HEAD(, ccd_s) ccd_softc_list =
124 	LIST_HEAD_INITIALIZER(&ccd_softc_list);
125 
126 static struct ccd_s *ccdfind(int);
127 static struct ccd_s *ccdnew(int);
128 static int ccddestroy(struct ccd_s *);
129 
130 /* called during module initialization */
131 static void ccdattach(void);
132 static int ccd_modevent(module_t, int, void *);
133 
134 /* called by biodone() at interrupt time */
135 static void ccdiodone(struct bio *bp);
136 
137 static void ccdstart(struct ccd_s *, struct bio *);
138 static void ccdinterleave(struct ccd_s *, int);
139 static int ccdinit(struct ccd_s *, char **, struct thread *);
140 static int ccdlookup(char *, struct thread *p, struct vnode **);
141 static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
142 		      struct bio *, daddr_t, caddr_t, long);
143 static int ccdlock(struct ccd_s *);
144 static void ccdunlock(struct ccd_s *);
145 
146 
147 /*
148  * Number of blocks to untouched in front of a component partition.
149  * This is to avoid violating its disklabel area when it starts at the
150  * beginning of the slice.
151  */
152 #if !defined(CCD_OFFSET)
153 #define CCD_OFFSET 16
154 #endif
155 
156 static struct ccd_s *
157 ccdfind(int unit)
158 {
159 	struct ccd_s *sc = NULL;
160 
161 	/* XXX: LOCK(unique unit numbers) */
162 	LIST_FOREACH(sc, &ccd_softc_list, list) {
163 		if (sc->sc_unit == unit)
164 			break;
165 	}
166 	/* XXX: UNLOCK(unique unit numbers) */
167 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
168 }
169 
170 static struct ccd_s *
171 ccdnew(int unit)
172 {
173 	struct ccd_s *sc;
174 
175 	/* XXX: LOCK(unique unit numbers) */
176 	if (IS_ALLOCATED(unit) || unit > 32)
177 		return (NULL);
178 
179 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
180 	sc->sc_unit = unit;
181 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
182 	/* XXX: UNLOCK(unique unit numbers) */
183 	return (sc);
184 }
185 
186 static int
187 ccddestroy(struct ccd_s *sc)
188 {
189 
190 	/* XXX: LOCK(unique unit numbers) */
191 	LIST_REMOVE(sc, list);
192 	/* XXX: UNLOCK(unique unit numbers) */
193 	FREE(sc, M_CCD);
194 	return (0);
195 }
196 
197 /*
198  * Called by main() during pseudo-device attachment.  All we need
199  * to do is to add devsw entries.
200  */
201 static void
202 ccdattach()
203 {
204 
205 	ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff,
206 		UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl");
207 	ccdctldev->si_drv1 = ccdctldev;
208 }
209 
210 static int
211 ccd_modevent(module_t mod, int type, void *data)
212 {
213 	int error = 0;
214 
215 	switch (type) {
216 	case MOD_LOAD:
217 		ccdattach();
218 		break;
219 
220 	case MOD_UNLOAD:
221 		printf("ccd0: Unload not supported!\n");
222 		error = EOPNOTSUPP;
223 		break;
224 
225 	case MOD_SHUTDOWN:
226 		break;
227 
228 	default:
229 		error = EOPNOTSUPP;
230 	}
231 	return (error);
232 }
233 
234 DEV_MODULE(ccd, ccd_modevent, NULL);
235 
236 static int
237 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
238 {
239 	struct ccdcinfo *ci = NULL;	/* XXX */
240 	size_t size;
241 	int ix;
242 	struct vnode *vp;
243 	size_t minsize;
244 	int maxsecsize;
245 	struct ccdgeom *ccg = &cs->sc_geom;
246 	char *tmppath = NULL;
247 	int error = 0;
248 	off_t mediasize;
249 	u_int sectorsize;
250 
251 
252 	cs->sc_size = 0;
253 
254 	/* Allocate space for the component info. */
255 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
256 	    M_CCD, M_WAITOK);
257 
258 	/*
259 	 * Verify that each component piece exists and record
260 	 * relevant information about it.
261 	 */
262 	maxsecsize = 0;
263 	minsize = 0;
264 	tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK);
265 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
266 		vp = cs->sc_vpp[ix];
267 		ci = &cs->sc_cinfo[ix];
268 		ci->ci_vp = vp;
269 
270 		/*
271 		 * Copy in the pathname of the component.
272 		 */
273 		if ((error = copyinstr(cpaths[ix], tmppath,
274 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
275 			goto fail;
276 		}
277 		ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK);
278 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
279 
280 		ci->ci_dev = vn_todev(vp);
281 
282 		/*
283 		 * Get partition information for the component.
284 		 */
285 		error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
286 		    FREAD, td->td_ucred, td);
287 		if (error != 0) {
288 			goto fail;
289 		}
290 		/*
291 		 * Get partition information for the component.
292 		 */
293 		error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
294 		    FREAD, td->td_ucred, td);
295 		if (error != 0) {
296 			goto fail;
297 		}
298 		if (sectorsize > maxsecsize)
299 			maxsecsize = sectorsize;
300 		size = mediasize / DEV_BSIZE - CCD_OFFSET;
301 
302 		/*
303 		 * Calculate the size, truncating to an interleave
304 		 * boundary if necessary.
305 		 */
306 
307 		if (cs->sc_ileave > 1)
308 			size -= size % cs->sc_ileave;
309 
310 		if (size == 0) {
311 			error = ENODEV;
312 			goto fail;
313 		}
314 
315 		if (minsize == 0 || size < minsize)
316 			minsize = size;
317 		ci->ci_size = size;
318 		cs->sc_size += size;
319 	}
320 
321 	free(tmppath, M_CCD);
322 	tmppath = NULL;
323 
324 	/*
325 	 * Don't allow the interleave to be smaller than
326 	 * the biggest component sector.
327 	 */
328 	if ((cs->sc_ileave > 0) &&
329 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
330 		error = EINVAL;
331 		goto fail;
332 	}
333 
334 	/*
335 	 * If uniform interleave is desired set all sizes to that of
336 	 * the smallest component.  This will guarentee that a single
337 	 * interleave table is generated.
338 	 *
339 	 * Lost space must be taken into account when calculating the
340 	 * overall size.  Half the space is lost when CCDF_MIRROR is
341 	 * specified.
342 	 */
343 	if (cs->sc_flags & CCDF_UNIFORM) {
344 		for (ci = cs->sc_cinfo;
345 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
346 			ci->ci_size = minsize;
347 		}
348 		if (cs->sc_flags & CCDF_MIRROR) {
349 			/*
350 			 * Check to see if an even number of components
351 			 * have been specified.  The interleave must also
352 			 * be non-zero in order for us to be able to
353 			 * guarentee the topology.
354 			 */
355 			if (cs->sc_nccdisks % 2) {
356 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
357 				error = EINVAL;
358 				goto fail;
359 			}
360 			if (cs->sc_ileave == 0) {
361 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
362 				error = EINVAL;
363 				goto fail;
364 			}
365 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
366 		} else {
367 			if (cs->sc_ileave == 0) {
368 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
369 				error = EINVAL;
370 				goto fail;
371 			}
372 			cs->sc_size = cs->sc_nccdisks * minsize;
373 		}
374 	}
375 
376 	/*
377 	 * Construct the interleave table.
378 	 */
379 	ccdinterleave(cs, cs->sc_unit);
380 
381 	/*
382 	 * Create pseudo-geometry based on 1MB cylinders.  It's
383 	 * pretty close.
384 	 */
385 	ccg->ccg_secsize = maxsecsize;
386 	ccg->ccg_ntracks = 1;
387 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
388 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
389 
390 	cs->sc_flags |= CCDF_INITED;
391 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
392 	return (0);
393 fail:
394 	while (ci > cs->sc_cinfo) {
395 		ci--;
396 		free(ci->ci_path, M_CCD);
397 	}
398 	if (tmppath != NULL)
399 		free(tmppath, M_CCD);
400 	free(cs->sc_cinfo, M_CCD);
401 	ccddestroy(cs);
402 	return (error);
403 }
404 
405 static void
406 ccdinterleave(struct ccd_s *cs, int unit)
407 {
408 	struct ccdcinfo *ci, *smallci;
409 	struct ccdiinfo *ii;
410 	daddr_t bn, lbn;
411 	int ix;
412 	u_long size;
413 
414 
415 	/*
416 	 * Allocate an interleave table.  The worst case occurs when each
417 	 * of N disks is of a different size, resulting in N interleave
418 	 * tables.
419 	 *
420 	 * Chances are this is too big, but we don't care.
421 	 */
422 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
423 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD,
424 	    M_WAITOK | M_ZERO);
425 
426 	/*
427 	 * Trivial case: no interleave (actually interleave of disk size).
428 	 * Each table entry represents a single component in its entirety.
429 	 *
430 	 * An interleave of 0 may not be used with a mirror setup.
431 	 */
432 	if (cs->sc_ileave == 0) {
433 		bn = 0;
434 		ii = cs->sc_itable;
435 
436 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
437 			/* Allocate space for ii_index. */
438 			ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK);
439 			ii->ii_ndisk = 1;
440 			ii->ii_startblk = bn;
441 			ii->ii_startoff = 0;
442 			ii->ii_index[0] = ix;
443 			bn += cs->sc_cinfo[ix].ci_size;
444 			ii++;
445 		}
446 		ii->ii_ndisk = 0;
447 		return;
448 	}
449 
450 	/*
451 	 * The following isn't fast or pretty; it doesn't have to be.
452 	 */
453 	size = 0;
454 	bn = lbn = 0;
455 	for (ii = cs->sc_itable; ; ii++) {
456 		/*
457 		 * Allocate space for ii_index.  We might allocate more then
458 		 * we use.
459 		 */
460 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
461 		    M_CCD, M_WAITOK);
462 
463 		/*
464 		 * Locate the smallest of the remaining components
465 		 */
466 		smallci = NULL;
467 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
468 		    ci++) {
469 			if (ci->ci_size > size &&
470 			    (smallci == NULL ||
471 			     ci->ci_size < smallci->ci_size)) {
472 				smallci = ci;
473 			}
474 		}
475 
476 		/*
477 		 * Nobody left, all done
478 		 */
479 		if (smallci == NULL) {
480 			ii->ii_ndisk = 0;
481 			free(ii->ii_index, M_CCD);
482 			break;
483 		}
484 
485 		/*
486 		 * Record starting logical block using an sc_ileave blocksize.
487 		 */
488 		ii->ii_startblk = bn / cs->sc_ileave;
489 
490 		/*
491 		 * Record starting comopnent block using an sc_ileave
492 		 * blocksize.  This value is relative to the beginning of
493 		 * a component disk.
494 		 */
495 		ii->ii_startoff = lbn;
496 
497 		/*
498 		 * Determine how many disks take part in this interleave
499 		 * and record their indices.
500 		 */
501 		ix = 0;
502 		for (ci = cs->sc_cinfo;
503 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
504 			if (ci->ci_size >= smallci->ci_size) {
505 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
506 			}
507 		}
508 		ii->ii_ndisk = ix;
509 		bn += ix * (smallci->ci_size - size);
510 		lbn = smallci->ci_size / cs->sc_ileave;
511 		size = smallci->ci_size;
512 	}
513 }
514 
515 static void
516 ccdstrategy(struct bio *bp)
517 {
518 	struct ccd_s *cs;
519 	int pbn;        /* in sc_secsize chunks */
520 	long sz;        /* in sc_secsize chunks */
521 
522 	cs = bp->bio_disk->d_drv1;
523 
524 	pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
525 	sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
526 
527 	/*
528 	 * If out of bounds return an error. If at the EOF point,
529 	 * simply read or write less.
530 	 */
531 
532 	if (pbn < 0 || pbn >= cs->sc_size) {
533 		bp->bio_resid = bp->bio_bcount;
534 		if (pbn != cs->sc_size)
535 			biofinish(bp, NULL, EINVAL);
536 		else
537 			biodone(bp);
538 		return;
539 	}
540 
541 	/*
542 	 * If the request crosses EOF, truncate the request.
543 	 */
544 	if (pbn + sz > cs->sc_size) {
545 		bp->bio_bcount = (cs->sc_size - pbn) *
546 		    cs->sc_geom.ccg_secsize;
547 	}
548 
549 	bp->bio_resid = bp->bio_bcount;
550 
551 	/*
552 	 * "Start" the unit.
553 	 */
554 	ccdstart(cs, bp);
555 	return;
556 }
557 
558 static void
559 ccdstart(struct ccd_s *cs, struct bio *bp)
560 {
561 	long bcount, rcount;
562 	struct ccdbuf *cbp[2];
563 	caddr_t addr;
564 	daddr_t bn;
565 	int err;
566 
567 	/*
568 	 * Translate the partition-relative block number to an absolute.
569 	 */
570 	bn = bp->bio_blkno;
571 
572 	/*
573 	 * Allocate component buffers and fire off the requests
574 	 */
575 	addr = bp->bio_data;
576 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
577 		err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
578 		if (err) {
579 			printf("ccdbuffer error %d\n", err);
580 			/* We're screwed */
581 			bp->bio_resid -= bcount;
582 			bp->bio_error = ENOMEM;
583 			bp->bio_flags |= BIO_ERROR;
584 			return;
585 		}
586 		rcount = cbp[0]->cb_buf.bio_bcount;
587 
588 		if (cs->sc_cflags & CCDF_MIRROR) {
589 			/*
590 			 * Mirroring.  Writes go to both disks, reads are
591 			 * taken from whichever disk seems most appropriate.
592 			 *
593 			 * We attempt to localize reads to the disk whos arm
594 			 * is nearest the read request.  We ignore seeks due
595 			 * to writes when making this determination and we
596 			 * also try to avoid hogging.
597 			 */
598 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
599 				BIO_STRATEGY(&cbp[0]->cb_buf);
600 				BIO_STRATEGY(&cbp[1]->cb_buf);
601 			} else {
602 				int pick = cs->sc_pick;
603 				daddr_t range = cs->sc_size / 16;
604 
605 				if (bn < cs->sc_blk[pick] - range ||
606 				    bn > cs->sc_blk[pick] + range
607 				) {
608 					cs->sc_pick = pick = 1 - pick;
609 				}
610 				cs->sc_blk[pick] = bn + btodb(rcount);
611 				BIO_STRATEGY(&cbp[pick]->cb_buf);
612 			}
613 		} else {
614 			/*
615 			 * Not mirroring
616 			 */
617 			BIO_STRATEGY(&cbp[0]->cb_buf);
618 		}
619 		bn += btodb(rcount);
620 		addr += rcount;
621 	}
622 }
623 
624 /*
625  * Build a component buffer header.
626  */
627 static int
628 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
629 {
630 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
631 	struct ccdbuf *cbp;
632 	daddr_t cbn, cboff;
633 	off_t cbc;
634 
635 	/*
636 	 * Determine which component bn falls in.
637 	 */
638 	cbn = bn;
639 	cboff = 0;
640 
641 	if (cs->sc_ileave == 0) {
642 		/*
643 		 * Serially concatenated and neither a mirror nor a parity
644 		 * config.  This is a special case.
645 		 */
646 		daddr_t sblk;
647 
648 		sblk = 0;
649 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
650 			sblk += ci->ci_size;
651 		cbn -= sblk;
652 	} else {
653 		struct ccdiinfo *ii;
654 		int ccdisk, off;
655 
656 		/*
657 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
658 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
659 		 * to cbn.
660 		 */
661 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
662 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
663 
664 		/*
665 		 * Figure out which interleave table to use.
666 		 */
667 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
668 			if (ii->ii_startblk > cbn)
669 				break;
670 		}
671 		ii--;
672 
673 		/*
674 		 * off is the logical superblock relative to the beginning
675 		 * of this interleave block.
676 		 */
677 		off = cbn - ii->ii_startblk;
678 
679 		/*
680 		 * We must calculate which disk component to use (ccdisk),
681 		 * and recalculate cbn to be the superblock relative to
682 		 * the beginning of the component.  This is typically done by
683 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
684 		 * must typically be divided by the number of components in
685 		 * this interleave array to be properly convert it from a
686 		 * CCD-relative logical superblock number to a
687 		 * component-relative superblock number.
688 		 */
689 		if (ii->ii_ndisk == 1) {
690 			/*
691 			 * When we have just one disk, it can't be a mirror
692 			 * or a parity config.
693 			 */
694 			ccdisk = ii->ii_index[0];
695 			cbn = ii->ii_startoff + off;
696 		} else {
697 			if (cs->sc_cflags & CCDF_MIRROR) {
698 				/*
699 				 * We have forced a uniform mapping, resulting
700 				 * in a single interleave array.  We double
701 				 * up on the first half of the available
702 				 * components and our mirror is in the second
703 				 * half.  This only works with a single
704 				 * interleave array because doubling up
705 				 * doubles the number of sectors, so there
706 				 * cannot be another interleave array because
707 				 * the next interleave array's calculations
708 				 * would be off.
709 				 */
710 				int ndisk2 = ii->ii_ndisk / 2;
711 				ccdisk = ii->ii_index[off % ndisk2];
712 				cbn = ii->ii_startoff + off / ndisk2;
713 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
714 			} else {
715 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
716 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
717 			}
718 		}
719 
720 		ci = &cs->sc_cinfo[ccdisk];
721 
722 		/*
723 		 * Convert cbn from a superblock to a normal block so it
724 		 * can be used to calculate (along with cboff) the normal
725 		 * block index into this particular disk.
726 		 */
727 		cbn *= cs->sc_ileave;
728 	}
729 
730 	/*
731 	 * Fill in the component buf structure.
732 	 */
733 	cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO);
734 	if (cbp == NULL)
735 		return (ENOMEM);
736 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
737 	cbp->cb_buf.bio_done = ccdiodone;
738 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
739 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
740 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
741 	cbp->cb_buf.bio_data = addr;
742 	cbp->cb_buf.bio_caller2 = cbp;
743 	if (cs->sc_ileave == 0)
744               cbc = dbtob((off_t)(ci->ci_size - cbn));
745 	else
746               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
747 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
748  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
749 
750 	/*
751 	 * context for ccdiodone
752 	 */
753 	cbp->cb_obp = bp;
754 	cbp->cb_softc = cs;
755 	cbp->cb_comp = ci - cs->sc_cinfo;
756 
757 	cb[0] = cbp;
758 
759 	/*
760 	 * Note: both I/O's setup when reading from mirror, but only one
761 	 * will be executed.
762 	 */
763 	if (cs->sc_cflags & CCDF_MIRROR) {
764 		/* mirror, setup second I/O */
765 		cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT);
766 		if (cbp == NULL) {
767 			free(cb[0], M_CCD);
768 			cb[0] = NULL;
769 			return (ENOMEM);
770 		}
771 		bcopy(cb[0], cbp, sizeof(struct ccdbuf));
772 		cbp->cb_buf.bio_caller2 = cbp;
773 		cbp->cb_buf.bio_dev = ci2->ci_dev;
774 		cbp->cb_comp = ci2 - cs->sc_cinfo;
775 		cb[1] = cbp;
776 		/* link together the ccdbuf's and clear "mirror done" flag */
777 		cb[0]->cb_mirror = cb[1];
778 		cb[1]->cb_mirror = cb[0];
779 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
780 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
781 	}
782 	return (0);
783 }
784 
785 /*
786  * Called at interrupt time.
787  * Mark the component as done and if all components are done,
788  * take a ccd interrupt.
789  */
790 static void
791 ccdiodone(struct bio *ibp)
792 {
793 	struct ccdbuf *cbp;
794 	struct bio *bp;
795 	struct ccd_s *cs;
796 	int count;
797 
798 	cbp = ibp->bio_caller2;
799 	cs = cbp->cb_softc;
800 	bp = cbp->cb_obp;
801 	/*
802 	 * If an error occured, report it.  If this is a mirrored
803 	 * configuration and the first of two possible reads, do not
804 	 * set the error in the bp yet because the second read may
805 	 * succeed.
806 	 */
807 
808 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
809 		const char *msg = "";
810 
811 		if ((cs->sc_cflags & CCDF_MIRROR) &&
812 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
813 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
814 			/*
815 			 * We will try our read on the other disk down
816 			 * below, also reverse the default pick so if we
817 			 * are doing a scan we do not keep hitting the
818 			 * bad disk first.
819 			 */
820 
821 			msg = ", trying other disk";
822 			cs->sc_pick = 1 - cs->sc_pick;
823 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
824 		} else {
825 			bp->bio_flags |= BIO_ERROR;
826 			bp->bio_error = cbp->cb_buf.bio_error ?
827 			    cbp->cb_buf.bio_error : EIO;
828 		}
829 		printf("ccd%d: error %d on component %d block %jd "
830 		    "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error,
831 		    cbp->cb_comp,
832 		    (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno,
833 		    msg);
834 	}
835 
836 	/*
837 	 * Process mirror.  If we are writing, I/O has been initiated on both
838 	 * buffers and we fall through only after both are finished.
839 	 *
840 	 * If we are reading only one I/O is initiated at a time.  If an
841 	 * error occurs we initiate the second I/O and return, otherwise
842 	 * we free the second I/O without initiating it.
843 	 */
844 
845 	if (cs->sc_cflags & CCDF_MIRROR) {
846 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
847 			/*
848 			 * When writing, handshake with the second buffer
849 			 * to determine when both are done.  If both are not
850 			 * done, return here.
851 			 */
852 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
853 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
854 				free(cbp, M_CCD);
855 				return;
856 			}
857 		} else {
858 			/*
859 			 * When reading, either dispose of the second buffer
860 			 * or initiate I/O on the second buffer if an error
861 			 * occured with this one.
862 			 */
863 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
864 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
865 					cbp->cb_mirror->cb_pflags |=
866 					    CCDPF_MIRROR_DONE;
867 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf);
868 					free(cbp, M_CCD);
869 					return;
870 				} else {
871 					free(cbp->cb_mirror, M_CCD);
872 				}
873 			}
874 		}
875 	}
876 
877 	/*
878 	 * use bio_caller1 to determine how big the original request was rather
879 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
880 	 *
881 	 * XXX We check for an error, but we do not test the resid for an
882 	 * aligned EOF condition.  This may result in character & block
883 	 * device access not recognizing EOF properly when read or written
884 	 * sequentially, but will not effect filesystems.
885 	 */
886 	count = (long)cbp->cb_buf.bio_caller1;
887 	free(cbp, M_CCD);
888 
889 	/*
890 	 * If all done, "interrupt".
891 	 */
892 	bp->bio_resid -= count;
893 	if (bp->bio_resid < 0)
894 		panic("ccdiodone: count");
895 	if (bp->bio_resid == 0) {
896 		if (bp->bio_flags & BIO_ERROR)
897 			bp->bio_resid = bp->bio_bcount;
898 		biodone(bp);
899 	}
900 }
901 
902 static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td);
903 
904 static int
905 ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
906 {
907 	struct ccd_ioctl *ccio;
908 	u_int unit;
909 	dev_t dev2;
910 	int error;
911 
912 	switch (cmd) {
913 	case CCDIOCSET:
914 	case CCDIOCCLR:
915 		ccio = (struct ccd_ioctl *)data;
916 		unit = ccio->ccio_size;
917 		return (ccdioctltoo(unit, cmd, data, flag, td));
918 	case CCDCONFINFO:
919 		{
920 		int ninit = 0;
921 		struct ccdconf *conf = (struct ccdconf *)data;
922 		struct ccd_s *tmpcs;
923 		struct ccd_s *ubuf = conf->buffer;
924 
925 		/* XXX: LOCK(unique unit numbers) */
926 		LIST_FOREACH(tmpcs, &ccd_softc_list, list)
927 			if (IS_INITED(tmpcs))
928 				ninit++;
929 
930 		if (conf->size == 0) {
931 			conf->size = sizeof(struct ccd_s) * ninit;
932 			return (0);
933 		} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
934 		    (conf->size % sizeof(struct ccd_s) != 0)) {
935 			/* XXX: UNLOCK(unique unit numbers) */
936 			return (EINVAL);
937 		}
938 
939 		ubuf += ninit;
940 		LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
941 			if (!IS_INITED(tmpcs))
942 				continue;
943 			error = copyout(tmpcs, --ubuf,
944 			    sizeof(struct ccd_s));
945 			if (error != 0)
946 				/* XXX: UNLOCK(unique unit numbers) */
947 				return (error);
948 		}
949 		/* XXX: UNLOCK(unique unit numbers) */
950 		return (0);
951 		}
952 
953 	case CCDCPPINFO:
954 		{
955 		struct ccdcpps *cpps = (struct ccdcpps *)data;
956 		char *ubuf = cpps->buffer;
957 		struct ccd_s *cs;
958 
959 
960 		error = copyin(ubuf, &unit, sizeof (unit));
961 		if (error)
962 			return (error);
963 
964 		if (!IS_ALLOCATED(unit))
965 			return (ENXIO);
966 		dev2 = makedev(CDEV_MAJOR, unit * 8 + 2);
967 		cs = ccdfind(unit);
968 		if (!IS_INITED(cs))
969 			return (ENXIO);
970 
971 		{
972 			int len = 0, i;
973 			struct ccdcpps *cpps = (struct ccdcpps *)data;
974 			char *ubuf = cpps->buffer;
975 
976 
977 			for (i = 0; i < cs->sc_nccdisks; ++i)
978 				len += cs->sc_cinfo[i].ci_pathlen;
979 
980 			if (cpps->size < len)
981 				return (ENOMEM);
982 
983 			for (i = 0; i < cs->sc_nccdisks; ++i) {
984 				len = cs->sc_cinfo[i].ci_pathlen;
985 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
986 				    len);
987 				if (error != 0)
988 					return (error);
989 				ubuf += len;
990 			}
991 			return(copyout("", ubuf, 1));
992 		}
993 		break;
994 		}
995 
996 	default:
997 		return (ENXIO);
998 	}
999 }
1000 
1001 static int
1002 ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td)
1003 {
1004 	int i, j, lookedup = 0, error = 0;
1005 	struct ccd_s *cs;
1006 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1007 	struct ccdgeom *ccg;
1008 	char **cpp;
1009 	struct vnode **vpp;
1010 
1011 	cs = ccdfind(unit);
1012 	switch (cmd) {
1013 	case CCDIOCSET:
1014 		if (cs == NULL)
1015 			cs = ccdnew(unit);
1016 		if (IS_INITED(cs))
1017 			return (EBUSY);
1018 
1019 		if ((flag & FWRITE) == 0)
1020 			return (EBADF);
1021 
1022 		if ((error = ccdlock(cs)) != 0)
1023 			return (error);
1024 
1025 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1026 			return (EINVAL);
1027 
1028 		/* Fill in some important bits. */
1029 		cs->sc_ileave = ccio->ccio_ileave;
1030 		if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) {
1031 			printf("ccd%d: disabling mirror, interleave is 0\n",
1032 			    unit);
1033 			ccio->ccio_flags &= ~(CCDF_MIRROR);
1034 		}
1035 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1036 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1037 			printf("ccd%d: mirror/parity forces uniform flag\n",
1038 			       unit);
1039 			ccio->ccio_flags |= CCDF_UNIFORM;
1040 		}
1041 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1042 
1043 		/*
1044 		 * Allocate space for and copy in the array of
1045 		 * componet pathnames and device numbers.
1046 		 */
1047 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1048 		    M_CCD, M_WAITOK);
1049 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1050 		    M_CCD, M_WAITOK);
1051 
1052 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1053 		    ccio->ccio_ndisks * sizeof(char **));
1054 		if (error) {
1055 			free(vpp, M_CCD);
1056 			free(cpp, M_CCD);
1057 			ccdunlock(cs);
1058 			return (error);
1059 		}
1060 
1061 
1062 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1063 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1064 				for (j = 0; j < lookedup; ++j)
1065 					(void)vn_close(vpp[j], FREAD|FWRITE,
1066 					    td->td_ucred, td);
1067 				free(vpp, M_CCD);
1068 				free(cpp, M_CCD);
1069 				ccdunlock(cs);
1070 				return (error);
1071 			}
1072 			++lookedup;
1073 		}
1074 		cs->sc_vpp = vpp;
1075 		cs->sc_nccdisks = ccio->ccio_ndisks;
1076 
1077 		/*
1078 		 * Initialize the ccd.  Fills in the softc for us.
1079 		 */
1080 		if ((error = ccdinit(cs, cpp, td)) != 0) {
1081 			for (j = 0; j < lookedup; ++j)
1082 				(void)vn_close(vpp[j], FREAD|FWRITE,
1083 				    td->td_ucred, td);
1084 			/*
1085 			 * We can't ccddestroy() cs just yet, because nothing
1086 			 * prevents user-level app to do another ioctl()
1087 			 * without closing the device first, therefore
1088 			 * declare unit null and void and let ccdclose()
1089 			 * destroy it when it is safe to do so.
1090 			 */
1091 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1092 			free(vpp, M_CCD);
1093 			free(cpp, M_CCD);
1094 			ccdunlock(cs);
1095 			return (error);
1096 		}
1097 		free(cpp, M_CCD);
1098 
1099 		/*
1100 		 * The ccd has been successfully initialized, so
1101 		 * we can place it into the array and read the disklabel.
1102 		 */
1103 		ccio->ccio_unit = unit;
1104 		ccio->ccio_size = cs->sc_size;
1105 		ccg = &cs->sc_geom;
1106 		cs->sc_disk = malloc(sizeof(struct disk), M_CCD,
1107 		    M_ZERO | M_WAITOK);
1108 		cs->sc_disk->d_strategy = ccdstrategy;
1109 		cs->sc_disk->d_name = "ccd";
1110 		cs->sc_disk->d_sectorsize = ccg->ccg_secsize;
1111 		cs->sc_disk->d_mediasize =
1112 		    cs->sc_size * (off_t)ccg->ccg_secsize;
1113 		cs->sc_disk->d_fwsectors = ccg->ccg_nsectors;
1114 		cs->sc_disk->d_fwheads = ccg->ccg_ntracks;
1115 		cs->sc_disk->d_drv1 = cs;
1116 		cs->sc_disk->d_maxsize = MAXPHYS;
1117 		disk_create(unit, cs->sc_disk, 0, NULL, NULL);
1118 
1119 		ccdunlock(cs);
1120 
1121 		break;
1122 
1123 	case CCDIOCCLR:
1124 		if (cs == NULL)
1125 			return (ENXIO);
1126 
1127 		if (!IS_INITED(cs))
1128 			return (ENXIO);
1129 
1130 		if ((flag & FWRITE) == 0)
1131 			return (EBADF);
1132 
1133 		if ((error = ccdlock(cs)) != 0)
1134 			return (error);
1135 
1136 		/* Don't unconfigure if any other partitions are open */
1137 		if (cs->sc_disk->d_flags & DISKFLAG_OPEN) {
1138 			ccdunlock(cs);
1139 			return (EBUSY);
1140 		}
1141 
1142 		disk_destroy(cs->sc_disk);
1143 		free(cs->sc_disk, M_CCD);
1144 		cs->sc_disk = NULL;
1145 		/* Declare unit null and void (reset all flags) */
1146 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1147 
1148 		/* Close the components and free their pathnames. */
1149 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1150 			/*
1151 			 * XXX: this close could potentially fail and
1152 			 * cause Bad Things.  Maybe we need to force
1153 			 * the close to happen?
1154 			 */
1155 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1156 			    td->td_ucred, td);
1157 			free(cs->sc_cinfo[i].ci_path, M_CCD);
1158 		}
1159 
1160 		/* Free interleave index. */
1161 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1162 			free(cs->sc_itable[i].ii_index, M_CCD);
1163 
1164 		/* Free component info and interleave table. */
1165 		free(cs->sc_cinfo, M_CCD);
1166 		free(cs->sc_itable, M_CCD);
1167 		free(cs->sc_vpp, M_CCD);
1168 
1169 		/* This must be atomic. */
1170 		ccdunlock(cs);
1171 		ccddestroy(cs);
1172 
1173 		break;
1174 	}
1175 
1176 	return (0);
1177 }
1178 
1179 
1180 /*
1181  * Lookup the provided name in the filesystem.  If the file exists,
1182  * is a valid block device, and isn't being used by anyone else,
1183  * set *vpp to the file's vnode.
1184  */
1185 static int
1186 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1187 {
1188 	struct nameidata nd;
1189 	struct vnode *vp;
1190 	int error, flags;
1191 
1192 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1193 	flags = FREAD | FWRITE;
1194 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1195 		return (error);
1196 	}
1197 	vp = nd.ni_vp;
1198 
1199 	if (vrefcnt(vp) > 1) {
1200 		error = EBUSY;
1201 		goto bad;
1202 	}
1203 
1204 	if (!vn_isdisk(vp, &error))
1205 		goto bad;
1206 
1207 
1208 	VOP_UNLOCK(vp, 0, td);
1209 	NDFREE(&nd, NDF_ONLY_PNBUF);
1210 	*vpp = vp;
1211 	return (0);
1212 bad:
1213 	VOP_UNLOCK(vp, 0, td);
1214 	NDFREE(&nd, NDF_ONLY_PNBUF);
1215 	/* vn_close does vrele() for vp */
1216 	(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1217 	return (error);
1218 }
1219 
1220 /*
1221 
1222  * Wait interruptibly for an exclusive lock.
1223  *
1224  * XXX
1225  * Several drivers do this; it should be abstracted and made MP-safe.
1226  */
1227 static int
1228 ccdlock(struct ccd_s *cs)
1229 {
1230 	int error;
1231 
1232 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1233 		cs->sc_flags |= CCDF_WANTED;
1234 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1235 			return (error);
1236 	}
1237 	cs->sc_flags |= CCDF_LOCKED;
1238 	return (0);
1239 }
1240 
1241 /*
1242  * Unlock and wake up any waiters.
1243  */
1244 static void
1245 ccdunlock(struct ccd_s *cs)
1246 {
1247 
1248 	cs->sc_flags &= ~CCDF_LOCKED;
1249 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1250 		cs->sc_flags &= ~CCDF_WANTED;
1251 		wakeup(cs);
1252 	}
1253 }
1254