xref: /freebsd/sys/geom/geom_ccd.c (revision f9218d3d4fd34f082473b3a021c6d4d109fb47cf)
1 /*
2  * Copyright (c) 2003 Poul-Henning Kamp.
3  * Copyright (c) 1995 Jason R. Thorpe.
4  * Copyright (c) 1990, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * All rights reserved.
7  * Copyright (c) 1988 University of Utah.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * the Systems Programming Group of the University of Utah Computer
11  * Science Department.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. All advertising materials mentioning features or use of this software
22  *    must display the following acknowledgement:
23  *	This product includes software developed for the NetBSD Project
24  *	by Jason R. Thorpe.
25  * 4. The names of the authors may not be used to endorse or promote products
26  *    derived from this software without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
35  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
36  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  * Dynamic configuration and disklabel support by:
41  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
42  *	Numerical Aerodynamic Simulation Facility
43  *	Mail Stop 258-6
44  *	NASA Ames Research Center
45  *	Moffett Field, CA 94035
46  *
47  * from: Utah $Hdr: cd.c 1.6 90/11/28$
48  *
49  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
50  *
51  *	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
52  *
53  * $FreeBSD$
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/module.h>
60 #include <sys/proc.h>
61 #include <sys/bio.h>
62 #include <sys/malloc.h>
63 #include <sys/namei.h>
64 #include <sys/conf.h>
65 #include <sys/stat.h>
66 #include <sys/stdint.h>
67 #include <sys/sysctl.h>
68 #include <sys/disk.h>
69 #include <sys/devicestat.h>
70 #include <sys/fcntl.h>
71 #include <sys/vnode.h>
72 
73 #include <sys/ccdvar.h>
74 
75 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
76 
77 /*
78    This is how mirroring works (only writes are special):
79 
80    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
81    linked together by the cb_mirror field.  "cb_pflags &
82    CCDPF_MIRROR_DONE" is set to 0 on both of them.
83 
84    When a component returns to ccdiodone(), it checks if "cb_pflags &
85    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
86    flag and returns.  If it is, it means its partner has already
87    returned, so it will go to the regular cleanup.
88 
89  */
90 
91 struct ccdbuf {
92 	struct bio	cb_buf;		/* new I/O buf */
93 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
94 	struct ccdbuf	*cb_freenext;	/* free list link */
95 	struct ccd_s	*cb_softc;
96 	int		cb_comp;	/* target component */
97 	int		cb_pflags;	/* mirror/parity status flag */
98 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
99 };
100 
101 /* bits in cb_pflags */
102 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
103 
104 /* convinient macros for often-used statements */
105 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
106 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
107 
108 static dev_t	ccdctldev;
109 
110 static disk_strategy_t ccdstrategy;
111 static d_ioctl_t ccdctlioctl;
112 
113 #define NCCDFREEHIWAT	16
114 
115 #define CDEV_MAJOR 74
116 
117 static struct cdevsw ccdctl_cdevsw = {
118 	.d_open =	nullopen,
119 	.d_close =	nullclose,
120 	.d_ioctl =	ccdctlioctl,
121 	.d_name =	"ccdctl",
122 	.d_maj =	CDEV_MAJOR,
123 };
124 
125 static LIST_HEAD(, ccd_s) ccd_softc_list =
126 	LIST_HEAD_INITIALIZER(&ccd_softc_list);
127 
128 static struct ccd_s *ccdfind(int);
129 static struct ccd_s *ccdnew(int);
130 static int ccddestroy(struct ccd_s *);
131 
132 /* called during module initialization */
133 static void ccdattach(void);
134 static int ccd_modevent(module_t, int, void *);
135 
136 /* called by biodone() at interrupt time */
137 static void ccdiodone(struct bio *bp);
138 
139 static void ccdstart(struct ccd_s *, struct bio *);
140 static void ccdinterleave(struct ccd_s *, int);
141 static int ccdinit(struct ccd_s *, char **, struct thread *);
142 static int ccdlookup(char *, struct thread *p, struct vnode **);
143 static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
144 		      struct bio *, daddr_t, caddr_t, long);
145 static int ccdlock(struct ccd_s *);
146 static void ccdunlock(struct ccd_s *);
147 
148 
149 /*
150  * Number of blocks to untouched in front of a component partition.
151  * This is to avoid violating its disklabel area when it starts at the
152  * beginning of the slice.
153  */
154 #if !defined(CCD_OFFSET)
155 #define CCD_OFFSET 16
156 #endif
157 
158 static struct ccd_s *
159 ccdfind(int unit)
160 {
161 	struct ccd_s *sc = NULL;
162 
163 	/* XXX: LOCK(unique unit numbers) */
164 	LIST_FOREACH(sc, &ccd_softc_list, list) {
165 		if (sc->sc_unit == unit)
166 			break;
167 	}
168 	/* XXX: UNLOCK(unique unit numbers) */
169 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
170 }
171 
172 static struct ccd_s *
173 ccdnew(int unit)
174 {
175 	struct ccd_s *sc;
176 
177 	/* XXX: LOCK(unique unit numbers) */
178 	if (IS_ALLOCATED(unit) || unit > 32)
179 		return (NULL);
180 
181 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
182 	sc->sc_unit = unit;
183 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
184 	/* XXX: UNLOCK(unique unit numbers) */
185 	return (sc);
186 }
187 
188 static int
189 ccddestroy(struct ccd_s *sc)
190 {
191 
192 	/* XXX: LOCK(unique unit numbers) */
193 	LIST_REMOVE(sc, list);
194 	/* XXX: UNLOCK(unique unit numbers) */
195 	FREE(sc, M_CCD);
196 	return (0);
197 }
198 
199 /*
200  * Called by main() during pseudo-device attachment.  All we need
201  * to do is to add devsw entries.
202  */
203 static void
204 ccdattach()
205 {
206 
207 	ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff,
208 		UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl");
209 	ccdctldev->si_drv1 = ccdctldev;
210 }
211 
212 static int
213 ccd_modevent(module_t mod, int type, void *data)
214 {
215 	int error = 0;
216 
217 	switch (type) {
218 	case MOD_LOAD:
219 		ccdattach();
220 		break;
221 
222 	case MOD_UNLOAD:
223 		printf("ccd0: Unload not supported!\n");
224 		error = EOPNOTSUPP;
225 		break;
226 
227 	case MOD_SHUTDOWN:
228 		break;
229 
230 	default:
231 		error = EOPNOTSUPP;
232 	}
233 	return (error);
234 }
235 
236 DEV_MODULE(ccd, ccd_modevent, NULL);
237 
238 static int
239 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
240 {
241 	struct ccdcinfo *ci = NULL;	/* XXX */
242 	size_t size;
243 	int ix;
244 	struct vnode *vp;
245 	size_t minsize;
246 	int maxsecsize;
247 	struct ccdgeom *ccg = &cs->sc_geom;
248 	char *tmppath = NULL;
249 	int error = 0;
250 	off_t mediasize;
251 	u_int sectorsize;
252 
253 
254 	cs->sc_size = 0;
255 
256 	/* Allocate space for the component info. */
257 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
258 	    M_CCD, M_WAITOK);
259 
260 	/*
261 	 * Verify that each component piece exists and record
262 	 * relevant information about it.
263 	 */
264 	maxsecsize = 0;
265 	minsize = 0;
266 	tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK);
267 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
268 		vp = cs->sc_vpp[ix];
269 		ci = &cs->sc_cinfo[ix];
270 		ci->ci_vp = vp;
271 
272 		/*
273 		 * Copy in the pathname of the component.
274 		 */
275 		if ((error = copyinstr(cpaths[ix], tmppath,
276 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
277 			goto fail;
278 		}
279 		ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK);
280 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
281 
282 		ci->ci_dev = vn_todev(vp);
283 
284 		/*
285 		 * Get partition information for the component.
286 		 */
287 		error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
288 		    FREAD, td->td_ucred, td);
289 		if (error != 0) {
290 			goto fail;
291 		}
292 		/*
293 		 * Get partition information for the component.
294 		 */
295 		error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
296 		    FREAD, td->td_ucred, td);
297 		if (error != 0) {
298 			goto fail;
299 		}
300 		if (sectorsize > maxsecsize)
301 			maxsecsize = sectorsize;
302 		size = mediasize / DEV_BSIZE - CCD_OFFSET;
303 
304 		/*
305 		 * Calculate the size, truncating to an interleave
306 		 * boundary if necessary.
307 		 */
308 
309 		if (cs->sc_ileave > 1)
310 			size -= size % cs->sc_ileave;
311 
312 		if (size == 0) {
313 			error = ENODEV;
314 			goto fail;
315 		}
316 
317 		if (minsize == 0 || size < minsize)
318 			minsize = size;
319 		ci->ci_size = size;
320 		cs->sc_size += size;
321 	}
322 
323 	free(tmppath, M_CCD);
324 	tmppath = NULL;
325 
326 	/*
327 	 * Don't allow the interleave to be smaller than
328 	 * the biggest component sector.
329 	 */
330 	if ((cs->sc_ileave > 0) &&
331 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
332 		error = EINVAL;
333 		goto fail;
334 	}
335 
336 	/*
337 	 * If uniform interleave is desired set all sizes to that of
338 	 * the smallest component.  This will guarentee that a single
339 	 * interleave table is generated.
340 	 *
341 	 * Lost space must be taken into account when calculating the
342 	 * overall size.  Half the space is lost when CCDF_MIRROR is
343 	 * specified.
344 	 */
345 	if (cs->sc_flags & CCDF_UNIFORM) {
346 		for (ci = cs->sc_cinfo;
347 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
348 			ci->ci_size = minsize;
349 		}
350 		if (cs->sc_flags & CCDF_MIRROR) {
351 			/*
352 			 * Check to see if an even number of components
353 			 * have been specified.  The interleave must also
354 			 * be non-zero in order for us to be able to
355 			 * guarentee the topology.
356 			 */
357 			if (cs->sc_nccdisks % 2) {
358 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
359 				error = EINVAL;
360 				goto fail;
361 			}
362 			if (cs->sc_ileave == 0) {
363 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
364 				error = EINVAL;
365 				goto fail;
366 			}
367 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
368 		} else {
369 			if (cs->sc_ileave == 0) {
370 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
371 				error = EINVAL;
372 				goto fail;
373 			}
374 			cs->sc_size = cs->sc_nccdisks * minsize;
375 		}
376 	}
377 
378 	/*
379 	 * Construct the interleave table.
380 	 */
381 	ccdinterleave(cs, cs->sc_unit);
382 
383 	/*
384 	 * Create pseudo-geometry based on 1MB cylinders.  It's
385 	 * pretty close.
386 	 */
387 	ccg->ccg_secsize = maxsecsize;
388 	ccg->ccg_ntracks = 1;
389 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
390 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
391 
392 	/*
393 	 * Add a devstat entry for this device.
394 	 */
395 	devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
396 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
397 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
398 			  DEVSTAT_PRIORITY_ARRAY);
399 
400 	cs->sc_flags |= CCDF_INITED;
401 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
402 	return (0);
403 fail:
404 	while (ci > cs->sc_cinfo) {
405 		ci--;
406 		free(ci->ci_path, M_CCD);
407 	}
408 	if (tmppath != NULL)
409 		free(tmppath, M_CCD);
410 	free(cs->sc_cinfo, M_CCD);
411 	ccddestroy(cs);
412 	return (error);
413 }
414 
415 static void
416 ccdinterleave(struct ccd_s *cs, int unit)
417 {
418 	struct ccdcinfo *ci, *smallci;
419 	struct ccdiinfo *ii;
420 	daddr_t bn, lbn;
421 	int ix;
422 	u_long size;
423 
424 
425 	/*
426 	 * Allocate an interleave table.  The worst case occurs when each
427 	 * of N disks is of a different size, resulting in N interleave
428 	 * tables.
429 	 *
430 	 * Chances are this is too big, but we don't care.
431 	 */
432 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
433 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD,
434 	    M_WAITOK | M_ZERO);
435 
436 	/*
437 	 * Trivial case: no interleave (actually interleave of disk size).
438 	 * Each table entry represents a single component in its entirety.
439 	 *
440 	 * An interleave of 0 may not be used with a mirror setup.
441 	 */
442 	if (cs->sc_ileave == 0) {
443 		bn = 0;
444 		ii = cs->sc_itable;
445 
446 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
447 			/* Allocate space for ii_index. */
448 			ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK);
449 			ii->ii_ndisk = 1;
450 			ii->ii_startblk = bn;
451 			ii->ii_startoff = 0;
452 			ii->ii_index[0] = ix;
453 			bn += cs->sc_cinfo[ix].ci_size;
454 			ii++;
455 		}
456 		ii->ii_ndisk = 0;
457 		return;
458 	}
459 
460 	/*
461 	 * The following isn't fast or pretty; it doesn't have to be.
462 	 */
463 	size = 0;
464 	bn = lbn = 0;
465 	for (ii = cs->sc_itable; ; ii++) {
466 		/*
467 		 * Allocate space for ii_index.  We might allocate more then
468 		 * we use.
469 		 */
470 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
471 		    M_CCD, M_WAITOK);
472 
473 		/*
474 		 * Locate the smallest of the remaining components
475 		 */
476 		smallci = NULL;
477 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
478 		    ci++) {
479 			if (ci->ci_size > size &&
480 			    (smallci == NULL ||
481 			     ci->ci_size < smallci->ci_size)) {
482 				smallci = ci;
483 			}
484 		}
485 
486 		/*
487 		 * Nobody left, all done
488 		 */
489 		if (smallci == NULL) {
490 			ii->ii_ndisk = 0;
491 			free(ii->ii_index, M_CCD);
492 			break;
493 		}
494 
495 		/*
496 		 * Record starting logical block using an sc_ileave blocksize.
497 		 */
498 		ii->ii_startblk = bn / cs->sc_ileave;
499 
500 		/*
501 		 * Record starting comopnent block using an sc_ileave
502 		 * blocksize.  This value is relative to the beginning of
503 		 * a component disk.
504 		 */
505 		ii->ii_startoff = lbn;
506 
507 		/*
508 		 * Determine how many disks take part in this interleave
509 		 * and record their indices.
510 		 */
511 		ix = 0;
512 		for (ci = cs->sc_cinfo;
513 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
514 			if (ci->ci_size >= smallci->ci_size) {
515 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
516 			}
517 		}
518 		ii->ii_ndisk = ix;
519 		bn += ix * (smallci->ci_size - size);
520 		lbn = smallci->ci_size / cs->sc_ileave;
521 		size = smallci->ci_size;
522 	}
523 }
524 
525 static void
526 ccdstrategy(struct bio *bp)
527 {
528 	struct ccd_s *cs;
529 	int pbn;        /* in sc_secsize chunks */
530 	long sz;        /* in sc_secsize chunks */
531 
532 	cs = bp->bio_disk->d_drv1;
533 
534 	pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
535 	sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
536 
537 	/*
538 	 * If out of bounds return an error. If at the EOF point,
539 	 * simply read or write less.
540 	 */
541 
542 	if (pbn < 0 || pbn >= cs->sc_size) {
543 		bp->bio_resid = bp->bio_bcount;
544 		if (pbn != cs->sc_size)
545 			biofinish(bp, NULL, EINVAL);
546 		else
547 			biodone(bp);
548 		return;
549 	}
550 
551 	/*
552 	 * If the request crosses EOF, truncate the request.
553 	 */
554 	if (pbn + sz > cs->sc_size) {
555 		bp->bio_bcount = (cs->sc_size - pbn) *
556 		    cs->sc_geom.ccg_secsize;
557 	}
558 
559 	bp->bio_resid = bp->bio_bcount;
560 
561 	/*
562 	 * "Start" the unit.
563 	 */
564 	ccdstart(cs, bp);
565 	return;
566 }
567 
568 static void
569 ccdstart(struct ccd_s *cs, struct bio *bp)
570 {
571 	long bcount, rcount;
572 	struct ccdbuf *cbp[2];
573 	caddr_t addr;
574 	daddr_t bn;
575 	int err;
576 
577 
578 	/* Record the transaction start  */
579 	devstat_start_transaction(&cs->device_stats);
580 
581 	/*
582 	 * Translate the partition-relative block number to an absolute.
583 	 */
584 	bn = bp->bio_blkno;
585 
586 	/*
587 	 * Allocate component buffers and fire off the requests
588 	 */
589 	addr = bp->bio_data;
590 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
591 		err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
592 		if (err) {
593 			printf("ccdbuffer error %d\n", err);
594 			/* We're screwed */
595 			bp->bio_resid -= bcount;
596 			bp->bio_error = ENOMEM;
597 			bp->bio_flags |= BIO_ERROR;
598 			return;
599 		}
600 		rcount = cbp[0]->cb_buf.bio_bcount;
601 
602 		if (cs->sc_cflags & CCDF_MIRROR) {
603 			/*
604 			 * Mirroring.  Writes go to both disks, reads are
605 			 * taken from whichever disk seems most appropriate.
606 			 *
607 			 * We attempt to localize reads to the disk whos arm
608 			 * is nearest the read request.  We ignore seeks due
609 			 * to writes when making this determination and we
610 			 * also try to avoid hogging.
611 			 */
612 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
613 				BIO_STRATEGY(&cbp[0]->cb_buf);
614 				BIO_STRATEGY(&cbp[1]->cb_buf);
615 			} else {
616 				int pick = cs->sc_pick;
617 				daddr_t range = cs->sc_size / 16;
618 
619 				if (bn < cs->sc_blk[pick] - range ||
620 				    bn > cs->sc_blk[pick] + range
621 				) {
622 					cs->sc_pick = pick = 1 - pick;
623 				}
624 				cs->sc_blk[pick] = bn + btodb(rcount);
625 				BIO_STRATEGY(&cbp[pick]->cb_buf);
626 			}
627 		} else {
628 			/*
629 			 * Not mirroring
630 			 */
631 			BIO_STRATEGY(&cbp[0]->cb_buf);
632 		}
633 		bn += btodb(rcount);
634 		addr += rcount;
635 	}
636 }
637 
638 /*
639  * Build a component buffer header.
640  */
641 static int
642 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
643 {
644 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
645 	struct ccdbuf *cbp;
646 	daddr_t cbn, cboff;
647 	off_t cbc;
648 
649 	/*
650 	 * Determine which component bn falls in.
651 	 */
652 	cbn = bn;
653 	cboff = 0;
654 
655 	if (cs->sc_ileave == 0) {
656 		/*
657 		 * Serially concatenated and neither a mirror nor a parity
658 		 * config.  This is a special case.
659 		 */
660 		daddr_t sblk;
661 
662 		sblk = 0;
663 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
664 			sblk += ci->ci_size;
665 		cbn -= sblk;
666 	} else {
667 		struct ccdiinfo *ii;
668 		int ccdisk, off;
669 
670 		/*
671 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
672 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
673 		 * to cbn.
674 		 */
675 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
676 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
677 
678 		/*
679 		 * Figure out which interleave table to use.
680 		 */
681 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
682 			if (ii->ii_startblk > cbn)
683 				break;
684 		}
685 		ii--;
686 
687 		/*
688 		 * off is the logical superblock relative to the beginning
689 		 * of this interleave block.
690 		 */
691 		off = cbn - ii->ii_startblk;
692 
693 		/*
694 		 * We must calculate which disk component to use (ccdisk),
695 		 * and recalculate cbn to be the superblock relative to
696 		 * the beginning of the component.  This is typically done by
697 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
698 		 * must typically be divided by the number of components in
699 		 * this interleave array to be properly convert it from a
700 		 * CCD-relative logical superblock number to a
701 		 * component-relative superblock number.
702 		 */
703 		if (ii->ii_ndisk == 1) {
704 			/*
705 			 * When we have just one disk, it can't be a mirror
706 			 * or a parity config.
707 			 */
708 			ccdisk = ii->ii_index[0];
709 			cbn = ii->ii_startoff + off;
710 		} else {
711 			if (cs->sc_cflags & CCDF_MIRROR) {
712 				/*
713 				 * We have forced a uniform mapping, resulting
714 				 * in a single interleave array.  We double
715 				 * up on the first half of the available
716 				 * components and our mirror is in the second
717 				 * half.  This only works with a single
718 				 * interleave array because doubling up
719 				 * doubles the number of sectors, so there
720 				 * cannot be another interleave array because
721 				 * the next interleave array's calculations
722 				 * would be off.
723 				 */
724 				int ndisk2 = ii->ii_ndisk / 2;
725 				ccdisk = ii->ii_index[off % ndisk2];
726 				cbn = ii->ii_startoff + off / ndisk2;
727 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
728 			} else {
729 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
730 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
731 			}
732 		}
733 
734 		ci = &cs->sc_cinfo[ccdisk];
735 
736 		/*
737 		 * Convert cbn from a superblock to a normal block so it
738 		 * can be used to calculate (along with cboff) the normal
739 		 * block index into this particular disk.
740 		 */
741 		cbn *= cs->sc_ileave;
742 	}
743 
744 	/*
745 	 * Fill in the component buf structure.
746 	 */
747 	cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO);
748 	if (cbp == NULL)
749 		return (ENOMEM);
750 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
751 	cbp->cb_buf.bio_done = ccdiodone;
752 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
753 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
754 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
755 	cbp->cb_buf.bio_data = addr;
756 	cbp->cb_buf.bio_caller2 = cbp;
757 	if (cs->sc_ileave == 0)
758               cbc = dbtob((off_t)(ci->ci_size - cbn));
759 	else
760               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
761 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
762  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
763 
764 	/*
765 	 * context for ccdiodone
766 	 */
767 	cbp->cb_obp = bp;
768 	cbp->cb_softc = cs;
769 	cbp->cb_comp = ci - cs->sc_cinfo;
770 
771 	cb[0] = cbp;
772 
773 	/*
774 	 * Note: both I/O's setup when reading from mirror, but only one
775 	 * will be executed.
776 	 */
777 	if (cs->sc_cflags & CCDF_MIRROR) {
778 		/* mirror, setup second I/O */
779 		cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT);
780 		if (cbp == NULL) {
781 			free(cb[0], M_CCD);
782 			cb[0] = NULL;
783 			return (ENOMEM);
784 		}
785 		bcopy(cb[0], cbp, sizeof(struct ccdbuf));
786 		cbp->cb_buf.bio_dev = ci2->ci_dev;
787 		cbp->cb_comp = ci2 - cs->sc_cinfo;
788 		cb[1] = cbp;
789 		/* link together the ccdbuf's and clear "mirror done" flag */
790 		cb[0]->cb_mirror = cb[1];
791 		cb[1]->cb_mirror = cb[0];
792 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
793 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
794 	}
795 	return (0);
796 }
797 
798 /*
799  * Called at interrupt time.
800  * Mark the component as done and if all components are done,
801  * take a ccd interrupt.
802  */
803 static void
804 ccdiodone(struct bio *ibp)
805 {
806 	struct ccdbuf *cbp;
807 	struct bio *bp;
808 	struct ccd_s *cs;
809 	int count;
810 
811 	cbp = ibp->bio_caller2;
812 	cs = cbp->cb_softc;
813 	bp = cbp->cb_obp;
814 	/*
815 	 * If an error occured, report it.  If this is a mirrored
816 	 * configuration and the first of two possible reads, do not
817 	 * set the error in the bp yet because the second read may
818 	 * succeed.
819 	 */
820 
821 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
822 		const char *msg = "";
823 
824 		if ((cs->sc_cflags & CCDF_MIRROR) &&
825 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
826 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
827 			/*
828 			 * We will try our read on the other disk down
829 			 * below, also reverse the default pick so if we
830 			 * are doing a scan we do not keep hitting the
831 			 * bad disk first.
832 			 */
833 
834 			msg = ", trying other disk";
835 			cs->sc_pick = 1 - cs->sc_pick;
836 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
837 		} else {
838 			bp->bio_flags |= BIO_ERROR;
839 			bp->bio_error = cbp->cb_buf.bio_error ?
840 			    cbp->cb_buf.bio_error : EIO;
841 		}
842 		printf("ccd%d: error %d on component %d block %jd "
843 		    "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error,
844 		    cbp->cb_comp,
845 		    (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno,
846 		    msg);
847 	}
848 
849 	/*
850 	 * Process mirror.  If we are writing, I/O has been initiated on both
851 	 * buffers and we fall through only after both are finished.
852 	 *
853 	 * If we are reading only one I/O is initiated at a time.  If an
854 	 * error occurs we initiate the second I/O and return, otherwise
855 	 * we free the second I/O without initiating it.
856 	 */
857 
858 	if (cs->sc_cflags & CCDF_MIRROR) {
859 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
860 			/*
861 			 * When writing, handshake with the second buffer
862 			 * to determine when both are done.  If both are not
863 			 * done, return here.
864 			 */
865 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
866 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
867 				free(cbp, M_CCD);
868 				return;
869 			}
870 		} else {
871 			/*
872 			 * When reading, either dispose of the second buffer
873 			 * or initiate I/O on the second buffer if an error
874 			 * occured with this one.
875 			 */
876 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
877 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
878 					cbp->cb_mirror->cb_pflags |=
879 					    CCDPF_MIRROR_DONE;
880 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf);
881 					free(cbp, M_CCD);
882 					return;
883 				} else {
884 					free(cbp->cb_mirror, M_CCD);
885 				}
886 			}
887 		}
888 	}
889 
890 	/*
891 	 * use bio_caller1 to determine how big the original request was rather
892 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
893 	 *
894 	 * XXX We check for an error, but we do not test the resid for an
895 	 * aligned EOF condition.  This may result in character & block
896 	 * device access not recognizing EOF properly when read or written
897 	 * sequentially, but will not effect filesystems.
898 	 */
899 	count = (long)cbp->cb_buf.bio_caller1;
900 	free(cbp, M_CCD);
901 
902 	/*
903 	 * If all done, "interrupt".
904 	 */
905 	bp->bio_resid -= count;
906 	if (bp->bio_resid < 0)
907 		panic("ccdiodone: count");
908 	if (bp->bio_resid == 0) {
909 		if (bp->bio_flags & BIO_ERROR)
910 			bp->bio_resid = bp->bio_bcount;
911 		biofinish(bp, &cs->device_stats, 0);
912 	}
913 }
914 
915 static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td);
916 
917 static int
918 ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
919 {
920 	struct ccd_ioctl *ccio;
921 	u_int unit;
922 	dev_t dev2;
923 	int error;
924 
925 	switch (cmd) {
926 	case CCDIOCSET:
927 	case CCDIOCCLR:
928 		ccio = (struct ccd_ioctl *)data;
929 		unit = ccio->ccio_size;
930 		return (ccdioctltoo(unit, cmd, data, flag, td));
931 	case CCDCONFINFO:
932 		{
933 		int ninit = 0;
934 		struct ccdconf *conf = (struct ccdconf *)data;
935 		struct ccd_s *tmpcs;
936 		struct ccd_s *ubuf = conf->buffer;
937 
938 		/* XXX: LOCK(unique unit numbers) */
939 		LIST_FOREACH(tmpcs, &ccd_softc_list, list)
940 			if (IS_INITED(tmpcs))
941 				ninit++;
942 
943 		if (conf->size == 0) {
944 			conf->size = sizeof(struct ccd_s) * ninit;
945 			return (0);
946 		} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
947 		    (conf->size % sizeof(struct ccd_s) != 0)) {
948 			/* XXX: UNLOCK(unique unit numbers) */
949 			return (EINVAL);
950 		}
951 
952 		ubuf += ninit;
953 		LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
954 			if (!IS_INITED(tmpcs))
955 				continue;
956 			error = copyout(tmpcs, --ubuf,
957 			    sizeof(struct ccd_s));
958 			if (error != 0)
959 				/* XXX: UNLOCK(unique unit numbers) */
960 				return (error);
961 		}
962 		/* XXX: UNLOCK(unique unit numbers) */
963 		return (0);
964 		}
965 
966 	case CCDCPPINFO:
967 		{
968 		struct ccdcpps *cpps = (struct ccdcpps *)data;
969 		char *ubuf = cpps->buffer;
970 		struct ccd_s *cs;
971 
972 
973 		error = copyin(ubuf, &unit, sizeof (unit));
974 		if (error)
975 			return (error);
976 
977 		if (!IS_ALLOCATED(unit))
978 			return (ENXIO);
979 		dev2 = makedev(CDEV_MAJOR, unit * 8 + 2);
980 		cs = ccdfind(unit);
981 		if (!IS_INITED(cs))
982 			return (ENXIO);
983 
984 		{
985 			int len = 0, i;
986 			struct ccdcpps *cpps = (struct ccdcpps *)data;
987 			char *ubuf = cpps->buffer;
988 
989 
990 			for (i = 0; i < cs->sc_nccdisks; ++i)
991 				len += cs->sc_cinfo[i].ci_pathlen;
992 
993 			if (cpps->size < len)
994 				return (ENOMEM);
995 
996 			for (i = 0; i < cs->sc_nccdisks; ++i) {
997 				len = cs->sc_cinfo[i].ci_pathlen;
998 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
999 				    len);
1000 				if (error != 0)
1001 					return (error);
1002 				ubuf += len;
1003 			}
1004 			return(copyout("", ubuf, 1));
1005 		}
1006 		break;
1007 		}
1008 
1009 	default:
1010 		return (ENXIO);
1011 	}
1012 }
1013 
1014 static int
1015 ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td)
1016 {
1017 	int i, j, lookedup = 0, error = 0;
1018 	struct ccd_s *cs;
1019 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1020 	struct ccdgeom *ccg;
1021 	char **cpp;
1022 	struct vnode **vpp;
1023 
1024 	cs = ccdfind(unit);
1025 	switch (cmd) {
1026 	case CCDIOCSET:
1027 		if (cs == NULL)
1028 			cs = ccdnew(unit);
1029 		if (IS_INITED(cs))
1030 			return (EBUSY);
1031 
1032 		if ((flag & FWRITE) == 0)
1033 			return (EBADF);
1034 
1035 		if ((error = ccdlock(cs)) != 0)
1036 			return (error);
1037 
1038 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1039 			return (EINVAL);
1040 
1041 		/* Fill in some important bits. */
1042 		cs->sc_ileave = ccio->ccio_ileave;
1043 		if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) {
1044 			printf("ccd%d: disabling mirror, interleave is 0\n",
1045 			    unit);
1046 			ccio->ccio_flags &= ~(CCDF_MIRROR);
1047 		}
1048 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1049 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1050 			printf("ccd%d: mirror/parity forces uniform flag\n",
1051 			       unit);
1052 			ccio->ccio_flags |= CCDF_UNIFORM;
1053 		}
1054 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1055 
1056 		/*
1057 		 * Allocate space for and copy in the array of
1058 		 * componet pathnames and device numbers.
1059 		 */
1060 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1061 		    M_CCD, M_WAITOK);
1062 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1063 		    M_CCD, M_WAITOK);
1064 
1065 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1066 		    ccio->ccio_ndisks * sizeof(char **));
1067 		if (error) {
1068 			free(vpp, M_CCD);
1069 			free(cpp, M_CCD);
1070 			ccdunlock(cs);
1071 			return (error);
1072 		}
1073 
1074 
1075 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1076 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1077 				for (j = 0; j < lookedup; ++j)
1078 					(void)vn_close(vpp[j], FREAD|FWRITE,
1079 					    td->td_ucred, td);
1080 				free(vpp, M_CCD);
1081 				free(cpp, M_CCD);
1082 				ccdunlock(cs);
1083 				return (error);
1084 			}
1085 			++lookedup;
1086 		}
1087 		cs->sc_vpp = vpp;
1088 		cs->sc_nccdisks = ccio->ccio_ndisks;
1089 
1090 		/*
1091 		 * Initialize the ccd.  Fills in the softc for us.
1092 		 */
1093 		if ((error = ccdinit(cs, cpp, td)) != 0) {
1094 			for (j = 0; j < lookedup; ++j)
1095 				(void)vn_close(vpp[j], FREAD|FWRITE,
1096 				    td->td_ucred, td);
1097 			/*
1098 			 * We can't ccddestroy() cs just yet, because nothing
1099 			 * prevents user-level app to do another ioctl()
1100 			 * without closing the device first, therefore
1101 			 * declare unit null and void and let ccdclose()
1102 			 * destroy it when it is safe to do so.
1103 			 */
1104 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1105 			free(vpp, M_CCD);
1106 			free(cpp, M_CCD);
1107 			ccdunlock(cs);
1108 			return (error);
1109 		}
1110 		free(cpp, M_CCD);
1111 
1112 		/*
1113 		 * The ccd has been successfully initialized, so
1114 		 * we can place it into the array and read the disklabel.
1115 		 */
1116 		ccio->ccio_unit = unit;
1117 		ccio->ccio_size = cs->sc_size;
1118 		ccg = &cs->sc_geom;
1119 		cs->sc_disk = malloc(sizeof(struct disk), M_CCD,
1120 		    M_ZERO | M_WAITOK);
1121 		cs->sc_disk->d_strategy = ccdstrategy;
1122 		cs->sc_disk->d_name = "ccd";
1123 		cs->sc_disk->d_sectorsize = ccg->ccg_secsize;
1124 		cs->sc_disk->d_mediasize =
1125 		    cs->sc_size * (off_t)ccg->ccg_secsize;
1126 		cs->sc_disk->d_fwsectors = ccg->ccg_nsectors;
1127 		cs->sc_disk->d_fwheads = ccg->ccg_ntracks;
1128 		cs->sc_disk->d_drv1 = cs;
1129 		cs->sc_disk->d_maxsize = MAXPHYS;
1130 		disk_create(unit, cs->sc_disk, 0, NULL, NULL);
1131 
1132 		ccdunlock(cs);
1133 
1134 		break;
1135 
1136 	case CCDIOCCLR:
1137 		if (cs == NULL)
1138 			return (ENXIO);
1139 
1140 		if (!IS_INITED(cs))
1141 			return (ENXIO);
1142 
1143 		if ((flag & FWRITE) == 0)
1144 			return (EBADF);
1145 
1146 		if ((error = ccdlock(cs)) != 0)
1147 			return (error);
1148 
1149 		/* Don't unconfigure if any other partitions are open */
1150 		if (cs->sc_disk->d_flags & DISKFLAG_OPEN) {
1151 			ccdunlock(cs);
1152 			return (EBUSY);
1153 		}
1154 
1155 		disk_destroy(cs->sc_disk);
1156 		free(cs->sc_disk, M_CCD);
1157 		cs->sc_disk = NULL;
1158 		/* Declare unit null and void (reset all flags) */
1159 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1160 
1161 		/* Close the components and free their pathnames. */
1162 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1163 			/*
1164 			 * XXX: this close could potentially fail and
1165 			 * cause Bad Things.  Maybe we need to force
1166 			 * the close to happen?
1167 			 */
1168 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1169 			    td->td_ucred, td);
1170 			free(cs->sc_cinfo[i].ci_path, M_CCD);
1171 		}
1172 
1173 		/* Free interleave index. */
1174 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1175 			free(cs->sc_itable[i].ii_index, M_CCD);
1176 
1177 		/* Free component info and interleave table. */
1178 		free(cs->sc_cinfo, M_CCD);
1179 		free(cs->sc_itable, M_CCD);
1180 		free(cs->sc_vpp, M_CCD);
1181 
1182 		/* And remove the devstat entry. */
1183 		devstat_remove_entry(&cs->device_stats);
1184 
1185 		/* This must be atomic. */
1186 		ccdunlock(cs);
1187 		ccddestroy(cs);
1188 
1189 		break;
1190 	}
1191 
1192 	return (0);
1193 }
1194 
1195 
1196 /*
1197  * Lookup the provided name in the filesystem.  If the file exists,
1198  * is a valid block device, and isn't being used by anyone else,
1199  * set *vpp to the file's vnode.
1200  */
1201 static int
1202 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1203 {
1204 	struct nameidata nd;
1205 	struct vnode *vp;
1206 	int error, flags;
1207 
1208 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1209 	flags = FREAD | FWRITE;
1210 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1211 		return (error);
1212 	}
1213 	vp = nd.ni_vp;
1214 
1215 	if (vrefcnt(vp) > 1) {
1216 		error = EBUSY;
1217 		goto bad;
1218 	}
1219 
1220 	if (!vn_isdisk(vp, &error))
1221 		goto bad;
1222 
1223 
1224 	VOP_UNLOCK(vp, 0, td);
1225 	NDFREE(&nd, NDF_ONLY_PNBUF);
1226 	*vpp = vp;
1227 	return (0);
1228 bad:
1229 	VOP_UNLOCK(vp, 0, td);
1230 	NDFREE(&nd, NDF_ONLY_PNBUF);
1231 	/* vn_close does vrele() for vp */
1232 	(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1233 	return (error);
1234 }
1235 
1236 /*
1237 
1238  * Wait interruptibly for an exclusive lock.
1239  *
1240  * XXX
1241  * Several drivers do this; it should be abstracted and made MP-safe.
1242  */
1243 static int
1244 ccdlock(struct ccd_s *cs)
1245 {
1246 	int error;
1247 
1248 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1249 		cs->sc_flags |= CCDF_WANTED;
1250 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1251 			return (error);
1252 	}
1253 	cs->sc_flags |= CCDF_LOCKED;
1254 	return (0);
1255 }
1256 
1257 /*
1258  * Unlock and wake up any waiters.
1259  */
1260 static void
1261 ccdunlock(struct ccd_s *cs)
1262 {
1263 
1264 	cs->sc_flags &= ~CCDF_LOCKED;
1265 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1266 		cs->sc_flags &= ~CCDF_WANTED;
1267 		wakeup(cs);
1268 	}
1269 }
1270