xref: /freebsd/sys/geom/geom_ccd.c (revision 6780ab54325a71e7e70112b11657973edde8655e)
1 /*
2  * Copyright (c) 2003 Poul-Henning Kamp.
3  * Copyright (c) 1995 Jason R. Thorpe.
4  * Copyright (c) 1990, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * All rights reserved.
7  * Copyright (c) 1988 University of Utah.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * the Systems Programming Group of the University of Utah Computer
11  * Science Department.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. All advertising materials mentioning features or use of this software
22  *    must display the following acknowledgement:
23  *	This product includes software developed for the NetBSD Project
24  *	by Jason R. Thorpe.
25  * 4. The names of the authors may not be used to endorse or promote products
26  *    derived from this software without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
35  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
36  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  * Dynamic configuration and disklabel support by:
41  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
42  *	Numerical Aerodynamic Simulation Facility
43  *	Mail Stop 258-6
44  *	NASA Ames Research Center
45  *	Moffett Field, CA 94035
46  *
47  * from: Utah $Hdr: cd.c 1.6 90/11/28$
48  *
49  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
50  *
51  *	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
52  *
53  * $FreeBSD$
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/module.h>
60 #include <sys/proc.h>
61 #include <sys/bio.h>
62 #include <sys/malloc.h>
63 #include <sys/namei.h>
64 #include <sys/conf.h>
65 #include <sys/stat.h>
66 #include <sys/stdint.h>
67 #include <sys/sysctl.h>
68 #include <sys/disk.h>
69 #include <sys/devicestat.h>
70 #include <sys/fcntl.h>
71 #include <sys/vnode.h>
72 
73 #include <sys/ccdvar.h>
74 
75 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
76 
77 /*
78    This is how mirroring works (only writes are special):
79 
80    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
81    linked together by the cb_mirror field.  "cb_pflags &
82    CCDPF_MIRROR_DONE" is set to 0 on both of them.
83 
84    When a component returns to ccdiodone(), it checks if "cb_pflags &
85    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
86    flag and returns.  If it is, it means its partner has already
87    returned, so it will go to the regular cleanup.
88 
89  */
90 
91 struct ccdbuf {
92 	struct bio	cb_buf;		/* new I/O buf */
93 	struct bio	*cb_obp;	/* ptr. to original I/O buf */
94 	struct ccdbuf	*cb_freenext;	/* free list link */
95 	struct ccd_s	*cb_softc;
96 	int		cb_comp;	/* target component */
97 	int		cb_pflags;	/* mirror/parity status flag */
98 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
99 };
100 
101 /* bits in cb_pflags */
102 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
103 
104 /* convinient macros for often-used statements */
105 #define IS_ALLOCATED(unit)	(ccdfind(unit) != NULL)
106 #define IS_INITED(cs)		(((cs)->sc_flags & CCDF_INITED) != 0)
107 
108 static dev_t	ccdctldev;
109 
110 static d_strategy_t ccdstrategy;
111 static d_ioctl_t ccdctlioctl;
112 
113 #define NCCDFREEHIWAT	16
114 
115 #define CDEV_MAJOR 74
116 
117 static struct cdevsw ccdctl_cdevsw = {
118 	/* open */	nullopen,
119 	/* close */	nullclose,
120 	/* read */	noread,
121 	/* write */	nowrite,
122 	/* ioctl */	ccdctlioctl,
123 	/* poll */	nopoll,
124 	/* mmap */	nommap,
125 	/* strategy */	nostrategy,
126 	/* name */	"ccdctl",
127 	/* maj */	CDEV_MAJOR,
128 	/* dump */	nodump,
129 	/* psize */	nopsize,
130 	/* flags */	0
131 };
132 
133 static LIST_HEAD(, ccd_s) ccd_softc_list =
134 	LIST_HEAD_INITIALIZER(&ccd_softc_list);
135 
136 static struct ccd_s *ccdfind(int);
137 static struct ccd_s *ccdnew(int);
138 static int ccddestroy(struct ccd_s *);
139 
140 /* called during module initialization */
141 static void ccdattach(void);
142 static int ccd_modevent(module_t, int, void *);
143 
144 /* called by biodone() at interrupt time */
145 static void ccdiodone(struct bio *bp);
146 
147 static void ccdstart(struct ccd_s *, struct bio *);
148 static void ccdinterleave(struct ccd_s *, int);
149 static int ccdinit(struct ccd_s *, char **, struct thread *);
150 static int ccdlookup(char *, struct thread *p, struct vnode **);
151 static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
152 		      struct bio *, daddr_t, caddr_t, long);
153 static int ccdlock(struct ccd_s *);
154 static void ccdunlock(struct ccd_s *);
155 
156 
157 /*
158  * Number of blocks to untouched in front of a component partition.
159  * This is to avoid violating its disklabel area when it starts at the
160  * beginning of the slice.
161  */
162 #if !defined(CCD_OFFSET)
163 #define CCD_OFFSET 16
164 #endif
165 
166 static struct ccd_s *
167 ccdfind(int unit)
168 {
169 	struct ccd_s *sc = NULL;
170 
171 	/* XXX: LOCK(unique unit numbers) */
172 	LIST_FOREACH(sc, &ccd_softc_list, list) {
173 		if (sc->sc_unit == unit)
174 			break;
175 	}
176 	/* XXX: UNLOCK(unique unit numbers) */
177 	return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
178 }
179 
180 static struct ccd_s *
181 ccdnew(int unit)
182 {
183 	struct ccd_s *sc;
184 
185 	/* XXX: LOCK(unique unit numbers) */
186 	if (IS_ALLOCATED(unit) || unit > 32)
187 		return (NULL);
188 
189 	MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_ZERO);
190 	sc->sc_unit = unit;
191 	LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
192 	/* XXX: UNLOCK(unique unit numbers) */
193 	return (sc);
194 }
195 
196 static int
197 ccddestroy(struct ccd_s *sc)
198 {
199 
200 	/* XXX: LOCK(unique unit numbers) */
201 	LIST_REMOVE(sc, list);
202 	/* XXX: UNLOCK(unique unit numbers) */
203 	FREE(sc, M_CCD);
204 	return (0);
205 }
206 
207 /*
208  * Called by main() during pseudo-device attachment.  All we need
209  * to do is to add devsw entries.
210  */
211 static void
212 ccdattach()
213 {
214 
215 	ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff,
216 		UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl");
217 	ccdctldev->si_drv1 = ccdctldev;
218 }
219 
220 static int
221 ccd_modevent(module_t mod, int type, void *data)
222 {
223 	int error = 0;
224 
225 	switch (type) {
226 	case MOD_LOAD:
227 		ccdattach();
228 		break;
229 
230 	case MOD_UNLOAD:
231 		printf("ccd0: Unload not supported!\n");
232 		error = EOPNOTSUPP;
233 		break;
234 
235 	case MOD_SHUTDOWN:
236 		break;
237 
238 	default:
239 		error = EOPNOTSUPP;
240 	}
241 	return (error);
242 }
243 
244 DEV_MODULE(ccd, ccd_modevent, NULL);
245 
246 static int
247 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
248 {
249 	struct ccdcinfo *ci = NULL;	/* XXX */
250 	size_t size;
251 	int ix;
252 	struct vnode *vp;
253 	size_t minsize;
254 	int maxsecsize;
255 	struct ccdgeom *ccg = &cs->sc_geom;
256 	char *tmppath = NULL;
257 	int error = 0;
258 	off_t mediasize;
259 	u_int sectorsize;
260 
261 
262 	cs->sc_size = 0;
263 
264 	/* Allocate space for the component info. */
265 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
266 	    M_CCD, 0);
267 
268 	/*
269 	 * Verify that each component piece exists and record
270 	 * relevant information about it.
271 	 */
272 	maxsecsize = 0;
273 	minsize = 0;
274 	tmppath = malloc(MAXPATHLEN, M_CCD, 0);
275 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
276 		vp = cs->sc_vpp[ix];
277 		ci = &cs->sc_cinfo[ix];
278 		ci->ci_vp = vp;
279 
280 		/*
281 		 * Copy in the pathname of the component.
282 		 */
283 		if ((error = copyinstr(cpaths[ix], tmppath,
284 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
285 			goto fail;
286 		}
287 		ci->ci_path = malloc(ci->ci_pathlen, M_CCD, 0);
288 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
289 
290 		ci->ci_dev = vn_todev(vp);
291 
292 		/*
293 		 * Get partition information for the component.
294 		 */
295 		error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
296 		    FREAD, td->td_ucred, td);
297 		if (error != 0) {
298 			goto fail;
299 		}
300 		/*
301 		 * Get partition information for the component.
302 		 */
303 		error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)&sectorsize,
304 		    FREAD, td->td_ucred, td);
305 		if (error != 0) {
306 			goto fail;
307 		}
308 		if (sectorsize > maxsecsize)
309 			maxsecsize = sectorsize;
310 		size = mediasize / DEV_BSIZE - CCD_OFFSET;
311 
312 		/*
313 		 * Calculate the size, truncating to an interleave
314 		 * boundary if necessary.
315 		 */
316 
317 		if (cs->sc_ileave > 1)
318 			size -= size % cs->sc_ileave;
319 
320 		if (size == 0) {
321 			error = ENODEV;
322 			goto fail;
323 		}
324 
325 		if (minsize == 0 || size < minsize)
326 			minsize = size;
327 		ci->ci_size = size;
328 		cs->sc_size += size;
329 	}
330 
331 	free(tmppath, M_CCD);
332 	tmppath = NULL;
333 
334 	/*
335 	 * Don't allow the interleave to be smaller than
336 	 * the biggest component sector.
337 	 */
338 	if ((cs->sc_ileave > 0) &&
339 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
340 		error = EINVAL;
341 		goto fail;
342 	}
343 
344 	/*
345 	 * If uniform interleave is desired set all sizes to that of
346 	 * the smallest component.  This will guarentee that a single
347 	 * interleave table is generated.
348 	 *
349 	 * Lost space must be taken into account when calculating the
350 	 * overall size.  Half the space is lost when CCDF_MIRROR is
351 	 * specified.
352 	 */
353 	if (cs->sc_flags & CCDF_UNIFORM) {
354 		for (ci = cs->sc_cinfo;
355 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
356 			ci->ci_size = minsize;
357 		}
358 		if (cs->sc_flags & CCDF_MIRROR) {
359 			/*
360 			 * Check to see if an even number of components
361 			 * have been specified.  The interleave must also
362 			 * be non-zero in order for us to be able to
363 			 * guarentee the topology.
364 			 */
365 			if (cs->sc_nccdisks % 2) {
366 				printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
367 				error = EINVAL;
368 				goto fail;
369 			}
370 			if (cs->sc_ileave == 0) {
371 				printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
372 				error = EINVAL;
373 				goto fail;
374 			}
375 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
376 		} else {
377 			if (cs->sc_ileave == 0) {
378 				printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
379 				error = EINVAL;
380 				goto fail;
381 			}
382 			cs->sc_size = cs->sc_nccdisks * minsize;
383 		}
384 	}
385 
386 	/*
387 	 * Construct the interleave table.
388 	 */
389 	ccdinterleave(cs, cs->sc_unit);
390 
391 	/*
392 	 * Create pseudo-geometry based on 1MB cylinders.  It's
393 	 * pretty close.
394 	 */
395 	ccg->ccg_secsize = maxsecsize;
396 	ccg->ccg_ntracks = 1;
397 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
398 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
399 
400 	/*
401 	 * Add a devstat entry for this device.
402 	 */
403 	devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
404 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
405 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
406 			  DEVSTAT_PRIORITY_ARRAY);
407 
408 	cs->sc_flags |= CCDF_INITED;
409 	cs->sc_cflags = cs->sc_flags;	/* So we can find out later... */
410 	return (0);
411 fail:
412 	while (ci > cs->sc_cinfo) {
413 		ci--;
414 		free(ci->ci_path, M_CCD);
415 	}
416 	if (tmppath != NULL)
417 		free(tmppath, M_CCD);
418 	free(cs->sc_cinfo, M_CCD);
419 	ccddestroy(cs);
420 	return (error);
421 }
422 
423 static void
424 ccdinterleave(struct ccd_s *cs, int unit)
425 {
426 	struct ccdcinfo *ci, *smallci;
427 	struct ccdiinfo *ii;
428 	daddr_t bn, lbn;
429 	int ix;
430 	u_long size;
431 
432 
433 	/*
434 	 * Allocate an interleave table.  The worst case occurs when each
435 	 * of N disks is of a different size, resulting in N interleave
436 	 * tables.
437 	 *
438 	 * Chances are this is too big, but we don't care.
439 	 */
440 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
441 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD,
442 	    M_ZERO);
443 
444 	/*
445 	 * Trivial case: no interleave (actually interleave of disk size).
446 	 * Each table entry represents a single component in its entirety.
447 	 *
448 	 * An interleave of 0 may not be used with a mirror setup.
449 	 */
450 	if (cs->sc_ileave == 0) {
451 		bn = 0;
452 		ii = cs->sc_itable;
453 
454 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
455 			/* Allocate space for ii_index. */
456 			ii->ii_index = malloc(sizeof(int), M_CCD, 0);
457 			ii->ii_ndisk = 1;
458 			ii->ii_startblk = bn;
459 			ii->ii_startoff = 0;
460 			ii->ii_index[0] = ix;
461 			bn += cs->sc_cinfo[ix].ci_size;
462 			ii++;
463 		}
464 		ii->ii_ndisk = 0;
465 		return;
466 	}
467 
468 	/*
469 	 * The following isn't fast or pretty; it doesn't have to be.
470 	 */
471 	size = 0;
472 	bn = lbn = 0;
473 	for (ii = cs->sc_itable; ; ii++) {
474 		/*
475 		 * Allocate space for ii_index.  We might allocate more then
476 		 * we use.
477 		 */
478 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
479 		    M_CCD, 0);
480 
481 		/*
482 		 * Locate the smallest of the remaining components
483 		 */
484 		smallci = NULL;
485 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
486 		    ci++) {
487 			if (ci->ci_size > size &&
488 			    (smallci == NULL ||
489 			     ci->ci_size < smallci->ci_size)) {
490 				smallci = ci;
491 			}
492 		}
493 
494 		/*
495 		 * Nobody left, all done
496 		 */
497 		if (smallci == NULL) {
498 			ii->ii_ndisk = 0;
499 			free(ii->ii_index, M_CCD);
500 			break;
501 		}
502 
503 		/*
504 		 * Record starting logical block using an sc_ileave blocksize.
505 		 */
506 		ii->ii_startblk = bn / cs->sc_ileave;
507 
508 		/*
509 		 * Record starting comopnent block using an sc_ileave
510 		 * blocksize.  This value is relative to the beginning of
511 		 * a component disk.
512 		 */
513 		ii->ii_startoff = lbn;
514 
515 		/*
516 		 * Determine how many disks take part in this interleave
517 		 * and record their indices.
518 		 */
519 		ix = 0;
520 		for (ci = cs->sc_cinfo;
521 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
522 			if (ci->ci_size >= smallci->ci_size) {
523 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
524 			}
525 		}
526 		ii->ii_ndisk = ix;
527 		bn += ix * (smallci->ci_size - size);
528 		lbn = smallci->ci_size / cs->sc_ileave;
529 		size = smallci->ci_size;
530 	}
531 }
532 
533 static void
534 ccdstrategy(struct bio *bp)
535 {
536 	struct ccd_s *cs;
537 	int pbn;        /* in sc_secsize chunks */
538 	long sz;        /* in sc_secsize chunks */
539 
540 	cs = bp->bio_dev->si_drv1;
541 
542 	pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
543 	sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
544 
545 	/*
546 	 * If out of bounds return an error. If at the EOF point,
547 	 * simply read or write less.
548 	 */
549 
550 	if (pbn < 0 || pbn >= cs->sc_size) {
551 		bp->bio_resid = bp->bio_bcount;
552 		if (pbn != cs->sc_size)
553 			biofinish(bp, NULL, EINVAL);
554 		else
555 			biodone(bp);
556 		return;
557 	}
558 
559 	/*
560 	 * If the request crosses EOF, truncate the request.
561 	 */
562 	if (pbn + sz > cs->sc_size) {
563 		bp->bio_bcount = (cs->sc_size - pbn) *
564 		    cs->sc_geom.ccg_secsize;
565 	}
566 
567 	bp->bio_resid = bp->bio_bcount;
568 
569 	/*
570 	 * "Start" the unit.
571 	 */
572 	ccdstart(cs, bp);
573 	return;
574 }
575 
576 static void
577 ccdstart(struct ccd_s *cs, struct bio *bp)
578 {
579 	long bcount, rcount;
580 	struct ccdbuf *cbp[2];
581 	caddr_t addr;
582 	daddr_t bn;
583 	int err;
584 
585 
586 	/* Record the transaction start  */
587 	devstat_start_transaction(&cs->device_stats);
588 
589 	/*
590 	 * Translate the partition-relative block number to an absolute.
591 	 */
592 	bn = bp->bio_blkno;
593 
594 	/*
595 	 * Allocate component buffers and fire off the requests
596 	 */
597 	addr = bp->bio_data;
598 	for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
599 		err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
600 		if (err) {
601 			printf("ccdbuffer error %d\n", err);
602 			/* We're screwed */
603 			bp->bio_resid -= bcount;
604 			bp->bio_error = ENOMEM;
605 			bp->bio_flags |= BIO_ERROR;
606 			return;
607 		}
608 		rcount = cbp[0]->cb_buf.bio_bcount;
609 
610 		if (cs->sc_cflags & CCDF_MIRROR) {
611 			/*
612 			 * Mirroring.  Writes go to both disks, reads are
613 			 * taken from whichever disk seems most appropriate.
614 			 *
615 			 * We attempt to localize reads to the disk whos arm
616 			 * is nearest the read request.  We ignore seeks due
617 			 * to writes when making this determination and we
618 			 * also try to avoid hogging.
619 			 */
620 			if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
621 				BIO_STRATEGY(&cbp[0]->cb_buf);
622 				BIO_STRATEGY(&cbp[1]->cb_buf);
623 			} else {
624 				int pick = cs->sc_pick;
625 				daddr_t range = cs->sc_size / 16;
626 
627 				if (bn < cs->sc_blk[pick] - range ||
628 				    bn > cs->sc_blk[pick] + range
629 				) {
630 					cs->sc_pick = pick = 1 - pick;
631 				}
632 				cs->sc_blk[pick] = bn + btodb(rcount);
633 				BIO_STRATEGY(&cbp[pick]->cb_buf);
634 			}
635 		} else {
636 			/*
637 			 * Not mirroring
638 			 */
639 			BIO_STRATEGY(&cbp[0]->cb_buf);
640 		}
641 		bn += btodb(rcount);
642 		addr += rcount;
643 	}
644 }
645 
646 /*
647  * Build a component buffer header.
648  */
649 static int
650 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
651 {
652 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
653 	struct ccdbuf *cbp;
654 	daddr_t cbn, cboff;
655 	off_t cbc;
656 
657 	/*
658 	 * Determine which component bn falls in.
659 	 */
660 	cbn = bn;
661 	cboff = 0;
662 
663 	if (cs->sc_ileave == 0) {
664 		/*
665 		 * Serially concatenated and neither a mirror nor a parity
666 		 * config.  This is a special case.
667 		 */
668 		daddr_t sblk;
669 
670 		sblk = 0;
671 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
672 			sblk += ci->ci_size;
673 		cbn -= sblk;
674 	} else {
675 		struct ccdiinfo *ii;
676 		int ccdisk, off;
677 
678 		/*
679 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
680 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
681 		 * to cbn.
682 		 */
683 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
684 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
685 
686 		/*
687 		 * Figure out which interleave table to use.
688 		 */
689 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
690 			if (ii->ii_startblk > cbn)
691 				break;
692 		}
693 		ii--;
694 
695 		/*
696 		 * off is the logical superblock relative to the beginning
697 		 * of this interleave block.
698 		 */
699 		off = cbn - ii->ii_startblk;
700 
701 		/*
702 		 * We must calculate which disk component to use (ccdisk),
703 		 * and recalculate cbn to be the superblock relative to
704 		 * the beginning of the component.  This is typically done by
705 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
706 		 * must typically be divided by the number of components in
707 		 * this interleave array to be properly convert it from a
708 		 * CCD-relative logical superblock number to a
709 		 * component-relative superblock number.
710 		 */
711 		if (ii->ii_ndisk == 1) {
712 			/*
713 			 * When we have just one disk, it can't be a mirror
714 			 * or a parity config.
715 			 */
716 			ccdisk = ii->ii_index[0];
717 			cbn = ii->ii_startoff + off;
718 		} else {
719 			if (cs->sc_cflags & CCDF_MIRROR) {
720 				/*
721 				 * We have forced a uniform mapping, resulting
722 				 * in a single interleave array.  We double
723 				 * up on the first half of the available
724 				 * components and our mirror is in the second
725 				 * half.  This only works with a single
726 				 * interleave array because doubling up
727 				 * doubles the number of sectors, so there
728 				 * cannot be another interleave array because
729 				 * the next interleave array's calculations
730 				 * would be off.
731 				 */
732 				int ndisk2 = ii->ii_ndisk / 2;
733 				ccdisk = ii->ii_index[off % ndisk2];
734 				cbn = ii->ii_startoff + off / ndisk2;
735 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
736 			} else {
737 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
738 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
739 			}
740 		}
741 
742 		ci = &cs->sc_cinfo[ccdisk];
743 
744 		/*
745 		 * Convert cbn from a superblock to a normal block so it
746 		 * can be used to calculate (along with cboff) the normal
747 		 * block index into this particular disk.
748 		 */
749 		cbn *= cs->sc_ileave;
750 	}
751 
752 	/*
753 	 * Fill in the component buf structure.
754 	 */
755 	cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO);
756 	if (cbp == NULL)
757 		return (ENOMEM);
758 	cbp->cb_buf.bio_cmd = bp->bio_cmd;
759 	cbp->cb_buf.bio_done = ccdiodone;
760 	cbp->cb_buf.bio_dev = ci->ci_dev;		/* XXX */
761 	cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
762 	cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
763 	cbp->cb_buf.bio_data = addr;
764 	cbp->cb_buf.bio_caller2 = cbp;
765 	if (cs->sc_ileave == 0)
766               cbc = dbtob((off_t)(ci->ci_size - cbn));
767 	else
768               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
769 	cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
770  	cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
771 
772 	/*
773 	 * context for ccdiodone
774 	 */
775 	cbp->cb_obp = bp;
776 	cbp->cb_softc = cs;
777 	cbp->cb_comp = ci - cs->sc_cinfo;
778 
779 	cb[0] = cbp;
780 
781 	/*
782 	 * Note: both I/O's setup when reading from mirror, but only one
783 	 * will be executed.
784 	 */
785 	if (cs->sc_cflags & CCDF_MIRROR) {
786 		/* mirror, setup second I/O */
787 		cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT);
788 		if (cbp == NULL) {
789 			free(cb[0], M_CCD);
790 			cb[0] = NULL;
791 			return (ENOMEM);
792 		}
793 		bcopy(cb[0], cbp, sizeof(struct ccdbuf));
794 		cbp->cb_buf.bio_dev = ci2->ci_dev;
795 		cbp->cb_comp = ci2 - cs->sc_cinfo;
796 		cb[1] = cbp;
797 		/* link together the ccdbuf's and clear "mirror done" flag */
798 		cb[0]->cb_mirror = cb[1];
799 		cb[1]->cb_mirror = cb[0];
800 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
801 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
802 	}
803 	return (0);
804 }
805 
806 /*
807  * Called at interrupt time.
808  * Mark the component as done and if all components are done,
809  * take a ccd interrupt.
810  */
811 static void
812 ccdiodone(struct bio *ibp)
813 {
814 	struct ccdbuf *cbp;
815 	struct bio *bp;
816 	struct ccd_s *cs;
817 	int count;
818 
819 	cbp = ibp->bio_caller2;
820 	cs = cbp->cb_softc;
821 	bp = cbp->cb_obp;
822 	/*
823 	 * If an error occured, report it.  If this is a mirrored
824 	 * configuration and the first of two possible reads, do not
825 	 * set the error in the bp yet because the second read may
826 	 * succeed.
827 	 */
828 
829 	if (cbp->cb_buf.bio_flags & BIO_ERROR) {
830 		const char *msg = "";
831 
832 		if ((cs->sc_cflags & CCDF_MIRROR) &&
833 		    (cbp->cb_buf.bio_cmd == BIO_READ) &&
834 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
835 			/*
836 			 * We will try our read on the other disk down
837 			 * below, also reverse the default pick so if we
838 			 * are doing a scan we do not keep hitting the
839 			 * bad disk first.
840 			 */
841 
842 			msg = ", trying other disk";
843 			cs->sc_pick = 1 - cs->sc_pick;
844 			cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
845 		} else {
846 			bp->bio_flags |= BIO_ERROR;
847 			bp->bio_error = cbp->cb_buf.bio_error ?
848 			    cbp->cb_buf.bio_error : EIO;
849 		}
850 		printf("ccd%d: error %d on component %d block %jd "
851 		    "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error,
852 		    cbp->cb_comp,
853 		    (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno,
854 		    msg);
855 	}
856 
857 	/*
858 	 * Process mirror.  If we are writing, I/O has been initiated on both
859 	 * buffers and we fall through only after both are finished.
860 	 *
861 	 * If we are reading only one I/O is initiated at a time.  If an
862 	 * error occurs we initiate the second I/O and return, otherwise
863 	 * we free the second I/O without initiating it.
864 	 */
865 
866 	if (cs->sc_cflags & CCDF_MIRROR) {
867 		if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
868 			/*
869 			 * When writing, handshake with the second buffer
870 			 * to determine when both are done.  If both are not
871 			 * done, return here.
872 			 */
873 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
874 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
875 				free(cbp, M_CCD);
876 				return;
877 			}
878 		} else {
879 			/*
880 			 * When reading, either dispose of the second buffer
881 			 * or initiate I/O on the second buffer if an error
882 			 * occured with this one.
883 			 */
884 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
885 				if (cbp->cb_buf.bio_flags & BIO_ERROR) {
886 					cbp->cb_mirror->cb_pflags |=
887 					    CCDPF_MIRROR_DONE;
888 					BIO_STRATEGY(&cbp->cb_mirror->cb_buf);
889 					free(cbp, M_CCD);
890 					return;
891 				} else {
892 					free(cbp->cb_mirror, M_CCD);
893 				}
894 			}
895 		}
896 	}
897 
898 	/*
899 	 * use bio_caller1 to determine how big the original request was rather
900 	 * then bio_bcount, because bio_bcount may have been truncated for EOF.
901 	 *
902 	 * XXX We check for an error, but we do not test the resid for an
903 	 * aligned EOF condition.  This may result in character & block
904 	 * device access not recognizing EOF properly when read or written
905 	 * sequentially, but will not effect filesystems.
906 	 */
907 	count = (long)cbp->cb_buf.bio_caller1;
908 	free(cbp, M_CCD);
909 
910 	/*
911 	 * If all done, "interrupt".
912 	 */
913 	bp->bio_resid -= count;
914 	if (bp->bio_resid < 0)
915 		panic("ccdiodone: count");
916 	if (bp->bio_resid == 0) {
917 		if (bp->bio_flags & BIO_ERROR)
918 			bp->bio_resid = bp->bio_bcount;
919 		biofinish(bp, &cs->device_stats, 0);
920 	}
921 }
922 
923 static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td);
924 
925 static int
926 ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
927 {
928 	struct ccd_ioctl *ccio;
929 	u_int unit;
930 	dev_t dev2;
931 	int error;
932 
933 	switch (cmd) {
934 	case CCDIOCSET:
935 	case CCDIOCCLR:
936 		ccio = (struct ccd_ioctl *)data;
937 		unit = ccio->ccio_size;
938 		return (ccdioctltoo(unit, cmd, data, flag, td));
939 	case CCDCONFINFO:
940 		{
941 		int ninit = 0;
942 		struct ccdconf *conf = (struct ccdconf *)data;
943 		struct ccd_s *tmpcs;
944 		struct ccd_s *ubuf = conf->buffer;
945 
946 		/* XXX: LOCK(unique unit numbers) */
947 		LIST_FOREACH(tmpcs, &ccd_softc_list, list)
948 			if (IS_INITED(tmpcs))
949 				ninit++;
950 
951 		if (conf->size == 0) {
952 			conf->size = sizeof(struct ccd_s) * ninit;
953 			return (0);
954 		} else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
955 		    (conf->size % sizeof(struct ccd_s) != 0)) {
956 			/* XXX: UNLOCK(unique unit numbers) */
957 			return (EINVAL);
958 		}
959 
960 		ubuf += ninit;
961 		LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
962 			if (!IS_INITED(tmpcs))
963 				continue;
964 			error = copyout(tmpcs, --ubuf,
965 			    sizeof(struct ccd_s));
966 			if (error != 0)
967 				/* XXX: UNLOCK(unique unit numbers) */
968 				return (error);
969 		}
970 		/* XXX: UNLOCK(unique unit numbers) */
971 		return (0);
972 		}
973 
974 	case CCDCPPINFO:
975 		{
976 		struct ccdcpps *cpps = (struct ccdcpps *)data;
977 		char *ubuf = cpps->buffer;
978 		struct ccd_s *cs;
979 
980 
981 		error = copyin(ubuf, &unit, sizeof (unit));
982 		if (error)
983 			return (error);
984 
985 		if (!IS_ALLOCATED(unit))
986 			return (ENXIO);
987 		dev2 = makedev(CDEV_MAJOR, unit * 8 + 2);
988 		cs = ccdfind(unit);
989 		if (!IS_INITED(cs))
990 			return (ENXIO);
991 
992 		{
993 			int len = 0, i;
994 			struct ccdcpps *cpps = (struct ccdcpps *)data;
995 			char *ubuf = cpps->buffer;
996 
997 
998 			for (i = 0; i < cs->sc_nccdisks; ++i)
999 				len += cs->sc_cinfo[i].ci_pathlen;
1000 
1001 			if (cpps->size < len)
1002 				return (ENOMEM);
1003 
1004 			for (i = 0; i < cs->sc_nccdisks; ++i) {
1005 				len = cs->sc_cinfo[i].ci_pathlen;
1006 				error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1007 				    len);
1008 				if (error != 0)
1009 					return (error);
1010 				ubuf += len;
1011 			}
1012 			return(copyout("", ubuf, 1));
1013 		}
1014 		break;
1015 		}
1016 
1017 	default:
1018 		return (ENXIO);
1019 	}
1020 }
1021 
1022 static int
1023 ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td)
1024 {
1025 	int i, j, lookedup = 0, error = 0;
1026 	struct ccd_s *cs;
1027 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1028 	struct ccdgeom *ccg;
1029 	char **cpp;
1030 	struct vnode **vpp;
1031 
1032 	cs = ccdfind(unit);
1033 	switch (cmd) {
1034 	case CCDIOCSET:
1035 		if (cs == NULL)
1036 			cs = ccdnew(unit);
1037 		if (IS_INITED(cs))
1038 			return (EBUSY);
1039 
1040 		if ((flag & FWRITE) == 0)
1041 			return (EBADF);
1042 
1043 		if ((error = ccdlock(cs)) != 0)
1044 			return (error);
1045 
1046 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1047 			return (EINVAL);
1048 
1049 		/* Fill in some important bits. */
1050 		cs->sc_ileave = ccio->ccio_ileave;
1051 		if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) {
1052 			printf("ccd%d: disabling mirror, interleave is 0\n",
1053 			    unit);
1054 			ccio->ccio_flags &= ~(CCDF_MIRROR);
1055 		}
1056 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1057 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1058 			printf("ccd%d: mirror/parity forces uniform flag\n",
1059 			       unit);
1060 			ccio->ccio_flags |= CCDF_UNIFORM;
1061 		}
1062 		cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1063 
1064 		/*
1065 		 * Allocate space for and copy in the array of
1066 		 * componet pathnames and device numbers.
1067 		 */
1068 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1069 		    M_CCD, 0);
1070 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1071 		    M_CCD, 0);
1072 
1073 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1074 		    ccio->ccio_ndisks * sizeof(char **));
1075 		if (error) {
1076 			free(vpp, M_CCD);
1077 			free(cpp, M_CCD);
1078 			ccdunlock(cs);
1079 			return (error);
1080 		}
1081 
1082 
1083 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1084 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1085 				for (j = 0; j < lookedup; ++j)
1086 					(void)vn_close(vpp[j], FREAD|FWRITE,
1087 					    td->td_ucred, td);
1088 				free(vpp, M_CCD);
1089 				free(cpp, M_CCD);
1090 				ccdunlock(cs);
1091 				return (error);
1092 			}
1093 			++lookedup;
1094 		}
1095 		cs->sc_vpp = vpp;
1096 		cs->sc_nccdisks = ccio->ccio_ndisks;
1097 
1098 		/*
1099 		 * Initialize the ccd.  Fills in the softc for us.
1100 		 */
1101 		if ((error = ccdinit(cs, cpp, td)) != 0) {
1102 			for (j = 0; j < lookedup; ++j)
1103 				(void)vn_close(vpp[j], FREAD|FWRITE,
1104 				    td->td_ucred, td);
1105 			/*
1106 			 * We can't ccddestroy() cs just yet, because nothing
1107 			 * prevents user-level app to do another ioctl()
1108 			 * without closing the device first, therefore
1109 			 * declare unit null and void and let ccdclose()
1110 			 * destroy it when it is safe to do so.
1111 			 */
1112 			cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1113 			free(vpp, M_CCD);
1114 			free(cpp, M_CCD);
1115 			ccdunlock(cs);
1116 			return (error);
1117 		}
1118 		free(cpp, M_CCD);
1119 
1120 		/*
1121 		 * The ccd has been successfully initialized, so
1122 		 * we can place it into the array and read the disklabel.
1123 		 */
1124 		ccio->ccio_unit = unit;
1125 		ccio->ccio_size = cs->sc_size;
1126 		ccg = &cs->sc_geom;
1127 		cs->sc_disk = malloc(sizeof(struct disk), M_CCD, M_ZERO);
1128 		cs->sc_disk->d_strategy = ccdstrategy;
1129 		cs->sc_disk->d_name = "ccd";
1130 		cs->sc_disk->d_sectorsize = ccg->ccg_secsize;
1131 		cs->sc_disk->d_mediasize =
1132 		    cs->sc_size * (off_t)ccg->ccg_secsize;
1133 		cs->sc_disk->d_fwsectors = ccg->ccg_nsectors;
1134 		cs->sc_disk->d_fwheads = ccg->ccg_ntracks;
1135 		cs->sc_dev = disk_create(unit, cs->sc_disk, 0, NULL, NULL);
1136 		cs->sc_dev->si_drv1 = cs;
1137 
1138 		ccdunlock(cs);
1139 
1140 		break;
1141 
1142 	case CCDIOCCLR:
1143 		if (cs == NULL)
1144 			return (ENXIO);
1145 
1146 		if (!IS_INITED(cs))
1147 			return (ENXIO);
1148 
1149 		if ((flag & FWRITE) == 0)
1150 			return (EBADF);
1151 
1152 		if ((error = ccdlock(cs)) != 0)
1153 			return (error);
1154 
1155 		/* Don't unconfigure if any other partitions are open */
1156 		if (cs->sc_disk->d_flags & DISKFLAG_OPEN) {
1157 			ccdunlock(cs);
1158 			return (EBUSY);
1159 		}
1160 
1161 		disk_destroy(cs->sc_dev);
1162 		free(cs->sc_disk, M_CCD);
1163 		cs->sc_disk = NULL;
1164 		/* Declare unit null and void (reset all flags) */
1165 		cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1166 
1167 		/* Close the components and free their pathnames. */
1168 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1169 			/*
1170 			 * XXX: this close could potentially fail and
1171 			 * cause Bad Things.  Maybe we need to force
1172 			 * the close to happen?
1173 			 */
1174 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1175 			    td->td_ucred, td);
1176 			free(cs->sc_cinfo[i].ci_path, M_CCD);
1177 		}
1178 
1179 		/* Free interleave index. */
1180 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1181 			free(cs->sc_itable[i].ii_index, M_CCD);
1182 
1183 		/* Free component info and interleave table. */
1184 		free(cs->sc_cinfo, M_CCD);
1185 		free(cs->sc_itable, M_CCD);
1186 		free(cs->sc_vpp, M_CCD);
1187 
1188 		/* And remove the devstat entry. */
1189 		devstat_remove_entry(&cs->device_stats);
1190 
1191 		/* This must be atomic. */
1192 		ccdunlock(cs);
1193 		ccddestroy(cs);
1194 
1195 		break;
1196 	}
1197 
1198 	return (0);
1199 }
1200 
1201 
1202 /*
1203  * Lookup the provided name in the filesystem.  If the file exists,
1204  * is a valid block device, and isn't being used by anyone else,
1205  * set *vpp to the file's vnode.
1206  */
1207 static int
1208 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1209 {
1210 	struct nameidata nd;
1211 	struct vnode *vp;
1212 	int error, flags;
1213 
1214 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1215 	flags = FREAD | FWRITE;
1216 	if ((error = vn_open(&nd, &flags, 0)) != 0) {
1217 		return (error);
1218 	}
1219 	vp = nd.ni_vp;
1220 
1221 	if (vrefcnt(vp) > 1) {
1222 		error = EBUSY;
1223 		goto bad;
1224 	}
1225 
1226 	if (!vn_isdisk(vp, &error))
1227 		goto bad;
1228 
1229 
1230 	VOP_UNLOCK(vp, 0, td);
1231 	NDFREE(&nd, NDF_ONLY_PNBUF);
1232 	*vpp = vp;
1233 	return (0);
1234 bad:
1235 	VOP_UNLOCK(vp, 0, td);
1236 	NDFREE(&nd, NDF_ONLY_PNBUF);
1237 	/* vn_close does vrele() for vp */
1238 	(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1239 	return (error);
1240 }
1241 
1242 /*
1243 
1244  * Wait interruptibly for an exclusive lock.
1245  *
1246  * XXX
1247  * Several drivers do this; it should be abstracted and made MP-safe.
1248  */
1249 static int
1250 ccdlock(struct ccd_s *cs)
1251 {
1252 	int error;
1253 
1254 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1255 		cs->sc_flags |= CCDF_WANTED;
1256 		if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1257 			return (error);
1258 	}
1259 	cs->sc_flags |= CCDF_LOCKED;
1260 	return (0);
1261 }
1262 
1263 /*
1264  * Unlock and wake up any waiters.
1265  */
1266 static void
1267 ccdunlock(struct ccd_s *cs)
1268 {
1269 
1270 	cs->sc_flags &= ~CCDF_LOCKED;
1271 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1272 		cs->sc_flags &= ~CCDF_WANTED;
1273 		wakeup(cs);
1274 	}
1275 }
1276