xref: /freebsd/sys/geom/geom_ccd.c (revision f5147e312f43a9050468de539aeafa072caa1a60)
1 /*-
2  * SPDX-License-Identifier: (BSD-2-Clause-NetBSD AND BSD-3-Clause)
3  *
4  * Copyright (c) 2003 Poul-Henning Kamp.
5  * Copyright (c) 1996, 1997 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Jason R. Thorpe.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  *
32  * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
33  */
34 
35 /*-
36  * Copyright (c) 1988 University of Utah.
37  * Copyright (c) 1990, 1993
38  *	The Regents of the University of California.  All rights reserved.
39  *
40  * This code is derived from software contributed to Berkeley by
41  * the Systems Programming Group of the University of Utah Computer
42  * Science Department.
43  *
44  * Redistribution and use in source and binary forms, with or without
45  * modification, are permitted provided that the following conditions
46  * are met:
47  * 1. Redistributions of source code must retain the above copyright
48  *    notice, this list of conditions and the following disclaimer.
49  * 2. Redistributions in binary form must reproduce the above copyright
50  *    notice, this list of conditions and the following disclaimer in the
51  *    documentation and/or other materials provided with the distribution.
52  * 3. Neither the name of the University nor the names of its contributors
53  *    may be used to endorse or promote products derived from this software
54  *    without specific prior written permission.
55  *
56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66  * SUCH DAMAGE.
67  *
68  * from: Utah $Hdr: cd.c 1.6 90/11/28$
69  *
70  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
71  */
72 
73 /*
74  * Dynamic configuration and disklabel support by:
75  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
76  *	Numerical Aerodynamic Simulation Facility
77  *	Mail Stop 258-6
78  *	NASA Ames Research Center
79  *	Moffett Field, CA 94035
80  */
81 
82 #include <sys/cdefs.h>
83 __FBSDID("$FreeBSD$");
84 
85 #include <sys/param.h>
86 #include <sys/systm.h>
87 #include <sys/kernel.h>
88 #include <sys/module.h>
89 #include <sys/bio.h>
90 #include <sys/malloc.h>
91 #include <sys/sbuf.h>
92 #include <geom/geom.h>
93 
94 /*
95  * Number of blocks to untouched in front of a component partition.
96  * This is to avoid violating its disklabel area when it starts at the
97  * beginning of the slice.
98  */
99 #if !defined(CCD_OFFSET)
100 #define CCD_OFFSET 16
101 #endif
102 
103 /* sc_flags */
104 #define CCDF_UNIFORM	0x02	/* use LCCD of sizes for uniform interleave */
105 #define CCDF_MIRROR	0x04	/* use mirroring */
106 #define CCDF_NO_OFFSET	0x08	/* do not leave space in front */
107 #define CCDF_LINUX	0x10	/* use Linux compatibility mode */
108 
109 /* Mask of user-settable ccd flags. */
110 #define CCDF_USERMASK	(CCDF_UNIFORM|CCDF_MIRROR)
111 
112 /*
113  * Interleave description table.
114  * Computed at boot time to speed irregular-interleave lookups.
115  * The idea is that we interleave in "groups".  First we interleave
116  * evenly over all component disks up to the size of the smallest
117  * component (the first group), then we interleave evenly over all
118  * remaining disks up to the size of the next-smallest (second group),
119  * and so on.
120  *
121  * Each table entry describes the interleave characteristics of one
122  * of these groups.  For example if a concatenated disk consisted of
123  * three components of 5, 3, and 7 DEV_BSIZE blocks interleaved at
124  * DEV_BSIZE (1), the table would have three entries:
125  *
126  *	ndisk	startblk	startoff	dev
127  *	3	0		0		0, 1, 2
128  *	2	9		3		0, 2
129  *	1	13		5		2
130  *	0	-		-		-
131  *
132  * which says that the first nine blocks (0-8) are interleaved over
133  * 3 disks (0, 1, 2) starting at block offset 0 on any component disk,
134  * the next 4 blocks (9-12) are interleaved over 2 disks (0, 2) starting
135  * at component block 3, and the remaining blocks (13-14) are on disk
136  * 2 starting at offset 5.
137  */
138 struct ccdiinfo {
139 	int	ii_ndisk;	/* # of disks range is interleaved over */
140 	daddr_t	ii_startblk;	/* starting scaled block # for range */
141 	daddr_t	ii_startoff;	/* starting component offset (block #) */
142 	int	*ii_index;	/* ordered list of components in range */
143 };
144 
145 /*
146  * Component info table.
147  * Describes a single component of a concatenated disk.
148  */
149 struct ccdcinfo {
150 	daddr_t		ci_size; 		/* size */
151 	struct g_provider *ci_provider;		/* provider */
152 	struct g_consumer *ci_consumer;		/* consumer */
153 };
154 
155 /*
156  * A concatenated disk is described by this structure.
157  */
158 
159 struct ccd_s {
160 	LIST_ENTRY(ccd_s) list;
161 
162 	int		 sc_unit;		/* logical unit number */
163 	int		 sc_flags;		/* flags */
164 	daddr_t		 sc_size;		/* size of ccd */
165 	int		 sc_ileave;		/* interleave */
166 	u_int		 sc_ndisks;		/* number of components */
167 	struct ccdcinfo	 *sc_cinfo;		/* component info */
168 	struct ccdiinfo	 *sc_itable;		/* interleave table */
169 	u_int32_t	 sc_secsize;		/* # bytes per sector */
170 	int		 sc_pick;		/* side of mirror picked */
171 	daddr_t		 sc_blk[2];		/* mirror localization */
172 	u_int32_t	 sc_offset;		/* actual offset used */
173 };
174 
175 static g_start_t g_ccd_start;
176 static void ccdiodone(struct bio *bp);
177 static void ccdinterleave(struct ccd_s *);
178 static int ccdinit(struct gctl_req *req, struct ccd_s *);
179 static int ccdbuffer(struct bio **ret, struct ccd_s *,
180 		      struct bio *, daddr_t, caddr_t, long);
181 
182 static void
183 g_ccd_orphan(struct g_consumer *cp)
184 {
185 	/*
186 	 * XXX: We don't do anything here.  It is not obvious
187 	 * XXX: what DTRT would be, so we do what the previous
188 	 * XXX: code did: ignore it and let the user cope.
189 	 */
190 }
191 
192 static int
193 g_ccd_access(struct g_provider *pp, int dr, int dw, int de)
194 {
195 	struct g_geom *gp;
196 	struct g_consumer *cp1, *cp2;
197 	int error;
198 
199 	de += dr;
200 	de += dw;
201 
202 	gp = pp->geom;
203 	error = ENXIO;
204 	LIST_FOREACH(cp1, &gp->consumer, consumer) {
205 		error = g_access(cp1, dr, dw, de);
206 		if (error) {
207 			LIST_FOREACH(cp2, &gp->consumer, consumer) {
208 				if (cp1 == cp2)
209 					break;
210 				g_access(cp2, -dr, -dw, -de);
211 			}
212 			break;
213 		}
214 	}
215 	return (error);
216 }
217 
218 /*
219  * Free the softc and its substructures.
220  */
221 static void
222 g_ccd_freesc(struct ccd_s *sc)
223 {
224 	struct ccdiinfo *ii;
225 
226 	g_free(sc->sc_cinfo);
227 	if (sc->sc_itable != NULL) {
228 		for (ii = sc->sc_itable; ii->ii_ndisk > 0; ii++)
229 			if (ii->ii_index != NULL)
230 				g_free(ii->ii_index);
231 		g_free(sc->sc_itable);
232 	}
233 	g_free(sc);
234 }
235 
236 
237 static int
238 ccdinit(struct gctl_req *req, struct ccd_s *cs)
239 {
240 	struct ccdcinfo *ci;
241 	daddr_t size;
242 	int ix;
243 	daddr_t minsize;
244 	int maxsecsize;
245 	off_t mediasize;
246 	u_int sectorsize;
247 
248 	cs->sc_size = 0;
249 
250 	maxsecsize = 0;
251 	minsize = 0;
252 
253 	if (cs->sc_flags & CCDF_LINUX) {
254 		cs->sc_offset = 0;
255 		cs->sc_ileave *= 2;
256 		if (cs->sc_flags & CCDF_MIRROR && cs->sc_ndisks != 2)
257 			gctl_error(req, "Mirror mode for Linux raids is "
258 			                "only supported with 2 devices");
259 	} else {
260 		if (cs->sc_flags & CCDF_NO_OFFSET)
261 			cs->sc_offset = 0;
262 		else
263 			cs->sc_offset = CCD_OFFSET;
264 
265 	}
266 	for (ix = 0; ix < cs->sc_ndisks; ix++) {
267 		ci = &cs->sc_cinfo[ix];
268 
269 		mediasize = ci->ci_provider->mediasize;
270 		sectorsize = ci->ci_provider->sectorsize;
271 		if (sectorsize > maxsecsize)
272 			maxsecsize = sectorsize;
273 		size = mediasize / DEV_BSIZE - cs->sc_offset;
274 
275 		/* Truncate to interleave boundary */
276 
277 		if (cs->sc_ileave > 1)
278 			size -= size % cs->sc_ileave;
279 
280 		if (size == 0) {
281 			gctl_error(req, "Component %s has effective size zero",
282 			    ci->ci_provider->name);
283 			return(ENODEV);
284 		}
285 
286 		if (minsize == 0 || size < minsize)
287 			minsize = size;
288 		ci->ci_size = size;
289 		cs->sc_size += size;
290 	}
291 
292 	/*
293 	 * Don't allow the interleave to be smaller than
294 	 * the biggest component sector.
295 	 */
296 	if ((cs->sc_ileave > 0) &&
297 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
298 		gctl_error(req, "Interleave to small for sector size");
299 		return(EINVAL);
300 	}
301 
302 	/*
303 	 * If uniform interleave is desired set all sizes to that of
304 	 * the smallest component.  This will guarantee that a single
305 	 * interleave table is generated.
306 	 *
307 	 * Lost space must be taken into account when calculating the
308 	 * overall size.  Half the space is lost when CCDF_MIRROR is
309 	 * specified.
310 	 */
311 	if (cs->sc_flags & CCDF_UNIFORM) {
312 		for (ix = 0; ix < cs->sc_ndisks; ix++) {
313 			ci = &cs->sc_cinfo[ix];
314 			ci->ci_size = minsize;
315 		}
316 		cs->sc_size = cs->sc_ndisks * minsize;
317 	}
318 
319 	if (cs->sc_flags & CCDF_MIRROR) {
320 		/*
321 		 * Check to see if an even number of components
322 		 * have been specified.  The interleave must also
323 		 * be non-zero in order for us to be able to
324 		 * guarantee the topology.
325 		 */
326 		if (cs->sc_ndisks % 2) {
327 			gctl_error(req,
328 			      "Mirroring requires an even number of disks");
329 			return(EINVAL);
330 		}
331 		if (cs->sc_ileave == 0) {
332 			gctl_error(req,
333 			     "An interleave must be specified when mirroring");
334 			return(EINVAL);
335 		}
336 		cs->sc_size = (cs->sc_ndisks/2) * minsize;
337 	}
338 
339 	/*
340 	 * Construct the interleave table.
341 	 */
342 	ccdinterleave(cs);
343 
344 	/*
345 	 * Create pseudo-geometry based on 1MB cylinders.  It's
346 	 * pretty close.
347 	 */
348 	cs->sc_secsize = maxsecsize;
349 
350 	return (0);
351 }
352 
353 static void
354 ccdinterleave(struct ccd_s *cs)
355 {
356 	struct ccdcinfo *ci, *smallci;
357 	struct ccdiinfo *ii;
358 	daddr_t bn, lbn;
359 	int ix;
360 	daddr_t size;
361 
362 
363 	/*
364 	 * Allocate an interleave table.  The worst case occurs when each
365 	 * of N disks is of a different size, resulting in N interleave
366 	 * tables.
367 	 *
368 	 * Chances are this is too big, but we don't care.
369 	 */
370 	size = (cs->sc_ndisks + 1) * sizeof(struct ccdiinfo);
371 	cs->sc_itable = g_malloc(size, M_WAITOK | M_ZERO);
372 
373 	/*
374 	 * Trivial case: no interleave (actually interleave of disk size).
375 	 * Each table entry represents a single component in its entirety.
376 	 *
377 	 * An interleave of 0 may not be used with a mirror setup.
378 	 */
379 	if (cs->sc_ileave == 0) {
380 		bn = 0;
381 		ii = cs->sc_itable;
382 
383 		for (ix = 0; ix < cs->sc_ndisks; ix++) {
384 			/* Allocate space for ii_index. */
385 			ii->ii_index = g_malloc(sizeof(int), M_WAITOK);
386 			ii->ii_ndisk = 1;
387 			ii->ii_startblk = bn;
388 			ii->ii_startoff = 0;
389 			ii->ii_index[0] = ix;
390 			bn += cs->sc_cinfo[ix].ci_size;
391 			ii++;
392 		}
393 		ii->ii_ndisk = 0;
394 		return;
395 	}
396 
397 	/*
398 	 * The following isn't fast or pretty; it doesn't have to be.
399 	 */
400 	size = 0;
401 	bn = lbn = 0;
402 	for (ii = cs->sc_itable; ; ii++) {
403 		/*
404 		 * Allocate space for ii_index.  We might allocate more then
405 		 * we use.
406 		 */
407 		ii->ii_index = g_malloc((sizeof(int) * cs->sc_ndisks),
408 		    M_WAITOK);
409 
410 		/*
411 		 * Locate the smallest of the remaining components
412 		 */
413 		smallci = NULL;
414 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_ndisks];
415 		    ci++) {
416 			if (ci->ci_size > size &&
417 			    (smallci == NULL ||
418 			     ci->ci_size < smallci->ci_size)) {
419 				smallci = ci;
420 			}
421 		}
422 
423 		/*
424 		 * Nobody left, all done
425 		 */
426 		if (smallci == NULL) {
427 			ii->ii_ndisk = 0;
428 			g_free(ii->ii_index);
429 			ii->ii_index = NULL;
430 			break;
431 		}
432 
433 		/*
434 		 * Record starting logical block using an sc_ileave blocksize.
435 		 */
436 		ii->ii_startblk = bn / cs->sc_ileave;
437 
438 		/*
439 		 * Record starting component block using an sc_ileave
440 		 * blocksize.  This value is relative to the beginning of
441 		 * a component disk.
442 		 */
443 		ii->ii_startoff = lbn;
444 
445 		/*
446 		 * Determine how many disks take part in this interleave
447 		 * and record their indices.
448 		 */
449 		ix = 0;
450 		for (ci = cs->sc_cinfo;
451 		    ci < &cs->sc_cinfo[cs->sc_ndisks]; ci++) {
452 			if (ci->ci_size >= smallci->ci_size) {
453 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
454 			}
455 		}
456 		ii->ii_ndisk = ix;
457 		bn += ix * (smallci->ci_size - size);
458 		lbn = smallci->ci_size / cs->sc_ileave;
459 		size = smallci->ci_size;
460 	}
461 }
462 
463 static void
464 g_ccd_start(struct bio *bp)
465 {
466 	long bcount, rcount;
467 	struct bio *cbp[2];
468 	caddr_t addr;
469 	daddr_t bn;
470 	int err;
471 	struct ccd_s *cs;
472 
473 	cs = bp->bio_to->geom->softc;
474 
475 	/*
476 	 * Block all GETATTR requests, we wouldn't know which of our
477 	 * subdevices we should ship it off to.
478 	 * XXX: this may not be the right policy.
479 	 */
480 	if(bp->bio_cmd == BIO_GETATTR) {
481 		g_io_deliver(bp, EINVAL);
482 		return;
483 	}
484 
485 	/*
486 	 * Translate the partition-relative block number to an absolute.
487 	 */
488 	bn = bp->bio_offset / cs->sc_secsize;
489 
490 	/*
491 	 * Allocate component buffers and fire off the requests
492 	 */
493 	addr = bp->bio_data;
494 	for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) {
495 		err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
496 		if (err) {
497 			bp->bio_completed += bcount;
498 			if (bp->bio_error == 0)
499 				bp->bio_error = err;
500 			if (bp->bio_completed == bp->bio_length)
501 				g_io_deliver(bp, bp->bio_error);
502 			return;
503 		}
504 		rcount = cbp[0]->bio_length;
505 
506 		if (cs->sc_flags & CCDF_MIRROR) {
507 			/*
508 			 * Mirroring.  Writes go to both disks, reads are
509 			 * taken from whichever disk seems most appropriate.
510 			 *
511 			 * We attempt to localize reads to the disk whos arm
512 			 * is nearest the read request.  We ignore seeks due
513 			 * to writes when making this determination and we
514 			 * also try to avoid hogging.
515 			 */
516 			if (cbp[0]->bio_cmd != BIO_READ) {
517 				g_io_request(cbp[0], cbp[0]->bio_from);
518 				g_io_request(cbp[1], cbp[1]->bio_from);
519 			} else {
520 				int pick = cs->sc_pick;
521 				daddr_t range = cs->sc_size / 16;
522 
523 				if (bn < cs->sc_blk[pick] - range ||
524 				    bn > cs->sc_blk[pick] + range
525 				) {
526 					cs->sc_pick = pick = 1 - pick;
527 				}
528 				cs->sc_blk[pick] = bn + btodb(rcount);
529 				g_io_request(cbp[pick], cbp[pick]->bio_from);
530 			}
531 		} else {
532 			/*
533 			 * Not mirroring
534 			 */
535 			g_io_request(cbp[0], cbp[0]->bio_from);
536 		}
537 		bn += btodb(rcount);
538 		addr += rcount;
539 	}
540 }
541 
542 /*
543  * Build a component buffer header.
544  */
545 static int
546 ccdbuffer(struct bio **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
547 {
548 	struct ccdcinfo *ci, *ci2 = NULL;
549 	struct bio *cbp;
550 	daddr_t cbn, cboff;
551 	off_t cbc;
552 
553 	/*
554 	 * Determine which component bn falls in.
555 	 */
556 	cbn = bn;
557 	cboff = 0;
558 
559 	if (cs->sc_ileave == 0) {
560 		/*
561 		 * Serially concatenated and neither a mirror nor a parity
562 		 * config.  This is a special case.
563 		 */
564 		daddr_t sblk;
565 
566 		sblk = 0;
567 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
568 			sblk += ci->ci_size;
569 		cbn -= sblk;
570 	} else {
571 		struct ccdiinfo *ii;
572 		int ccdisk, off;
573 
574 		/*
575 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
576 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
577 		 * to cbn.
578 		 */
579 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
580 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
581 
582 		/*
583 		 * Figure out which interleave table to use.
584 		 */
585 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
586 			if (ii->ii_startblk > cbn)
587 				break;
588 		}
589 		ii--;
590 
591 		/*
592 		 * off is the logical superblock relative to the beginning
593 		 * of this interleave block.
594 		 */
595 		off = cbn - ii->ii_startblk;
596 
597 		/*
598 		 * We must calculate which disk component to use (ccdisk),
599 		 * and recalculate cbn to be the superblock relative to
600 		 * the beginning of the component.  This is typically done by
601 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
602 		 * must typically be divided by the number of components in
603 		 * this interleave array to be properly convert it from a
604 		 * CCD-relative logical superblock number to a
605 		 * component-relative superblock number.
606 		 */
607 		if (ii->ii_ndisk == 1) {
608 			/*
609 			 * When we have just one disk, it can't be a mirror
610 			 * or a parity config.
611 			 */
612 			ccdisk = ii->ii_index[0];
613 			cbn = ii->ii_startoff + off;
614 		} else {
615 			if (cs->sc_flags & CCDF_MIRROR) {
616 				/*
617 				 * We have forced a uniform mapping, resulting
618 				 * in a single interleave array.  We double
619 				 * up on the first half of the available
620 				 * components and our mirror is in the second
621 				 * half.  This only works with a single
622 				 * interleave array because doubling up
623 				 * doubles the number of sectors, so there
624 				 * cannot be another interleave array because
625 				 * the next interleave array's calculations
626 				 * would be off.
627 				 */
628 				int ndisk2 = ii->ii_ndisk / 2;
629 				ccdisk = ii->ii_index[off % ndisk2];
630 				cbn = ii->ii_startoff + off / ndisk2;
631 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
632 			} else {
633 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
634 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
635 			}
636 		}
637 
638 		ci = &cs->sc_cinfo[ccdisk];
639 
640 		/*
641 		 * Convert cbn from a superblock to a normal block so it
642 		 * can be used to calculate (along with cboff) the normal
643 		 * block index into this particular disk.
644 		 */
645 		cbn *= cs->sc_ileave;
646 	}
647 
648 	/*
649 	 * Fill in the component buf structure.
650 	 */
651 	cbp = g_clone_bio(bp);
652 	if (cbp == NULL)
653 		return (ENOMEM);
654 	cbp->bio_done = g_std_done;
655 	cbp->bio_offset = dbtob(cbn + cboff + cs->sc_offset);
656 	cbp->bio_data = addr;
657 	if (cs->sc_ileave == 0)
658               cbc = dbtob((off_t)(ci->ci_size - cbn));
659 	else
660               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
661 	cbp->bio_length = (cbc < bcount) ? cbc : bcount;
662 
663 	cbp->bio_from = ci->ci_consumer;
664 	cb[0] = cbp;
665 
666 	if (cs->sc_flags & CCDF_MIRROR) {
667 		cbp = g_clone_bio(bp);
668 		if (cbp == NULL)
669 			return (ENOMEM);
670 		cbp->bio_done = cb[0]->bio_done = ccdiodone;
671 		cbp->bio_offset = cb[0]->bio_offset;
672 		cbp->bio_data = cb[0]->bio_data;
673 		cbp->bio_length = cb[0]->bio_length;
674 		cbp->bio_from = ci2->ci_consumer;
675 		cbp->bio_caller1 = cb[0];
676 		cb[0]->bio_caller1 = cbp;
677 		cb[1] = cbp;
678 	}
679 	return (0);
680 }
681 
682 /*
683  * Called only for mirrored operations.
684  */
685 static void
686 ccdiodone(struct bio *cbp)
687 {
688 	struct bio *mbp, *pbp;
689 
690 	mbp = cbp->bio_caller1;
691 	pbp = cbp->bio_parent;
692 
693 	if (pbp->bio_cmd == BIO_READ) {
694 		if (cbp->bio_error == 0) {
695 			/* We will not be needing the partner bio */
696 			if (mbp != NULL) {
697 				pbp->bio_inbed++;
698 				g_destroy_bio(mbp);
699 			}
700 			g_std_done(cbp);
701 			return;
702 		}
703 		if (mbp != NULL) {
704 			/* Try partner the bio instead */
705 			mbp->bio_caller1 = NULL;
706 			pbp->bio_inbed++;
707 			g_destroy_bio(cbp);
708 			g_io_request(mbp, mbp->bio_from);
709 			/*
710 			 * XXX: If this comes back OK, we should actually
711 			 * try to write the good data on the failed mirror
712 			 */
713 			return;
714 		}
715 		g_std_done(cbp);
716 		return;
717 	}
718 	if (mbp != NULL) {
719 		mbp->bio_caller1 = NULL;
720 		pbp->bio_inbed++;
721 		if (cbp->bio_error != 0 && pbp->bio_error == 0)
722 			pbp->bio_error = cbp->bio_error;
723 		g_destroy_bio(cbp);
724 		return;
725 	}
726 	g_std_done(cbp);
727 }
728 
729 static void
730 g_ccd_create(struct gctl_req *req, struct g_class *mp)
731 {
732 	int *unit, *ileave, *nprovider;
733 	struct g_geom *gp;
734 	struct g_consumer *cp;
735 	struct g_provider *pp;
736 	struct ccd_s *sc;
737 	struct sbuf *sb;
738 	char buf[20];
739 	int i, error;
740 
741 	g_topology_assert();
742 	unit = gctl_get_paraml(req, "unit", sizeof (*unit));
743 	if (unit == NULL) {
744 		gctl_error(req, "unit parameter not given");
745 		return;
746 	}
747 	ileave = gctl_get_paraml(req, "ileave", sizeof (*ileave));
748 	if (ileave == NULL) {
749 		gctl_error(req, "ileave parameter not given");
750 		return;
751 	}
752 	nprovider = gctl_get_paraml(req, "nprovider", sizeof (*nprovider));
753 	if (nprovider == NULL) {
754 		gctl_error(req, "nprovider parameter not given");
755 		return;
756 	}
757 
758 	/* Check for duplicate unit */
759 	LIST_FOREACH(gp, &mp->geom, geom) {
760 		sc = gp->softc;
761 		if (sc != NULL && sc->sc_unit == *unit) {
762 			gctl_error(req, "Unit %d already configured", *unit);
763 			return;
764 		}
765 	}
766 
767 	if (*nprovider <= 0) {
768 		gctl_error(req, "Bogus nprovider argument (= %d)", *nprovider);
769 		return;
770 	}
771 
772 	/* Check all providers are valid */
773 	for (i = 0; i < *nprovider; i++) {
774 		sprintf(buf, "provider%d", i);
775 		pp = gctl_get_provider(req, buf);
776 		if (pp == NULL)
777 			return;
778 	}
779 
780 	gp = g_new_geomf(mp, "ccd%d", *unit);
781 	sc = g_malloc(sizeof *sc, M_WAITOK | M_ZERO);
782 	gp->softc = sc;
783 	sc->sc_ndisks = *nprovider;
784 
785 	/* Allocate space for the component info. */
786 	sc->sc_cinfo = g_malloc(sc->sc_ndisks * sizeof(struct ccdcinfo),
787 	    M_WAITOK | M_ZERO);
788 
789 	/* Create consumers and attach to all providers */
790 	for (i = 0; i < *nprovider; i++) {
791 		sprintf(buf, "provider%d", i);
792 		pp = gctl_get_provider(req, buf);
793 		cp = g_new_consumer(gp);
794 		error = g_attach(cp, pp);
795 		KASSERT(error == 0, ("attach to %s failed", pp->name));
796 		sc->sc_cinfo[i].ci_consumer = cp;
797 		sc->sc_cinfo[i].ci_provider = pp;
798 	}
799 
800 	sc->sc_unit = *unit;
801 	sc->sc_ileave = *ileave;
802 
803 	if (gctl_get_param(req, "no_offset", NULL))
804 		sc->sc_flags |= CCDF_NO_OFFSET;
805 	if (gctl_get_param(req, "linux", NULL))
806 		sc->sc_flags |= CCDF_LINUX;
807 
808 	if (gctl_get_param(req, "uniform", NULL))
809 		sc->sc_flags |= CCDF_UNIFORM;
810 	if (gctl_get_param(req, "mirror", NULL))
811 		sc->sc_flags |= CCDF_MIRROR;
812 
813 	if (sc->sc_ileave == 0 && (sc->sc_flags & CCDF_MIRROR)) {
814 		printf("%s: disabling mirror, interleave is 0\n", gp->name);
815 		sc->sc_flags &= ~(CCDF_MIRROR);
816 	}
817 
818 	if ((sc->sc_flags & CCDF_MIRROR) && !(sc->sc_flags & CCDF_UNIFORM)) {
819 		printf("%s: mirror/parity forces uniform flag\n", gp->name);
820 		sc->sc_flags |= CCDF_UNIFORM;
821 	}
822 
823 	error = ccdinit(req, sc);
824 	if (error != 0) {
825 		g_ccd_freesc(sc);
826 		gp->softc = NULL;
827 		g_wither_geom(gp, ENXIO);
828 		return;
829 	}
830 
831 	pp = g_new_providerf(gp, "%s", gp->name);
832 	pp->mediasize = sc->sc_size * (off_t)sc->sc_secsize;
833 	pp->sectorsize = sc->sc_secsize;
834 	g_error_provider(pp, 0);
835 
836 	sb = sbuf_new_auto();
837 	sbuf_printf(sb, "ccd%d: %d components ", sc->sc_unit, *nprovider);
838 	for (i = 0; i < *nprovider; i++) {
839 		sbuf_printf(sb, "%s%s",
840 		    i == 0 ? "(" : ", ",
841 		    sc->sc_cinfo[i].ci_provider->name);
842 	}
843 	sbuf_printf(sb, "), %jd blocks ", (off_t)pp->mediasize / DEV_BSIZE);
844 	if (sc->sc_ileave != 0)
845 		sbuf_printf(sb, "interleaved at %d blocks\n",
846 			sc->sc_ileave);
847 	else
848 		sbuf_printf(sb, "concatenated\n");
849 	sbuf_finish(sb);
850 	gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
851 	sbuf_delete(sb);
852 }
853 
854 static int
855 g_ccd_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp)
856 {
857 	struct g_provider *pp;
858 	struct ccd_s *sc;
859 
860 	g_topology_assert();
861 	sc = gp->softc;
862 	pp = LIST_FIRST(&gp->provider);
863 	if (sc == NULL || pp == NULL)
864 		return (EBUSY);
865 	if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
866 		gctl_error(req, "%s is open(r%dw%de%d)", gp->name,
867 		    pp->acr, pp->acw, pp->ace);
868 		return (EBUSY);
869 	}
870 	g_ccd_freesc(sc);
871 	gp->softc = NULL;
872 	g_wither_geom(gp, ENXIO);
873 	return (0);
874 }
875 
876 static void
877 g_ccd_list(struct gctl_req *req, struct g_class *mp)
878 {
879 	struct sbuf *sb;
880 	struct ccd_s *cs;
881 	struct g_geom *gp;
882 	int i, unit, *up;
883 
884 	up = gctl_get_paraml(req, "unit", sizeof (*up));
885 	if (up == NULL) {
886 		gctl_error(req, "unit parameter not given");
887 		return;
888 	}
889 	unit = *up;
890 	sb = sbuf_new_auto();
891 	LIST_FOREACH(gp, &mp->geom, geom) {
892 		cs = gp->softc;
893 		if (cs == NULL || (unit >= 0 && unit != cs->sc_unit))
894 			continue;
895 		sbuf_printf(sb, "ccd%d\t\t%d\t%d\t",
896 		    cs->sc_unit, cs->sc_ileave, cs->sc_flags & CCDF_USERMASK);
897 
898 		for (i = 0; i < cs->sc_ndisks; ++i) {
899 			sbuf_printf(sb, "%s/dev/%s", i == 0 ? "" : " ",
900 			    cs->sc_cinfo[i].ci_provider->name);
901 		}
902 		sbuf_printf(sb, "\n");
903 	}
904 	sbuf_finish(sb);
905 	gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
906 	sbuf_delete(sb);
907 }
908 
909 static void
910 g_ccd_config(struct gctl_req *req, struct g_class *mp, char const *verb)
911 {
912 	struct g_geom *gp;
913 
914 	g_topology_assert();
915 	if (!strcmp(verb, "create geom")) {
916 		g_ccd_create(req, mp);
917 	} else if (!strcmp(verb, "destroy geom")) {
918 		gp = gctl_get_geom(req, mp, "geom");
919 		if (gp != NULL)
920 		g_ccd_destroy_geom(req, mp, gp);
921 	} else if (!strcmp(verb, "list")) {
922 		g_ccd_list(req, mp);
923 	} else {
924 		gctl_error(req, "unknown verb");
925 	}
926 }
927 
928 static struct g_class g_ccd_class = {
929 	.name = "CCD",
930 	.version = G_VERSION,
931 	.ctlreq = g_ccd_config,
932 	.destroy_geom = g_ccd_destroy_geom,
933 	.start = g_ccd_start,
934 	.orphan = g_ccd_orphan,
935 	.access = g_ccd_access,
936 };
937 
938 DECLARE_GEOM_CLASS(g_ccd_class, g_ccd);
939