xref: /freebsd/sys/geom/geom_ccd.c (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
1 /*-
2  * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause)
3  *
4  * Copyright (c) 2003 Poul-Henning Kamp.
5  * Copyright (c) 1996, 1997 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Jason R. Thorpe.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  *
32  * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
33  */
34 
35 /*-
36  * Copyright (c) 1988 University of Utah.
37  * Copyright (c) 1990, 1993
38  *	The Regents of the University of California.  All rights reserved.
39  *
40  * This code is derived from software contributed to Berkeley by
41  * the Systems Programming Group of the University of Utah Computer
42  * Science Department.
43  *
44  * Redistribution and use in source and binary forms, with or without
45  * modification, are permitted provided that the following conditions
46  * are met:
47  * 1. Redistributions of source code must retain the above copyright
48  *    notice, this list of conditions and the following disclaimer.
49  * 2. Redistributions in binary form must reproduce the above copyright
50  *    notice, this list of conditions and the following disclaimer in the
51  *    documentation and/or other materials provided with the distribution.
52  * 3. Neither the name of the University nor the names of its contributors
53  *    may be used to endorse or promote products derived from this software
54  *    without specific prior written permission.
55  *
56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66  * SUCH DAMAGE.
67  *
68  * from: Utah $Hdr: cd.c 1.6 90/11/28$
69  */
70 
71 /*
72  * Dynamic configuration and disklabel support by:
73  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
74  *	Numerical Aerodynamic Simulation Facility
75  *	Mail Stop 258-6
76  *	NASA Ames Research Center
77  *	Moffett Field, CA 94035
78  */
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/kernel.h>
83 #include <sys/module.h>
84 #include <sys/bio.h>
85 #include <sys/malloc.h>
86 #include <sys/sbuf.h>
87 #include <geom/geom.h>
88 
89 /*
90  * Number of blocks to untouched in front of a component partition.
91  * This is to avoid violating its disklabel area when it starts at the
92  * beginning of the slice.
93  */
94 #if !defined(CCD_OFFSET)
95 #define CCD_OFFSET 16
96 #endif
97 
98 /* sc_flags */
99 #define CCDF_UNIFORM	0x02	/* use LCCD of sizes for uniform interleave */
100 #define CCDF_MIRROR	0x04	/* use mirroring */
101 #define CCDF_NO_OFFSET	0x08	/* do not leave space in front */
102 #define CCDF_LINUX	0x10	/* use Linux compatibility mode */
103 
104 /* Mask of user-settable ccd flags. */
105 #define CCDF_USERMASK	(CCDF_UNIFORM|CCDF_MIRROR)
106 
107 /*
108  * Interleave description table.
109  * Computed at boot time to speed irregular-interleave lookups.
110  * The idea is that we interleave in "groups".  First we interleave
111  * evenly over all component disks up to the size of the smallest
112  * component (the first group), then we interleave evenly over all
113  * remaining disks up to the size of the next-smallest (second group),
114  * and so on.
115  *
116  * Each table entry describes the interleave characteristics of one
117  * of these groups.  For example if a concatenated disk consisted of
118  * three components of 5, 3, and 7 DEV_BSIZE blocks interleaved at
119  * DEV_BSIZE (1), the table would have three entries:
120  *
121  *	ndisk	startblk	startoff	dev
122  *	3	0		0		0, 1, 2
123  *	2	9		3		0, 2
124  *	1	13		5		2
125  *	0	-		-		-
126  *
127  * which says that the first nine blocks (0-8) are interleaved over
128  * 3 disks (0, 1, 2) starting at block offset 0 on any component disk,
129  * the next 4 blocks (9-12) are interleaved over 2 disks (0, 2) starting
130  * at component block 3, and the remaining blocks (13-14) are on disk
131  * 2 starting at offset 5.
132  */
133 struct ccdiinfo {
134 	int	ii_ndisk;	/* # of disks range is interleaved over */
135 	daddr_t	ii_startblk;	/* starting scaled block # for range */
136 	daddr_t	ii_startoff;	/* starting component offset (block #) */
137 	int	*ii_index;	/* ordered list of components in range */
138 };
139 
140 /*
141  * Component info table.
142  * Describes a single component of a concatenated disk.
143  */
144 struct ccdcinfo {
145 	daddr_t		ci_size; 		/* size */
146 	struct g_provider *ci_provider;		/* provider */
147 	struct g_consumer *ci_consumer;		/* consumer */
148 };
149 
150 /*
151  * A concatenated disk is described by this structure.
152  */
153 
154 struct ccd_s {
155 	LIST_ENTRY(ccd_s) list;
156 
157 	int		 sc_unit;		/* logical unit number */
158 	int		 sc_flags;		/* flags */
159 	daddr_t		 sc_size;		/* size of ccd */
160 	int		 sc_ileave;		/* interleave */
161 	u_int		 sc_ndisks;		/* number of components */
162 	struct ccdcinfo	 *sc_cinfo;		/* component info */
163 	struct ccdiinfo	 *sc_itable;		/* interleave table */
164 	uint32_t	 sc_secsize;		/* # bytes per sector */
165 	int		 sc_pick;		/* side of mirror picked */
166 	daddr_t		 sc_blk[2];		/* mirror localization */
167 	uint32_t	 sc_offset;		/* actual offset used */
168 };
169 
170 static g_start_t g_ccd_start;
171 static void ccdiodone(struct bio *bp);
172 static void ccdinterleave(struct ccd_s *);
173 static int ccdinit(struct gctl_req *req, struct ccd_s *);
174 static int ccdbuffer(struct bio **ret, struct ccd_s *,
175 		      struct bio *, daddr_t, caddr_t, long);
176 
177 static void
178 g_ccd_orphan(struct g_consumer *cp)
179 {
180 	/*
181 	 * XXX: We don't do anything here.  It is not obvious
182 	 * XXX: what DTRT would be, so we do what the previous
183 	 * XXX: code did: ignore it and let the user cope.
184 	 */
185 }
186 
187 static int
188 g_ccd_access(struct g_provider *pp, int dr, int dw, int de)
189 {
190 	struct g_geom *gp;
191 	struct g_consumer *cp1, *cp2;
192 	int error;
193 
194 	de += dr;
195 	de += dw;
196 
197 	gp = pp->geom;
198 	error = ENXIO;
199 	LIST_FOREACH(cp1, &gp->consumer, consumer) {
200 		error = g_access(cp1, dr, dw, de);
201 		if (error) {
202 			LIST_FOREACH(cp2, &gp->consumer, consumer) {
203 				if (cp1 == cp2)
204 					break;
205 				g_access(cp2, -dr, -dw, -de);
206 			}
207 			break;
208 		}
209 	}
210 	return (error);
211 }
212 
213 /*
214  * Free the softc and its substructures.
215  */
216 static void
217 g_ccd_freesc(struct ccd_s *sc)
218 {
219 	struct ccdiinfo *ii;
220 
221 	g_free(sc->sc_cinfo);
222 	if (sc->sc_itable != NULL) {
223 		for (ii = sc->sc_itable; ii->ii_ndisk > 0; ii++)
224 			g_free(ii->ii_index);
225 		g_free(sc->sc_itable);
226 	}
227 	g_free(sc);
228 }
229 
230 static int
231 ccdinit(struct gctl_req *req, struct ccd_s *cs)
232 {
233 	struct ccdcinfo *ci;
234 	daddr_t size;
235 	int ix;
236 	daddr_t minsize;
237 	int maxsecsize;
238 	off_t mediasize;
239 	u_int sectorsize;
240 
241 	cs->sc_size = 0;
242 
243 	maxsecsize = 0;
244 	minsize = 0;
245 
246 	if (cs->sc_flags & CCDF_LINUX) {
247 		cs->sc_offset = 0;
248 		cs->sc_ileave *= 2;
249 		if (cs->sc_flags & CCDF_MIRROR && cs->sc_ndisks != 2)
250 			gctl_error(req, "Mirror mode for Linux raids is "
251 			                "only supported with 2 devices");
252 	} else {
253 		if (cs->sc_flags & CCDF_NO_OFFSET)
254 			cs->sc_offset = 0;
255 		else
256 			cs->sc_offset = CCD_OFFSET;
257 	}
258 	for (ix = 0; ix < cs->sc_ndisks; ix++) {
259 		ci = &cs->sc_cinfo[ix];
260 
261 		mediasize = ci->ci_provider->mediasize;
262 		sectorsize = ci->ci_provider->sectorsize;
263 		if (sectorsize > maxsecsize)
264 			maxsecsize = sectorsize;
265 		size = mediasize / DEV_BSIZE - cs->sc_offset;
266 
267 		/* Truncate to interleave boundary */
268 
269 		if (cs->sc_ileave > 1)
270 			size -= size % cs->sc_ileave;
271 
272 		if (size == 0) {
273 			gctl_error(req, "Component %s has effective size zero",
274 			    ci->ci_provider->name);
275 			return(ENODEV);
276 		}
277 
278 		if (minsize == 0 || size < minsize)
279 			minsize = size;
280 		ci->ci_size = size;
281 		cs->sc_size += size;
282 	}
283 
284 	/*
285 	 * Don't allow the interleave to be smaller than
286 	 * the biggest component sector.
287 	 */
288 	if ((cs->sc_ileave > 0) &&
289 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
290 		gctl_error(req, "Interleave to small for sector size");
291 		return(EINVAL);
292 	}
293 
294 	/*
295 	 * If uniform interleave is desired set all sizes to that of
296 	 * the smallest component.  This will guarantee that a single
297 	 * interleave table is generated.
298 	 *
299 	 * Lost space must be taken into account when calculating the
300 	 * overall size.  Half the space is lost when CCDF_MIRROR is
301 	 * specified.
302 	 */
303 	if (cs->sc_flags & CCDF_UNIFORM) {
304 		for (ix = 0; ix < cs->sc_ndisks; ix++) {
305 			ci = &cs->sc_cinfo[ix];
306 			ci->ci_size = minsize;
307 		}
308 		cs->sc_size = cs->sc_ndisks * minsize;
309 	}
310 
311 	if (cs->sc_flags & CCDF_MIRROR) {
312 		/*
313 		 * Check to see if an even number of components
314 		 * have been specified.  The interleave must also
315 		 * be non-zero in order for us to be able to
316 		 * guarantee the topology.
317 		 */
318 		if (cs->sc_ndisks % 2) {
319 			gctl_error(req,
320 			      "Mirroring requires an even number of disks");
321 			return(EINVAL);
322 		}
323 		if (cs->sc_ileave == 0) {
324 			gctl_error(req,
325 			     "An interleave must be specified when mirroring");
326 			return(EINVAL);
327 		}
328 		cs->sc_size = (cs->sc_ndisks/2) * minsize;
329 	}
330 
331 	/*
332 	 * Construct the interleave table.
333 	 */
334 	ccdinterleave(cs);
335 
336 	/*
337 	 * Create pseudo-geometry based on 1MB cylinders.  It's
338 	 * pretty close.
339 	 */
340 	cs->sc_secsize = maxsecsize;
341 
342 	return (0);
343 }
344 
345 static void
346 ccdinterleave(struct ccd_s *cs)
347 {
348 	struct ccdcinfo *ci, *smallci;
349 	struct ccdiinfo *ii;
350 	daddr_t bn, lbn;
351 	int ix;
352 	daddr_t size;
353 
354 	/*
355 	 * Allocate an interleave table.  The worst case occurs when each
356 	 * of N disks is of a different size, resulting in N interleave
357 	 * tables.
358 	 *
359 	 * Chances are this is too big, but we don't care.
360 	 */
361 	size = (cs->sc_ndisks + 1) * sizeof(struct ccdiinfo);
362 	cs->sc_itable = g_malloc(size, M_WAITOK | M_ZERO);
363 
364 	/*
365 	 * Trivial case: no interleave (actually interleave of disk size).
366 	 * Each table entry represents a single component in its entirety.
367 	 *
368 	 * An interleave of 0 may not be used with a mirror setup.
369 	 */
370 	if (cs->sc_ileave == 0) {
371 		bn = 0;
372 		ii = cs->sc_itable;
373 
374 		for (ix = 0; ix < cs->sc_ndisks; ix++) {
375 			/* Allocate space for ii_index. */
376 			ii->ii_index = g_malloc(sizeof(int), M_WAITOK);
377 			ii->ii_ndisk = 1;
378 			ii->ii_startblk = bn;
379 			ii->ii_startoff = 0;
380 			ii->ii_index[0] = ix;
381 			bn += cs->sc_cinfo[ix].ci_size;
382 			ii++;
383 		}
384 		ii->ii_ndisk = 0;
385 		return;
386 	}
387 
388 	/*
389 	 * The following isn't fast or pretty; it doesn't have to be.
390 	 */
391 	size = 0;
392 	bn = lbn = 0;
393 	for (ii = cs->sc_itable; ; ii++) {
394 		/*
395 		 * Allocate space for ii_index.  We might allocate more then
396 		 * we use.
397 		 */
398 		ii->ii_index = g_malloc((sizeof(int) * cs->sc_ndisks),
399 		    M_WAITOK);
400 
401 		/*
402 		 * Locate the smallest of the remaining components
403 		 */
404 		smallci = NULL;
405 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_ndisks];
406 		    ci++) {
407 			if (ci->ci_size > size &&
408 			    (smallci == NULL ||
409 			     ci->ci_size < smallci->ci_size)) {
410 				smallci = ci;
411 			}
412 		}
413 
414 		/*
415 		 * Nobody left, all done
416 		 */
417 		if (smallci == NULL) {
418 			ii->ii_ndisk = 0;
419 			g_free(ii->ii_index);
420 			ii->ii_index = NULL;
421 			break;
422 		}
423 
424 		/*
425 		 * Record starting logical block using an sc_ileave blocksize.
426 		 */
427 		ii->ii_startblk = bn / cs->sc_ileave;
428 
429 		/*
430 		 * Record starting component block using an sc_ileave
431 		 * blocksize.  This value is relative to the beginning of
432 		 * a component disk.
433 		 */
434 		ii->ii_startoff = lbn;
435 
436 		/*
437 		 * Determine how many disks take part in this interleave
438 		 * and record their indices.
439 		 */
440 		ix = 0;
441 		for (ci = cs->sc_cinfo;
442 		    ci < &cs->sc_cinfo[cs->sc_ndisks]; ci++) {
443 			if (ci->ci_size >= smallci->ci_size) {
444 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
445 			}
446 		}
447 		ii->ii_ndisk = ix;
448 		bn += ix * (smallci->ci_size - size);
449 		lbn = smallci->ci_size / cs->sc_ileave;
450 		size = smallci->ci_size;
451 	}
452 }
453 
454 static void
455 g_ccd_start(struct bio *bp)
456 {
457 	long bcount, rcount;
458 	struct bio *cbp[2];
459 	caddr_t addr;
460 	daddr_t bn;
461 	int err;
462 	struct ccd_s *cs;
463 
464 	cs = bp->bio_to->geom->softc;
465 
466 	/*
467 	 * Block all GETATTR requests, we wouldn't know which of our
468 	 * subdevices we should ship it off to.
469 	 * XXX: this may not be the right policy.
470 	 */
471 	if(bp->bio_cmd == BIO_GETATTR) {
472 		g_io_deliver(bp, EINVAL);
473 		return;
474 	}
475 
476 	/*
477 	 * Translate the partition-relative block number to an absolute.
478 	 */
479 	bn = bp->bio_offset / cs->sc_secsize;
480 
481 	/*
482 	 * Allocate component buffers and fire off the requests
483 	 */
484 	addr = bp->bio_data;
485 	for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) {
486 		err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
487 		if (err) {
488 			bp->bio_completed += bcount;
489 			if (bp->bio_error == 0)
490 				bp->bio_error = err;
491 			if (bp->bio_completed == bp->bio_length)
492 				g_io_deliver(bp, bp->bio_error);
493 			return;
494 		}
495 		rcount = cbp[0]->bio_length;
496 
497 		if (cs->sc_flags & CCDF_MIRROR) {
498 			/*
499 			 * Mirroring.  Writes go to both disks, reads are
500 			 * taken from whichever disk seems most appropriate.
501 			 *
502 			 * We attempt to localize reads to the disk whos arm
503 			 * is nearest the read request.  We ignore seeks due
504 			 * to writes when making this determination and we
505 			 * also try to avoid hogging.
506 			 */
507 			if (cbp[0]->bio_cmd != BIO_READ) {
508 				g_io_request(cbp[0], cbp[0]->bio_from);
509 				g_io_request(cbp[1], cbp[1]->bio_from);
510 			} else {
511 				int pick = cs->sc_pick;
512 				daddr_t range = cs->sc_size / 16;
513 
514 				if (bn < cs->sc_blk[pick] - range ||
515 				    bn > cs->sc_blk[pick] + range
516 				) {
517 					cs->sc_pick = pick = 1 - pick;
518 				}
519 				cs->sc_blk[pick] = bn + btodb(rcount);
520 				g_io_request(cbp[pick], cbp[pick]->bio_from);
521 			}
522 		} else {
523 			/*
524 			 * Not mirroring
525 			 */
526 			g_io_request(cbp[0], cbp[0]->bio_from);
527 		}
528 		bn += btodb(rcount);
529 		addr += rcount;
530 	}
531 }
532 
533 /*
534  * Build a component buffer header.
535  */
536 static int
537 ccdbuffer(struct bio **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
538 {
539 	struct ccdcinfo *ci, *ci2 = NULL;
540 	struct bio *cbp;
541 	daddr_t cbn, cboff;
542 	off_t cbc;
543 
544 	/*
545 	 * Determine which component bn falls in.
546 	 */
547 	cbn = bn;
548 	cboff = 0;
549 
550 	if (cs->sc_ileave == 0) {
551 		/*
552 		 * Serially concatenated and neither a mirror nor a parity
553 		 * config.  This is a special case.
554 		 */
555 		daddr_t sblk;
556 
557 		sblk = 0;
558 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
559 			sblk += ci->ci_size;
560 		cbn -= sblk;
561 	} else {
562 		struct ccdiinfo *ii;
563 		int ccdisk, off;
564 
565 		/*
566 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
567 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
568 		 * to cbn.
569 		 */
570 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
571 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
572 
573 		/*
574 		 * Figure out which interleave table to use.
575 		 */
576 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
577 			if (ii->ii_startblk > cbn)
578 				break;
579 		}
580 		ii--;
581 
582 		/*
583 		 * off is the logical superblock relative to the beginning
584 		 * of this interleave block.
585 		 */
586 		off = cbn - ii->ii_startblk;
587 
588 		/*
589 		 * We must calculate which disk component to use (ccdisk),
590 		 * and recalculate cbn to be the superblock relative to
591 		 * the beginning of the component.  This is typically done by
592 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
593 		 * must typically be divided by the number of components in
594 		 * this interleave array to be properly convert it from a
595 		 * CCD-relative logical superblock number to a
596 		 * component-relative superblock number.
597 		 */
598 		if (ii->ii_ndisk == 1) {
599 			/*
600 			 * When we have just one disk, it can't be a mirror
601 			 * or a parity config.
602 			 */
603 			ccdisk = ii->ii_index[0];
604 			cbn = ii->ii_startoff + off;
605 		} else {
606 			if (cs->sc_flags & CCDF_MIRROR) {
607 				/*
608 				 * We have forced a uniform mapping, resulting
609 				 * in a single interleave array.  We double
610 				 * up on the first half of the available
611 				 * components and our mirror is in the second
612 				 * half.  This only works with a single
613 				 * interleave array because doubling up
614 				 * doubles the number of sectors, so there
615 				 * cannot be another interleave array because
616 				 * the next interleave array's calculations
617 				 * would be off.
618 				 */
619 				int ndisk2 = ii->ii_ndisk / 2;
620 				ccdisk = ii->ii_index[off % ndisk2];
621 				cbn = ii->ii_startoff + off / ndisk2;
622 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
623 			} else {
624 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
625 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
626 			}
627 		}
628 
629 		ci = &cs->sc_cinfo[ccdisk];
630 
631 		/*
632 		 * Convert cbn from a superblock to a normal block so it
633 		 * can be used to calculate (along with cboff) the normal
634 		 * block index into this particular disk.
635 		 */
636 		cbn *= cs->sc_ileave;
637 	}
638 
639 	/*
640 	 * Fill in the component buf structure.
641 	 */
642 	cbp = g_clone_bio(bp);
643 	if (cbp == NULL)
644 		return (ENOMEM);
645 	cbp->bio_done = g_std_done;
646 	cbp->bio_offset = dbtob(cbn + cboff + cs->sc_offset);
647 	cbp->bio_data = addr;
648 	if (cs->sc_ileave == 0)
649               cbc = dbtob((off_t)(ci->ci_size - cbn));
650 	else
651               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
652 	cbp->bio_length = (cbc < bcount) ? cbc : bcount;
653 
654 	cbp->bio_from = ci->ci_consumer;
655 	cb[0] = cbp;
656 
657 	if (cs->sc_flags & CCDF_MIRROR) {
658 		cbp = g_clone_bio(bp);
659 		if (cbp == NULL)
660 			return (ENOMEM);
661 		cbp->bio_done = cb[0]->bio_done = ccdiodone;
662 		cbp->bio_offset = cb[0]->bio_offset;
663 		cbp->bio_data = cb[0]->bio_data;
664 		cbp->bio_length = cb[0]->bio_length;
665 		cbp->bio_from = ci2->ci_consumer;
666 		cbp->bio_caller1 = cb[0];
667 		cb[0]->bio_caller1 = cbp;
668 		cb[1] = cbp;
669 	}
670 	return (0);
671 }
672 
673 /*
674  * Called only for mirrored operations.
675  */
676 static void
677 ccdiodone(struct bio *cbp)
678 {
679 	struct bio *mbp, *pbp;
680 
681 	mbp = cbp->bio_caller1;
682 	pbp = cbp->bio_parent;
683 
684 	if (pbp->bio_cmd == BIO_READ) {
685 		if (cbp->bio_error == 0) {
686 			/* We will not be needing the partner bio */
687 			if (mbp != NULL) {
688 				pbp->bio_inbed++;
689 				g_destroy_bio(mbp);
690 			}
691 			g_std_done(cbp);
692 			return;
693 		}
694 		if (mbp != NULL) {
695 			/* Try partner the bio instead */
696 			mbp->bio_caller1 = NULL;
697 			pbp->bio_inbed++;
698 			g_destroy_bio(cbp);
699 			g_io_request(mbp, mbp->bio_from);
700 			/*
701 			 * XXX: If this comes back OK, we should actually
702 			 * try to write the good data on the failed mirror
703 			 */
704 			return;
705 		}
706 		g_std_done(cbp);
707 		return;
708 	}
709 	if (mbp != NULL) {
710 		mbp->bio_caller1 = NULL;
711 		pbp->bio_inbed++;
712 		if (cbp->bio_error != 0 && pbp->bio_error == 0)
713 			pbp->bio_error = cbp->bio_error;
714 		g_destroy_bio(cbp);
715 		return;
716 	}
717 	g_std_done(cbp);
718 }
719 
720 static void
721 g_ccd_create(struct gctl_req *req, struct g_class *mp)
722 {
723 	int *unit, *ileave, *nprovider;
724 	struct g_geom *gp;
725 	struct g_consumer *cp;
726 	struct g_provider *pp;
727 	struct ccd_s *sc;
728 	struct sbuf *sb;
729 	char buf[20];
730 	int i, error;
731 
732 	g_topology_assert();
733 	unit = gctl_get_paraml(req, "unit", sizeof (*unit));
734 	if (unit == NULL) {
735 		gctl_error(req, "unit parameter not given");
736 		return;
737 	}
738 	ileave = gctl_get_paraml(req, "ileave", sizeof (*ileave));
739 	if (ileave == NULL) {
740 		gctl_error(req, "ileave parameter not given");
741 		return;
742 	}
743 	nprovider = gctl_get_paraml(req, "nprovider", sizeof (*nprovider));
744 	if (nprovider == NULL) {
745 		gctl_error(req, "nprovider parameter not given");
746 		return;
747 	}
748 
749 	/* Check for duplicate unit */
750 	LIST_FOREACH(gp, &mp->geom, geom) {
751 		sc = gp->softc;
752 		if (sc != NULL && sc->sc_unit == *unit) {
753 			gctl_error(req, "Unit %d already configured", *unit);
754 			return;
755 		}
756 	}
757 
758 	if (*nprovider <= 0) {
759 		gctl_error(req, "Bogus nprovider argument (= %d)", *nprovider);
760 		return;
761 	}
762 
763 	/* Check all providers are valid */
764 	for (i = 0; i < *nprovider; i++) {
765 		snprintf(buf, sizeof(buf), "provider%d", i);
766 		pp = gctl_get_provider(req, buf);
767 		if (pp == NULL)
768 			return;
769 	}
770 
771 	gp = g_new_geomf(mp, "ccd%d", *unit);
772 	sc = g_malloc(sizeof *sc, M_WAITOK | M_ZERO);
773 	gp->softc = sc;
774 	sc->sc_ndisks = *nprovider;
775 
776 	/* Allocate space for the component info. */
777 	sc->sc_cinfo = g_malloc(sc->sc_ndisks * sizeof(struct ccdcinfo),
778 	    M_WAITOK | M_ZERO);
779 
780 	/* Create consumers and attach to all providers */
781 	for (i = 0; i < *nprovider; i++) {
782 		snprintf(buf, sizeof(buf), "provider%d", i);
783 		pp = gctl_get_provider(req, buf);
784 		cp = g_new_consumer(gp);
785 		error = g_attach(cp, pp);
786 		KASSERT(error == 0, ("attach to %s failed", pp->name));
787 		sc->sc_cinfo[i].ci_consumer = cp;
788 		sc->sc_cinfo[i].ci_provider = pp;
789 	}
790 
791 	sc->sc_unit = *unit;
792 	sc->sc_ileave = *ileave;
793 
794 	if (gctl_get_param(req, "no_offset", NULL))
795 		sc->sc_flags |= CCDF_NO_OFFSET;
796 	if (gctl_get_param(req, "linux", NULL))
797 		sc->sc_flags |= CCDF_LINUX;
798 
799 	if (gctl_get_param(req, "uniform", NULL))
800 		sc->sc_flags |= CCDF_UNIFORM;
801 	if (gctl_get_param(req, "mirror", NULL))
802 		sc->sc_flags |= CCDF_MIRROR;
803 
804 	if (sc->sc_ileave == 0 && (sc->sc_flags & CCDF_MIRROR)) {
805 		printf("%s: disabling mirror, interleave is 0\n", gp->name);
806 		sc->sc_flags &= ~(CCDF_MIRROR);
807 	}
808 
809 	if ((sc->sc_flags & CCDF_MIRROR) && !(sc->sc_flags & CCDF_UNIFORM)) {
810 		printf("%s: mirror/parity forces uniform flag\n", gp->name);
811 		sc->sc_flags |= CCDF_UNIFORM;
812 	}
813 
814 	error = ccdinit(req, sc);
815 	if (error != 0) {
816 		g_ccd_freesc(sc);
817 		gp->softc = NULL;
818 		g_wither_geom(gp, ENXIO);
819 		return;
820 	}
821 
822 	pp = g_new_providerf(gp, "%s", gp->name);
823 	pp->mediasize = sc->sc_size * (off_t)sc->sc_secsize;
824 	pp->sectorsize = sc->sc_secsize;
825 	g_error_provider(pp, 0);
826 
827 	sb = sbuf_new_auto();
828 	sbuf_printf(sb, "ccd%d: %d components ", sc->sc_unit, *nprovider);
829 	for (i = 0; i < *nprovider; i++) {
830 		sbuf_printf(sb, "%s%s",
831 		    i == 0 ? "(" : ", ",
832 		    sc->sc_cinfo[i].ci_provider->name);
833 	}
834 	sbuf_printf(sb, "), %jd blocks ", (off_t)pp->mediasize / DEV_BSIZE);
835 	if (sc->sc_ileave != 0)
836 		sbuf_printf(sb, "interleaved at %d blocks\n",
837 			sc->sc_ileave);
838 	else
839 		sbuf_printf(sb, "concatenated\n");
840 	sbuf_finish(sb);
841 	gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
842 	sbuf_delete(sb);
843 }
844 
845 static int
846 g_ccd_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp)
847 {
848 	struct g_provider *pp;
849 	struct ccd_s *sc;
850 
851 	g_topology_assert();
852 	sc = gp->softc;
853 	pp = LIST_FIRST(&gp->provider);
854 	if (sc == NULL || pp == NULL)
855 		return (EBUSY);
856 	if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
857 		gctl_error(req, "%s is open(r%dw%de%d)", gp->name,
858 		    pp->acr, pp->acw, pp->ace);
859 		return (EBUSY);
860 	}
861 	g_ccd_freesc(sc);
862 	gp->softc = NULL;
863 	g_wither_geom(gp, ENXIO);
864 	return (0);
865 }
866 
867 static void
868 g_ccd_list(struct gctl_req *req, struct g_class *mp)
869 {
870 	struct sbuf *sb;
871 	struct ccd_s *cs;
872 	struct g_geom *gp;
873 	int i, unit, *up;
874 
875 	up = gctl_get_paraml(req, "unit", sizeof (*up));
876 	if (up == NULL) {
877 		gctl_error(req, "unit parameter not given");
878 		return;
879 	}
880 	unit = *up;
881 	sb = sbuf_new_auto();
882 	LIST_FOREACH(gp, &mp->geom, geom) {
883 		cs = gp->softc;
884 		if (cs == NULL || (unit >= 0 && unit != cs->sc_unit))
885 			continue;
886 		sbuf_printf(sb, "ccd%d\t\t%d\t%d\t",
887 		    cs->sc_unit, cs->sc_ileave, cs->sc_flags & CCDF_USERMASK);
888 
889 		for (i = 0; i < cs->sc_ndisks; ++i) {
890 			sbuf_printf(sb, "%s/dev/%s", i == 0 ? "" : " ",
891 			    cs->sc_cinfo[i].ci_provider->name);
892 		}
893 		sbuf_printf(sb, "\n");
894 	}
895 	sbuf_finish(sb);
896 	gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
897 	sbuf_delete(sb);
898 }
899 
900 static void
901 g_ccd_config(struct gctl_req *req, struct g_class *mp, char const *verb)
902 {
903 	struct g_geom *gp;
904 
905 	g_topology_assert();
906 	if (!strcmp(verb, "create geom")) {
907 		g_ccd_create(req, mp);
908 	} else if (!strcmp(verb, "destroy geom")) {
909 		gp = gctl_get_geom(req, mp, "geom");
910 		if (gp != NULL)
911 			g_ccd_destroy_geom(req, mp, gp);
912 	} else if (!strcmp(verb, "list")) {
913 		g_ccd_list(req, mp);
914 	} else {
915 		gctl_error(req, "unknown verb");
916 	}
917 }
918 
919 static struct g_class g_ccd_class = {
920 	.name = "CCD",
921 	.version = G_VERSION,
922 	.ctlreq = g_ccd_config,
923 	.destroy_geom = g_ccd_destroy_geom,
924 	.start = g_ccd_start,
925 	.orphan = g_ccd_orphan,
926 	.access = g_ccd_access,
927 };
928 
929 DECLARE_GEOM_CLASS(g_ccd_class, g_ccd);
930 MODULE_VERSION(geom_ccd, 0);
931