1 /* 2 * Copyright (c) 2003 Poul-Henning Kamp. 3 * Copyright (c) 1995 Jason R. Thorpe. 4 * Copyright (c) 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * All rights reserved. 7 * Copyright (c) 1988 University of Utah. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * the Systems Programming Group of the University of Utah Computer 11 * Science Department. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed for the NetBSD Project 24 * by Jason R. Thorpe. 25 * 4. The names of the authors may not be used to endorse or promote products 26 * derived from this software without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 35 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 36 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * Dynamic configuration and disklabel support by: 41 * Jason R. Thorpe <thorpej@nas.nasa.gov> 42 * Numerical Aerodynamic Simulation Facility 43 * Mail Stop 258-6 44 * NASA Ames Research Center 45 * Moffett Field, CA 94035 46 * 47 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 48 * 49 * @(#)cd.c 8.2 (Berkeley) 11/16/93 50 * 51 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ 52 * 53 * $FreeBSD$ 54 */ 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/kernel.h> 59 #include <sys/module.h> 60 #include <sys/proc.h> 61 #include <sys/bio.h> 62 #include <sys/malloc.h> 63 #include <sys/namei.h> 64 #include <sys/conf.h> 65 #include <sys/stat.h> 66 #include <sys/disk.h> 67 #include <sys/fcntl.h> 68 #include <sys/vnode.h> 69 #include <geom/geom.h> 70 #include <geom/geom_disk.h> 71 72 #include <sys/ccdvar.h> 73 74 /* 75 * Component info table. 76 * Describes a single component of a concatenated disk. 77 */ 78 struct ccdcinfo { 79 struct vnode *ci_vp; /* device's vnode */ 80 dev_t ci_dev; /* XXX: device's dev_t */ 81 size_t ci_size; /* size */ 82 char *ci_path; /* path to component */ 83 size_t ci_pathlen; /* length of component path */ 84 }; 85 86 /* 87 * Interleave description table. 88 * Computed at boot time to speed irregular-interleave lookups. 89 * The idea is that we interleave in "groups". First we interleave 90 * evenly over all component disks up to the size of the smallest 91 * component (the first group), then we interleave evenly over all 92 * remaining disks up to the size of the next-smallest (second group), 93 * and so on. 94 * 95 * Each table entry describes the interleave characteristics of one 96 * of these groups. For example if a concatenated disk consisted of 97 * three components of 5, 3, and 7 DEV_BSIZE blocks interleaved at 98 * DEV_BSIZE (1), the table would have three entries: 99 * 100 * ndisk startblk startoff dev 101 * 3 0 0 0, 1, 2 102 * 2 9 3 0, 2 103 * 1 13 5 2 104 * 0 - - - 105 * 106 * which says that the first nine blocks (0-8) are interleaved over 107 * 3 disks (0, 1, 2) starting at block offset 0 on any component disk, 108 * the next 4 blocks (9-12) are interleaved over 2 disks (0, 2) starting 109 * at component block 3, and the remaining blocks (13-14) are on disk 110 * 2 starting at offset 5. 111 */ 112 struct ccdiinfo { 113 int ii_ndisk; /* # of disks range is interleaved over */ 114 daddr_t ii_startblk; /* starting scaled block # for range */ 115 daddr_t ii_startoff; /* starting component offset (block #) */ 116 int *ii_index; /* ordered list of components in range */ 117 }; 118 119 /* 120 * Concatenated disk pseudo-geometry information. 121 */ 122 struct ccdgeom { 123 u_int32_t ccg_secsize; /* # bytes per sector */ 124 u_int32_t ccg_nsectors; /* # data sectors per track */ 125 u_int32_t ccg_ntracks; /* # tracks per cylinder */ 126 u_int32_t ccg_ncylinders; /* # cylinders per unit */ 127 }; 128 129 130 /* 131 * A concatenated disk is described by this structure. 132 */ 133 struct ccd_s { 134 LIST_ENTRY(ccd_s) list; 135 136 int sc_unit; /* logical unit number */ 137 struct vnode **sc_vpp; /* array of component vnodes */ 138 int sc_flags; /* flags */ 139 int sc_cflags; /* configuration flags */ 140 size_t sc_size; /* size of ccd */ 141 int sc_ileave; /* interleave */ 142 u_int sc_nccdisks; /* number of components */ 143 #define CCD_MAXNDISKS 65536 144 struct ccdcinfo *sc_cinfo; /* component info */ 145 struct ccdiinfo *sc_itable; /* interleave table */ 146 struct ccdgeom sc_geom; /* pseudo geometry info */ 147 int sc_pick; /* side of mirror picked */ 148 daddr_t sc_blk[2]; /* mirror localization */ 149 struct disk *sc_disk; 150 struct cdev *__remove00; /* XXX: remove when convenient */ 151 }; 152 153 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver"); 154 155 /* 156 This is how mirroring works (only writes are special): 157 158 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s 159 linked together by the cb_mirror field. "cb_pflags & 160 CCDPF_MIRROR_DONE" is set to 0 on both of them. 161 162 When a component returns to ccdiodone(), it checks if "cb_pflags & 163 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's 164 flag and returns. If it is, it means its partner has already 165 returned, so it will go to the regular cleanup. 166 167 */ 168 169 struct ccdbuf { 170 struct bio cb_buf; /* new I/O buf */ 171 struct bio *cb_obp; /* ptr. to original I/O buf */ 172 struct ccdbuf *cb_freenext; /* free list link */ 173 struct ccd_s *cb_softc; 174 int cb_comp; /* target component */ 175 int cb_pflags; /* mirror/parity status flag */ 176 struct ccdbuf *cb_mirror; /* mirror counterpart */ 177 }; 178 179 /* bits in cb_pflags */ 180 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */ 181 182 /* convinient macros for often-used statements */ 183 #define IS_ALLOCATED(unit) (ccdfind(unit) != NULL) 184 #define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0) 185 186 static dev_t ccdctldev; 187 188 static disk_strategy_t ccdstrategy; 189 static d_ioctl_t ccdctlioctl; 190 191 #define NCCDFREEHIWAT 16 192 193 #define CDEV_MAJOR 74 194 195 static struct cdevsw ccdctl_cdevsw = { 196 .d_open = nullopen, 197 .d_close = nullclose, 198 .d_ioctl = ccdctlioctl, 199 .d_name = "ccdctl", 200 .d_maj = CDEV_MAJOR, 201 }; 202 203 static LIST_HEAD(, ccd_s) ccd_softc_list = 204 LIST_HEAD_INITIALIZER(&ccd_softc_list); 205 206 static struct ccd_s *ccdfind(int); 207 static struct ccd_s *ccdnew(int); 208 static int ccddestroy(struct ccd_s *); 209 210 /* called during module initialization */ 211 static void ccdattach(void); 212 static int ccd_modevent(module_t, int, void *); 213 214 /* called by biodone() at interrupt time */ 215 static void ccdiodone(struct bio *bp); 216 217 static void ccdstart(struct ccd_s *, struct bio *); 218 static void ccdinterleave(struct ccd_s *, int); 219 static int ccdinit(struct ccd_s *, char **, struct thread *); 220 static int ccdlookup(char *, struct thread *p, struct vnode **); 221 static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *, 222 struct bio *, daddr_t, caddr_t, long); 223 static int ccdlock(struct ccd_s *); 224 static void ccdunlock(struct ccd_s *); 225 226 227 /* 228 * Number of blocks to untouched in front of a component partition. 229 * This is to avoid violating its disklabel area when it starts at the 230 * beginning of the slice. 231 */ 232 #if !defined(CCD_OFFSET) 233 #define CCD_OFFSET 16 234 #endif 235 236 static struct ccd_s * 237 ccdfind(int unit) 238 { 239 struct ccd_s *sc = NULL; 240 241 /* XXX: LOCK(unique unit numbers) */ 242 LIST_FOREACH(sc, &ccd_softc_list, list) { 243 if (sc->sc_unit == unit) 244 break; 245 } 246 /* XXX: UNLOCK(unique unit numbers) */ 247 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc); 248 } 249 250 static struct ccd_s * 251 ccdnew(int unit) 252 { 253 struct ccd_s *sc; 254 255 /* XXX: LOCK(unique unit numbers) */ 256 if (IS_ALLOCATED(unit) || unit > 32) 257 return (NULL); 258 259 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO); 260 sc->sc_unit = unit; 261 LIST_INSERT_HEAD(&ccd_softc_list, sc, list); 262 /* XXX: UNLOCK(unique unit numbers) */ 263 return (sc); 264 } 265 266 static int 267 ccddestroy(struct ccd_s *sc) 268 { 269 270 /* XXX: LOCK(unique unit numbers) */ 271 LIST_REMOVE(sc, list); 272 /* XXX: UNLOCK(unique unit numbers) */ 273 FREE(sc, M_CCD); 274 return (0); 275 } 276 277 /* 278 * Called by main() during pseudo-device attachment. All we need 279 * to do is to add devsw entries. 280 */ 281 static void 282 ccdattach() 283 { 284 285 ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff, 286 UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl"); 287 ccdctldev->si_drv1 = ccdctldev; 288 } 289 290 static int 291 ccd_modevent(module_t mod, int type, void *data) 292 { 293 int error = 0; 294 295 switch (type) { 296 case MOD_LOAD: 297 ccdattach(); 298 break; 299 300 case MOD_UNLOAD: 301 printf("ccd0: Unload not supported!\n"); 302 error = EOPNOTSUPP; 303 break; 304 305 case MOD_SHUTDOWN: 306 break; 307 308 default: 309 error = EOPNOTSUPP; 310 } 311 return (error); 312 } 313 314 DEV_MODULE(ccd, ccd_modevent, NULL); 315 316 static int 317 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td) 318 { 319 struct ccdcinfo *ci = NULL; /* XXX */ 320 size_t size; 321 int ix; 322 struct vnode *vp; 323 size_t minsize; 324 int maxsecsize; 325 struct ccdgeom *ccg = &cs->sc_geom; 326 char *tmppath = NULL; 327 int error = 0; 328 off_t mediasize; 329 u_int sectorsize; 330 331 332 cs->sc_size = 0; 333 334 /* Allocate space for the component info. */ 335 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo), 336 M_CCD, M_WAITOK); 337 338 /* 339 * Verify that each component piece exists and record 340 * relevant information about it. 341 */ 342 maxsecsize = 0; 343 minsize = 0; 344 tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK); 345 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 346 vp = cs->sc_vpp[ix]; 347 ci = &cs->sc_cinfo[ix]; 348 ci->ci_vp = vp; 349 350 /* 351 * Copy in the pathname of the component. 352 */ 353 if ((error = copyinstr(cpaths[ix], tmppath, 354 MAXPATHLEN, &ci->ci_pathlen)) != 0) { 355 goto fail; 356 } 357 ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK); 358 bcopy(tmppath, ci->ci_path, ci->ci_pathlen); 359 360 ci->ci_dev = vn_todev(vp); 361 362 /* 363 * Get partition information for the component. 364 */ 365 error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize, 366 FREAD, td->td_ucred, td); 367 if (error != 0) { 368 goto fail; 369 } 370 /* 371 * Get partition information for the component. 372 */ 373 error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)§orsize, 374 FREAD, td->td_ucred, td); 375 if (error != 0) { 376 goto fail; 377 } 378 if (sectorsize > maxsecsize) 379 maxsecsize = sectorsize; 380 size = mediasize / DEV_BSIZE - CCD_OFFSET; 381 382 /* 383 * Calculate the size, truncating to an interleave 384 * boundary if necessary. 385 */ 386 387 if (cs->sc_ileave > 1) 388 size -= size % cs->sc_ileave; 389 390 if (size == 0) { 391 error = ENODEV; 392 goto fail; 393 } 394 395 if (minsize == 0 || size < minsize) 396 minsize = size; 397 ci->ci_size = size; 398 cs->sc_size += size; 399 } 400 401 free(tmppath, M_CCD); 402 tmppath = NULL; 403 404 /* 405 * Don't allow the interleave to be smaller than 406 * the biggest component sector. 407 */ 408 if ((cs->sc_ileave > 0) && 409 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { 410 error = EINVAL; 411 goto fail; 412 } 413 414 /* 415 * If uniform interleave is desired set all sizes to that of 416 * the smallest component. This will guarentee that a single 417 * interleave table is generated. 418 * 419 * Lost space must be taken into account when calculating the 420 * overall size. Half the space is lost when CCDF_MIRROR is 421 * specified. 422 */ 423 if (cs->sc_flags & CCDF_UNIFORM) { 424 for (ci = cs->sc_cinfo; 425 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 426 ci->ci_size = minsize; 427 } 428 if (cs->sc_flags & CCDF_MIRROR) { 429 /* 430 * Check to see if an even number of components 431 * have been specified. The interleave must also 432 * be non-zero in order for us to be able to 433 * guarentee the topology. 434 */ 435 if (cs->sc_nccdisks % 2) { 436 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit ); 437 error = EINVAL; 438 goto fail; 439 } 440 if (cs->sc_ileave == 0) { 441 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit); 442 error = EINVAL; 443 goto fail; 444 } 445 cs->sc_size = (cs->sc_nccdisks/2) * minsize; 446 } else { 447 if (cs->sc_ileave == 0) { 448 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit); 449 error = EINVAL; 450 goto fail; 451 } 452 cs->sc_size = cs->sc_nccdisks * minsize; 453 } 454 } 455 456 /* 457 * Construct the interleave table. 458 */ 459 ccdinterleave(cs, cs->sc_unit); 460 461 /* 462 * Create pseudo-geometry based on 1MB cylinders. It's 463 * pretty close. 464 */ 465 ccg->ccg_secsize = maxsecsize; 466 ccg->ccg_ntracks = 1; 467 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize; 468 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors; 469 470 cs->sc_flags |= CCDF_INITED; 471 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */ 472 return (0); 473 fail: 474 while (ci > cs->sc_cinfo) { 475 ci--; 476 free(ci->ci_path, M_CCD); 477 } 478 if (tmppath != NULL) 479 free(tmppath, M_CCD); 480 free(cs->sc_cinfo, M_CCD); 481 ccddestroy(cs); 482 return (error); 483 } 484 485 static void 486 ccdinterleave(struct ccd_s *cs, int unit) 487 { 488 struct ccdcinfo *ci, *smallci; 489 struct ccdiinfo *ii; 490 daddr_t bn, lbn; 491 int ix; 492 u_long size; 493 494 495 /* 496 * Allocate an interleave table. The worst case occurs when each 497 * of N disks is of a different size, resulting in N interleave 498 * tables. 499 * 500 * Chances are this is too big, but we don't care. 501 */ 502 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo); 503 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD, 504 M_WAITOK | M_ZERO); 505 506 /* 507 * Trivial case: no interleave (actually interleave of disk size). 508 * Each table entry represents a single component in its entirety. 509 * 510 * An interleave of 0 may not be used with a mirror setup. 511 */ 512 if (cs->sc_ileave == 0) { 513 bn = 0; 514 ii = cs->sc_itable; 515 516 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 517 /* Allocate space for ii_index. */ 518 ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK); 519 ii->ii_ndisk = 1; 520 ii->ii_startblk = bn; 521 ii->ii_startoff = 0; 522 ii->ii_index[0] = ix; 523 bn += cs->sc_cinfo[ix].ci_size; 524 ii++; 525 } 526 ii->ii_ndisk = 0; 527 return; 528 } 529 530 /* 531 * The following isn't fast or pretty; it doesn't have to be. 532 */ 533 size = 0; 534 bn = lbn = 0; 535 for (ii = cs->sc_itable; ; ii++) { 536 /* 537 * Allocate space for ii_index. We might allocate more then 538 * we use. 539 */ 540 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks), 541 M_CCD, M_WAITOK); 542 543 /* 544 * Locate the smallest of the remaining components 545 */ 546 smallci = NULL; 547 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks]; 548 ci++) { 549 if (ci->ci_size > size && 550 (smallci == NULL || 551 ci->ci_size < smallci->ci_size)) { 552 smallci = ci; 553 } 554 } 555 556 /* 557 * Nobody left, all done 558 */ 559 if (smallci == NULL) { 560 ii->ii_ndisk = 0; 561 free(ii->ii_index, M_CCD); 562 break; 563 } 564 565 /* 566 * Record starting logical block using an sc_ileave blocksize. 567 */ 568 ii->ii_startblk = bn / cs->sc_ileave; 569 570 /* 571 * Record starting comopnent block using an sc_ileave 572 * blocksize. This value is relative to the beginning of 573 * a component disk. 574 */ 575 ii->ii_startoff = lbn; 576 577 /* 578 * Determine how many disks take part in this interleave 579 * and record their indices. 580 */ 581 ix = 0; 582 for (ci = cs->sc_cinfo; 583 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 584 if (ci->ci_size >= smallci->ci_size) { 585 ii->ii_index[ix++] = ci - cs->sc_cinfo; 586 } 587 } 588 ii->ii_ndisk = ix; 589 bn += ix * (smallci->ci_size - size); 590 lbn = smallci->ci_size / cs->sc_ileave; 591 size = smallci->ci_size; 592 } 593 } 594 595 static void 596 ccdstrategy(struct bio *bp) 597 { 598 struct ccd_s *cs; 599 int pbn; /* in sc_secsize chunks */ 600 long sz; /* in sc_secsize chunks */ 601 602 cs = bp->bio_disk->d_drv1; 603 604 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE); 605 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize); 606 607 /* 608 * If out of bounds return an error. If at the EOF point, 609 * simply read or write less. 610 */ 611 612 if (pbn < 0 || pbn >= cs->sc_size) { 613 bp->bio_resid = bp->bio_bcount; 614 if (pbn != cs->sc_size) 615 biofinish(bp, NULL, EINVAL); 616 else 617 biodone(bp); 618 return; 619 } 620 621 /* 622 * If the request crosses EOF, truncate the request. 623 */ 624 if (pbn + sz > cs->sc_size) { 625 bp->bio_bcount = (cs->sc_size - pbn) * 626 cs->sc_geom.ccg_secsize; 627 } 628 629 bp->bio_resid = bp->bio_bcount; 630 631 /* 632 * "Start" the unit. 633 */ 634 ccdstart(cs, bp); 635 return; 636 } 637 638 static void 639 ccdstart(struct ccd_s *cs, struct bio *bp) 640 { 641 long bcount, rcount; 642 struct ccdbuf *cbp[2]; 643 caddr_t addr; 644 daddr_t bn; 645 int err; 646 int sent; 647 648 /* 649 * Translate the partition-relative block number to an absolute. 650 */ 651 bn = bp->bio_blkno; 652 653 /* 654 * Allocate component buffers and fire off the requests 655 */ 656 addr = bp->bio_data; 657 sent = 0; 658 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) { 659 err = ccdbuffer(cbp, cs, bp, bn, addr, bcount); 660 if (err) { 661 printf("ccdbuffer error %d\n", err); 662 if (!sent) 663 biofinish(bp, NULL, err); 664 else { 665 /* 666 * XXX: maybe a race where the partners 667 * XXX: we sent already have been in 668 * XXX: ccdiodone(). Single-threaded g_down 669 * XXX: may protect against this. 670 */ 671 bp->bio_resid -= bcount; 672 bp->bio_error = err; 673 bp->bio_flags |= BIO_ERROR; 674 } 675 return; 676 } 677 rcount = cbp[0]->cb_buf.bio_bcount; 678 679 if (cs->sc_cflags & CCDF_MIRROR) { 680 /* 681 * Mirroring. Writes go to both disks, reads are 682 * taken from whichever disk seems most appropriate. 683 * 684 * We attempt to localize reads to the disk whos arm 685 * is nearest the read request. We ignore seeks due 686 * to writes when making this determination and we 687 * also try to avoid hogging. 688 */ 689 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) { 690 BIO_STRATEGY(&cbp[0]->cb_buf); 691 BIO_STRATEGY(&cbp[1]->cb_buf); 692 sent++; 693 } else { 694 int pick = cs->sc_pick; 695 daddr_t range = cs->sc_size / 16; 696 697 if (bn < cs->sc_blk[pick] - range || 698 bn > cs->sc_blk[pick] + range 699 ) { 700 cs->sc_pick = pick = 1 - pick; 701 } 702 cs->sc_blk[pick] = bn + btodb(rcount); 703 BIO_STRATEGY(&cbp[pick]->cb_buf); 704 sent++; 705 } 706 } else { 707 /* 708 * Not mirroring 709 */ 710 BIO_STRATEGY(&cbp[0]->cb_buf); 711 sent++; 712 } 713 bn += btodb(rcount); 714 addr += rcount; 715 } 716 } 717 718 /* 719 * Build a component buffer header. 720 */ 721 static int 722 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount) 723 { 724 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */ 725 struct ccdbuf *cbp; 726 daddr_t cbn, cboff; 727 off_t cbc; 728 729 /* 730 * Determine which component bn falls in. 731 */ 732 cbn = bn; 733 cboff = 0; 734 735 if (cs->sc_ileave == 0) { 736 /* 737 * Serially concatenated and neither a mirror nor a parity 738 * config. This is a special case. 739 */ 740 daddr_t sblk; 741 742 sblk = 0; 743 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++) 744 sblk += ci->ci_size; 745 cbn -= sblk; 746 } else { 747 struct ccdiinfo *ii; 748 int ccdisk, off; 749 750 /* 751 * Calculate cbn, the logical superblock (sc_ileave chunks), 752 * and cboff, a normal block offset (DEV_BSIZE chunks) relative 753 * to cbn. 754 */ 755 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */ 756 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */ 757 758 /* 759 * Figure out which interleave table to use. 760 */ 761 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) { 762 if (ii->ii_startblk > cbn) 763 break; 764 } 765 ii--; 766 767 /* 768 * off is the logical superblock relative to the beginning 769 * of this interleave block. 770 */ 771 off = cbn - ii->ii_startblk; 772 773 /* 774 * We must calculate which disk component to use (ccdisk), 775 * and recalculate cbn to be the superblock relative to 776 * the beginning of the component. This is typically done by 777 * adding 'off' and ii->ii_startoff together. However, 'off' 778 * must typically be divided by the number of components in 779 * this interleave array to be properly convert it from a 780 * CCD-relative logical superblock number to a 781 * component-relative superblock number. 782 */ 783 if (ii->ii_ndisk == 1) { 784 /* 785 * When we have just one disk, it can't be a mirror 786 * or a parity config. 787 */ 788 ccdisk = ii->ii_index[0]; 789 cbn = ii->ii_startoff + off; 790 } else { 791 if (cs->sc_cflags & CCDF_MIRROR) { 792 /* 793 * We have forced a uniform mapping, resulting 794 * in a single interleave array. We double 795 * up on the first half of the available 796 * components and our mirror is in the second 797 * half. This only works with a single 798 * interleave array because doubling up 799 * doubles the number of sectors, so there 800 * cannot be another interleave array because 801 * the next interleave array's calculations 802 * would be off. 803 */ 804 int ndisk2 = ii->ii_ndisk / 2; 805 ccdisk = ii->ii_index[off % ndisk2]; 806 cbn = ii->ii_startoff + off / ndisk2; 807 ci2 = &cs->sc_cinfo[ccdisk + ndisk2]; 808 } else { 809 ccdisk = ii->ii_index[off % ii->ii_ndisk]; 810 cbn = ii->ii_startoff + off / ii->ii_ndisk; 811 } 812 } 813 814 ci = &cs->sc_cinfo[ccdisk]; 815 816 /* 817 * Convert cbn from a superblock to a normal block so it 818 * can be used to calculate (along with cboff) the normal 819 * block index into this particular disk. 820 */ 821 cbn *= cs->sc_ileave; 822 } 823 824 /* 825 * Fill in the component buf structure. 826 */ 827 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO); 828 if (cbp == NULL) 829 return (ENOMEM); 830 cbp->cb_buf.bio_cmd = bp->bio_cmd; 831 cbp->cb_buf.bio_done = ccdiodone; 832 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */ 833 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET; 834 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET); 835 cbp->cb_buf.bio_data = addr; 836 cbp->cb_buf.bio_caller2 = cbp; 837 if (cs->sc_ileave == 0) 838 cbc = dbtob((off_t)(ci->ci_size - cbn)); 839 else 840 cbc = dbtob((off_t)(cs->sc_ileave - cboff)); 841 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount; 842 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount; 843 844 /* 845 * context for ccdiodone 846 */ 847 cbp->cb_obp = bp; 848 cbp->cb_softc = cs; 849 cbp->cb_comp = ci - cs->sc_cinfo; 850 851 cb[0] = cbp; 852 853 /* 854 * Note: both I/O's setup when reading from mirror, but only one 855 * will be executed. 856 */ 857 if (cs->sc_cflags & CCDF_MIRROR) { 858 /* mirror, setup second I/O */ 859 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT); 860 if (cbp == NULL) { 861 free(cb[0], M_CCD); 862 cb[0] = NULL; 863 return (ENOMEM); 864 } 865 bcopy(cb[0], cbp, sizeof(struct ccdbuf)); 866 cbp->cb_buf.bio_caller2 = cbp; 867 cbp->cb_buf.bio_dev = ci2->ci_dev; 868 cbp->cb_comp = ci2 - cs->sc_cinfo; 869 cb[1] = cbp; 870 /* link together the ccdbuf's and clear "mirror done" flag */ 871 cb[0]->cb_mirror = cb[1]; 872 cb[1]->cb_mirror = cb[0]; 873 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE; 874 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE; 875 } 876 return (0); 877 } 878 879 /* 880 * Called at interrupt time. 881 * Mark the component as done and if all components are done, 882 * take a ccd interrupt. 883 */ 884 static void 885 ccdiodone(struct bio *ibp) 886 { 887 struct ccdbuf *cbp; 888 struct bio *bp; 889 struct ccd_s *cs; 890 int count; 891 892 cbp = ibp->bio_caller2; 893 cs = cbp->cb_softc; 894 bp = cbp->cb_obp; 895 /* 896 * If an error occured, report it. If this is a mirrored 897 * configuration and the first of two possible reads, do not 898 * set the error in the bp yet because the second read may 899 * succeed. 900 */ 901 902 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 903 const char *msg = ""; 904 905 if ((cs->sc_cflags & CCDF_MIRROR) && 906 (cbp->cb_buf.bio_cmd == BIO_READ) && 907 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 908 /* 909 * We will try our read on the other disk down 910 * below, also reverse the default pick so if we 911 * are doing a scan we do not keep hitting the 912 * bad disk first. 913 */ 914 915 msg = ", trying other disk"; 916 cs->sc_pick = 1 - cs->sc_pick; 917 cs->sc_blk[cs->sc_pick] = bp->bio_blkno; 918 } else { 919 bp->bio_flags |= BIO_ERROR; 920 bp->bio_error = cbp->cb_buf.bio_error ? 921 cbp->cb_buf.bio_error : EIO; 922 } 923 printf("ccd%d: error %d on component %d block %jd " 924 "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error, 925 cbp->cb_comp, 926 (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno, 927 msg); 928 } 929 930 /* 931 * Process mirror. If we are writing, I/O has been initiated on both 932 * buffers and we fall through only after both are finished. 933 * 934 * If we are reading only one I/O is initiated at a time. If an 935 * error occurs we initiate the second I/O and return, otherwise 936 * we free the second I/O without initiating it. 937 */ 938 939 if (cs->sc_cflags & CCDF_MIRROR) { 940 if (cbp->cb_buf.bio_cmd == BIO_WRITE) { 941 /* 942 * When writing, handshake with the second buffer 943 * to determine when both are done. If both are not 944 * done, return here. 945 */ 946 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 947 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE; 948 free(cbp, M_CCD); 949 return; 950 } 951 } else { 952 /* 953 * When reading, either dispose of the second buffer 954 * or initiate I/O on the second buffer if an error 955 * occured with this one. 956 */ 957 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 958 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 959 cbp->cb_mirror->cb_pflags |= 960 CCDPF_MIRROR_DONE; 961 BIO_STRATEGY(&cbp->cb_mirror->cb_buf); 962 free(cbp, M_CCD); 963 return; 964 } else { 965 free(cbp->cb_mirror, M_CCD); 966 } 967 } 968 } 969 } 970 971 /* 972 * use bio_caller1 to determine how big the original request was rather 973 * then bio_bcount, because bio_bcount may have been truncated for EOF. 974 * 975 * XXX We check for an error, but we do not test the resid for an 976 * aligned EOF condition. This may result in character & block 977 * device access not recognizing EOF properly when read or written 978 * sequentially, but will not effect filesystems. 979 */ 980 count = (long)cbp->cb_buf.bio_caller1; 981 free(cbp, M_CCD); 982 983 /* 984 * If all done, "interrupt". 985 */ 986 bp->bio_resid -= count; 987 if (bp->bio_resid < 0) 988 panic("ccdiodone: count"); 989 if (bp->bio_resid == 0) { 990 if (bp->bio_flags & BIO_ERROR) 991 bp->bio_resid = bp->bio_bcount; 992 biodone(bp); 993 } 994 } 995 996 static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td); 997 998 static int 999 ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td) 1000 { 1001 struct ccd_ioctl *ccio; 1002 u_int unit; 1003 1004 switch (cmd) { 1005 case CCDIOCSET: 1006 case CCDIOCCLR: 1007 ccio = (struct ccd_ioctl *)data; 1008 unit = ccio->ccio_size; 1009 return (ccdioctltoo(unit, cmd, data, flag, td)); 1010 default: 1011 return (ENOIOCTL); 1012 } 1013 } 1014 1015 static int 1016 ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td) 1017 { 1018 int i, j, lookedup = 0, error = 0; 1019 struct ccd_s *cs; 1020 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data; 1021 struct ccdgeom *ccg; 1022 char **cpp; 1023 struct vnode **vpp; 1024 1025 cs = ccdfind(unit); 1026 switch (cmd) { 1027 case CCDIOCSET: 1028 if (cs == NULL) 1029 cs = ccdnew(unit); 1030 if (IS_INITED(cs)) 1031 return (EBUSY); 1032 1033 if ((flag & FWRITE) == 0) 1034 return (EBADF); 1035 1036 if ((error = ccdlock(cs)) != 0) 1037 return (error); 1038 1039 if (ccio->ccio_ndisks > CCD_MAXNDISKS) 1040 return (EINVAL); 1041 1042 /* Fill in some important bits. */ 1043 cs->sc_ileave = ccio->ccio_ileave; 1044 if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) { 1045 printf("ccd%d: disabling mirror, interleave is 0\n", 1046 unit); 1047 ccio->ccio_flags &= ~(CCDF_MIRROR); 1048 } 1049 if ((ccio->ccio_flags & CCDF_MIRROR) && 1050 !(ccio->ccio_flags & CCDF_UNIFORM)) { 1051 printf("ccd%d: mirror/parity forces uniform flag\n", 1052 unit); 1053 ccio->ccio_flags |= CCDF_UNIFORM; 1054 } 1055 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK; 1056 1057 /* 1058 * Allocate space for and copy in the array of 1059 * componet pathnames and device numbers. 1060 */ 1061 cpp = malloc(ccio->ccio_ndisks * sizeof(char *), 1062 M_CCD, M_WAITOK); 1063 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *), 1064 M_CCD, M_WAITOK); 1065 1066 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp, 1067 ccio->ccio_ndisks * sizeof(char **)); 1068 if (error) { 1069 free(vpp, M_CCD); 1070 free(cpp, M_CCD); 1071 ccdunlock(cs); 1072 return (error); 1073 } 1074 1075 1076 for (i = 0; i < ccio->ccio_ndisks; ++i) { 1077 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) { 1078 for (j = 0; j < lookedup; ++j) 1079 (void)vn_close(vpp[j], FREAD|FWRITE, 1080 td->td_ucred, td); 1081 free(vpp, M_CCD); 1082 free(cpp, M_CCD); 1083 ccdunlock(cs); 1084 return (error); 1085 } 1086 ++lookedup; 1087 } 1088 cs->sc_vpp = vpp; 1089 cs->sc_nccdisks = ccio->ccio_ndisks; 1090 1091 /* 1092 * Initialize the ccd. Fills in the softc for us. 1093 */ 1094 if ((error = ccdinit(cs, cpp, td)) != 0) { 1095 for (j = 0; j < lookedup; ++j) 1096 (void)vn_close(vpp[j], FREAD|FWRITE, 1097 td->td_ucred, td); 1098 /* 1099 * We can't ccddestroy() cs just yet, because nothing 1100 * prevents user-level app to do another ioctl() 1101 * without closing the device first, therefore 1102 * declare unit null and void and let ccdclose() 1103 * destroy it when it is safe to do so. 1104 */ 1105 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1106 free(vpp, M_CCD); 1107 free(cpp, M_CCD); 1108 ccdunlock(cs); 1109 return (error); 1110 } 1111 free(cpp, M_CCD); 1112 1113 /* 1114 * The ccd has been successfully initialized, so 1115 * we can place it into the array and read the disklabel. 1116 */ 1117 ccio->ccio_unit = unit; 1118 ccio->ccio_size = cs->sc_size; 1119 ccg = &cs->sc_geom; 1120 cs->sc_disk = malloc(sizeof(struct disk), M_CCD, 1121 M_ZERO | M_WAITOK); 1122 cs->sc_disk->d_strategy = ccdstrategy; 1123 cs->sc_disk->d_name = "ccd"; 1124 cs->sc_disk->d_sectorsize = ccg->ccg_secsize; 1125 cs->sc_disk->d_mediasize = 1126 cs->sc_size * (off_t)ccg->ccg_secsize; 1127 cs->sc_disk->d_fwsectors = ccg->ccg_nsectors; 1128 cs->sc_disk->d_fwheads = ccg->ccg_ntracks; 1129 cs->sc_disk->d_drv1 = cs; 1130 cs->sc_disk->d_maxsize = MAXPHYS; 1131 disk_create(unit, cs->sc_disk, 0, NULL, NULL); 1132 1133 ccdunlock(cs); 1134 1135 break; 1136 1137 case CCDIOCCLR: 1138 if (cs == NULL) 1139 return (ENXIO); 1140 1141 if (!IS_INITED(cs)) 1142 return (ENXIO); 1143 1144 if ((flag & FWRITE) == 0) 1145 return (EBADF); 1146 1147 if ((error = ccdlock(cs)) != 0) 1148 return (error); 1149 1150 /* Don't unconfigure if any other partitions are open */ 1151 if (cs->sc_disk->d_flags & DISKFLAG_OPEN) { 1152 ccdunlock(cs); 1153 return (EBUSY); 1154 } 1155 1156 disk_destroy(cs->sc_disk); 1157 free(cs->sc_disk, M_CCD); 1158 cs->sc_disk = NULL; 1159 /* Declare unit null and void (reset all flags) */ 1160 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1161 1162 /* Close the components and free their pathnames. */ 1163 for (i = 0; i < cs->sc_nccdisks; ++i) { 1164 /* 1165 * XXX: this close could potentially fail and 1166 * cause Bad Things. Maybe we need to force 1167 * the close to happen? 1168 */ 1169 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, 1170 td->td_ucred, td); 1171 free(cs->sc_cinfo[i].ci_path, M_CCD); 1172 } 1173 1174 /* Free interleave index. */ 1175 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) 1176 free(cs->sc_itable[i].ii_index, M_CCD); 1177 1178 /* Free component info and interleave table. */ 1179 free(cs->sc_cinfo, M_CCD); 1180 free(cs->sc_itable, M_CCD); 1181 free(cs->sc_vpp, M_CCD); 1182 1183 /* This must be atomic. */ 1184 ccdunlock(cs); 1185 ccddestroy(cs); 1186 1187 break; 1188 } 1189 1190 return (0); 1191 } 1192 1193 1194 /* 1195 * Lookup the provided name in the filesystem. If the file exists, 1196 * is a valid block device, and isn't being used by anyone else, 1197 * set *vpp to the file's vnode. 1198 */ 1199 static int 1200 ccdlookup(char *path, struct thread *td, struct vnode **vpp) 1201 { 1202 struct nameidata nd; 1203 struct vnode *vp; 1204 int error, flags; 1205 1206 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td); 1207 flags = FREAD | FWRITE; 1208 if ((error = vn_open(&nd, &flags, 0)) != 0) { 1209 return (error); 1210 } 1211 vp = nd.ni_vp; 1212 1213 if (vrefcnt(vp) > 1) { 1214 error = EBUSY; 1215 goto bad; 1216 } 1217 1218 if (!vn_isdisk(vp, &error)) 1219 goto bad; 1220 1221 1222 VOP_UNLOCK(vp, 0, td); 1223 NDFREE(&nd, NDF_ONLY_PNBUF); 1224 *vpp = vp; 1225 return (0); 1226 bad: 1227 VOP_UNLOCK(vp, 0, td); 1228 NDFREE(&nd, NDF_ONLY_PNBUF); 1229 /* vn_close does vrele() for vp */ 1230 (void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td); 1231 return (error); 1232 } 1233 1234 /* 1235 1236 * Wait interruptibly for an exclusive lock. 1237 * 1238 * XXX 1239 * Several drivers do this; it should be abstracted and made MP-safe. 1240 */ 1241 static int 1242 ccdlock(struct ccd_s *cs) 1243 { 1244 int error; 1245 1246 while ((cs->sc_flags & CCDF_LOCKED) != 0) { 1247 cs->sc_flags |= CCDF_WANTED; 1248 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0) 1249 return (error); 1250 } 1251 cs->sc_flags |= CCDF_LOCKED; 1252 return (0); 1253 } 1254 1255 /* 1256 * Unlock and wake up any waiters. 1257 */ 1258 static void 1259 ccdunlock(struct ccd_s *cs) 1260 { 1261 1262 cs->sc_flags &= ~CCDF_LOCKED; 1263 if ((cs->sc_flags & CCDF_WANTED) != 0) { 1264 cs->sc_flags &= ~CCDF_WANTED; 1265 wakeup(cs); 1266 } 1267 } 1268 1269 static struct sbuf * 1270 g_ccd_list(int unit) 1271 { 1272 struct sbuf *sb; 1273 struct ccd_s *cs; 1274 int i; 1275 1276 sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); 1277 sbuf_clear(sb); 1278 LIST_FOREACH(cs, &ccd_softc_list, list) { 1279 if (!IS_INITED(cs)) 1280 continue; 1281 if (unit >= 0 && unit != cs->sc_unit) 1282 continue; 1283 sbuf_printf(sb, "ccd%d\t\t%d\t%d\t", 1284 cs->sc_unit, cs->sc_ileave, cs->sc_cflags & CCDF_USERMASK); 1285 1286 for (i = 0; i < cs->sc_nccdisks; ++i) { 1287 sbuf_printf(sb, "%s%s", i == 0 ? "" : " ", 1288 cs->sc_cinfo[i].ci_path); 1289 } 1290 sbuf_printf(sb, "\n"); 1291 } 1292 sbuf_finish(sb); 1293 return (sb); 1294 } 1295 1296 static void 1297 g_ccd_config(struct gctl_req *req, struct g_class *mp, char const *verb) 1298 { 1299 struct sbuf *sb; 1300 int u, *up; 1301 1302 g_topology_assert(); 1303 if (!strcmp(verb, "create geom")) { 1304 gctl_error(req, "TBD"); 1305 } else if (!strcmp(verb, "destroy geom")) { 1306 gctl_error(req, "TBD"); 1307 } else if (!strcmp(verb, "list")) { 1308 up = gctl_get_paraml(req, "unit", sizeof (int)); 1309 u = *up; 1310 sb = g_ccd_list(u); 1311 gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); 1312 } else { 1313 gctl_error(req, "unknown verb"); 1314 } 1315 } 1316 1317 static struct g_class g_ccd_class = { 1318 .name = "CCD", 1319 .ctlreq = g_ccd_config, 1320 }; 1321 1322 DECLARE_GEOM_CLASS(g_ccd_class, g_ccd); 1323