1 /* 2 * Copyright (c) 2003 Poul-Henning Kamp. 3 * Copyright (c) 1995 Jason R. Thorpe. 4 * Copyright (c) 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * All rights reserved. 7 * Copyright (c) 1988 University of Utah. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * the Systems Programming Group of the University of Utah Computer 11 * Science Department. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed for the NetBSD Project 24 * by Jason R. Thorpe. 25 * 4. The names of the authors may not be used to endorse or promote products 26 * derived from this software without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 35 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 36 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * Dynamic configuration and disklabel support by: 41 * Jason R. Thorpe <thorpej@nas.nasa.gov> 42 * Numerical Aerodynamic Simulation Facility 43 * Mail Stop 258-6 44 * NASA Ames Research Center 45 * Moffett Field, CA 94035 46 * 47 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 48 * 49 * @(#)cd.c 8.2 (Berkeley) 11/16/93 50 * 51 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ 52 * 53 * $FreeBSD$ 54 */ 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/kernel.h> 59 #include <sys/module.h> 60 #include <sys/proc.h> 61 #include <sys/bio.h> 62 #include <sys/malloc.h> 63 #include <sys/namei.h> 64 #include <sys/conf.h> 65 #include <sys/stat.h> 66 #include <sys/stdint.h> 67 #include <sys/sysctl.h> 68 #include <sys/disk.h> 69 #include <sys/devicestat.h> 70 #include <sys/fcntl.h> 71 #include <sys/vnode.h> 72 73 #include <sys/ccdvar.h> 74 75 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver"); 76 77 /* 78 This is how mirroring works (only writes are special): 79 80 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s 81 linked together by the cb_mirror field. "cb_pflags & 82 CCDPF_MIRROR_DONE" is set to 0 on both of them. 83 84 When a component returns to ccdiodone(), it checks if "cb_pflags & 85 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's 86 flag and returns. If it is, it means its partner has already 87 returned, so it will go to the regular cleanup. 88 89 */ 90 91 struct ccdbuf { 92 struct bio cb_buf; /* new I/O buf */ 93 struct bio *cb_obp; /* ptr. to original I/O buf */ 94 struct ccdbuf *cb_freenext; /* free list link */ 95 struct ccd_s *cb_softc; 96 int cb_comp; /* target component */ 97 int cb_pflags; /* mirror/parity status flag */ 98 struct ccdbuf *cb_mirror; /* mirror counterpart */ 99 }; 100 101 /* bits in cb_pflags */ 102 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */ 103 104 /* convinient macros for often-used statements */ 105 #define IS_ALLOCATED(unit) (ccdfind(unit) != NULL) 106 #define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0) 107 108 static dev_t ccdctldev; 109 110 static d_open_t ccdopen; 111 static d_close_t ccdclose; 112 static d_strategy_t ccdstrategy; 113 static d_ioctl_t ccdctlioctl; 114 115 #define NCCDFREEHIWAT 16 116 117 #define CDEV_MAJOR 74 118 119 static struct cdevsw ccdctl_cdevsw = { 120 /* open */ nullopen, 121 /* close */ nullclose, 122 /* read */ noread, 123 /* write */ nowrite, 124 /* ioctl */ ccdctlioctl, 125 /* poll */ nopoll, 126 /* mmap */ nommap, 127 /* strategy */ nostrategy, 128 /* name */ "ccdctl", 129 /* maj */ CDEV_MAJOR, 130 /* dump */ nodump, 131 /* psize */ nopsize, 132 /* flags */ 0 133 }; 134 135 static struct cdevsw ccd_cdevsw = { 136 /* open */ ccdopen, 137 /* close */ ccdclose, 138 /* read */ physread, 139 /* write */ physwrite, 140 /* ioctl */ noioctl, 141 /* poll */ nopoll, 142 /* mmap */ nommap, 143 /* strategy */ ccdstrategy, 144 /* name */ "ccd", 145 /* maj */ CDEV_MAJOR, 146 /* dump */ nodump, 147 /* psize */ nopsize, 148 /* flags */ D_DISK, 149 }; 150 151 static struct cdevsw ccddisk_cdevsw; 152 153 static LIST_HEAD(, ccd_s) ccd_softc_list = 154 LIST_HEAD_INITIALIZER(&ccd_softc_list); 155 156 static struct ccd_s *ccdfind(int); 157 static struct ccd_s *ccdnew(int); 158 static int ccddestroy(struct ccd_s *); 159 160 /* called during module initialization */ 161 static void ccdattach(void); 162 static int ccd_modevent(module_t, int, void *); 163 164 /* called by biodone() at interrupt time */ 165 static void ccdiodone(struct bio *bp); 166 167 static void ccdstart(struct ccd_s *, struct bio *); 168 static void ccdinterleave(struct ccd_s *, int); 169 static int ccdinit(struct ccd_s *, char **, struct thread *); 170 static int ccdlookup(char *, struct thread *p, struct vnode **); 171 static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *, 172 struct bio *, daddr_t, caddr_t, long); 173 static int ccdlock(struct ccd_s *); 174 static void ccdunlock(struct ccd_s *); 175 176 177 /* 178 * Number of blocks to untouched in front of a component partition. 179 * This is to avoid violating its disklabel area when it starts at the 180 * beginning of the slice. 181 */ 182 #if !defined(CCD_OFFSET) 183 #define CCD_OFFSET 16 184 #endif 185 186 static struct ccd_s * 187 ccdfind(int unit) 188 { 189 struct ccd_s *sc = NULL; 190 191 /* XXX: LOCK(unique unit numbers) */ 192 LIST_FOREACH(sc, &ccd_softc_list, list) { 193 if (sc->sc_unit == unit) 194 break; 195 } 196 /* XXX: UNLOCK(unique unit numbers) */ 197 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc); 198 } 199 200 static struct ccd_s * 201 ccdnew(int unit) 202 { 203 struct ccd_s *sc; 204 205 /* XXX: LOCK(unique unit numbers) */ 206 if (IS_ALLOCATED(unit) || unit > 32) 207 return (NULL); 208 209 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_ZERO); 210 sc->sc_unit = unit; 211 LIST_INSERT_HEAD(&ccd_softc_list, sc, list); 212 /* XXX: UNLOCK(unique unit numbers) */ 213 return (sc); 214 } 215 216 static int 217 ccddestroy(struct ccd_s *sc) 218 { 219 220 /* XXX: LOCK(unique unit numbers) */ 221 LIST_REMOVE(sc, list); 222 /* XXX: UNLOCK(unique unit numbers) */ 223 FREE(sc, M_CCD); 224 return (0); 225 } 226 227 /* 228 * Called by main() during pseudo-device attachment. All we need 229 * to do is to add devsw entries. 230 */ 231 static void 232 ccdattach() 233 { 234 235 ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff, 236 UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl"); 237 ccdctldev->si_drv1 = ccdctldev; 238 } 239 240 static int 241 ccd_modevent(module_t mod, int type, void *data) 242 { 243 int error = 0; 244 245 switch (type) { 246 case MOD_LOAD: 247 ccdattach(); 248 break; 249 250 case MOD_UNLOAD: 251 printf("ccd0: Unload not supported!\n"); 252 error = EOPNOTSUPP; 253 break; 254 255 case MOD_SHUTDOWN: 256 break; 257 258 default: 259 error = EOPNOTSUPP; 260 } 261 return (error); 262 } 263 264 DEV_MODULE(ccd, ccd_modevent, NULL); 265 266 static int 267 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td) 268 { 269 struct ccdcinfo *ci = NULL; /* XXX */ 270 size_t size; 271 int ix; 272 struct vnode *vp; 273 size_t minsize; 274 int maxsecsize; 275 struct ccdgeom *ccg = &cs->sc_geom; 276 char *tmppath = NULL; 277 int error = 0; 278 off_t mediasize; 279 u_int sectorsize; 280 281 282 cs->sc_size = 0; 283 284 /* Allocate space for the component info. */ 285 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo), 286 M_CCD, 0); 287 288 /* 289 * Verify that each component piece exists and record 290 * relevant information about it. 291 */ 292 maxsecsize = 0; 293 minsize = 0; 294 tmppath = malloc(MAXPATHLEN, M_CCD, 0); 295 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 296 vp = cs->sc_vpp[ix]; 297 ci = &cs->sc_cinfo[ix]; 298 ci->ci_vp = vp; 299 300 /* 301 * Copy in the pathname of the component. 302 */ 303 if ((error = copyinstr(cpaths[ix], tmppath, 304 MAXPATHLEN, &ci->ci_pathlen)) != 0) { 305 goto fail; 306 } 307 ci->ci_path = malloc(ci->ci_pathlen, M_CCD, 0); 308 bcopy(tmppath, ci->ci_path, ci->ci_pathlen); 309 310 ci->ci_dev = vn_todev(vp); 311 312 /* 313 * Get partition information for the component. 314 */ 315 error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize, 316 FREAD, td->td_ucred, td); 317 if (error != 0) { 318 goto fail; 319 } 320 /* 321 * Get partition information for the component. 322 */ 323 error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)§orsize, 324 FREAD, td->td_ucred, td); 325 if (error != 0) { 326 goto fail; 327 } 328 if (sectorsize > maxsecsize) 329 maxsecsize = sectorsize; 330 size = mediasize / DEV_BSIZE - CCD_OFFSET; 331 332 /* 333 * Calculate the size, truncating to an interleave 334 * boundary if necessary. 335 */ 336 337 if (cs->sc_ileave > 1) 338 size -= size % cs->sc_ileave; 339 340 if (size == 0) { 341 error = ENODEV; 342 goto fail; 343 } 344 345 if (minsize == 0 || size < minsize) 346 minsize = size; 347 ci->ci_size = size; 348 cs->sc_size += size; 349 } 350 351 free(tmppath, M_CCD); 352 tmppath = NULL; 353 354 /* 355 * Don't allow the interleave to be smaller than 356 * the biggest component sector. 357 */ 358 if ((cs->sc_ileave > 0) && 359 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { 360 error = EINVAL; 361 goto fail; 362 } 363 364 /* 365 * If uniform interleave is desired set all sizes to that of 366 * the smallest component. This will guarentee that a single 367 * interleave table is generated. 368 * 369 * Lost space must be taken into account when calculating the 370 * overall size. Half the space is lost when CCDF_MIRROR is 371 * specified. 372 */ 373 if (cs->sc_flags & CCDF_UNIFORM) { 374 for (ci = cs->sc_cinfo; 375 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 376 ci->ci_size = minsize; 377 } 378 if (cs->sc_flags & CCDF_MIRROR) { 379 /* 380 * Check to see if an even number of components 381 * have been specified. The interleave must also 382 * be non-zero in order for us to be able to 383 * guarentee the topology. 384 */ 385 if (cs->sc_nccdisks % 2) { 386 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit ); 387 error = EINVAL; 388 goto fail; 389 } 390 if (cs->sc_ileave == 0) { 391 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit); 392 error = EINVAL; 393 goto fail; 394 } 395 cs->sc_size = (cs->sc_nccdisks/2) * minsize; 396 } else { 397 if (cs->sc_ileave == 0) { 398 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit); 399 error = EINVAL; 400 goto fail; 401 } 402 cs->sc_size = cs->sc_nccdisks * minsize; 403 } 404 } 405 406 /* 407 * Construct the interleave table. 408 */ 409 ccdinterleave(cs, cs->sc_unit); 410 411 /* 412 * Create pseudo-geometry based on 1MB cylinders. It's 413 * pretty close. 414 */ 415 ccg->ccg_secsize = maxsecsize; 416 ccg->ccg_ntracks = 1; 417 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize; 418 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors; 419 420 /* 421 * Add a devstat entry for this device. 422 */ 423 devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit, 424 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED, 425 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER, 426 DEVSTAT_PRIORITY_ARRAY); 427 428 cs->sc_flags |= CCDF_INITED; 429 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */ 430 return (0); 431 fail: 432 while (ci > cs->sc_cinfo) { 433 ci--; 434 free(ci->ci_path, M_CCD); 435 } 436 if (tmppath != NULL) 437 free(tmppath, M_CCD); 438 free(cs->sc_cinfo, M_CCD); 439 ccddestroy(cs); 440 return (error); 441 } 442 443 static void 444 ccdinterleave(struct ccd_s *cs, int unit) 445 { 446 struct ccdcinfo *ci, *smallci; 447 struct ccdiinfo *ii; 448 daddr_t bn, lbn; 449 int ix; 450 u_long size; 451 452 453 /* 454 * Allocate an interleave table. The worst case occurs when each 455 * of N disks is of a different size, resulting in N interleave 456 * tables. 457 * 458 * Chances are this is too big, but we don't care. 459 */ 460 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo); 461 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD, 462 M_ZERO); 463 464 /* 465 * Trivial case: no interleave (actually interleave of disk size). 466 * Each table entry represents a single component in its entirety. 467 * 468 * An interleave of 0 may not be used with a mirror setup. 469 */ 470 if (cs->sc_ileave == 0) { 471 bn = 0; 472 ii = cs->sc_itable; 473 474 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 475 /* Allocate space for ii_index. */ 476 ii->ii_index = malloc(sizeof(int), M_CCD, 0); 477 ii->ii_ndisk = 1; 478 ii->ii_startblk = bn; 479 ii->ii_startoff = 0; 480 ii->ii_index[0] = ix; 481 bn += cs->sc_cinfo[ix].ci_size; 482 ii++; 483 } 484 ii->ii_ndisk = 0; 485 return; 486 } 487 488 /* 489 * The following isn't fast or pretty; it doesn't have to be. 490 */ 491 size = 0; 492 bn = lbn = 0; 493 for (ii = cs->sc_itable; ; ii++) { 494 /* 495 * Allocate space for ii_index. We might allocate more then 496 * we use. 497 */ 498 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks), 499 M_CCD, 0); 500 501 /* 502 * Locate the smallest of the remaining components 503 */ 504 smallci = NULL; 505 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks]; 506 ci++) { 507 if (ci->ci_size > size && 508 (smallci == NULL || 509 ci->ci_size < smallci->ci_size)) { 510 smallci = ci; 511 } 512 } 513 514 /* 515 * Nobody left, all done 516 */ 517 if (smallci == NULL) { 518 ii->ii_ndisk = 0; 519 free(ii->ii_index, M_CCD); 520 break; 521 } 522 523 /* 524 * Record starting logical block using an sc_ileave blocksize. 525 */ 526 ii->ii_startblk = bn / cs->sc_ileave; 527 528 /* 529 * Record starting comopnent block using an sc_ileave 530 * blocksize. This value is relative to the beginning of 531 * a component disk. 532 */ 533 ii->ii_startoff = lbn; 534 535 /* 536 * Determine how many disks take part in this interleave 537 * and record their indices. 538 */ 539 ix = 0; 540 for (ci = cs->sc_cinfo; 541 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 542 if (ci->ci_size >= smallci->ci_size) { 543 ii->ii_index[ix++] = ci - cs->sc_cinfo; 544 } 545 } 546 ii->ii_ndisk = ix; 547 bn += ix * (smallci->ci_size - size); 548 lbn = smallci->ci_size / cs->sc_ileave; 549 size = smallci->ci_size; 550 } 551 } 552 553 static int 554 ccdopen(dev_t dev, int flags, int fmt, struct thread *td) 555 { 556 struct ccd_s *cs; 557 558 cs = dev->si_drv1; 559 cs->sc_openmask = 1; 560 return (0); 561 } 562 563 /* ARGSUSED */ 564 static int 565 ccdclose(dev_t dev, int flags, int fmt, struct thread *td) 566 { 567 struct ccd_s *cs; 568 569 cs = dev->si_drv1; 570 cs->sc_openmask = 0; 571 return (0); 572 } 573 574 static void 575 ccdstrategy(struct bio *bp) 576 { 577 struct ccd_s *cs; 578 int pbn; /* in sc_secsize chunks */ 579 long sz; /* in sc_secsize chunks */ 580 581 cs = bp->bio_dev->si_drv1; 582 583 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE); 584 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize); 585 586 /* 587 * If out of bounds return an error. If at the EOF point, 588 * simply read or write less. 589 */ 590 591 if (pbn < 0 || pbn >= cs->sc_size) { 592 bp->bio_resid = bp->bio_bcount; 593 if (pbn != cs->sc_size) 594 biofinish(bp, NULL, EINVAL); 595 else 596 biodone(bp); 597 return; 598 } 599 600 /* 601 * If the request crosses EOF, truncate the request. 602 */ 603 if (pbn + sz > cs->sc_size) { 604 bp->bio_bcount = (cs->sc_size - pbn) * 605 cs->sc_geom.ccg_secsize; 606 } 607 608 bp->bio_resid = bp->bio_bcount; 609 610 /* 611 * "Start" the unit. 612 */ 613 ccdstart(cs, bp); 614 return; 615 } 616 617 static void 618 ccdstart(struct ccd_s *cs, struct bio *bp) 619 { 620 long bcount, rcount; 621 struct ccdbuf *cbp[2]; 622 caddr_t addr; 623 daddr_t bn; 624 int err; 625 626 627 /* Record the transaction start */ 628 devstat_start_transaction(&cs->device_stats); 629 630 /* 631 * Translate the partition-relative block number to an absolute. 632 */ 633 bn = bp->bio_blkno; 634 635 /* 636 * Allocate component buffers and fire off the requests 637 */ 638 addr = bp->bio_data; 639 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) { 640 err = ccdbuffer(cbp, cs, bp, bn, addr, bcount); 641 if (err) { 642 printf("ccdbuffer error %d\n", err); 643 /* We're screwed */ 644 bp->bio_resid -= bcount; 645 bp->bio_error = ENOMEM; 646 bp->bio_flags |= BIO_ERROR; 647 return; 648 } 649 rcount = cbp[0]->cb_buf.bio_bcount; 650 651 if (cs->sc_cflags & CCDF_MIRROR) { 652 /* 653 * Mirroring. Writes go to both disks, reads are 654 * taken from whichever disk seems most appropriate. 655 * 656 * We attempt to localize reads to the disk whos arm 657 * is nearest the read request. We ignore seeks due 658 * to writes when making this determination and we 659 * also try to avoid hogging. 660 */ 661 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) { 662 BIO_STRATEGY(&cbp[0]->cb_buf); 663 BIO_STRATEGY(&cbp[1]->cb_buf); 664 } else { 665 int pick = cs->sc_pick; 666 daddr_t range = cs->sc_size / 16; 667 668 if (bn < cs->sc_blk[pick] - range || 669 bn > cs->sc_blk[pick] + range 670 ) { 671 cs->sc_pick = pick = 1 - pick; 672 } 673 cs->sc_blk[pick] = bn + btodb(rcount); 674 BIO_STRATEGY(&cbp[pick]->cb_buf); 675 } 676 } else { 677 /* 678 * Not mirroring 679 */ 680 BIO_STRATEGY(&cbp[0]->cb_buf); 681 } 682 bn += btodb(rcount); 683 addr += rcount; 684 } 685 } 686 687 /* 688 * Build a component buffer header. 689 */ 690 static int 691 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount) 692 { 693 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */ 694 struct ccdbuf *cbp; 695 daddr_t cbn, cboff; 696 off_t cbc; 697 698 /* 699 * Determine which component bn falls in. 700 */ 701 cbn = bn; 702 cboff = 0; 703 704 if (cs->sc_ileave == 0) { 705 /* 706 * Serially concatenated and neither a mirror nor a parity 707 * config. This is a special case. 708 */ 709 daddr_t sblk; 710 711 sblk = 0; 712 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++) 713 sblk += ci->ci_size; 714 cbn -= sblk; 715 } else { 716 struct ccdiinfo *ii; 717 int ccdisk, off; 718 719 /* 720 * Calculate cbn, the logical superblock (sc_ileave chunks), 721 * and cboff, a normal block offset (DEV_BSIZE chunks) relative 722 * to cbn. 723 */ 724 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */ 725 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */ 726 727 /* 728 * Figure out which interleave table to use. 729 */ 730 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) { 731 if (ii->ii_startblk > cbn) 732 break; 733 } 734 ii--; 735 736 /* 737 * off is the logical superblock relative to the beginning 738 * of this interleave block. 739 */ 740 off = cbn - ii->ii_startblk; 741 742 /* 743 * We must calculate which disk component to use (ccdisk), 744 * and recalculate cbn to be the superblock relative to 745 * the beginning of the component. This is typically done by 746 * adding 'off' and ii->ii_startoff together. However, 'off' 747 * must typically be divided by the number of components in 748 * this interleave array to be properly convert it from a 749 * CCD-relative logical superblock number to a 750 * component-relative superblock number. 751 */ 752 if (ii->ii_ndisk == 1) { 753 /* 754 * When we have just one disk, it can't be a mirror 755 * or a parity config. 756 */ 757 ccdisk = ii->ii_index[0]; 758 cbn = ii->ii_startoff + off; 759 } else { 760 if (cs->sc_cflags & CCDF_MIRROR) { 761 /* 762 * We have forced a uniform mapping, resulting 763 * in a single interleave array. We double 764 * up on the first half of the available 765 * components and our mirror is in the second 766 * half. This only works with a single 767 * interleave array because doubling up 768 * doubles the number of sectors, so there 769 * cannot be another interleave array because 770 * the next interleave array's calculations 771 * would be off. 772 */ 773 int ndisk2 = ii->ii_ndisk / 2; 774 ccdisk = ii->ii_index[off % ndisk2]; 775 cbn = ii->ii_startoff + off / ndisk2; 776 ci2 = &cs->sc_cinfo[ccdisk + ndisk2]; 777 } else { 778 ccdisk = ii->ii_index[off % ii->ii_ndisk]; 779 cbn = ii->ii_startoff + off / ii->ii_ndisk; 780 } 781 } 782 783 ci = &cs->sc_cinfo[ccdisk]; 784 785 /* 786 * Convert cbn from a superblock to a normal block so it 787 * can be used to calculate (along with cboff) the normal 788 * block index into this particular disk. 789 */ 790 cbn *= cs->sc_ileave; 791 } 792 793 /* 794 * Fill in the component buf structure. 795 */ 796 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO); 797 if (cbp == NULL) 798 return (ENOMEM); 799 cbp->cb_buf.bio_cmd = bp->bio_cmd; 800 cbp->cb_buf.bio_done = ccdiodone; 801 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */ 802 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET; 803 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET); 804 cbp->cb_buf.bio_data = addr; 805 cbp->cb_buf.bio_caller2 = cbp; 806 if (cs->sc_ileave == 0) 807 cbc = dbtob((off_t)(ci->ci_size - cbn)); 808 else 809 cbc = dbtob((off_t)(cs->sc_ileave - cboff)); 810 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount; 811 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount; 812 813 /* 814 * context for ccdiodone 815 */ 816 cbp->cb_obp = bp; 817 cbp->cb_softc = cs; 818 cbp->cb_comp = ci - cs->sc_cinfo; 819 820 cb[0] = cbp; 821 822 /* 823 * Note: both I/O's setup when reading from mirror, but only one 824 * will be executed. 825 */ 826 if (cs->sc_cflags & CCDF_MIRROR) { 827 /* mirror, setup second I/O */ 828 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT); 829 if (cbp == NULL) { 830 free(cb[0], M_CCD); 831 cb[0] = NULL; 832 return (ENOMEM); 833 } 834 bcopy(cb[0], cbp, sizeof(struct ccdbuf)); 835 cbp->cb_buf.bio_dev = ci2->ci_dev; 836 cbp->cb_comp = ci2 - cs->sc_cinfo; 837 cb[1] = cbp; 838 /* link together the ccdbuf's and clear "mirror done" flag */ 839 cb[0]->cb_mirror = cb[1]; 840 cb[1]->cb_mirror = cb[0]; 841 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE; 842 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE; 843 } 844 return (0); 845 } 846 847 /* 848 * Called at interrupt time. 849 * Mark the component as done and if all components are done, 850 * take a ccd interrupt. 851 */ 852 static void 853 ccdiodone(struct bio *ibp) 854 { 855 struct ccdbuf *cbp; 856 struct bio *bp; 857 struct ccd_s *cs; 858 int count; 859 860 cbp = ibp->bio_caller2; 861 cs = cbp->cb_softc; 862 bp = cbp->cb_obp; 863 /* 864 * If an error occured, report it. If this is a mirrored 865 * configuration and the first of two possible reads, do not 866 * set the error in the bp yet because the second read may 867 * succeed. 868 */ 869 870 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 871 const char *msg = ""; 872 873 if ((cs->sc_cflags & CCDF_MIRROR) && 874 (cbp->cb_buf.bio_cmd == BIO_READ) && 875 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 876 /* 877 * We will try our read on the other disk down 878 * below, also reverse the default pick so if we 879 * are doing a scan we do not keep hitting the 880 * bad disk first. 881 */ 882 883 msg = ", trying other disk"; 884 cs->sc_pick = 1 - cs->sc_pick; 885 cs->sc_blk[cs->sc_pick] = bp->bio_blkno; 886 } else { 887 bp->bio_flags |= BIO_ERROR; 888 bp->bio_error = cbp->cb_buf.bio_error ? 889 cbp->cb_buf.bio_error : EIO; 890 } 891 printf("ccd%d: error %d on component %d block %jd " 892 "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error, 893 cbp->cb_comp, 894 (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno, 895 msg); 896 } 897 898 /* 899 * Process mirror. If we are writing, I/O has been initiated on both 900 * buffers and we fall through only after both are finished. 901 * 902 * If we are reading only one I/O is initiated at a time. If an 903 * error occurs we initiate the second I/O and return, otherwise 904 * we free the second I/O without initiating it. 905 */ 906 907 if (cs->sc_cflags & CCDF_MIRROR) { 908 if (cbp->cb_buf.bio_cmd == BIO_WRITE) { 909 /* 910 * When writing, handshake with the second buffer 911 * to determine when both are done. If both are not 912 * done, return here. 913 */ 914 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 915 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE; 916 free(cbp, M_CCD); 917 return; 918 } 919 } else { 920 /* 921 * When reading, either dispose of the second buffer 922 * or initiate I/O on the second buffer if an error 923 * occured with this one. 924 */ 925 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 926 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 927 cbp->cb_mirror->cb_pflags |= 928 CCDPF_MIRROR_DONE; 929 BIO_STRATEGY(&cbp->cb_mirror->cb_buf); 930 free(cbp, M_CCD); 931 return; 932 } else { 933 free(cbp->cb_mirror, M_CCD); 934 } 935 } 936 } 937 } 938 939 /* 940 * use bio_caller1 to determine how big the original request was rather 941 * then bio_bcount, because bio_bcount may have been truncated for EOF. 942 * 943 * XXX We check for an error, but we do not test the resid for an 944 * aligned EOF condition. This may result in character & block 945 * device access not recognizing EOF properly when read or written 946 * sequentially, but will not effect filesystems. 947 */ 948 count = (long)cbp->cb_buf.bio_caller1; 949 free(cbp, M_CCD); 950 951 /* 952 * If all done, "interrupt". 953 */ 954 bp->bio_resid -= count; 955 if (bp->bio_resid < 0) 956 panic("ccdiodone: count"); 957 if (bp->bio_resid == 0) { 958 if (bp->bio_flags & BIO_ERROR) 959 bp->bio_resid = bp->bio_bcount; 960 biofinish(bp, &cs->device_stats, 0); 961 } 962 } 963 964 static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td); 965 966 static int 967 ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td) 968 { 969 struct ccd_ioctl *ccio; 970 u_int unit; 971 dev_t dev2; 972 int error; 973 974 switch (cmd) { 975 case CCDIOCSET: 976 case CCDIOCCLR: 977 ccio = (struct ccd_ioctl *)data; 978 unit = ccio->ccio_size; 979 return (ccdioctltoo(unit, cmd, data, flag, td)); 980 case CCDCONFINFO: 981 { 982 int ninit = 0; 983 struct ccdconf *conf = (struct ccdconf *)data; 984 struct ccd_s *tmpcs; 985 struct ccd_s *ubuf = conf->buffer; 986 987 /* XXX: LOCK(unique unit numbers) */ 988 LIST_FOREACH(tmpcs, &ccd_softc_list, list) 989 if (IS_INITED(tmpcs)) 990 ninit++; 991 992 if (conf->size == 0) { 993 conf->size = sizeof(struct ccd_s) * ninit; 994 return (0); 995 } else if ((conf->size / sizeof(struct ccd_s) != ninit) || 996 (conf->size % sizeof(struct ccd_s) != 0)) { 997 /* XXX: UNLOCK(unique unit numbers) */ 998 return (EINVAL); 999 } 1000 1001 ubuf += ninit; 1002 LIST_FOREACH(tmpcs, &ccd_softc_list, list) { 1003 if (!IS_INITED(tmpcs)) 1004 continue; 1005 error = copyout(tmpcs, --ubuf, 1006 sizeof(struct ccd_s)); 1007 if (error != 0) 1008 /* XXX: UNLOCK(unique unit numbers) */ 1009 return (error); 1010 } 1011 /* XXX: UNLOCK(unique unit numbers) */ 1012 return (0); 1013 } 1014 1015 case CCDCPPINFO: 1016 { 1017 struct ccdcpps *cpps = (struct ccdcpps *)data; 1018 char *ubuf = cpps->buffer; 1019 struct ccd_s *cs; 1020 1021 1022 error = copyin(ubuf, &unit, sizeof (unit)); 1023 if (error) 1024 return (error); 1025 1026 if (!IS_ALLOCATED(unit)) 1027 return (ENXIO); 1028 dev2 = makedev(CDEV_MAJOR, unit * 8 + 2); 1029 cs = ccdfind(unit); 1030 if (!IS_INITED(cs)) 1031 return (ENXIO); 1032 1033 { 1034 int len = 0, i; 1035 struct ccdcpps *cpps = (struct ccdcpps *)data; 1036 char *ubuf = cpps->buffer; 1037 1038 1039 for (i = 0; i < cs->sc_nccdisks; ++i) 1040 len += cs->sc_cinfo[i].ci_pathlen; 1041 1042 if (cpps->size < len) 1043 return (ENOMEM); 1044 1045 for (i = 0; i < cs->sc_nccdisks; ++i) { 1046 len = cs->sc_cinfo[i].ci_pathlen; 1047 error = copyout(cs->sc_cinfo[i].ci_path, ubuf, 1048 len); 1049 if (error != 0) 1050 return (error); 1051 ubuf += len; 1052 } 1053 return(copyout("", ubuf, 1)); 1054 } 1055 break; 1056 } 1057 1058 default: 1059 return (ENXIO); 1060 } 1061 } 1062 1063 static int 1064 ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td) 1065 { 1066 int i, j, lookedup = 0, error = 0; 1067 struct ccd_s *cs; 1068 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data; 1069 struct ccdgeom *ccg; 1070 char **cpp; 1071 struct vnode **vpp; 1072 1073 cs = ccdfind(unit); 1074 switch (cmd) { 1075 case CCDIOCSET: 1076 if (cs == NULL) 1077 cs = ccdnew(unit); 1078 if (IS_INITED(cs)) 1079 return (EBUSY); 1080 1081 if ((flag & FWRITE) == 0) 1082 return (EBADF); 1083 1084 if ((error = ccdlock(cs)) != 0) 1085 return (error); 1086 1087 if (ccio->ccio_ndisks > CCD_MAXNDISKS) 1088 return (EINVAL); 1089 1090 /* Fill in some important bits. */ 1091 cs->sc_ileave = ccio->ccio_ileave; 1092 if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) { 1093 printf("ccd%d: disabling mirror, interleave is 0\n", 1094 unit); 1095 ccio->ccio_flags &= ~(CCDF_MIRROR); 1096 } 1097 if ((ccio->ccio_flags & CCDF_MIRROR) && 1098 !(ccio->ccio_flags & CCDF_UNIFORM)) { 1099 printf("ccd%d: mirror/parity forces uniform flag\n", 1100 unit); 1101 ccio->ccio_flags |= CCDF_UNIFORM; 1102 } 1103 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK; 1104 1105 /* 1106 * Allocate space for and copy in the array of 1107 * componet pathnames and device numbers. 1108 */ 1109 cpp = malloc(ccio->ccio_ndisks * sizeof(char *), 1110 M_CCD, 0); 1111 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *), 1112 M_CCD, 0); 1113 1114 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp, 1115 ccio->ccio_ndisks * sizeof(char **)); 1116 if (error) { 1117 free(vpp, M_CCD); 1118 free(cpp, M_CCD); 1119 ccdunlock(cs); 1120 return (error); 1121 } 1122 1123 1124 for (i = 0; i < ccio->ccio_ndisks; ++i) { 1125 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) { 1126 for (j = 0; j < lookedup; ++j) 1127 (void)vn_close(vpp[j], FREAD|FWRITE, 1128 td->td_ucred, td); 1129 free(vpp, M_CCD); 1130 free(cpp, M_CCD); 1131 ccdunlock(cs); 1132 return (error); 1133 } 1134 ++lookedup; 1135 } 1136 cs->sc_vpp = vpp; 1137 cs->sc_nccdisks = ccio->ccio_ndisks; 1138 1139 /* 1140 * Initialize the ccd. Fills in the softc for us. 1141 */ 1142 if ((error = ccdinit(cs, cpp, td)) != 0) { 1143 for (j = 0; j < lookedup; ++j) 1144 (void)vn_close(vpp[j], FREAD|FWRITE, 1145 td->td_ucred, td); 1146 /* 1147 * We can't ccddestroy() cs just yet, because nothing 1148 * prevents user-level app to do another ioctl() 1149 * without closing the device first, therefore 1150 * declare unit null and void and let ccdclose() 1151 * destroy it when it is safe to do so. 1152 */ 1153 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1154 free(vpp, M_CCD); 1155 free(cpp, M_CCD); 1156 ccdunlock(cs); 1157 return (error); 1158 } 1159 free(cpp, M_CCD); 1160 1161 /* 1162 * The ccd has been successfully initialized, so 1163 * we can place it into the array and read the disklabel. 1164 */ 1165 ccio->ccio_unit = unit; 1166 ccio->ccio_size = cs->sc_size; 1167 cs->sc_disk = malloc(sizeof(struct disk), M_CCD, 0); 1168 cs->sc_dev = disk_create(unit, cs->sc_disk, 0, 1169 &ccd_cdevsw, &ccddisk_cdevsw); 1170 cs->sc_dev->si_drv1 = cs; 1171 ccg = &cs->sc_geom; 1172 cs->sc_disk->d_sectorsize = ccg->ccg_secsize; 1173 cs->sc_disk->d_mediasize = 1174 cs->sc_size * (off_t)ccg->ccg_secsize; 1175 cs->sc_disk->d_fwsectors = ccg->ccg_nsectors; 1176 cs->sc_disk->d_fwheads = ccg->ccg_ntracks; 1177 1178 ccdunlock(cs); 1179 1180 break; 1181 1182 case CCDIOCCLR: 1183 if (cs == NULL) 1184 return (ENXIO); 1185 1186 if (!IS_INITED(cs)) 1187 return (ENXIO); 1188 1189 if ((flag & FWRITE) == 0) 1190 return (EBADF); 1191 1192 if ((error = ccdlock(cs)) != 0) 1193 return (error); 1194 1195 /* Don't unconfigure if any other partitions are open */ 1196 if (cs->sc_openmask) { 1197 ccdunlock(cs); 1198 return (EBUSY); 1199 } 1200 1201 disk_destroy(cs->sc_dev); 1202 free(cs->sc_disk, M_CCD); 1203 cs->sc_disk = NULL; 1204 /* Declare unit null and void (reset all flags) */ 1205 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1206 1207 /* Close the components and free their pathnames. */ 1208 for (i = 0; i < cs->sc_nccdisks; ++i) { 1209 /* 1210 * XXX: this close could potentially fail and 1211 * cause Bad Things. Maybe we need to force 1212 * the close to happen? 1213 */ 1214 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, 1215 td->td_ucred, td); 1216 free(cs->sc_cinfo[i].ci_path, M_CCD); 1217 } 1218 1219 /* Free interleave index. */ 1220 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) 1221 free(cs->sc_itable[i].ii_index, M_CCD); 1222 1223 /* Free component info and interleave table. */ 1224 free(cs->sc_cinfo, M_CCD); 1225 free(cs->sc_itable, M_CCD); 1226 free(cs->sc_vpp, M_CCD); 1227 1228 /* And remove the devstat entry. */ 1229 devstat_remove_entry(&cs->device_stats); 1230 1231 /* This must be atomic. */ 1232 ccdunlock(cs); 1233 ccddestroy(cs); 1234 1235 break; 1236 } 1237 1238 return (0); 1239 } 1240 1241 1242 /* 1243 * Lookup the provided name in the filesystem. If the file exists, 1244 * is a valid block device, and isn't being used by anyone else, 1245 * set *vpp to the file's vnode. 1246 */ 1247 static int 1248 ccdlookup(char *path, struct thread *td, struct vnode **vpp) 1249 { 1250 struct nameidata nd; 1251 struct vnode *vp; 1252 int error, flags; 1253 1254 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td); 1255 flags = FREAD | FWRITE; 1256 if ((error = vn_open(&nd, &flags, 0)) != 0) { 1257 return (error); 1258 } 1259 vp = nd.ni_vp; 1260 1261 if (vrefcnt(vp) > 1) { 1262 error = EBUSY; 1263 goto bad; 1264 } 1265 1266 if (!vn_isdisk(vp, &error)) 1267 goto bad; 1268 1269 1270 VOP_UNLOCK(vp, 0, td); 1271 NDFREE(&nd, NDF_ONLY_PNBUF); 1272 *vpp = vp; 1273 return (0); 1274 bad: 1275 VOP_UNLOCK(vp, 0, td); 1276 NDFREE(&nd, NDF_ONLY_PNBUF); 1277 /* vn_close does vrele() for vp */ 1278 (void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td); 1279 return (error); 1280 } 1281 1282 /* 1283 1284 * Wait interruptibly for an exclusive lock. 1285 * 1286 * XXX 1287 * Several drivers do this; it should be abstracted and made MP-safe. 1288 */ 1289 static int 1290 ccdlock(struct ccd_s *cs) 1291 { 1292 int error; 1293 1294 while ((cs->sc_flags & CCDF_LOCKED) != 0) { 1295 cs->sc_flags |= CCDF_WANTED; 1296 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0) 1297 return (error); 1298 } 1299 cs->sc_flags |= CCDF_LOCKED; 1300 return (0); 1301 } 1302 1303 /* 1304 * Unlock and wake up any waiters. 1305 */ 1306 static void 1307 ccdunlock(struct ccd_s *cs) 1308 { 1309 1310 cs->sc_flags &= ~CCDF_LOCKED; 1311 if ((cs->sc_flags & CCDF_WANTED) != 0) { 1312 cs->sc_flags &= ~CCDF_WANTED; 1313 wakeup(cs); 1314 } 1315 } 1316