1 /* 2 * Copyright (c) 2003 Poul-Henning Kamp. 3 * Copyright (c) 1995 Jason R. Thorpe. 4 * Copyright (c) 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * All rights reserved. 7 * Copyright (c) 1988 University of Utah. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * the Systems Programming Group of the University of Utah Computer 11 * Science Department. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed for the NetBSD Project 24 * by Jason R. Thorpe. 25 * 4. The names of the authors may not be used to endorse or promote products 26 * derived from this software without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 35 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 36 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * Dynamic configuration and disklabel support by: 41 * Jason R. Thorpe <thorpej@nas.nasa.gov> 42 * Numerical Aerodynamic Simulation Facility 43 * Mail Stop 258-6 44 * NASA Ames Research Center 45 * Moffett Field, CA 94035 46 * 47 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 48 * 49 * @(#)cd.c 8.2 (Berkeley) 11/16/93 50 * 51 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ 52 * 53 * $FreeBSD$ 54 */ 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/kernel.h> 59 #include <sys/module.h> 60 #include <sys/proc.h> 61 #include <sys/bio.h> 62 #include <sys/malloc.h> 63 #include <sys/namei.h> 64 #include <sys/conf.h> 65 #include <sys/stat.h> 66 #include <sys/sysctl.h> 67 #include <sys/disk.h> 68 #include <sys/fcntl.h> 69 #include <sys/vnode.h> 70 71 #include <sys/ccdvar.h> 72 73 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver"); 74 75 /* 76 This is how mirroring works (only writes are special): 77 78 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s 79 linked together by the cb_mirror field. "cb_pflags & 80 CCDPF_MIRROR_DONE" is set to 0 on both of them. 81 82 When a component returns to ccdiodone(), it checks if "cb_pflags & 83 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's 84 flag and returns. If it is, it means its partner has already 85 returned, so it will go to the regular cleanup. 86 87 */ 88 89 struct ccdbuf { 90 struct bio cb_buf; /* new I/O buf */ 91 struct bio *cb_obp; /* ptr. to original I/O buf */ 92 struct ccdbuf *cb_freenext; /* free list link */ 93 struct ccd_s *cb_softc; 94 int cb_comp; /* target component */ 95 int cb_pflags; /* mirror/parity status flag */ 96 struct ccdbuf *cb_mirror; /* mirror counterpart */ 97 }; 98 99 /* bits in cb_pflags */ 100 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */ 101 102 /* convinient macros for often-used statements */ 103 #define IS_ALLOCATED(unit) (ccdfind(unit) != NULL) 104 #define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0) 105 106 static dev_t ccdctldev; 107 108 static disk_strategy_t ccdstrategy; 109 static d_ioctl_t ccdctlioctl; 110 111 #define NCCDFREEHIWAT 16 112 113 #define CDEV_MAJOR 74 114 115 static struct cdevsw ccdctl_cdevsw = { 116 .d_open = nullopen, 117 .d_close = nullclose, 118 .d_ioctl = ccdctlioctl, 119 .d_name = "ccdctl", 120 .d_maj = CDEV_MAJOR, 121 }; 122 123 static LIST_HEAD(, ccd_s) ccd_softc_list = 124 LIST_HEAD_INITIALIZER(&ccd_softc_list); 125 126 static struct ccd_s *ccdfind(int); 127 static struct ccd_s *ccdnew(int); 128 static int ccddestroy(struct ccd_s *); 129 130 /* called during module initialization */ 131 static void ccdattach(void); 132 static int ccd_modevent(module_t, int, void *); 133 134 /* called by biodone() at interrupt time */ 135 static void ccdiodone(struct bio *bp); 136 137 static void ccdstart(struct ccd_s *, struct bio *); 138 static void ccdinterleave(struct ccd_s *, int); 139 static int ccdinit(struct ccd_s *, char **, struct thread *); 140 static int ccdlookup(char *, struct thread *p, struct vnode **); 141 static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *, 142 struct bio *, daddr_t, caddr_t, long); 143 static int ccdlock(struct ccd_s *); 144 static void ccdunlock(struct ccd_s *); 145 146 147 /* 148 * Number of blocks to untouched in front of a component partition. 149 * This is to avoid violating its disklabel area when it starts at the 150 * beginning of the slice. 151 */ 152 #if !defined(CCD_OFFSET) 153 #define CCD_OFFSET 16 154 #endif 155 156 static struct ccd_s * 157 ccdfind(int unit) 158 { 159 struct ccd_s *sc = NULL; 160 161 /* XXX: LOCK(unique unit numbers) */ 162 LIST_FOREACH(sc, &ccd_softc_list, list) { 163 if (sc->sc_unit == unit) 164 break; 165 } 166 /* XXX: UNLOCK(unique unit numbers) */ 167 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc); 168 } 169 170 static struct ccd_s * 171 ccdnew(int unit) 172 { 173 struct ccd_s *sc; 174 175 /* XXX: LOCK(unique unit numbers) */ 176 if (IS_ALLOCATED(unit) || unit > 32) 177 return (NULL); 178 179 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO); 180 sc->sc_unit = unit; 181 LIST_INSERT_HEAD(&ccd_softc_list, sc, list); 182 /* XXX: UNLOCK(unique unit numbers) */ 183 return (sc); 184 } 185 186 static int 187 ccddestroy(struct ccd_s *sc) 188 { 189 190 /* XXX: LOCK(unique unit numbers) */ 191 LIST_REMOVE(sc, list); 192 /* XXX: UNLOCK(unique unit numbers) */ 193 FREE(sc, M_CCD); 194 return (0); 195 } 196 197 /* 198 * Called by main() during pseudo-device attachment. All we need 199 * to do is to add devsw entries. 200 */ 201 static void 202 ccdattach() 203 { 204 205 ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff, 206 UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl"); 207 ccdctldev->si_drv1 = ccdctldev; 208 } 209 210 static int 211 ccd_modevent(module_t mod, int type, void *data) 212 { 213 int error = 0; 214 215 switch (type) { 216 case MOD_LOAD: 217 ccdattach(); 218 break; 219 220 case MOD_UNLOAD: 221 printf("ccd0: Unload not supported!\n"); 222 error = EOPNOTSUPP; 223 break; 224 225 case MOD_SHUTDOWN: 226 break; 227 228 default: 229 error = EOPNOTSUPP; 230 } 231 return (error); 232 } 233 234 DEV_MODULE(ccd, ccd_modevent, NULL); 235 236 static int 237 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td) 238 { 239 struct ccdcinfo *ci = NULL; /* XXX */ 240 size_t size; 241 int ix; 242 struct vnode *vp; 243 size_t minsize; 244 int maxsecsize; 245 struct ccdgeom *ccg = &cs->sc_geom; 246 char *tmppath = NULL; 247 int error = 0; 248 off_t mediasize; 249 u_int sectorsize; 250 251 252 cs->sc_size = 0; 253 254 /* Allocate space for the component info. */ 255 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo), 256 M_CCD, M_WAITOK); 257 258 /* 259 * Verify that each component piece exists and record 260 * relevant information about it. 261 */ 262 maxsecsize = 0; 263 minsize = 0; 264 tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK); 265 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 266 vp = cs->sc_vpp[ix]; 267 ci = &cs->sc_cinfo[ix]; 268 ci->ci_vp = vp; 269 270 /* 271 * Copy in the pathname of the component. 272 */ 273 if ((error = copyinstr(cpaths[ix], tmppath, 274 MAXPATHLEN, &ci->ci_pathlen)) != 0) { 275 goto fail; 276 } 277 ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK); 278 bcopy(tmppath, ci->ci_path, ci->ci_pathlen); 279 280 ci->ci_dev = vn_todev(vp); 281 282 /* 283 * Get partition information for the component. 284 */ 285 error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize, 286 FREAD, td->td_ucred, td); 287 if (error != 0) { 288 goto fail; 289 } 290 /* 291 * Get partition information for the component. 292 */ 293 error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)§orsize, 294 FREAD, td->td_ucred, td); 295 if (error != 0) { 296 goto fail; 297 } 298 if (sectorsize > maxsecsize) 299 maxsecsize = sectorsize; 300 size = mediasize / DEV_BSIZE - CCD_OFFSET; 301 302 /* 303 * Calculate the size, truncating to an interleave 304 * boundary if necessary. 305 */ 306 307 if (cs->sc_ileave > 1) 308 size -= size % cs->sc_ileave; 309 310 if (size == 0) { 311 error = ENODEV; 312 goto fail; 313 } 314 315 if (minsize == 0 || size < minsize) 316 minsize = size; 317 ci->ci_size = size; 318 cs->sc_size += size; 319 } 320 321 free(tmppath, M_CCD); 322 tmppath = NULL; 323 324 /* 325 * Don't allow the interleave to be smaller than 326 * the biggest component sector. 327 */ 328 if ((cs->sc_ileave > 0) && 329 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { 330 error = EINVAL; 331 goto fail; 332 } 333 334 /* 335 * If uniform interleave is desired set all sizes to that of 336 * the smallest component. This will guarentee that a single 337 * interleave table is generated. 338 * 339 * Lost space must be taken into account when calculating the 340 * overall size. Half the space is lost when CCDF_MIRROR is 341 * specified. 342 */ 343 if (cs->sc_flags & CCDF_UNIFORM) { 344 for (ci = cs->sc_cinfo; 345 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 346 ci->ci_size = minsize; 347 } 348 if (cs->sc_flags & CCDF_MIRROR) { 349 /* 350 * Check to see if an even number of components 351 * have been specified. The interleave must also 352 * be non-zero in order for us to be able to 353 * guarentee the topology. 354 */ 355 if (cs->sc_nccdisks % 2) { 356 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit ); 357 error = EINVAL; 358 goto fail; 359 } 360 if (cs->sc_ileave == 0) { 361 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit); 362 error = EINVAL; 363 goto fail; 364 } 365 cs->sc_size = (cs->sc_nccdisks/2) * minsize; 366 } else { 367 if (cs->sc_ileave == 0) { 368 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit); 369 error = EINVAL; 370 goto fail; 371 } 372 cs->sc_size = cs->sc_nccdisks * minsize; 373 } 374 } 375 376 /* 377 * Construct the interleave table. 378 */ 379 ccdinterleave(cs, cs->sc_unit); 380 381 /* 382 * Create pseudo-geometry based on 1MB cylinders. It's 383 * pretty close. 384 */ 385 ccg->ccg_secsize = maxsecsize; 386 ccg->ccg_ntracks = 1; 387 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize; 388 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors; 389 390 cs->sc_flags |= CCDF_INITED; 391 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */ 392 return (0); 393 fail: 394 while (ci > cs->sc_cinfo) { 395 ci--; 396 free(ci->ci_path, M_CCD); 397 } 398 if (tmppath != NULL) 399 free(tmppath, M_CCD); 400 free(cs->sc_cinfo, M_CCD); 401 ccddestroy(cs); 402 return (error); 403 } 404 405 static void 406 ccdinterleave(struct ccd_s *cs, int unit) 407 { 408 struct ccdcinfo *ci, *smallci; 409 struct ccdiinfo *ii; 410 daddr_t bn, lbn; 411 int ix; 412 u_long size; 413 414 415 /* 416 * Allocate an interleave table. The worst case occurs when each 417 * of N disks is of a different size, resulting in N interleave 418 * tables. 419 * 420 * Chances are this is too big, but we don't care. 421 */ 422 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo); 423 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD, 424 M_WAITOK | M_ZERO); 425 426 /* 427 * Trivial case: no interleave (actually interleave of disk size). 428 * Each table entry represents a single component in its entirety. 429 * 430 * An interleave of 0 may not be used with a mirror setup. 431 */ 432 if (cs->sc_ileave == 0) { 433 bn = 0; 434 ii = cs->sc_itable; 435 436 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 437 /* Allocate space for ii_index. */ 438 ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK); 439 ii->ii_ndisk = 1; 440 ii->ii_startblk = bn; 441 ii->ii_startoff = 0; 442 ii->ii_index[0] = ix; 443 bn += cs->sc_cinfo[ix].ci_size; 444 ii++; 445 } 446 ii->ii_ndisk = 0; 447 return; 448 } 449 450 /* 451 * The following isn't fast or pretty; it doesn't have to be. 452 */ 453 size = 0; 454 bn = lbn = 0; 455 for (ii = cs->sc_itable; ; ii++) { 456 /* 457 * Allocate space for ii_index. We might allocate more then 458 * we use. 459 */ 460 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks), 461 M_CCD, M_WAITOK); 462 463 /* 464 * Locate the smallest of the remaining components 465 */ 466 smallci = NULL; 467 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks]; 468 ci++) { 469 if (ci->ci_size > size && 470 (smallci == NULL || 471 ci->ci_size < smallci->ci_size)) { 472 smallci = ci; 473 } 474 } 475 476 /* 477 * Nobody left, all done 478 */ 479 if (smallci == NULL) { 480 ii->ii_ndisk = 0; 481 free(ii->ii_index, M_CCD); 482 break; 483 } 484 485 /* 486 * Record starting logical block using an sc_ileave blocksize. 487 */ 488 ii->ii_startblk = bn / cs->sc_ileave; 489 490 /* 491 * Record starting comopnent block using an sc_ileave 492 * blocksize. This value is relative to the beginning of 493 * a component disk. 494 */ 495 ii->ii_startoff = lbn; 496 497 /* 498 * Determine how many disks take part in this interleave 499 * and record their indices. 500 */ 501 ix = 0; 502 for (ci = cs->sc_cinfo; 503 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 504 if (ci->ci_size >= smallci->ci_size) { 505 ii->ii_index[ix++] = ci - cs->sc_cinfo; 506 } 507 } 508 ii->ii_ndisk = ix; 509 bn += ix * (smallci->ci_size - size); 510 lbn = smallci->ci_size / cs->sc_ileave; 511 size = smallci->ci_size; 512 } 513 } 514 515 static void 516 ccdstrategy(struct bio *bp) 517 { 518 struct ccd_s *cs; 519 int pbn; /* in sc_secsize chunks */ 520 long sz; /* in sc_secsize chunks */ 521 522 cs = bp->bio_disk->d_drv1; 523 524 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE); 525 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize); 526 527 /* 528 * If out of bounds return an error. If at the EOF point, 529 * simply read or write less. 530 */ 531 532 if (pbn < 0 || pbn >= cs->sc_size) { 533 bp->bio_resid = bp->bio_bcount; 534 if (pbn != cs->sc_size) 535 biofinish(bp, NULL, EINVAL); 536 else 537 biodone(bp); 538 return; 539 } 540 541 /* 542 * If the request crosses EOF, truncate the request. 543 */ 544 if (pbn + sz > cs->sc_size) { 545 bp->bio_bcount = (cs->sc_size - pbn) * 546 cs->sc_geom.ccg_secsize; 547 } 548 549 bp->bio_resid = bp->bio_bcount; 550 551 /* 552 * "Start" the unit. 553 */ 554 ccdstart(cs, bp); 555 return; 556 } 557 558 static void 559 ccdstart(struct ccd_s *cs, struct bio *bp) 560 { 561 long bcount, rcount; 562 struct ccdbuf *cbp[2]; 563 caddr_t addr; 564 daddr_t bn; 565 int err; 566 567 /* 568 * Translate the partition-relative block number to an absolute. 569 */ 570 bn = bp->bio_blkno; 571 572 /* 573 * Allocate component buffers and fire off the requests 574 */ 575 addr = bp->bio_data; 576 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) { 577 err = ccdbuffer(cbp, cs, bp, bn, addr, bcount); 578 if (err) { 579 printf("ccdbuffer error %d\n", err); 580 /* We're screwed */ 581 bp->bio_resid -= bcount; 582 bp->bio_error = ENOMEM; 583 bp->bio_flags |= BIO_ERROR; 584 return; 585 } 586 rcount = cbp[0]->cb_buf.bio_bcount; 587 588 if (cs->sc_cflags & CCDF_MIRROR) { 589 /* 590 * Mirroring. Writes go to both disks, reads are 591 * taken from whichever disk seems most appropriate. 592 * 593 * We attempt to localize reads to the disk whos arm 594 * is nearest the read request. We ignore seeks due 595 * to writes when making this determination and we 596 * also try to avoid hogging. 597 */ 598 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) { 599 BIO_STRATEGY(&cbp[0]->cb_buf); 600 BIO_STRATEGY(&cbp[1]->cb_buf); 601 } else { 602 int pick = cs->sc_pick; 603 daddr_t range = cs->sc_size / 16; 604 605 if (bn < cs->sc_blk[pick] - range || 606 bn > cs->sc_blk[pick] + range 607 ) { 608 cs->sc_pick = pick = 1 - pick; 609 } 610 cs->sc_blk[pick] = bn + btodb(rcount); 611 BIO_STRATEGY(&cbp[pick]->cb_buf); 612 } 613 } else { 614 /* 615 * Not mirroring 616 */ 617 BIO_STRATEGY(&cbp[0]->cb_buf); 618 } 619 bn += btodb(rcount); 620 addr += rcount; 621 } 622 } 623 624 /* 625 * Build a component buffer header. 626 */ 627 static int 628 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount) 629 { 630 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */ 631 struct ccdbuf *cbp; 632 daddr_t cbn, cboff; 633 off_t cbc; 634 635 /* 636 * Determine which component bn falls in. 637 */ 638 cbn = bn; 639 cboff = 0; 640 641 if (cs->sc_ileave == 0) { 642 /* 643 * Serially concatenated and neither a mirror nor a parity 644 * config. This is a special case. 645 */ 646 daddr_t sblk; 647 648 sblk = 0; 649 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++) 650 sblk += ci->ci_size; 651 cbn -= sblk; 652 } else { 653 struct ccdiinfo *ii; 654 int ccdisk, off; 655 656 /* 657 * Calculate cbn, the logical superblock (sc_ileave chunks), 658 * and cboff, a normal block offset (DEV_BSIZE chunks) relative 659 * to cbn. 660 */ 661 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */ 662 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */ 663 664 /* 665 * Figure out which interleave table to use. 666 */ 667 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) { 668 if (ii->ii_startblk > cbn) 669 break; 670 } 671 ii--; 672 673 /* 674 * off is the logical superblock relative to the beginning 675 * of this interleave block. 676 */ 677 off = cbn - ii->ii_startblk; 678 679 /* 680 * We must calculate which disk component to use (ccdisk), 681 * and recalculate cbn to be the superblock relative to 682 * the beginning of the component. This is typically done by 683 * adding 'off' and ii->ii_startoff together. However, 'off' 684 * must typically be divided by the number of components in 685 * this interleave array to be properly convert it from a 686 * CCD-relative logical superblock number to a 687 * component-relative superblock number. 688 */ 689 if (ii->ii_ndisk == 1) { 690 /* 691 * When we have just one disk, it can't be a mirror 692 * or a parity config. 693 */ 694 ccdisk = ii->ii_index[0]; 695 cbn = ii->ii_startoff + off; 696 } else { 697 if (cs->sc_cflags & CCDF_MIRROR) { 698 /* 699 * We have forced a uniform mapping, resulting 700 * in a single interleave array. We double 701 * up on the first half of the available 702 * components and our mirror is in the second 703 * half. This only works with a single 704 * interleave array because doubling up 705 * doubles the number of sectors, so there 706 * cannot be another interleave array because 707 * the next interleave array's calculations 708 * would be off. 709 */ 710 int ndisk2 = ii->ii_ndisk / 2; 711 ccdisk = ii->ii_index[off % ndisk2]; 712 cbn = ii->ii_startoff + off / ndisk2; 713 ci2 = &cs->sc_cinfo[ccdisk + ndisk2]; 714 } else { 715 ccdisk = ii->ii_index[off % ii->ii_ndisk]; 716 cbn = ii->ii_startoff + off / ii->ii_ndisk; 717 } 718 } 719 720 ci = &cs->sc_cinfo[ccdisk]; 721 722 /* 723 * Convert cbn from a superblock to a normal block so it 724 * can be used to calculate (along with cboff) the normal 725 * block index into this particular disk. 726 */ 727 cbn *= cs->sc_ileave; 728 } 729 730 /* 731 * Fill in the component buf structure. 732 */ 733 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO); 734 if (cbp == NULL) 735 return (ENOMEM); 736 cbp->cb_buf.bio_cmd = bp->bio_cmd; 737 cbp->cb_buf.bio_done = ccdiodone; 738 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */ 739 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET; 740 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET); 741 cbp->cb_buf.bio_data = addr; 742 cbp->cb_buf.bio_caller2 = cbp; 743 if (cs->sc_ileave == 0) 744 cbc = dbtob((off_t)(ci->ci_size - cbn)); 745 else 746 cbc = dbtob((off_t)(cs->sc_ileave - cboff)); 747 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount; 748 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount; 749 750 /* 751 * context for ccdiodone 752 */ 753 cbp->cb_obp = bp; 754 cbp->cb_softc = cs; 755 cbp->cb_comp = ci - cs->sc_cinfo; 756 757 cb[0] = cbp; 758 759 /* 760 * Note: both I/O's setup when reading from mirror, but only one 761 * will be executed. 762 */ 763 if (cs->sc_cflags & CCDF_MIRROR) { 764 /* mirror, setup second I/O */ 765 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT); 766 if (cbp == NULL) { 767 free(cb[0], M_CCD); 768 cb[0] = NULL; 769 return (ENOMEM); 770 } 771 bcopy(cb[0], cbp, sizeof(struct ccdbuf)); 772 cbp->cb_buf.bio_caller2 = cbp; 773 cbp->cb_buf.bio_dev = ci2->ci_dev; 774 cbp->cb_comp = ci2 - cs->sc_cinfo; 775 cb[1] = cbp; 776 /* link together the ccdbuf's and clear "mirror done" flag */ 777 cb[0]->cb_mirror = cb[1]; 778 cb[1]->cb_mirror = cb[0]; 779 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE; 780 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE; 781 } 782 return (0); 783 } 784 785 /* 786 * Called at interrupt time. 787 * Mark the component as done and if all components are done, 788 * take a ccd interrupt. 789 */ 790 static void 791 ccdiodone(struct bio *ibp) 792 { 793 struct ccdbuf *cbp; 794 struct bio *bp; 795 struct ccd_s *cs; 796 int count; 797 798 cbp = ibp->bio_caller2; 799 cs = cbp->cb_softc; 800 bp = cbp->cb_obp; 801 /* 802 * If an error occured, report it. If this is a mirrored 803 * configuration and the first of two possible reads, do not 804 * set the error in the bp yet because the second read may 805 * succeed. 806 */ 807 808 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 809 const char *msg = ""; 810 811 if ((cs->sc_cflags & CCDF_MIRROR) && 812 (cbp->cb_buf.bio_cmd == BIO_READ) && 813 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 814 /* 815 * We will try our read on the other disk down 816 * below, also reverse the default pick so if we 817 * are doing a scan we do not keep hitting the 818 * bad disk first. 819 */ 820 821 msg = ", trying other disk"; 822 cs->sc_pick = 1 - cs->sc_pick; 823 cs->sc_blk[cs->sc_pick] = bp->bio_blkno; 824 } else { 825 bp->bio_flags |= BIO_ERROR; 826 bp->bio_error = cbp->cb_buf.bio_error ? 827 cbp->cb_buf.bio_error : EIO; 828 } 829 printf("ccd%d: error %d on component %d block %jd " 830 "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error, 831 cbp->cb_comp, 832 (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno, 833 msg); 834 } 835 836 /* 837 * Process mirror. If we are writing, I/O has been initiated on both 838 * buffers and we fall through only after both are finished. 839 * 840 * If we are reading only one I/O is initiated at a time. If an 841 * error occurs we initiate the second I/O and return, otherwise 842 * we free the second I/O without initiating it. 843 */ 844 845 if (cs->sc_cflags & CCDF_MIRROR) { 846 if (cbp->cb_buf.bio_cmd == BIO_WRITE) { 847 /* 848 * When writing, handshake with the second buffer 849 * to determine when both are done. If both are not 850 * done, return here. 851 */ 852 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 853 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE; 854 free(cbp, M_CCD); 855 return; 856 } 857 } else { 858 /* 859 * When reading, either dispose of the second buffer 860 * or initiate I/O on the second buffer if an error 861 * occured with this one. 862 */ 863 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 864 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 865 cbp->cb_mirror->cb_pflags |= 866 CCDPF_MIRROR_DONE; 867 BIO_STRATEGY(&cbp->cb_mirror->cb_buf); 868 free(cbp, M_CCD); 869 return; 870 } else { 871 free(cbp->cb_mirror, M_CCD); 872 } 873 } 874 } 875 } 876 877 /* 878 * use bio_caller1 to determine how big the original request was rather 879 * then bio_bcount, because bio_bcount may have been truncated for EOF. 880 * 881 * XXX We check for an error, but we do not test the resid for an 882 * aligned EOF condition. This may result in character & block 883 * device access not recognizing EOF properly when read or written 884 * sequentially, but will not effect filesystems. 885 */ 886 count = (long)cbp->cb_buf.bio_caller1; 887 free(cbp, M_CCD); 888 889 /* 890 * If all done, "interrupt". 891 */ 892 bp->bio_resid -= count; 893 if (bp->bio_resid < 0) 894 panic("ccdiodone: count"); 895 if (bp->bio_resid == 0) { 896 if (bp->bio_flags & BIO_ERROR) 897 bp->bio_resid = bp->bio_bcount; 898 biodone(bp); 899 } 900 } 901 902 static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td); 903 904 static int 905 ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td) 906 { 907 struct ccd_ioctl *ccio; 908 u_int unit; 909 dev_t dev2; 910 int error; 911 912 switch (cmd) { 913 case CCDIOCSET: 914 case CCDIOCCLR: 915 ccio = (struct ccd_ioctl *)data; 916 unit = ccio->ccio_size; 917 return (ccdioctltoo(unit, cmd, data, flag, td)); 918 case CCDCONFINFO: 919 { 920 int ninit = 0; 921 struct ccdconf *conf = (struct ccdconf *)data; 922 struct ccd_s *tmpcs; 923 struct ccd_s *ubuf = conf->buffer; 924 925 /* XXX: LOCK(unique unit numbers) */ 926 LIST_FOREACH(tmpcs, &ccd_softc_list, list) 927 if (IS_INITED(tmpcs)) 928 ninit++; 929 930 if (conf->size == 0) { 931 conf->size = sizeof(struct ccd_s) * ninit; 932 return (0); 933 } else if ((conf->size / sizeof(struct ccd_s) != ninit) || 934 (conf->size % sizeof(struct ccd_s) != 0)) { 935 /* XXX: UNLOCK(unique unit numbers) */ 936 return (EINVAL); 937 } 938 939 ubuf += ninit; 940 LIST_FOREACH(tmpcs, &ccd_softc_list, list) { 941 if (!IS_INITED(tmpcs)) 942 continue; 943 error = copyout(tmpcs, --ubuf, 944 sizeof(struct ccd_s)); 945 if (error != 0) 946 /* XXX: UNLOCK(unique unit numbers) */ 947 return (error); 948 } 949 /* XXX: UNLOCK(unique unit numbers) */ 950 return (0); 951 } 952 953 case CCDCPPINFO: 954 { 955 struct ccdcpps *cpps = (struct ccdcpps *)data; 956 char *ubuf = cpps->buffer; 957 struct ccd_s *cs; 958 959 960 error = copyin(ubuf, &unit, sizeof (unit)); 961 if (error) 962 return (error); 963 964 if (!IS_ALLOCATED(unit)) 965 return (ENXIO); 966 dev2 = makedev(CDEV_MAJOR, unit * 8 + 2); 967 cs = ccdfind(unit); 968 if (!IS_INITED(cs)) 969 return (ENXIO); 970 971 { 972 int len = 0, i; 973 struct ccdcpps *cpps = (struct ccdcpps *)data; 974 char *ubuf = cpps->buffer; 975 976 977 for (i = 0; i < cs->sc_nccdisks; ++i) 978 len += cs->sc_cinfo[i].ci_pathlen; 979 980 if (cpps->size < len) 981 return (ENOMEM); 982 983 for (i = 0; i < cs->sc_nccdisks; ++i) { 984 len = cs->sc_cinfo[i].ci_pathlen; 985 error = copyout(cs->sc_cinfo[i].ci_path, ubuf, 986 len); 987 if (error != 0) 988 return (error); 989 ubuf += len; 990 } 991 return(copyout("", ubuf, 1)); 992 } 993 break; 994 } 995 996 default: 997 return (ENXIO); 998 } 999 } 1000 1001 static int 1002 ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td) 1003 { 1004 int i, j, lookedup = 0, error = 0; 1005 struct ccd_s *cs; 1006 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data; 1007 struct ccdgeom *ccg; 1008 char **cpp; 1009 struct vnode **vpp; 1010 1011 cs = ccdfind(unit); 1012 switch (cmd) { 1013 case CCDIOCSET: 1014 if (cs == NULL) 1015 cs = ccdnew(unit); 1016 if (IS_INITED(cs)) 1017 return (EBUSY); 1018 1019 if ((flag & FWRITE) == 0) 1020 return (EBADF); 1021 1022 if ((error = ccdlock(cs)) != 0) 1023 return (error); 1024 1025 if (ccio->ccio_ndisks > CCD_MAXNDISKS) 1026 return (EINVAL); 1027 1028 /* Fill in some important bits. */ 1029 cs->sc_ileave = ccio->ccio_ileave; 1030 if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) { 1031 printf("ccd%d: disabling mirror, interleave is 0\n", 1032 unit); 1033 ccio->ccio_flags &= ~(CCDF_MIRROR); 1034 } 1035 if ((ccio->ccio_flags & CCDF_MIRROR) && 1036 !(ccio->ccio_flags & CCDF_UNIFORM)) { 1037 printf("ccd%d: mirror/parity forces uniform flag\n", 1038 unit); 1039 ccio->ccio_flags |= CCDF_UNIFORM; 1040 } 1041 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK; 1042 1043 /* 1044 * Allocate space for and copy in the array of 1045 * componet pathnames and device numbers. 1046 */ 1047 cpp = malloc(ccio->ccio_ndisks * sizeof(char *), 1048 M_CCD, M_WAITOK); 1049 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *), 1050 M_CCD, M_WAITOK); 1051 1052 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp, 1053 ccio->ccio_ndisks * sizeof(char **)); 1054 if (error) { 1055 free(vpp, M_CCD); 1056 free(cpp, M_CCD); 1057 ccdunlock(cs); 1058 return (error); 1059 } 1060 1061 1062 for (i = 0; i < ccio->ccio_ndisks; ++i) { 1063 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) { 1064 for (j = 0; j < lookedup; ++j) 1065 (void)vn_close(vpp[j], FREAD|FWRITE, 1066 td->td_ucred, td); 1067 free(vpp, M_CCD); 1068 free(cpp, M_CCD); 1069 ccdunlock(cs); 1070 return (error); 1071 } 1072 ++lookedup; 1073 } 1074 cs->sc_vpp = vpp; 1075 cs->sc_nccdisks = ccio->ccio_ndisks; 1076 1077 /* 1078 * Initialize the ccd. Fills in the softc for us. 1079 */ 1080 if ((error = ccdinit(cs, cpp, td)) != 0) { 1081 for (j = 0; j < lookedup; ++j) 1082 (void)vn_close(vpp[j], FREAD|FWRITE, 1083 td->td_ucred, td); 1084 /* 1085 * We can't ccddestroy() cs just yet, because nothing 1086 * prevents user-level app to do another ioctl() 1087 * without closing the device first, therefore 1088 * declare unit null and void and let ccdclose() 1089 * destroy it when it is safe to do so. 1090 */ 1091 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1092 free(vpp, M_CCD); 1093 free(cpp, M_CCD); 1094 ccdunlock(cs); 1095 return (error); 1096 } 1097 free(cpp, M_CCD); 1098 1099 /* 1100 * The ccd has been successfully initialized, so 1101 * we can place it into the array and read the disklabel. 1102 */ 1103 ccio->ccio_unit = unit; 1104 ccio->ccio_size = cs->sc_size; 1105 ccg = &cs->sc_geom; 1106 cs->sc_disk = malloc(sizeof(struct disk), M_CCD, 1107 M_ZERO | M_WAITOK); 1108 cs->sc_disk->d_strategy = ccdstrategy; 1109 cs->sc_disk->d_name = "ccd"; 1110 cs->sc_disk->d_sectorsize = ccg->ccg_secsize; 1111 cs->sc_disk->d_mediasize = 1112 cs->sc_size * (off_t)ccg->ccg_secsize; 1113 cs->sc_disk->d_fwsectors = ccg->ccg_nsectors; 1114 cs->sc_disk->d_fwheads = ccg->ccg_ntracks; 1115 cs->sc_disk->d_drv1 = cs; 1116 cs->sc_disk->d_maxsize = MAXPHYS; 1117 disk_create(unit, cs->sc_disk, 0, NULL, NULL); 1118 1119 ccdunlock(cs); 1120 1121 break; 1122 1123 case CCDIOCCLR: 1124 if (cs == NULL) 1125 return (ENXIO); 1126 1127 if (!IS_INITED(cs)) 1128 return (ENXIO); 1129 1130 if ((flag & FWRITE) == 0) 1131 return (EBADF); 1132 1133 if ((error = ccdlock(cs)) != 0) 1134 return (error); 1135 1136 /* Don't unconfigure if any other partitions are open */ 1137 if (cs->sc_disk->d_flags & DISKFLAG_OPEN) { 1138 ccdunlock(cs); 1139 return (EBUSY); 1140 } 1141 1142 disk_destroy(cs->sc_disk); 1143 free(cs->sc_disk, M_CCD); 1144 cs->sc_disk = NULL; 1145 /* Declare unit null and void (reset all flags) */ 1146 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1147 1148 /* Close the components and free their pathnames. */ 1149 for (i = 0; i < cs->sc_nccdisks; ++i) { 1150 /* 1151 * XXX: this close could potentially fail and 1152 * cause Bad Things. Maybe we need to force 1153 * the close to happen? 1154 */ 1155 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, 1156 td->td_ucred, td); 1157 free(cs->sc_cinfo[i].ci_path, M_CCD); 1158 } 1159 1160 /* Free interleave index. */ 1161 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) 1162 free(cs->sc_itable[i].ii_index, M_CCD); 1163 1164 /* Free component info and interleave table. */ 1165 free(cs->sc_cinfo, M_CCD); 1166 free(cs->sc_itable, M_CCD); 1167 free(cs->sc_vpp, M_CCD); 1168 1169 /* This must be atomic. */ 1170 ccdunlock(cs); 1171 ccddestroy(cs); 1172 1173 break; 1174 } 1175 1176 return (0); 1177 } 1178 1179 1180 /* 1181 * Lookup the provided name in the filesystem. If the file exists, 1182 * is a valid block device, and isn't being used by anyone else, 1183 * set *vpp to the file's vnode. 1184 */ 1185 static int 1186 ccdlookup(char *path, struct thread *td, struct vnode **vpp) 1187 { 1188 struct nameidata nd; 1189 struct vnode *vp; 1190 int error, flags; 1191 1192 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td); 1193 flags = FREAD | FWRITE; 1194 if ((error = vn_open(&nd, &flags, 0)) != 0) { 1195 return (error); 1196 } 1197 vp = nd.ni_vp; 1198 1199 if (vrefcnt(vp) > 1) { 1200 error = EBUSY; 1201 goto bad; 1202 } 1203 1204 if (!vn_isdisk(vp, &error)) 1205 goto bad; 1206 1207 1208 VOP_UNLOCK(vp, 0, td); 1209 NDFREE(&nd, NDF_ONLY_PNBUF); 1210 *vpp = vp; 1211 return (0); 1212 bad: 1213 VOP_UNLOCK(vp, 0, td); 1214 NDFREE(&nd, NDF_ONLY_PNBUF); 1215 /* vn_close does vrele() for vp */ 1216 (void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td); 1217 return (error); 1218 } 1219 1220 /* 1221 1222 * Wait interruptibly for an exclusive lock. 1223 * 1224 * XXX 1225 * Several drivers do this; it should be abstracted and made MP-safe. 1226 */ 1227 static int 1228 ccdlock(struct ccd_s *cs) 1229 { 1230 int error; 1231 1232 while ((cs->sc_flags & CCDF_LOCKED) != 0) { 1233 cs->sc_flags |= CCDF_WANTED; 1234 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0) 1235 return (error); 1236 } 1237 cs->sc_flags |= CCDF_LOCKED; 1238 return (0); 1239 } 1240 1241 /* 1242 * Unlock and wake up any waiters. 1243 */ 1244 static void 1245 ccdunlock(struct ccd_s *cs) 1246 { 1247 1248 cs->sc_flags &= ~CCDF_LOCKED; 1249 if ((cs->sc_flags & CCDF_WANTED) != 0) { 1250 cs->sc_flags &= ~CCDF_WANTED; 1251 wakeup(cs); 1252 } 1253 } 1254