1 /* 2 * Copyright (c) 2003 Poul-Henning Kamp. 3 * Copyright (c) 1995 Jason R. Thorpe. 4 * Copyright (c) 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * All rights reserved. 7 * Copyright (c) 1988 University of Utah. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * the Systems Programming Group of the University of Utah Computer 11 * Science Department. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed for the NetBSD Project 24 * by Jason R. Thorpe. 25 * 4. The names of the authors may not be used to endorse or promote products 26 * derived from this software without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 35 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 36 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * Dynamic configuration and disklabel support by: 41 * Jason R. Thorpe <thorpej@nas.nasa.gov> 42 * Numerical Aerodynamic Simulation Facility 43 * Mail Stop 258-6 44 * NASA Ames Research Center 45 * Moffett Field, CA 94035 46 * 47 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 48 * 49 * @(#)cd.c 8.2 (Berkeley) 11/16/93 50 * 51 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ 52 * 53 * $FreeBSD$ 54 */ 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/kernel.h> 59 #include <sys/module.h> 60 #include <sys/proc.h> 61 #include <sys/bio.h> 62 #include <sys/malloc.h> 63 #include <sys/namei.h> 64 #include <sys/conf.h> 65 #include <sys/stat.h> 66 #include <sys/sysctl.h> 67 #include <sys/disk.h> 68 #include <sys/fcntl.h> 69 #include <sys/vnode.h> 70 #include <geom/geom_disk.h> 71 72 #include <sys/ccdvar.h> 73 74 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver"); 75 76 /* 77 This is how mirroring works (only writes are special): 78 79 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s 80 linked together by the cb_mirror field. "cb_pflags & 81 CCDPF_MIRROR_DONE" is set to 0 on both of them. 82 83 When a component returns to ccdiodone(), it checks if "cb_pflags & 84 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's 85 flag and returns. If it is, it means its partner has already 86 returned, so it will go to the regular cleanup. 87 88 */ 89 90 struct ccdbuf { 91 struct bio cb_buf; /* new I/O buf */ 92 struct bio *cb_obp; /* ptr. to original I/O buf */ 93 struct ccdbuf *cb_freenext; /* free list link */ 94 struct ccd_s *cb_softc; 95 int cb_comp; /* target component */ 96 int cb_pflags; /* mirror/parity status flag */ 97 struct ccdbuf *cb_mirror; /* mirror counterpart */ 98 }; 99 100 /* bits in cb_pflags */ 101 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */ 102 103 /* convinient macros for often-used statements */ 104 #define IS_ALLOCATED(unit) (ccdfind(unit) != NULL) 105 #define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0) 106 107 static dev_t ccdctldev; 108 109 static disk_strategy_t ccdstrategy; 110 static d_ioctl_t ccdctlioctl; 111 112 #define NCCDFREEHIWAT 16 113 114 #define CDEV_MAJOR 74 115 116 static struct cdevsw ccdctl_cdevsw = { 117 .d_open = nullopen, 118 .d_close = nullclose, 119 .d_ioctl = ccdctlioctl, 120 .d_name = "ccdctl", 121 .d_maj = CDEV_MAJOR, 122 }; 123 124 static LIST_HEAD(, ccd_s) ccd_softc_list = 125 LIST_HEAD_INITIALIZER(&ccd_softc_list); 126 127 static struct ccd_s *ccdfind(int); 128 static struct ccd_s *ccdnew(int); 129 static int ccddestroy(struct ccd_s *); 130 131 /* called during module initialization */ 132 static void ccdattach(void); 133 static int ccd_modevent(module_t, int, void *); 134 135 /* called by biodone() at interrupt time */ 136 static void ccdiodone(struct bio *bp); 137 138 static void ccdstart(struct ccd_s *, struct bio *); 139 static void ccdinterleave(struct ccd_s *, int); 140 static int ccdinit(struct ccd_s *, char **, struct thread *); 141 static int ccdlookup(char *, struct thread *p, struct vnode **); 142 static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *, 143 struct bio *, daddr_t, caddr_t, long); 144 static int ccdlock(struct ccd_s *); 145 static void ccdunlock(struct ccd_s *); 146 147 148 /* 149 * Number of blocks to untouched in front of a component partition. 150 * This is to avoid violating its disklabel area when it starts at the 151 * beginning of the slice. 152 */ 153 #if !defined(CCD_OFFSET) 154 #define CCD_OFFSET 16 155 #endif 156 157 static struct ccd_s * 158 ccdfind(int unit) 159 { 160 struct ccd_s *sc = NULL; 161 162 /* XXX: LOCK(unique unit numbers) */ 163 LIST_FOREACH(sc, &ccd_softc_list, list) { 164 if (sc->sc_unit == unit) 165 break; 166 } 167 /* XXX: UNLOCK(unique unit numbers) */ 168 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc); 169 } 170 171 static struct ccd_s * 172 ccdnew(int unit) 173 { 174 struct ccd_s *sc; 175 176 /* XXX: LOCK(unique unit numbers) */ 177 if (IS_ALLOCATED(unit) || unit > 32) 178 return (NULL); 179 180 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO); 181 sc->sc_unit = unit; 182 LIST_INSERT_HEAD(&ccd_softc_list, sc, list); 183 /* XXX: UNLOCK(unique unit numbers) */ 184 return (sc); 185 } 186 187 static int 188 ccddestroy(struct ccd_s *sc) 189 { 190 191 /* XXX: LOCK(unique unit numbers) */ 192 LIST_REMOVE(sc, list); 193 /* XXX: UNLOCK(unique unit numbers) */ 194 FREE(sc, M_CCD); 195 return (0); 196 } 197 198 /* 199 * Called by main() during pseudo-device attachment. All we need 200 * to do is to add devsw entries. 201 */ 202 static void 203 ccdattach() 204 { 205 206 ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff, 207 UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl"); 208 ccdctldev->si_drv1 = ccdctldev; 209 } 210 211 static int 212 ccd_modevent(module_t mod, int type, void *data) 213 { 214 int error = 0; 215 216 switch (type) { 217 case MOD_LOAD: 218 ccdattach(); 219 break; 220 221 case MOD_UNLOAD: 222 printf("ccd0: Unload not supported!\n"); 223 error = EOPNOTSUPP; 224 break; 225 226 case MOD_SHUTDOWN: 227 break; 228 229 default: 230 error = EOPNOTSUPP; 231 } 232 return (error); 233 } 234 235 DEV_MODULE(ccd, ccd_modevent, NULL); 236 237 static int 238 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td) 239 { 240 struct ccdcinfo *ci = NULL; /* XXX */ 241 size_t size; 242 int ix; 243 struct vnode *vp; 244 size_t minsize; 245 int maxsecsize; 246 struct ccdgeom *ccg = &cs->sc_geom; 247 char *tmppath = NULL; 248 int error = 0; 249 off_t mediasize; 250 u_int sectorsize; 251 252 253 cs->sc_size = 0; 254 255 /* Allocate space for the component info. */ 256 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo), 257 M_CCD, M_WAITOK); 258 259 /* 260 * Verify that each component piece exists and record 261 * relevant information about it. 262 */ 263 maxsecsize = 0; 264 minsize = 0; 265 tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK); 266 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 267 vp = cs->sc_vpp[ix]; 268 ci = &cs->sc_cinfo[ix]; 269 ci->ci_vp = vp; 270 271 /* 272 * Copy in the pathname of the component. 273 */ 274 if ((error = copyinstr(cpaths[ix], tmppath, 275 MAXPATHLEN, &ci->ci_pathlen)) != 0) { 276 goto fail; 277 } 278 ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK); 279 bcopy(tmppath, ci->ci_path, ci->ci_pathlen); 280 281 ci->ci_dev = vn_todev(vp); 282 283 /* 284 * Get partition information for the component. 285 */ 286 error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize, 287 FREAD, td->td_ucred, td); 288 if (error != 0) { 289 goto fail; 290 } 291 /* 292 * Get partition information for the component. 293 */ 294 error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)§orsize, 295 FREAD, td->td_ucred, td); 296 if (error != 0) { 297 goto fail; 298 } 299 if (sectorsize > maxsecsize) 300 maxsecsize = sectorsize; 301 size = mediasize / DEV_BSIZE - CCD_OFFSET; 302 303 /* 304 * Calculate the size, truncating to an interleave 305 * boundary if necessary. 306 */ 307 308 if (cs->sc_ileave > 1) 309 size -= size % cs->sc_ileave; 310 311 if (size == 0) { 312 error = ENODEV; 313 goto fail; 314 } 315 316 if (minsize == 0 || size < minsize) 317 minsize = size; 318 ci->ci_size = size; 319 cs->sc_size += size; 320 } 321 322 free(tmppath, M_CCD); 323 tmppath = NULL; 324 325 /* 326 * Don't allow the interleave to be smaller than 327 * the biggest component sector. 328 */ 329 if ((cs->sc_ileave > 0) && 330 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { 331 error = EINVAL; 332 goto fail; 333 } 334 335 /* 336 * If uniform interleave is desired set all sizes to that of 337 * the smallest component. This will guarentee that a single 338 * interleave table is generated. 339 * 340 * Lost space must be taken into account when calculating the 341 * overall size. Half the space is lost when CCDF_MIRROR is 342 * specified. 343 */ 344 if (cs->sc_flags & CCDF_UNIFORM) { 345 for (ci = cs->sc_cinfo; 346 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 347 ci->ci_size = minsize; 348 } 349 if (cs->sc_flags & CCDF_MIRROR) { 350 /* 351 * Check to see if an even number of components 352 * have been specified. The interleave must also 353 * be non-zero in order for us to be able to 354 * guarentee the topology. 355 */ 356 if (cs->sc_nccdisks % 2) { 357 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit ); 358 error = EINVAL; 359 goto fail; 360 } 361 if (cs->sc_ileave == 0) { 362 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit); 363 error = EINVAL; 364 goto fail; 365 } 366 cs->sc_size = (cs->sc_nccdisks/2) * minsize; 367 } else { 368 if (cs->sc_ileave == 0) { 369 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit); 370 error = EINVAL; 371 goto fail; 372 } 373 cs->sc_size = cs->sc_nccdisks * minsize; 374 } 375 } 376 377 /* 378 * Construct the interleave table. 379 */ 380 ccdinterleave(cs, cs->sc_unit); 381 382 /* 383 * Create pseudo-geometry based on 1MB cylinders. It's 384 * pretty close. 385 */ 386 ccg->ccg_secsize = maxsecsize; 387 ccg->ccg_ntracks = 1; 388 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize; 389 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors; 390 391 cs->sc_flags |= CCDF_INITED; 392 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */ 393 return (0); 394 fail: 395 while (ci > cs->sc_cinfo) { 396 ci--; 397 free(ci->ci_path, M_CCD); 398 } 399 if (tmppath != NULL) 400 free(tmppath, M_CCD); 401 free(cs->sc_cinfo, M_CCD); 402 ccddestroy(cs); 403 return (error); 404 } 405 406 static void 407 ccdinterleave(struct ccd_s *cs, int unit) 408 { 409 struct ccdcinfo *ci, *smallci; 410 struct ccdiinfo *ii; 411 daddr_t bn, lbn; 412 int ix; 413 u_long size; 414 415 416 /* 417 * Allocate an interleave table. The worst case occurs when each 418 * of N disks is of a different size, resulting in N interleave 419 * tables. 420 * 421 * Chances are this is too big, but we don't care. 422 */ 423 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo); 424 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD, 425 M_WAITOK | M_ZERO); 426 427 /* 428 * Trivial case: no interleave (actually interleave of disk size). 429 * Each table entry represents a single component in its entirety. 430 * 431 * An interleave of 0 may not be used with a mirror setup. 432 */ 433 if (cs->sc_ileave == 0) { 434 bn = 0; 435 ii = cs->sc_itable; 436 437 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 438 /* Allocate space for ii_index. */ 439 ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK); 440 ii->ii_ndisk = 1; 441 ii->ii_startblk = bn; 442 ii->ii_startoff = 0; 443 ii->ii_index[0] = ix; 444 bn += cs->sc_cinfo[ix].ci_size; 445 ii++; 446 } 447 ii->ii_ndisk = 0; 448 return; 449 } 450 451 /* 452 * The following isn't fast or pretty; it doesn't have to be. 453 */ 454 size = 0; 455 bn = lbn = 0; 456 for (ii = cs->sc_itable; ; ii++) { 457 /* 458 * Allocate space for ii_index. We might allocate more then 459 * we use. 460 */ 461 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks), 462 M_CCD, M_WAITOK); 463 464 /* 465 * Locate the smallest of the remaining components 466 */ 467 smallci = NULL; 468 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks]; 469 ci++) { 470 if (ci->ci_size > size && 471 (smallci == NULL || 472 ci->ci_size < smallci->ci_size)) { 473 smallci = ci; 474 } 475 } 476 477 /* 478 * Nobody left, all done 479 */ 480 if (smallci == NULL) { 481 ii->ii_ndisk = 0; 482 free(ii->ii_index, M_CCD); 483 break; 484 } 485 486 /* 487 * Record starting logical block using an sc_ileave blocksize. 488 */ 489 ii->ii_startblk = bn / cs->sc_ileave; 490 491 /* 492 * Record starting comopnent block using an sc_ileave 493 * blocksize. This value is relative to the beginning of 494 * a component disk. 495 */ 496 ii->ii_startoff = lbn; 497 498 /* 499 * Determine how many disks take part in this interleave 500 * and record their indices. 501 */ 502 ix = 0; 503 for (ci = cs->sc_cinfo; 504 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 505 if (ci->ci_size >= smallci->ci_size) { 506 ii->ii_index[ix++] = ci - cs->sc_cinfo; 507 } 508 } 509 ii->ii_ndisk = ix; 510 bn += ix * (smallci->ci_size - size); 511 lbn = smallci->ci_size / cs->sc_ileave; 512 size = smallci->ci_size; 513 } 514 } 515 516 static void 517 ccdstrategy(struct bio *bp) 518 { 519 struct ccd_s *cs; 520 int pbn; /* in sc_secsize chunks */ 521 long sz; /* in sc_secsize chunks */ 522 523 cs = bp->bio_disk->d_drv1; 524 525 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE); 526 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize); 527 528 /* 529 * If out of bounds return an error. If at the EOF point, 530 * simply read or write less. 531 */ 532 533 if (pbn < 0 || pbn >= cs->sc_size) { 534 bp->bio_resid = bp->bio_bcount; 535 if (pbn != cs->sc_size) 536 biofinish(bp, NULL, EINVAL); 537 else 538 biodone(bp); 539 return; 540 } 541 542 /* 543 * If the request crosses EOF, truncate the request. 544 */ 545 if (pbn + sz > cs->sc_size) { 546 bp->bio_bcount = (cs->sc_size - pbn) * 547 cs->sc_geom.ccg_secsize; 548 } 549 550 bp->bio_resid = bp->bio_bcount; 551 552 /* 553 * "Start" the unit. 554 */ 555 ccdstart(cs, bp); 556 return; 557 } 558 559 static void 560 ccdstart(struct ccd_s *cs, struct bio *bp) 561 { 562 long bcount, rcount; 563 struct ccdbuf *cbp[2]; 564 caddr_t addr; 565 daddr_t bn; 566 int err; 567 int sent; 568 569 /* 570 * Translate the partition-relative block number to an absolute. 571 */ 572 bn = bp->bio_blkno; 573 574 /* 575 * Allocate component buffers and fire off the requests 576 */ 577 addr = bp->bio_data; 578 sent = 0; 579 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) { 580 err = ccdbuffer(cbp, cs, bp, bn, addr, bcount); 581 if (err) { 582 printf("ccdbuffer error %d\n", err); 583 if (!sent) 584 biofinish(bp, NULL, err); 585 else { 586 /* 587 * XXX: maybe a race where the partners 588 * XXX: we sent already have been in 589 * XXX: ccdiodone(). Single-threaded g_down 590 * XXX: may protect against this. 591 */ 592 bp->bio_resid -= bcount; 593 bp->bio_error = err; 594 bp->bio_flags |= BIO_ERROR; 595 } 596 return; 597 } 598 rcount = cbp[0]->cb_buf.bio_bcount; 599 600 if (cs->sc_cflags & CCDF_MIRROR) { 601 /* 602 * Mirroring. Writes go to both disks, reads are 603 * taken from whichever disk seems most appropriate. 604 * 605 * We attempt to localize reads to the disk whos arm 606 * is nearest the read request. We ignore seeks due 607 * to writes when making this determination and we 608 * also try to avoid hogging. 609 */ 610 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) { 611 BIO_STRATEGY(&cbp[0]->cb_buf); 612 BIO_STRATEGY(&cbp[1]->cb_buf); 613 sent++; 614 } else { 615 int pick = cs->sc_pick; 616 daddr_t range = cs->sc_size / 16; 617 618 if (bn < cs->sc_blk[pick] - range || 619 bn > cs->sc_blk[pick] + range 620 ) { 621 cs->sc_pick = pick = 1 - pick; 622 } 623 cs->sc_blk[pick] = bn + btodb(rcount); 624 BIO_STRATEGY(&cbp[pick]->cb_buf); 625 sent++; 626 } 627 } else { 628 /* 629 * Not mirroring 630 */ 631 BIO_STRATEGY(&cbp[0]->cb_buf); 632 sent++; 633 } 634 bn += btodb(rcount); 635 addr += rcount; 636 } 637 } 638 639 /* 640 * Build a component buffer header. 641 */ 642 static int 643 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount) 644 { 645 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */ 646 struct ccdbuf *cbp; 647 daddr_t cbn, cboff; 648 off_t cbc; 649 650 /* 651 * Determine which component bn falls in. 652 */ 653 cbn = bn; 654 cboff = 0; 655 656 if (cs->sc_ileave == 0) { 657 /* 658 * Serially concatenated and neither a mirror nor a parity 659 * config. This is a special case. 660 */ 661 daddr_t sblk; 662 663 sblk = 0; 664 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++) 665 sblk += ci->ci_size; 666 cbn -= sblk; 667 } else { 668 struct ccdiinfo *ii; 669 int ccdisk, off; 670 671 /* 672 * Calculate cbn, the logical superblock (sc_ileave chunks), 673 * and cboff, a normal block offset (DEV_BSIZE chunks) relative 674 * to cbn. 675 */ 676 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */ 677 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */ 678 679 /* 680 * Figure out which interleave table to use. 681 */ 682 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) { 683 if (ii->ii_startblk > cbn) 684 break; 685 } 686 ii--; 687 688 /* 689 * off is the logical superblock relative to the beginning 690 * of this interleave block. 691 */ 692 off = cbn - ii->ii_startblk; 693 694 /* 695 * We must calculate which disk component to use (ccdisk), 696 * and recalculate cbn to be the superblock relative to 697 * the beginning of the component. This is typically done by 698 * adding 'off' and ii->ii_startoff together. However, 'off' 699 * must typically be divided by the number of components in 700 * this interleave array to be properly convert it from a 701 * CCD-relative logical superblock number to a 702 * component-relative superblock number. 703 */ 704 if (ii->ii_ndisk == 1) { 705 /* 706 * When we have just one disk, it can't be a mirror 707 * or a parity config. 708 */ 709 ccdisk = ii->ii_index[0]; 710 cbn = ii->ii_startoff + off; 711 } else { 712 if (cs->sc_cflags & CCDF_MIRROR) { 713 /* 714 * We have forced a uniform mapping, resulting 715 * in a single interleave array. We double 716 * up on the first half of the available 717 * components and our mirror is in the second 718 * half. This only works with a single 719 * interleave array because doubling up 720 * doubles the number of sectors, so there 721 * cannot be another interleave array because 722 * the next interleave array's calculations 723 * would be off. 724 */ 725 int ndisk2 = ii->ii_ndisk / 2; 726 ccdisk = ii->ii_index[off % ndisk2]; 727 cbn = ii->ii_startoff + off / ndisk2; 728 ci2 = &cs->sc_cinfo[ccdisk + ndisk2]; 729 } else { 730 ccdisk = ii->ii_index[off % ii->ii_ndisk]; 731 cbn = ii->ii_startoff + off / ii->ii_ndisk; 732 } 733 } 734 735 ci = &cs->sc_cinfo[ccdisk]; 736 737 /* 738 * Convert cbn from a superblock to a normal block so it 739 * can be used to calculate (along with cboff) the normal 740 * block index into this particular disk. 741 */ 742 cbn *= cs->sc_ileave; 743 } 744 745 /* 746 * Fill in the component buf structure. 747 */ 748 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO); 749 if (cbp == NULL) 750 return (ENOMEM); 751 cbp->cb_buf.bio_cmd = bp->bio_cmd; 752 cbp->cb_buf.bio_done = ccdiodone; 753 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */ 754 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET; 755 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET); 756 cbp->cb_buf.bio_data = addr; 757 cbp->cb_buf.bio_caller2 = cbp; 758 if (cs->sc_ileave == 0) 759 cbc = dbtob((off_t)(ci->ci_size - cbn)); 760 else 761 cbc = dbtob((off_t)(cs->sc_ileave - cboff)); 762 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount; 763 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount; 764 765 /* 766 * context for ccdiodone 767 */ 768 cbp->cb_obp = bp; 769 cbp->cb_softc = cs; 770 cbp->cb_comp = ci - cs->sc_cinfo; 771 772 cb[0] = cbp; 773 774 /* 775 * Note: both I/O's setup when reading from mirror, but only one 776 * will be executed. 777 */ 778 if (cs->sc_cflags & CCDF_MIRROR) { 779 /* mirror, setup second I/O */ 780 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT); 781 if (cbp == NULL) { 782 free(cb[0], M_CCD); 783 cb[0] = NULL; 784 return (ENOMEM); 785 } 786 bcopy(cb[0], cbp, sizeof(struct ccdbuf)); 787 cbp->cb_buf.bio_caller2 = cbp; 788 cbp->cb_buf.bio_dev = ci2->ci_dev; 789 cbp->cb_comp = ci2 - cs->sc_cinfo; 790 cb[1] = cbp; 791 /* link together the ccdbuf's and clear "mirror done" flag */ 792 cb[0]->cb_mirror = cb[1]; 793 cb[1]->cb_mirror = cb[0]; 794 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE; 795 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE; 796 } 797 return (0); 798 } 799 800 /* 801 * Called at interrupt time. 802 * Mark the component as done and if all components are done, 803 * take a ccd interrupt. 804 */ 805 static void 806 ccdiodone(struct bio *ibp) 807 { 808 struct ccdbuf *cbp; 809 struct bio *bp; 810 struct ccd_s *cs; 811 int count; 812 813 cbp = ibp->bio_caller2; 814 cs = cbp->cb_softc; 815 bp = cbp->cb_obp; 816 /* 817 * If an error occured, report it. If this is a mirrored 818 * configuration and the first of two possible reads, do not 819 * set the error in the bp yet because the second read may 820 * succeed. 821 */ 822 823 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 824 const char *msg = ""; 825 826 if ((cs->sc_cflags & CCDF_MIRROR) && 827 (cbp->cb_buf.bio_cmd == BIO_READ) && 828 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 829 /* 830 * We will try our read on the other disk down 831 * below, also reverse the default pick so if we 832 * are doing a scan we do not keep hitting the 833 * bad disk first. 834 */ 835 836 msg = ", trying other disk"; 837 cs->sc_pick = 1 - cs->sc_pick; 838 cs->sc_blk[cs->sc_pick] = bp->bio_blkno; 839 } else { 840 bp->bio_flags |= BIO_ERROR; 841 bp->bio_error = cbp->cb_buf.bio_error ? 842 cbp->cb_buf.bio_error : EIO; 843 } 844 printf("ccd%d: error %d on component %d block %jd " 845 "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error, 846 cbp->cb_comp, 847 (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno, 848 msg); 849 } 850 851 /* 852 * Process mirror. If we are writing, I/O has been initiated on both 853 * buffers and we fall through only after both are finished. 854 * 855 * If we are reading only one I/O is initiated at a time. If an 856 * error occurs we initiate the second I/O and return, otherwise 857 * we free the second I/O without initiating it. 858 */ 859 860 if (cs->sc_cflags & CCDF_MIRROR) { 861 if (cbp->cb_buf.bio_cmd == BIO_WRITE) { 862 /* 863 * When writing, handshake with the second buffer 864 * to determine when both are done. If both are not 865 * done, return here. 866 */ 867 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 868 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE; 869 free(cbp, M_CCD); 870 return; 871 } 872 } else { 873 /* 874 * When reading, either dispose of the second buffer 875 * or initiate I/O on the second buffer if an error 876 * occured with this one. 877 */ 878 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 879 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 880 cbp->cb_mirror->cb_pflags |= 881 CCDPF_MIRROR_DONE; 882 BIO_STRATEGY(&cbp->cb_mirror->cb_buf); 883 free(cbp, M_CCD); 884 return; 885 } else { 886 free(cbp->cb_mirror, M_CCD); 887 } 888 } 889 } 890 } 891 892 /* 893 * use bio_caller1 to determine how big the original request was rather 894 * then bio_bcount, because bio_bcount may have been truncated for EOF. 895 * 896 * XXX We check for an error, but we do not test the resid for an 897 * aligned EOF condition. This may result in character & block 898 * device access not recognizing EOF properly when read or written 899 * sequentially, but will not effect filesystems. 900 */ 901 count = (long)cbp->cb_buf.bio_caller1; 902 free(cbp, M_CCD); 903 904 /* 905 * If all done, "interrupt". 906 */ 907 bp->bio_resid -= count; 908 if (bp->bio_resid < 0) 909 panic("ccdiodone: count"); 910 if (bp->bio_resid == 0) { 911 if (bp->bio_flags & BIO_ERROR) 912 bp->bio_resid = bp->bio_bcount; 913 biodone(bp); 914 } 915 } 916 917 static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td); 918 919 static int 920 ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td) 921 { 922 struct ccd_ioctl *ccio; 923 u_int unit; 924 dev_t dev2; 925 int error; 926 927 switch (cmd) { 928 case CCDIOCSET: 929 case CCDIOCCLR: 930 ccio = (struct ccd_ioctl *)data; 931 unit = ccio->ccio_size; 932 return (ccdioctltoo(unit, cmd, data, flag, td)); 933 case CCDCONFINFO: 934 { 935 int ninit = 0; 936 struct ccdconf *conf = (struct ccdconf *)data; 937 struct ccd_s *tmpcs; 938 struct ccd_s *ubuf = conf->buffer; 939 940 /* XXX: LOCK(unique unit numbers) */ 941 LIST_FOREACH(tmpcs, &ccd_softc_list, list) 942 if (IS_INITED(tmpcs)) 943 ninit++; 944 945 if (conf->size == 0) { 946 conf->size = sizeof(struct ccd_s) * ninit; 947 return (0); 948 } else if ((conf->size / sizeof(struct ccd_s) != ninit) || 949 (conf->size % sizeof(struct ccd_s) != 0)) { 950 /* XXX: UNLOCK(unique unit numbers) */ 951 return (EINVAL); 952 } 953 954 ubuf += ninit; 955 LIST_FOREACH(tmpcs, &ccd_softc_list, list) { 956 if (!IS_INITED(tmpcs)) 957 continue; 958 error = copyout(tmpcs, --ubuf, 959 sizeof(struct ccd_s)); 960 if (error != 0) 961 /* XXX: UNLOCK(unique unit numbers) */ 962 return (error); 963 } 964 /* XXX: UNLOCK(unique unit numbers) */ 965 return (0); 966 } 967 968 case CCDCPPINFO: 969 { 970 struct ccdcpps *cpps = (struct ccdcpps *)data; 971 char *ubuf = cpps->buffer; 972 struct ccd_s *cs; 973 974 975 error = copyin(ubuf, &unit, sizeof (unit)); 976 if (error) 977 return (error); 978 979 if (!IS_ALLOCATED(unit)) 980 return (ENXIO); 981 dev2 = makedev(CDEV_MAJOR, unit * 8 + 2); 982 cs = ccdfind(unit); 983 if (!IS_INITED(cs)) 984 return (ENXIO); 985 986 { 987 int len = 0, i; 988 struct ccdcpps *cpps = (struct ccdcpps *)data; 989 char *ubuf = cpps->buffer; 990 991 992 for (i = 0; i < cs->sc_nccdisks; ++i) 993 len += cs->sc_cinfo[i].ci_pathlen; 994 995 if (cpps->size < len) 996 return (ENOMEM); 997 998 for (i = 0; i < cs->sc_nccdisks; ++i) { 999 len = cs->sc_cinfo[i].ci_pathlen; 1000 error = copyout(cs->sc_cinfo[i].ci_path, ubuf, 1001 len); 1002 if (error != 0) 1003 return (error); 1004 ubuf += len; 1005 } 1006 return(copyout("", ubuf, 1)); 1007 } 1008 break; 1009 } 1010 1011 default: 1012 return (ENXIO); 1013 } 1014 } 1015 1016 static int 1017 ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td) 1018 { 1019 int i, j, lookedup = 0, error = 0; 1020 struct ccd_s *cs; 1021 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data; 1022 struct ccdgeom *ccg; 1023 char **cpp; 1024 struct vnode **vpp; 1025 1026 cs = ccdfind(unit); 1027 switch (cmd) { 1028 case CCDIOCSET: 1029 if (cs == NULL) 1030 cs = ccdnew(unit); 1031 if (IS_INITED(cs)) 1032 return (EBUSY); 1033 1034 if ((flag & FWRITE) == 0) 1035 return (EBADF); 1036 1037 if ((error = ccdlock(cs)) != 0) 1038 return (error); 1039 1040 if (ccio->ccio_ndisks > CCD_MAXNDISKS) 1041 return (EINVAL); 1042 1043 /* Fill in some important bits. */ 1044 cs->sc_ileave = ccio->ccio_ileave; 1045 if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) { 1046 printf("ccd%d: disabling mirror, interleave is 0\n", 1047 unit); 1048 ccio->ccio_flags &= ~(CCDF_MIRROR); 1049 } 1050 if ((ccio->ccio_flags & CCDF_MIRROR) && 1051 !(ccio->ccio_flags & CCDF_UNIFORM)) { 1052 printf("ccd%d: mirror/parity forces uniform flag\n", 1053 unit); 1054 ccio->ccio_flags |= CCDF_UNIFORM; 1055 } 1056 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK; 1057 1058 /* 1059 * Allocate space for and copy in the array of 1060 * componet pathnames and device numbers. 1061 */ 1062 cpp = malloc(ccio->ccio_ndisks * sizeof(char *), 1063 M_CCD, M_WAITOK); 1064 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *), 1065 M_CCD, M_WAITOK); 1066 1067 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp, 1068 ccio->ccio_ndisks * sizeof(char **)); 1069 if (error) { 1070 free(vpp, M_CCD); 1071 free(cpp, M_CCD); 1072 ccdunlock(cs); 1073 return (error); 1074 } 1075 1076 1077 for (i = 0; i < ccio->ccio_ndisks; ++i) { 1078 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) { 1079 for (j = 0; j < lookedup; ++j) 1080 (void)vn_close(vpp[j], FREAD|FWRITE, 1081 td->td_ucred, td); 1082 free(vpp, M_CCD); 1083 free(cpp, M_CCD); 1084 ccdunlock(cs); 1085 return (error); 1086 } 1087 ++lookedup; 1088 } 1089 cs->sc_vpp = vpp; 1090 cs->sc_nccdisks = ccio->ccio_ndisks; 1091 1092 /* 1093 * Initialize the ccd. Fills in the softc for us. 1094 */ 1095 if ((error = ccdinit(cs, cpp, td)) != 0) { 1096 for (j = 0; j < lookedup; ++j) 1097 (void)vn_close(vpp[j], FREAD|FWRITE, 1098 td->td_ucred, td); 1099 /* 1100 * We can't ccddestroy() cs just yet, because nothing 1101 * prevents user-level app to do another ioctl() 1102 * without closing the device first, therefore 1103 * declare unit null and void and let ccdclose() 1104 * destroy it when it is safe to do so. 1105 */ 1106 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1107 free(vpp, M_CCD); 1108 free(cpp, M_CCD); 1109 ccdunlock(cs); 1110 return (error); 1111 } 1112 free(cpp, M_CCD); 1113 1114 /* 1115 * The ccd has been successfully initialized, so 1116 * we can place it into the array and read the disklabel. 1117 */ 1118 ccio->ccio_unit = unit; 1119 ccio->ccio_size = cs->sc_size; 1120 ccg = &cs->sc_geom; 1121 cs->sc_disk = malloc(sizeof(struct disk), M_CCD, 1122 M_ZERO | M_WAITOK); 1123 cs->sc_disk->d_strategy = ccdstrategy; 1124 cs->sc_disk->d_name = "ccd"; 1125 cs->sc_disk->d_sectorsize = ccg->ccg_secsize; 1126 cs->sc_disk->d_mediasize = 1127 cs->sc_size * (off_t)ccg->ccg_secsize; 1128 cs->sc_disk->d_fwsectors = ccg->ccg_nsectors; 1129 cs->sc_disk->d_fwheads = ccg->ccg_ntracks; 1130 cs->sc_disk->d_drv1 = cs; 1131 cs->sc_disk->d_maxsize = MAXPHYS; 1132 disk_create(unit, cs->sc_disk, 0, NULL, NULL); 1133 1134 ccdunlock(cs); 1135 1136 break; 1137 1138 case CCDIOCCLR: 1139 if (cs == NULL) 1140 return (ENXIO); 1141 1142 if (!IS_INITED(cs)) 1143 return (ENXIO); 1144 1145 if ((flag & FWRITE) == 0) 1146 return (EBADF); 1147 1148 if ((error = ccdlock(cs)) != 0) 1149 return (error); 1150 1151 /* Don't unconfigure if any other partitions are open */ 1152 if (cs->sc_disk->d_flags & DISKFLAG_OPEN) { 1153 ccdunlock(cs); 1154 return (EBUSY); 1155 } 1156 1157 disk_destroy(cs->sc_disk); 1158 free(cs->sc_disk, M_CCD); 1159 cs->sc_disk = NULL; 1160 /* Declare unit null and void (reset all flags) */ 1161 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1162 1163 /* Close the components and free their pathnames. */ 1164 for (i = 0; i < cs->sc_nccdisks; ++i) { 1165 /* 1166 * XXX: this close could potentially fail and 1167 * cause Bad Things. Maybe we need to force 1168 * the close to happen? 1169 */ 1170 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, 1171 td->td_ucred, td); 1172 free(cs->sc_cinfo[i].ci_path, M_CCD); 1173 } 1174 1175 /* Free interleave index. */ 1176 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) 1177 free(cs->sc_itable[i].ii_index, M_CCD); 1178 1179 /* Free component info and interleave table. */ 1180 free(cs->sc_cinfo, M_CCD); 1181 free(cs->sc_itable, M_CCD); 1182 free(cs->sc_vpp, M_CCD); 1183 1184 /* This must be atomic. */ 1185 ccdunlock(cs); 1186 ccddestroy(cs); 1187 1188 break; 1189 } 1190 1191 return (0); 1192 } 1193 1194 1195 /* 1196 * Lookup the provided name in the filesystem. If the file exists, 1197 * is a valid block device, and isn't being used by anyone else, 1198 * set *vpp to the file's vnode. 1199 */ 1200 static int 1201 ccdlookup(char *path, struct thread *td, struct vnode **vpp) 1202 { 1203 struct nameidata nd; 1204 struct vnode *vp; 1205 int error, flags; 1206 1207 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td); 1208 flags = FREAD | FWRITE; 1209 if ((error = vn_open(&nd, &flags, 0)) != 0) { 1210 return (error); 1211 } 1212 vp = nd.ni_vp; 1213 1214 if (vrefcnt(vp) > 1) { 1215 error = EBUSY; 1216 goto bad; 1217 } 1218 1219 if (!vn_isdisk(vp, &error)) 1220 goto bad; 1221 1222 1223 VOP_UNLOCK(vp, 0, td); 1224 NDFREE(&nd, NDF_ONLY_PNBUF); 1225 *vpp = vp; 1226 return (0); 1227 bad: 1228 VOP_UNLOCK(vp, 0, td); 1229 NDFREE(&nd, NDF_ONLY_PNBUF); 1230 /* vn_close does vrele() for vp */ 1231 (void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td); 1232 return (error); 1233 } 1234 1235 /* 1236 1237 * Wait interruptibly for an exclusive lock. 1238 * 1239 * XXX 1240 * Several drivers do this; it should be abstracted and made MP-safe. 1241 */ 1242 static int 1243 ccdlock(struct ccd_s *cs) 1244 { 1245 int error; 1246 1247 while ((cs->sc_flags & CCDF_LOCKED) != 0) { 1248 cs->sc_flags |= CCDF_WANTED; 1249 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0) 1250 return (error); 1251 } 1252 cs->sc_flags |= CCDF_LOCKED; 1253 return (0); 1254 } 1255 1256 /* 1257 * Unlock and wake up any waiters. 1258 */ 1259 static void 1260 ccdunlock(struct ccd_s *cs) 1261 { 1262 1263 cs->sc_flags &= ~CCDF_LOCKED; 1264 if ((cs->sc_flags & CCDF_WANTED) != 0) { 1265 cs->sc_flags &= ~CCDF_WANTED; 1266 wakeup(cs); 1267 } 1268 } 1269