1 /* 2 * Copyright (c) 2003 Poul-Henning Kamp. 3 * Copyright (c) 1995 Jason R. Thorpe. 4 * Copyright (c) 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * All rights reserved. 7 * Copyright (c) 1988 University of Utah. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * the Systems Programming Group of the University of Utah Computer 11 * Science Department. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed for the NetBSD Project 24 * by Jason R. Thorpe. 25 * 4. The names of the authors may not be used to endorse or promote products 26 * derived from this software without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 35 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 36 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * Dynamic configuration and disklabel support by: 41 * Jason R. Thorpe <thorpej@nas.nasa.gov> 42 * Numerical Aerodynamic Simulation Facility 43 * Mail Stop 258-6 44 * NASA Ames Research Center 45 * Moffett Field, CA 94035 46 * 47 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 48 * 49 * @(#)cd.c 8.2 (Berkeley) 11/16/93 50 * 51 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ 52 * 53 * $FreeBSD$ 54 */ 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/kernel.h> 59 #include <sys/module.h> 60 #include <sys/proc.h> 61 #include <sys/bio.h> 62 #include <sys/malloc.h> 63 #include <sys/namei.h> 64 #include <sys/conf.h> 65 #include <sys/stat.h> 66 #include <sys/stdint.h> 67 #include <sys/sysctl.h> 68 #include <sys/disk.h> 69 #include <sys/devicestat.h> 70 #include <sys/fcntl.h> 71 #include <sys/vnode.h> 72 73 #include <sys/ccdvar.h> 74 75 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver"); 76 77 /* 78 This is how mirroring works (only writes are special): 79 80 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s 81 linked together by the cb_mirror field. "cb_pflags & 82 CCDPF_MIRROR_DONE" is set to 0 on both of them. 83 84 When a component returns to ccdiodone(), it checks if "cb_pflags & 85 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's 86 flag and returns. If it is, it means its partner has already 87 returned, so it will go to the regular cleanup. 88 89 */ 90 91 struct ccdbuf { 92 struct bio cb_buf; /* new I/O buf */ 93 struct bio *cb_obp; /* ptr. to original I/O buf */ 94 struct ccdbuf *cb_freenext; /* free list link */ 95 struct ccd_s *cb_softc; 96 int cb_comp; /* target component */ 97 int cb_pflags; /* mirror/parity status flag */ 98 struct ccdbuf *cb_mirror; /* mirror counterpart */ 99 }; 100 101 /* bits in cb_pflags */ 102 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */ 103 104 /* convinient macros for often-used statements */ 105 #define IS_ALLOCATED(unit) (ccdfind(unit) != NULL) 106 #define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0) 107 108 static dev_t ccdctldev; 109 110 static d_strategy_t ccdstrategy; 111 static d_ioctl_t ccdctlioctl; 112 113 #define NCCDFREEHIWAT 16 114 115 #define CDEV_MAJOR 74 116 117 static struct cdevsw ccdctl_cdevsw = { 118 /* open */ nullopen, 119 /* close */ nullclose, 120 /* read */ noread, 121 /* write */ nowrite, 122 /* ioctl */ ccdctlioctl, 123 /* poll */ nopoll, 124 /* mmap */ nommap, 125 /* strategy */ nostrategy, 126 /* name */ "ccdctl", 127 /* maj */ CDEV_MAJOR, 128 /* dump */ nodump, 129 /* psize */ nopsize, 130 /* flags */ 0 131 }; 132 133 static LIST_HEAD(, ccd_s) ccd_softc_list = 134 LIST_HEAD_INITIALIZER(&ccd_softc_list); 135 136 static struct ccd_s *ccdfind(int); 137 static struct ccd_s *ccdnew(int); 138 static int ccddestroy(struct ccd_s *); 139 140 /* called during module initialization */ 141 static void ccdattach(void); 142 static int ccd_modevent(module_t, int, void *); 143 144 /* called by biodone() at interrupt time */ 145 static void ccdiodone(struct bio *bp); 146 147 static void ccdstart(struct ccd_s *, struct bio *); 148 static void ccdinterleave(struct ccd_s *, int); 149 static int ccdinit(struct ccd_s *, char **, struct thread *); 150 static int ccdlookup(char *, struct thread *p, struct vnode **); 151 static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *, 152 struct bio *, daddr_t, caddr_t, long); 153 static int ccdlock(struct ccd_s *); 154 static void ccdunlock(struct ccd_s *); 155 156 157 /* 158 * Number of blocks to untouched in front of a component partition. 159 * This is to avoid violating its disklabel area when it starts at the 160 * beginning of the slice. 161 */ 162 #if !defined(CCD_OFFSET) 163 #define CCD_OFFSET 16 164 #endif 165 166 static struct ccd_s * 167 ccdfind(int unit) 168 { 169 struct ccd_s *sc = NULL; 170 171 /* XXX: LOCK(unique unit numbers) */ 172 LIST_FOREACH(sc, &ccd_softc_list, list) { 173 if (sc->sc_unit == unit) 174 break; 175 } 176 /* XXX: UNLOCK(unique unit numbers) */ 177 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc); 178 } 179 180 static struct ccd_s * 181 ccdnew(int unit) 182 { 183 struct ccd_s *sc; 184 185 /* XXX: LOCK(unique unit numbers) */ 186 if (IS_ALLOCATED(unit) || unit > 32) 187 return (NULL); 188 189 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_ZERO); 190 sc->sc_unit = unit; 191 LIST_INSERT_HEAD(&ccd_softc_list, sc, list); 192 /* XXX: UNLOCK(unique unit numbers) */ 193 return (sc); 194 } 195 196 static int 197 ccddestroy(struct ccd_s *sc) 198 { 199 200 /* XXX: LOCK(unique unit numbers) */ 201 LIST_REMOVE(sc, list); 202 /* XXX: UNLOCK(unique unit numbers) */ 203 FREE(sc, M_CCD); 204 return (0); 205 } 206 207 /* 208 * Called by main() during pseudo-device attachment. All we need 209 * to do is to add devsw entries. 210 */ 211 static void 212 ccdattach() 213 { 214 215 ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff, 216 UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl"); 217 ccdctldev->si_drv1 = ccdctldev; 218 } 219 220 static int 221 ccd_modevent(module_t mod, int type, void *data) 222 { 223 int error = 0; 224 225 switch (type) { 226 case MOD_LOAD: 227 ccdattach(); 228 break; 229 230 case MOD_UNLOAD: 231 printf("ccd0: Unload not supported!\n"); 232 error = EOPNOTSUPP; 233 break; 234 235 case MOD_SHUTDOWN: 236 break; 237 238 default: 239 error = EOPNOTSUPP; 240 } 241 return (error); 242 } 243 244 DEV_MODULE(ccd, ccd_modevent, NULL); 245 246 static int 247 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td) 248 { 249 struct ccdcinfo *ci = NULL; /* XXX */ 250 size_t size; 251 int ix; 252 struct vnode *vp; 253 size_t minsize; 254 int maxsecsize; 255 struct ccdgeom *ccg = &cs->sc_geom; 256 char *tmppath = NULL; 257 int error = 0; 258 off_t mediasize; 259 u_int sectorsize; 260 261 262 cs->sc_size = 0; 263 264 /* Allocate space for the component info. */ 265 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo), 266 M_CCD, 0); 267 268 /* 269 * Verify that each component piece exists and record 270 * relevant information about it. 271 */ 272 maxsecsize = 0; 273 minsize = 0; 274 tmppath = malloc(MAXPATHLEN, M_CCD, 0); 275 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 276 vp = cs->sc_vpp[ix]; 277 ci = &cs->sc_cinfo[ix]; 278 ci->ci_vp = vp; 279 280 /* 281 * Copy in the pathname of the component. 282 */ 283 if ((error = copyinstr(cpaths[ix], tmppath, 284 MAXPATHLEN, &ci->ci_pathlen)) != 0) { 285 goto fail; 286 } 287 ci->ci_path = malloc(ci->ci_pathlen, M_CCD, 0); 288 bcopy(tmppath, ci->ci_path, ci->ci_pathlen); 289 290 ci->ci_dev = vn_todev(vp); 291 292 /* 293 * Get partition information for the component. 294 */ 295 error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize, 296 FREAD, td->td_ucred, td); 297 if (error != 0) { 298 goto fail; 299 } 300 /* 301 * Get partition information for the component. 302 */ 303 error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)§orsize, 304 FREAD, td->td_ucred, td); 305 if (error != 0) { 306 goto fail; 307 } 308 if (sectorsize > maxsecsize) 309 maxsecsize = sectorsize; 310 size = mediasize / DEV_BSIZE - CCD_OFFSET; 311 312 /* 313 * Calculate the size, truncating to an interleave 314 * boundary if necessary. 315 */ 316 317 if (cs->sc_ileave > 1) 318 size -= size % cs->sc_ileave; 319 320 if (size == 0) { 321 error = ENODEV; 322 goto fail; 323 } 324 325 if (minsize == 0 || size < minsize) 326 minsize = size; 327 ci->ci_size = size; 328 cs->sc_size += size; 329 } 330 331 free(tmppath, M_CCD); 332 tmppath = NULL; 333 334 /* 335 * Don't allow the interleave to be smaller than 336 * the biggest component sector. 337 */ 338 if ((cs->sc_ileave > 0) && 339 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { 340 error = EINVAL; 341 goto fail; 342 } 343 344 /* 345 * If uniform interleave is desired set all sizes to that of 346 * the smallest component. This will guarentee that a single 347 * interleave table is generated. 348 * 349 * Lost space must be taken into account when calculating the 350 * overall size. Half the space is lost when CCDF_MIRROR is 351 * specified. 352 */ 353 if (cs->sc_flags & CCDF_UNIFORM) { 354 for (ci = cs->sc_cinfo; 355 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 356 ci->ci_size = minsize; 357 } 358 if (cs->sc_flags & CCDF_MIRROR) { 359 /* 360 * Check to see if an even number of components 361 * have been specified. The interleave must also 362 * be non-zero in order for us to be able to 363 * guarentee the topology. 364 */ 365 if (cs->sc_nccdisks % 2) { 366 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit ); 367 error = EINVAL; 368 goto fail; 369 } 370 if (cs->sc_ileave == 0) { 371 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit); 372 error = EINVAL; 373 goto fail; 374 } 375 cs->sc_size = (cs->sc_nccdisks/2) * minsize; 376 } else { 377 if (cs->sc_ileave == 0) { 378 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit); 379 error = EINVAL; 380 goto fail; 381 } 382 cs->sc_size = cs->sc_nccdisks * minsize; 383 } 384 } 385 386 /* 387 * Construct the interleave table. 388 */ 389 ccdinterleave(cs, cs->sc_unit); 390 391 /* 392 * Create pseudo-geometry based on 1MB cylinders. It's 393 * pretty close. 394 */ 395 ccg->ccg_secsize = maxsecsize; 396 ccg->ccg_ntracks = 1; 397 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize; 398 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors; 399 400 /* 401 * Add a devstat entry for this device. 402 */ 403 devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit, 404 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED, 405 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER, 406 DEVSTAT_PRIORITY_ARRAY); 407 408 cs->sc_flags |= CCDF_INITED; 409 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */ 410 return (0); 411 fail: 412 while (ci > cs->sc_cinfo) { 413 ci--; 414 free(ci->ci_path, M_CCD); 415 } 416 if (tmppath != NULL) 417 free(tmppath, M_CCD); 418 free(cs->sc_cinfo, M_CCD); 419 ccddestroy(cs); 420 return (error); 421 } 422 423 static void 424 ccdinterleave(struct ccd_s *cs, int unit) 425 { 426 struct ccdcinfo *ci, *smallci; 427 struct ccdiinfo *ii; 428 daddr_t bn, lbn; 429 int ix; 430 u_long size; 431 432 433 /* 434 * Allocate an interleave table. The worst case occurs when each 435 * of N disks is of a different size, resulting in N interleave 436 * tables. 437 * 438 * Chances are this is too big, but we don't care. 439 */ 440 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo); 441 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD, 442 M_ZERO); 443 444 /* 445 * Trivial case: no interleave (actually interleave of disk size). 446 * Each table entry represents a single component in its entirety. 447 * 448 * An interleave of 0 may not be used with a mirror setup. 449 */ 450 if (cs->sc_ileave == 0) { 451 bn = 0; 452 ii = cs->sc_itable; 453 454 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 455 /* Allocate space for ii_index. */ 456 ii->ii_index = malloc(sizeof(int), M_CCD, 0); 457 ii->ii_ndisk = 1; 458 ii->ii_startblk = bn; 459 ii->ii_startoff = 0; 460 ii->ii_index[0] = ix; 461 bn += cs->sc_cinfo[ix].ci_size; 462 ii++; 463 } 464 ii->ii_ndisk = 0; 465 return; 466 } 467 468 /* 469 * The following isn't fast or pretty; it doesn't have to be. 470 */ 471 size = 0; 472 bn = lbn = 0; 473 for (ii = cs->sc_itable; ; ii++) { 474 /* 475 * Allocate space for ii_index. We might allocate more then 476 * we use. 477 */ 478 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks), 479 M_CCD, 0); 480 481 /* 482 * Locate the smallest of the remaining components 483 */ 484 smallci = NULL; 485 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks]; 486 ci++) { 487 if (ci->ci_size > size && 488 (smallci == NULL || 489 ci->ci_size < smallci->ci_size)) { 490 smallci = ci; 491 } 492 } 493 494 /* 495 * Nobody left, all done 496 */ 497 if (smallci == NULL) { 498 ii->ii_ndisk = 0; 499 free(ii->ii_index, M_CCD); 500 break; 501 } 502 503 /* 504 * Record starting logical block using an sc_ileave blocksize. 505 */ 506 ii->ii_startblk = bn / cs->sc_ileave; 507 508 /* 509 * Record starting comopnent block using an sc_ileave 510 * blocksize. This value is relative to the beginning of 511 * a component disk. 512 */ 513 ii->ii_startoff = lbn; 514 515 /* 516 * Determine how many disks take part in this interleave 517 * and record their indices. 518 */ 519 ix = 0; 520 for (ci = cs->sc_cinfo; 521 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 522 if (ci->ci_size >= smallci->ci_size) { 523 ii->ii_index[ix++] = ci - cs->sc_cinfo; 524 } 525 } 526 ii->ii_ndisk = ix; 527 bn += ix * (smallci->ci_size - size); 528 lbn = smallci->ci_size / cs->sc_ileave; 529 size = smallci->ci_size; 530 } 531 } 532 533 static void 534 ccdstrategy(struct bio *bp) 535 { 536 struct ccd_s *cs; 537 int pbn; /* in sc_secsize chunks */ 538 long sz; /* in sc_secsize chunks */ 539 540 cs = bp->bio_dev->si_drv1; 541 542 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE); 543 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize); 544 545 /* 546 * If out of bounds return an error. If at the EOF point, 547 * simply read or write less. 548 */ 549 550 if (pbn < 0 || pbn >= cs->sc_size) { 551 bp->bio_resid = bp->bio_bcount; 552 if (pbn != cs->sc_size) 553 biofinish(bp, NULL, EINVAL); 554 else 555 biodone(bp); 556 return; 557 } 558 559 /* 560 * If the request crosses EOF, truncate the request. 561 */ 562 if (pbn + sz > cs->sc_size) { 563 bp->bio_bcount = (cs->sc_size - pbn) * 564 cs->sc_geom.ccg_secsize; 565 } 566 567 bp->bio_resid = bp->bio_bcount; 568 569 /* 570 * "Start" the unit. 571 */ 572 ccdstart(cs, bp); 573 return; 574 } 575 576 static void 577 ccdstart(struct ccd_s *cs, struct bio *bp) 578 { 579 long bcount, rcount; 580 struct ccdbuf *cbp[2]; 581 caddr_t addr; 582 daddr_t bn; 583 int err; 584 585 586 /* Record the transaction start */ 587 devstat_start_transaction(&cs->device_stats); 588 589 /* 590 * Translate the partition-relative block number to an absolute. 591 */ 592 bn = bp->bio_blkno; 593 594 /* 595 * Allocate component buffers and fire off the requests 596 */ 597 addr = bp->bio_data; 598 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) { 599 err = ccdbuffer(cbp, cs, bp, bn, addr, bcount); 600 if (err) { 601 printf("ccdbuffer error %d\n", err); 602 /* We're screwed */ 603 bp->bio_resid -= bcount; 604 bp->bio_error = ENOMEM; 605 bp->bio_flags |= BIO_ERROR; 606 return; 607 } 608 rcount = cbp[0]->cb_buf.bio_bcount; 609 610 if (cs->sc_cflags & CCDF_MIRROR) { 611 /* 612 * Mirroring. Writes go to both disks, reads are 613 * taken from whichever disk seems most appropriate. 614 * 615 * We attempt to localize reads to the disk whos arm 616 * is nearest the read request. We ignore seeks due 617 * to writes when making this determination and we 618 * also try to avoid hogging. 619 */ 620 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) { 621 BIO_STRATEGY(&cbp[0]->cb_buf); 622 BIO_STRATEGY(&cbp[1]->cb_buf); 623 } else { 624 int pick = cs->sc_pick; 625 daddr_t range = cs->sc_size / 16; 626 627 if (bn < cs->sc_blk[pick] - range || 628 bn > cs->sc_blk[pick] + range 629 ) { 630 cs->sc_pick = pick = 1 - pick; 631 } 632 cs->sc_blk[pick] = bn + btodb(rcount); 633 BIO_STRATEGY(&cbp[pick]->cb_buf); 634 } 635 } else { 636 /* 637 * Not mirroring 638 */ 639 BIO_STRATEGY(&cbp[0]->cb_buf); 640 } 641 bn += btodb(rcount); 642 addr += rcount; 643 } 644 } 645 646 /* 647 * Build a component buffer header. 648 */ 649 static int 650 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount) 651 { 652 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */ 653 struct ccdbuf *cbp; 654 daddr_t cbn, cboff; 655 off_t cbc; 656 657 /* 658 * Determine which component bn falls in. 659 */ 660 cbn = bn; 661 cboff = 0; 662 663 if (cs->sc_ileave == 0) { 664 /* 665 * Serially concatenated and neither a mirror nor a parity 666 * config. This is a special case. 667 */ 668 daddr_t sblk; 669 670 sblk = 0; 671 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++) 672 sblk += ci->ci_size; 673 cbn -= sblk; 674 } else { 675 struct ccdiinfo *ii; 676 int ccdisk, off; 677 678 /* 679 * Calculate cbn, the logical superblock (sc_ileave chunks), 680 * and cboff, a normal block offset (DEV_BSIZE chunks) relative 681 * to cbn. 682 */ 683 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */ 684 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */ 685 686 /* 687 * Figure out which interleave table to use. 688 */ 689 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) { 690 if (ii->ii_startblk > cbn) 691 break; 692 } 693 ii--; 694 695 /* 696 * off is the logical superblock relative to the beginning 697 * of this interleave block. 698 */ 699 off = cbn - ii->ii_startblk; 700 701 /* 702 * We must calculate which disk component to use (ccdisk), 703 * and recalculate cbn to be the superblock relative to 704 * the beginning of the component. This is typically done by 705 * adding 'off' and ii->ii_startoff together. However, 'off' 706 * must typically be divided by the number of components in 707 * this interleave array to be properly convert it from a 708 * CCD-relative logical superblock number to a 709 * component-relative superblock number. 710 */ 711 if (ii->ii_ndisk == 1) { 712 /* 713 * When we have just one disk, it can't be a mirror 714 * or a parity config. 715 */ 716 ccdisk = ii->ii_index[0]; 717 cbn = ii->ii_startoff + off; 718 } else { 719 if (cs->sc_cflags & CCDF_MIRROR) { 720 /* 721 * We have forced a uniform mapping, resulting 722 * in a single interleave array. We double 723 * up on the first half of the available 724 * components and our mirror is in the second 725 * half. This only works with a single 726 * interleave array because doubling up 727 * doubles the number of sectors, so there 728 * cannot be another interleave array because 729 * the next interleave array's calculations 730 * would be off. 731 */ 732 int ndisk2 = ii->ii_ndisk / 2; 733 ccdisk = ii->ii_index[off % ndisk2]; 734 cbn = ii->ii_startoff + off / ndisk2; 735 ci2 = &cs->sc_cinfo[ccdisk + ndisk2]; 736 } else { 737 ccdisk = ii->ii_index[off % ii->ii_ndisk]; 738 cbn = ii->ii_startoff + off / ii->ii_ndisk; 739 } 740 } 741 742 ci = &cs->sc_cinfo[ccdisk]; 743 744 /* 745 * Convert cbn from a superblock to a normal block so it 746 * can be used to calculate (along with cboff) the normal 747 * block index into this particular disk. 748 */ 749 cbn *= cs->sc_ileave; 750 } 751 752 /* 753 * Fill in the component buf structure. 754 */ 755 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO); 756 if (cbp == NULL) 757 return (ENOMEM); 758 cbp->cb_buf.bio_cmd = bp->bio_cmd; 759 cbp->cb_buf.bio_done = ccdiodone; 760 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */ 761 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET; 762 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET); 763 cbp->cb_buf.bio_data = addr; 764 cbp->cb_buf.bio_caller2 = cbp; 765 if (cs->sc_ileave == 0) 766 cbc = dbtob((off_t)(ci->ci_size - cbn)); 767 else 768 cbc = dbtob((off_t)(cs->sc_ileave - cboff)); 769 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount; 770 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount; 771 772 /* 773 * context for ccdiodone 774 */ 775 cbp->cb_obp = bp; 776 cbp->cb_softc = cs; 777 cbp->cb_comp = ci - cs->sc_cinfo; 778 779 cb[0] = cbp; 780 781 /* 782 * Note: both I/O's setup when reading from mirror, but only one 783 * will be executed. 784 */ 785 if (cs->sc_cflags & CCDF_MIRROR) { 786 /* mirror, setup second I/O */ 787 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT); 788 if (cbp == NULL) { 789 free(cb[0], M_CCD); 790 cb[0] = NULL; 791 return (ENOMEM); 792 } 793 bcopy(cb[0], cbp, sizeof(struct ccdbuf)); 794 cbp->cb_buf.bio_dev = ci2->ci_dev; 795 cbp->cb_comp = ci2 - cs->sc_cinfo; 796 cb[1] = cbp; 797 /* link together the ccdbuf's and clear "mirror done" flag */ 798 cb[0]->cb_mirror = cb[1]; 799 cb[1]->cb_mirror = cb[0]; 800 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE; 801 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE; 802 } 803 return (0); 804 } 805 806 /* 807 * Called at interrupt time. 808 * Mark the component as done and if all components are done, 809 * take a ccd interrupt. 810 */ 811 static void 812 ccdiodone(struct bio *ibp) 813 { 814 struct ccdbuf *cbp; 815 struct bio *bp; 816 struct ccd_s *cs; 817 int count; 818 819 cbp = ibp->bio_caller2; 820 cs = cbp->cb_softc; 821 bp = cbp->cb_obp; 822 /* 823 * If an error occured, report it. If this is a mirrored 824 * configuration and the first of two possible reads, do not 825 * set the error in the bp yet because the second read may 826 * succeed. 827 */ 828 829 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 830 const char *msg = ""; 831 832 if ((cs->sc_cflags & CCDF_MIRROR) && 833 (cbp->cb_buf.bio_cmd == BIO_READ) && 834 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 835 /* 836 * We will try our read on the other disk down 837 * below, also reverse the default pick so if we 838 * are doing a scan we do not keep hitting the 839 * bad disk first. 840 */ 841 842 msg = ", trying other disk"; 843 cs->sc_pick = 1 - cs->sc_pick; 844 cs->sc_blk[cs->sc_pick] = bp->bio_blkno; 845 } else { 846 bp->bio_flags |= BIO_ERROR; 847 bp->bio_error = cbp->cb_buf.bio_error ? 848 cbp->cb_buf.bio_error : EIO; 849 } 850 printf("ccd%d: error %d on component %d block %jd " 851 "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error, 852 cbp->cb_comp, 853 (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno, 854 msg); 855 } 856 857 /* 858 * Process mirror. If we are writing, I/O has been initiated on both 859 * buffers and we fall through only after both are finished. 860 * 861 * If we are reading only one I/O is initiated at a time. If an 862 * error occurs we initiate the second I/O and return, otherwise 863 * we free the second I/O without initiating it. 864 */ 865 866 if (cs->sc_cflags & CCDF_MIRROR) { 867 if (cbp->cb_buf.bio_cmd == BIO_WRITE) { 868 /* 869 * When writing, handshake with the second buffer 870 * to determine when both are done. If both are not 871 * done, return here. 872 */ 873 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 874 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE; 875 free(cbp, M_CCD); 876 return; 877 } 878 } else { 879 /* 880 * When reading, either dispose of the second buffer 881 * or initiate I/O on the second buffer if an error 882 * occured with this one. 883 */ 884 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 885 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 886 cbp->cb_mirror->cb_pflags |= 887 CCDPF_MIRROR_DONE; 888 BIO_STRATEGY(&cbp->cb_mirror->cb_buf); 889 free(cbp, M_CCD); 890 return; 891 } else { 892 free(cbp->cb_mirror, M_CCD); 893 } 894 } 895 } 896 } 897 898 /* 899 * use bio_caller1 to determine how big the original request was rather 900 * then bio_bcount, because bio_bcount may have been truncated for EOF. 901 * 902 * XXX We check for an error, but we do not test the resid for an 903 * aligned EOF condition. This may result in character & block 904 * device access not recognizing EOF properly when read or written 905 * sequentially, but will not effect filesystems. 906 */ 907 count = (long)cbp->cb_buf.bio_caller1; 908 free(cbp, M_CCD); 909 910 /* 911 * If all done, "interrupt". 912 */ 913 bp->bio_resid -= count; 914 if (bp->bio_resid < 0) 915 panic("ccdiodone: count"); 916 if (bp->bio_resid == 0) { 917 if (bp->bio_flags & BIO_ERROR) 918 bp->bio_resid = bp->bio_bcount; 919 biofinish(bp, &cs->device_stats, 0); 920 } 921 } 922 923 static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td); 924 925 static int 926 ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td) 927 { 928 struct ccd_ioctl *ccio; 929 u_int unit; 930 dev_t dev2; 931 int error; 932 933 switch (cmd) { 934 case CCDIOCSET: 935 case CCDIOCCLR: 936 ccio = (struct ccd_ioctl *)data; 937 unit = ccio->ccio_size; 938 return (ccdioctltoo(unit, cmd, data, flag, td)); 939 case CCDCONFINFO: 940 { 941 int ninit = 0; 942 struct ccdconf *conf = (struct ccdconf *)data; 943 struct ccd_s *tmpcs; 944 struct ccd_s *ubuf = conf->buffer; 945 946 /* XXX: LOCK(unique unit numbers) */ 947 LIST_FOREACH(tmpcs, &ccd_softc_list, list) 948 if (IS_INITED(tmpcs)) 949 ninit++; 950 951 if (conf->size == 0) { 952 conf->size = sizeof(struct ccd_s) * ninit; 953 return (0); 954 } else if ((conf->size / sizeof(struct ccd_s) != ninit) || 955 (conf->size % sizeof(struct ccd_s) != 0)) { 956 /* XXX: UNLOCK(unique unit numbers) */ 957 return (EINVAL); 958 } 959 960 ubuf += ninit; 961 LIST_FOREACH(tmpcs, &ccd_softc_list, list) { 962 if (!IS_INITED(tmpcs)) 963 continue; 964 error = copyout(tmpcs, --ubuf, 965 sizeof(struct ccd_s)); 966 if (error != 0) 967 /* XXX: UNLOCK(unique unit numbers) */ 968 return (error); 969 } 970 /* XXX: UNLOCK(unique unit numbers) */ 971 return (0); 972 } 973 974 case CCDCPPINFO: 975 { 976 struct ccdcpps *cpps = (struct ccdcpps *)data; 977 char *ubuf = cpps->buffer; 978 struct ccd_s *cs; 979 980 981 error = copyin(ubuf, &unit, sizeof (unit)); 982 if (error) 983 return (error); 984 985 if (!IS_ALLOCATED(unit)) 986 return (ENXIO); 987 dev2 = makedev(CDEV_MAJOR, unit * 8 + 2); 988 cs = ccdfind(unit); 989 if (!IS_INITED(cs)) 990 return (ENXIO); 991 992 { 993 int len = 0, i; 994 struct ccdcpps *cpps = (struct ccdcpps *)data; 995 char *ubuf = cpps->buffer; 996 997 998 for (i = 0; i < cs->sc_nccdisks; ++i) 999 len += cs->sc_cinfo[i].ci_pathlen; 1000 1001 if (cpps->size < len) 1002 return (ENOMEM); 1003 1004 for (i = 0; i < cs->sc_nccdisks; ++i) { 1005 len = cs->sc_cinfo[i].ci_pathlen; 1006 error = copyout(cs->sc_cinfo[i].ci_path, ubuf, 1007 len); 1008 if (error != 0) 1009 return (error); 1010 ubuf += len; 1011 } 1012 return(copyout("", ubuf, 1)); 1013 } 1014 break; 1015 } 1016 1017 default: 1018 return (ENXIO); 1019 } 1020 } 1021 1022 static int 1023 ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td) 1024 { 1025 int i, j, lookedup = 0, error = 0; 1026 struct ccd_s *cs; 1027 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data; 1028 struct ccdgeom *ccg; 1029 char **cpp; 1030 struct vnode **vpp; 1031 1032 cs = ccdfind(unit); 1033 switch (cmd) { 1034 case CCDIOCSET: 1035 if (cs == NULL) 1036 cs = ccdnew(unit); 1037 if (IS_INITED(cs)) 1038 return (EBUSY); 1039 1040 if ((flag & FWRITE) == 0) 1041 return (EBADF); 1042 1043 if ((error = ccdlock(cs)) != 0) 1044 return (error); 1045 1046 if (ccio->ccio_ndisks > CCD_MAXNDISKS) 1047 return (EINVAL); 1048 1049 /* Fill in some important bits. */ 1050 cs->sc_ileave = ccio->ccio_ileave; 1051 if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) { 1052 printf("ccd%d: disabling mirror, interleave is 0\n", 1053 unit); 1054 ccio->ccio_flags &= ~(CCDF_MIRROR); 1055 } 1056 if ((ccio->ccio_flags & CCDF_MIRROR) && 1057 !(ccio->ccio_flags & CCDF_UNIFORM)) { 1058 printf("ccd%d: mirror/parity forces uniform flag\n", 1059 unit); 1060 ccio->ccio_flags |= CCDF_UNIFORM; 1061 } 1062 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK; 1063 1064 /* 1065 * Allocate space for and copy in the array of 1066 * componet pathnames and device numbers. 1067 */ 1068 cpp = malloc(ccio->ccio_ndisks * sizeof(char *), 1069 M_CCD, 0); 1070 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *), 1071 M_CCD, 0); 1072 1073 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp, 1074 ccio->ccio_ndisks * sizeof(char **)); 1075 if (error) { 1076 free(vpp, M_CCD); 1077 free(cpp, M_CCD); 1078 ccdunlock(cs); 1079 return (error); 1080 } 1081 1082 1083 for (i = 0; i < ccio->ccio_ndisks; ++i) { 1084 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) { 1085 for (j = 0; j < lookedup; ++j) 1086 (void)vn_close(vpp[j], FREAD|FWRITE, 1087 td->td_ucred, td); 1088 free(vpp, M_CCD); 1089 free(cpp, M_CCD); 1090 ccdunlock(cs); 1091 return (error); 1092 } 1093 ++lookedup; 1094 } 1095 cs->sc_vpp = vpp; 1096 cs->sc_nccdisks = ccio->ccio_ndisks; 1097 1098 /* 1099 * Initialize the ccd. Fills in the softc for us. 1100 */ 1101 if ((error = ccdinit(cs, cpp, td)) != 0) { 1102 for (j = 0; j < lookedup; ++j) 1103 (void)vn_close(vpp[j], FREAD|FWRITE, 1104 td->td_ucred, td); 1105 /* 1106 * We can't ccddestroy() cs just yet, because nothing 1107 * prevents user-level app to do another ioctl() 1108 * without closing the device first, therefore 1109 * declare unit null and void and let ccdclose() 1110 * destroy it when it is safe to do so. 1111 */ 1112 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1113 free(vpp, M_CCD); 1114 free(cpp, M_CCD); 1115 ccdunlock(cs); 1116 return (error); 1117 } 1118 free(cpp, M_CCD); 1119 1120 /* 1121 * The ccd has been successfully initialized, so 1122 * we can place it into the array and read the disklabel. 1123 */ 1124 ccio->ccio_unit = unit; 1125 ccio->ccio_size = cs->sc_size; 1126 ccg = &cs->sc_geom; 1127 cs->sc_disk = malloc(sizeof(struct disk), M_CCD, M_ZERO); 1128 cs->sc_disk->d_strategy = ccdstrategy; 1129 cs->sc_disk->d_name = "ccd"; 1130 cs->sc_disk->d_sectorsize = ccg->ccg_secsize; 1131 cs->sc_disk->d_mediasize = 1132 cs->sc_size * (off_t)ccg->ccg_secsize; 1133 cs->sc_disk->d_fwsectors = ccg->ccg_nsectors; 1134 cs->sc_disk->d_fwheads = ccg->ccg_ntracks; 1135 cs->sc_dev = disk_create(unit, cs->sc_disk, 0, NULL, NULL); 1136 cs->sc_dev->si_drv1 = cs; 1137 1138 ccdunlock(cs); 1139 1140 break; 1141 1142 case CCDIOCCLR: 1143 if (cs == NULL) 1144 return (ENXIO); 1145 1146 if (!IS_INITED(cs)) 1147 return (ENXIO); 1148 1149 if ((flag & FWRITE) == 0) 1150 return (EBADF); 1151 1152 if ((error = ccdlock(cs)) != 0) 1153 return (error); 1154 1155 /* Don't unconfigure if any other partitions are open */ 1156 if (cs->sc_disk->d_flags & DISKFLAG_OPEN) { 1157 ccdunlock(cs); 1158 return (EBUSY); 1159 } 1160 1161 disk_destroy(cs->sc_dev); 1162 free(cs->sc_disk, M_CCD); 1163 cs->sc_disk = NULL; 1164 /* Declare unit null and void (reset all flags) */ 1165 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1166 1167 /* Close the components and free their pathnames. */ 1168 for (i = 0; i < cs->sc_nccdisks; ++i) { 1169 /* 1170 * XXX: this close could potentially fail and 1171 * cause Bad Things. Maybe we need to force 1172 * the close to happen? 1173 */ 1174 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, 1175 td->td_ucred, td); 1176 free(cs->sc_cinfo[i].ci_path, M_CCD); 1177 } 1178 1179 /* Free interleave index. */ 1180 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) 1181 free(cs->sc_itable[i].ii_index, M_CCD); 1182 1183 /* Free component info and interleave table. */ 1184 free(cs->sc_cinfo, M_CCD); 1185 free(cs->sc_itable, M_CCD); 1186 free(cs->sc_vpp, M_CCD); 1187 1188 /* And remove the devstat entry. */ 1189 devstat_remove_entry(&cs->device_stats); 1190 1191 /* This must be atomic. */ 1192 ccdunlock(cs); 1193 ccddestroy(cs); 1194 1195 break; 1196 } 1197 1198 return (0); 1199 } 1200 1201 1202 /* 1203 * Lookup the provided name in the filesystem. If the file exists, 1204 * is a valid block device, and isn't being used by anyone else, 1205 * set *vpp to the file's vnode. 1206 */ 1207 static int 1208 ccdlookup(char *path, struct thread *td, struct vnode **vpp) 1209 { 1210 struct nameidata nd; 1211 struct vnode *vp; 1212 int error, flags; 1213 1214 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td); 1215 flags = FREAD | FWRITE; 1216 if ((error = vn_open(&nd, &flags, 0)) != 0) { 1217 return (error); 1218 } 1219 vp = nd.ni_vp; 1220 1221 if (vrefcnt(vp) > 1) { 1222 error = EBUSY; 1223 goto bad; 1224 } 1225 1226 if (!vn_isdisk(vp, &error)) 1227 goto bad; 1228 1229 1230 VOP_UNLOCK(vp, 0, td); 1231 NDFREE(&nd, NDF_ONLY_PNBUF); 1232 *vpp = vp; 1233 return (0); 1234 bad: 1235 VOP_UNLOCK(vp, 0, td); 1236 NDFREE(&nd, NDF_ONLY_PNBUF); 1237 /* vn_close does vrele() for vp */ 1238 (void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td); 1239 return (error); 1240 } 1241 1242 /* 1243 1244 * Wait interruptibly for an exclusive lock. 1245 * 1246 * XXX 1247 * Several drivers do this; it should be abstracted and made MP-safe. 1248 */ 1249 static int 1250 ccdlock(struct ccd_s *cs) 1251 { 1252 int error; 1253 1254 while ((cs->sc_flags & CCDF_LOCKED) != 0) { 1255 cs->sc_flags |= CCDF_WANTED; 1256 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0) 1257 return (error); 1258 } 1259 cs->sc_flags |= CCDF_LOCKED; 1260 return (0); 1261 } 1262 1263 /* 1264 * Unlock and wake up any waiters. 1265 */ 1266 static void 1267 ccdunlock(struct ccd_s *cs) 1268 { 1269 1270 cs->sc_flags &= ~CCDF_LOCKED; 1271 if ((cs->sc_flags & CCDF_WANTED) != 0) { 1272 cs->sc_flags &= ~CCDF_WANTED; 1273 wakeup(cs); 1274 } 1275 } 1276