1 /* 2 * Copyright (c) 2003 Poul-Henning Kamp. 3 * Copyright (c) 1995 Jason R. Thorpe. 4 * Copyright (c) 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * All rights reserved. 7 * Copyright (c) 1988 University of Utah. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * the Systems Programming Group of the University of Utah Computer 11 * Science Department. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed for the NetBSD Project 24 * by Jason R. Thorpe. 25 * 4. The names of the authors may not be used to endorse or promote products 26 * derived from this software without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 35 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 36 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * Dynamic configuration and disklabel support by: 41 * Jason R. Thorpe <thorpej@nas.nasa.gov> 42 * Numerical Aerodynamic Simulation Facility 43 * Mail Stop 258-6 44 * NASA Ames Research Center 45 * Moffett Field, CA 94035 46 * 47 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 48 * 49 * @(#)cd.c 8.2 (Berkeley) 11/16/93 50 * 51 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ 52 * 53 * $FreeBSD$ 54 */ 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/kernel.h> 59 #include <sys/module.h> 60 #include <sys/proc.h> 61 #include <sys/bio.h> 62 #include <sys/malloc.h> 63 #include <sys/namei.h> 64 #include <sys/conf.h> 65 #include <sys/stat.h> 66 #include <sys/stdint.h> 67 #include <sys/sysctl.h> 68 #include <sys/disk.h> 69 #include <sys/devicestat.h> 70 #include <sys/fcntl.h> 71 #include <sys/vnode.h> 72 73 #include <sys/ccdvar.h> 74 75 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver"); 76 77 /* 78 This is how mirroring works (only writes are special): 79 80 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s 81 linked together by the cb_mirror field. "cb_pflags & 82 CCDPF_MIRROR_DONE" is set to 0 on both of them. 83 84 When a component returns to ccdiodone(), it checks if "cb_pflags & 85 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's 86 flag and returns. If it is, it means its partner has already 87 returned, so it will go to the regular cleanup. 88 89 */ 90 91 struct ccdbuf { 92 struct bio cb_buf; /* new I/O buf */ 93 struct bio *cb_obp; /* ptr. to original I/O buf */ 94 struct ccdbuf *cb_freenext; /* free list link */ 95 struct ccd_s *cb_softc; 96 int cb_comp; /* target component */ 97 int cb_pflags; /* mirror/parity status flag */ 98 struct ccdbuf *cb_mirror; /* mirror counterpart */ 99 }; 100 101 /* bits in cb_pflags */ 102 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */ 103 104 /* convinient macros for often-used statements */ 105 #define IS_ALLOCATED(unit) (ccdfind(unit) != NULL) 106 #define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0) 107 108 static dev_t ccdctldev; 109 110 static disk_strategy_t ccdstrategy; 111 static d_ioctl_t ccdctlioctl; 112 113 #define NCCDFREEHIWAT 16 114 115 #define CDEV_MAJOR 74 116 117 static struct cdevsw ccdctl_cdevsw = { 118 .d_open = nullopen, 119 .d_close = nullclose, 120 .d_ioctl = ccdctlioctl, 121 .d_name = "ccdctl", 122 .d_maj = CDEV_MAJOR, 123 }; 124 125 static LIST_HEAD(, ccd_s) ccd_softc_list = 126 LIST_HEAD_INITIALIZER(&ccd_softc_list); 127 128 static struct ccd_s *ccdfind(int); 129 static struct ccd_s *ccdnew(int); 130 static int ccddestroy(struct ccd_s *); 131 132 /* called during module initialization */ 133 static void ccdattach(void); 134 static int ccd_modevent(module_t, int, void *); 135 136 /* called by biodone() at interrupt time */ 137 static void ccdiodone(struct bio *bp); 138 139 static void ccdstart(struct ccd_s *, struct bio *); 140 static void ccdinterleave(struct ccd_s *, int); 141 static int ccdinit(struct ccd_s *, char **, struct thread *); 142 static int ccdlookup(char *, struct thread *p, struct vnode **); 143 static int ccdbuffer(struct ccdbuf **ret, struct ccd_s *, 144 struct bio *, daddr_t, caddr_t, long); 145 static int ccdlock(struct ccd_s *); 146 static void ccdunlock(struct ccd_s *); 147 148 149 /* 150 * Number of blocks to untouched in front of a component partition. 151 * This is to avoid violating its disklabel area when it starts at the 152 * beginning of the slice. 153 */ 154 #if !defined(CCD_OFFSET) 155 #define CCD_OFFSET 16 156 #endif 157 158 static struct ccd_s * 159 ccdfind(int unit) 160 { 161 struct ccd_s *sc = NULL; 162 163 /* XXX: LOCK(unique unit numbers) */ 164 LIST_FOREACH(sc, &ccd_softc_list, list) { 165 if (sc->sc_unit == unit) 166 break; 167 } 168 /* XXX: UNLOCK(unique unit numbers) */ 169 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc); 170 } 171 172 static struct ccd_s * 173 ccdnew(int unit) 174 { 175 struct ccd_s *sc; 176 177 /* XXX: LOCK(unique unit numbers) */ 178 if (IS_ALLOCATED(unit) || unit > 32) 179 return (NULL); 180 181 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO); 182 sc->sc_unit = unit; 183 LIST_INSERT_HEAD(&ccd_softc_list, sc, list); 184 /* XXX: UNLOCK(unique unit numbers) */ 185 return (sc); 186 } 187 188 static int 189 ccddestroy(struct ccd_s *sc) 190 { 191 192 /* XXX: LOCK(unique unit numbers) */ 193 LIST_REMOVE(sc, list); 194 /* XXX: UNLOCK(unique unit numbers) */ 195 FREE(sc, M_CCD); 196 return (0); 197 } 198 199 /* 200 * Called by main() during pseudo-device attachment. All we need 201 * to do is to add devsw entries. 202 */ 203 static void 204 ccdattach() 205 { 206 207 ccdctldev = make_dev(&ccdctl_cdevsw, 0xffff00ff, 208 UID_ROOT, GID_OPERATOR, 0640, "ccd.ctl"); 209 ccdctldev->si_drv1 = ccdctldev; 210 } 211 212 static int 213 ccd_modevent(module_t mod, int type, void *data) 214 { 215 int error = 0; 216 217 switch (type) { 218 case MOD_LOAD: 219 ccdattach(); 220 break; 221 222 case MOD_UNLOAD: 223 printf("ccd0: Unload not supported!\n"); 224 error = EOPNOTSUPP; 225 break; 226 227 case MOD_SHUTDOWN: 228 break; 229 230 default: 231 error = EOPNOTSUPP; 232 } 233 return (error); 234 } 235 236 DEV_MODULE(ccd, ccd_modevent, NULL); 237 238 static int 239 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td) 240 { 241 struct ccdcinfo *ci = NULL; /* XXX */ 242 size_t size; 243 int ix; 244 struct vnode *vp; 245 size_t minsize; 246 int maxsecsize; 247 struct ccdgeom *ccg = &cs->sc_geom; 248 char *tmppath = NULL; 249 int error = 0; 250 off_t mediasize; 251 u_int sectorsize; 252 253 254 cs->sc_size = 0; 255 256 /* Allocate space for the component info. */ 257 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo), 258 M_CCD, M_WAITOK); 259 260 /* 261 * Verify that each component piece exists and record 262 * relevant information about it. 263 */ 264 maxsecsize = 0; 265 minsize = 0; 266 tmppath = malloc(MAXPATHLEN, M_CCD, M_WAITOK); 267 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 268 vp = cs->sc_vpp[ix]; 269 ci = &cs->sc_cinfo[ix]; 270 ci->ci_vp = vp; 271 272 /* 273 * Copy in the pathname of the component. 274 */ 275 if ((error = copyinstr(cpaths[ix], tmppath, 276 MAXPATHLEN, &ci->ci_pathlen)) != 0) { 277 goto fail; 278 } 279 ci->ci_path = malloc(ci->ci_pathlen, M_CCD, M_WAITOK); 280 bcopy(tmppath, ci->ci_path, ci->ci_pathlen); 281 282 ci->ci_dev = vn_todev(vp); 283 284 /* 285 * Get partition information for the component. 286 */ 287 error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize, 288 FREAD, td->td_ucred, td); 289 if (error != 0) { 290 goto fail; 291 } 292 /* 293 * Get partition information for the component. 294 */ 295 error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)§orsize, 296 FREAD, td->td_ucred, td); 297 if (error != 0) { 298 goto fail; 299 } 300 if (sectorsize > maxsecsize) 301 maxsecsize = sectorsize; 302 size = mediasize / DEV_BSIZE - CCD_OFFSET; 303 304 /* 305 * Calculate the size, truncating to an interleave 306 * boundary if necessary. 307 */ 308 309 if (cs->sc_ileave > 1) 310 size -= size % cs->sc_ileave; 311 312 if (size == 0) { 313 error = ENODEV; 314 goto fail; 315 } 316 317 if (minsize == 0 || size < minsize) 318 minsize = size; 319 ci->ci_size = size; 320 cs->sc_size += size; 321 } 322 323 free(tmppath, M_CCD); 324 tmppath = NULL; 325 326 /* 327 * Don't allow the interleave to be smaller than 328 * the biggest component sector. 329 */ 330 if ((cs->sc_ileave > 0) && 331 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { 332 error = EINVAL; 333 goto fail; 334 } 335 336 /* 337 * If uniform interleave is desired set all sizes to that of 338 * the smallest component. This will guarentee that a single 339 * interleave table is generated. 340 * 341 * Lost space must be taken into account when calculating the 342 * overall size. Half the space is lost when CCDF_MIRROR is 343 * specified. 344 */ 345 if (cs->sc_flags & CCDF_UNIFORM) { 346 for (ci = cs->sc_cinfo; 347 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 348 ci->ci_size = minsize; 349 } 350 if (cs->sc_flags & CCDF_MIRROR) { 351 /* 352 * Check to see if an even number of components 353 * have been specified. The interleave must also 354 * be non-zero in order for us to be able to 355 * guarentee the topology. 356 */ 357 if (cs->sc_nccdisks % 2) { 358 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit ); 359 error = EINVAL; 360 goto fail; 361 } 362 if (cs->sc_ileave == 0) { 363 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit); 364 error = EINVAL; 365 goto fail; 366 } 367 cs->sc_size = (cs->sc_nccdisks/2) * minsize; 368 } else { 369 if (cs->sc_ileave == 0) { 370 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit); 371 error = EINVAL; 372 goto fail; 373 } 374 cs->sc_size = cs->sc_nccdisks * minsize; 375 } 376 } 377 378 /* 379 * Construct the interleave table. 380 */ 381 ccdinterleave(cs, cs->sc_unit); 382 383 /* 384 * Create pseudo-geometry based on 1MB cylinders. It's 385 * pretty close. 386 */ 387 ccg->ccg_secsize = maxsecsize; 388 ccg->ccg_ntracks = 1; 389 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize; 390 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors; 391 392 /* 393 * Add a devstat entry for this device. 394 */ 395 devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit, 396 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED, 397 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER, 398 DEVSTAT_PRIORITY_ARRAY); 399 400 cs->sc_flags |= CCDF_INITED; 401 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */ 402 return (0); 403 fail: 404 while (ci > cs->sc_cinfo) { 405 ci--; 406 free(ci->ci_path, M_CCD); 407 } 408 if (tmppath != NULL) 409 free(tmppath, M_CCD); 410 free(cs->sc_cinfo, M_CCD); 411 ccddestroy(cs); 412 return (error); 413 } 414 415 static void 416 ccdinterleave(struct ccd_s *cs, int unit) 417 { 418 struct ccdcinfo *ci, *smallci; 419 struct ccdiinfo *ii; 420 daddr_t bn, lbn; 421 int ix; 422 u_long size; 423 424 425 /* 426 * Allocate an interleave table. The worst case occurs when each 427 * of N disks is of a different size, resulting in N interleave 428 * tables. 429 * 430 * Chances are this is too big, but we don't care. 431 */ 432 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo); 433 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_CCD, 434 M_WAITOK | M_ZERO); 435 436 /* 437 * Trivial case: no interleave (actually interleave of disk size). 438 * Each table entry represents a single component in its entirety. 439 * 440 * An interleave of 0 may not be used with a mirror setup. 441 */ 442 if (cs->sc_ileave == 0) { 443 bn = 0; 444 ii = cs->sc_itable; 445 446 for (ix = 0; ix < cs->sc_nccdisks; ix++) { 447 /* Allocate space for ii_index. */ 448 ii->ii_index = malloc(sizeof(int), M_CCD, M_WAITOK); 449 ii->ii_ndisk = 1; 450 ii->ii_startblk = bn; 451 ii->ii_startoff = 0; 452 ii->ii_index[0] = ix; 453 bn += cs->sc_cinfo[ix].ci_size; 454 ii++; 455 } 456 ii->ii_ndisk = 0; 457 return; 458 } 459 460 /* 461 * The following isn't fast or pretty; it doesn't have to be. 462 */ 463 size = 0; 464 bn = lbn = 0; 465 for (ii = cs->sc_itable; ; ii++) { 466 /* 467 * Allocate space for ii_index. We might allocate more then 468 * we use. 469 */ 470 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks), 471 M_CCD, M_WAITOK); 472 473 /* 474 * Locate the smallest of the remaining components 475 */ 476 smallci = NULL; 477 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks]; 478 ci++) { 479 if (ci->ci_size > size && 480 (smallci == NULL || 481 ci->ci_size < smallci->ci_size)) { 482 smallci = ci; 483 } 484 } 485 486 /* 487 * Nobody left, all done 488 */ 489 if (smallci == NULL) { 490 ii->ii_ndisk = 0; 491 free(ii->ii_index, M_CCD); 492 break; 493 } 494 495 /* 496 * Record starting logical block using an sc_ileave blocksize. 497 */ 498 ii->ii_startblk = bn / cs->sc_ileave; 499 500 /* 501 * Record starting comopnent block using an sc_ileave 502 * blocksize. This value is relative to the beginning of 503 * a component disk. 504 */ 505 ii->ii_startoff = lbn; 506 507 /* 508 * Determine how many disks take part in this interleave 509 * and record their indices. 510 */ 511 ix = 0; 512 for (ci = cs->sc_cinfo; 513 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) { 514 if (ci->ci_size >= smallci->ci_size) { 515 ii->ii_index[ix++] = ci - cs->sc_cinfo; 516 } 517 } 518 ii->ii_ndisk = ix; 519 bn += ix * (smallci->ci_size - size); 520 lbn = smallci->ci_size / cs->sc_ileave; 521 size = smallci->ci_size; 522 } 523 } 524 525 static void 526 ccdstrategy(struct bio *bp) 527 { 528 struct ccd_s *cs; 529 int pbn; /* in sc_secsize chunks */ 530 long sz; /* in sc_secsize chunks */ 531 532 cs = bp->bio_disk->d_drv1; 533 534 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE); 535 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize); 536 537 /* 538 * If out of bounds return an error. If at the EOF point, 539 * simply read or write less. 540 */ 541 542 if (pbn < 0 || pbn >= cs->sc_size) { 543 bp->bio_resid = bp->bio_bcount; 544 if (pbn != cs->sc_size) 545 biofinish(bp, NULL, EINVAL); 546 else 547 biodone(bp); 548 return; 549 } 550 551 /* 552 * If the request crosses EOF, truncate the request. 553 */ 554 if (pbn + sz > cs->sc_size) { 555 bp->bio_bcount = (cs->sc_size - pbn) * 556 cs->sc_geom.ccg_secsize; 557 } 558 559 bp->bio_resid = bp->bio_bcount; 560 561 /* 562 * "Start" the unit. 563 */ 564 ccdstart(cs, bp); 565 return; 566 } 567 568 static void 569 ccdstart(struct ccd_s *cs, struct bio *bp) 570 { 571 long bcount, rcount; 572 struct ccdbuf *cbp[2]; 573 caddr_t addr; 574 daddr_t bn; 575 int err; 576 577 578 /* Record the transaction start */ 579 devstat_start_transaction(&cs->device_stats); 580 581 /* 582 * Translate the partition-relative block number to an absolute. 583 */ 584 bn = bp->bio_blkno; 585 586 /* 587 * Allocate component buffers and fire off the requests 588 */ 589 addr = bp->bio_data; 590 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) { 591 err = ccdbuffer(cbp, cs, bp, bn, addr, bcount); 592 if (err) { 593 printf("ccdbuffer error %d\n", err); 594 /* We're screwed */ 595 bp->bio_resid -= bcount; 596 bp->bio_error = ENOMEM; 597 bp->bio_flags |= BIO_ERROR; 598 return; 599 } 600 rcount = cbp[0]->cb_buf.bio_bcount; 601 602 if (cs->sc_cflags & CCDF_MIRROR) { 603 /* 604 * Mirroring. Writes go to both disks, reads are 605 * taken from whichever disk seems most appropriate. 606 * 607 * We attempt to localize reads to the disk whos arm 608 * is nearest the read request. We ignore seeks due 609 * to writes when making this determination and we 610 * also try to avoid hogging. 611 */ 612 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) { 613 BIO_STRATEGY(&cbp[0]->cb_buf); 614 BIO_STRATEGY(&cbp[1]->cb_buf); 615 } else { 616 int pick = cs->sc_pick; 617 daddr_t range = cs->sc_size / 16; 618 619 if (bn < cs->sc_blk[pick] - range || 620 bn > cs->sc_blk[pick] + range 621 ) { 622 cs->sc_pick = pick = 1 - pick; 623 } 624 cs->sc_blk[pick] = bn + btodb(rcount); 625 BIO_STRATEGY(&cbp[pick]->cb_buf); 626 } 627 } else { 628 /* 629 * Not mirroring 630 */ 631 BIO_STRATEGY(&cbp[0]->cb_buf); 632 } 633 bn += btodb(rcount); 634 addr += rcount; 635 } 636 } 637 638 /* 639 * Build a component buffer header. 640 */ 641 static int 642 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount) 643 { 644 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */ 645 struct ccdbuf *cbp; 646 daddr_t cbn, cboff; 647 off_t cbc; 648 649 /* 650 * Determine which component bn falls in. 651 */ 652 cbn = bn; 653 cboff = 0; 654 655 if (cs->sc_ileave == 0) { 656 /* 657 * Serially concatenated and neither a mirror nor a parity 658 * config. This is a special case. 659 */ 660 daddr_t sblk; 661 662 sblk = 0; 663 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++) 664 sblk += ci->ci_size; 665 cbn -= sblk; 666 } else { 667 struct ccdiinfo *ii; 668 int ccdisk, off; 669 670 /* 671 * Calculate cbn, the logical superblock (sc_ileave chunks), 672 * and cboff, a normal block offset (DEV_BSIZE chunks) relative 673 * to cbn. 674 */ 675 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */ 676 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */ 677 678 /* 679 * Figure out which interleave table to use. 680 */ 681 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) { 682 if (ii->ii_startblk > cbn) 683 break; 684 } 685 ii--; 686 687 /* 688 * off is the logical superblock relative to the beginning 689 * of this interleave block. 690 */ 691 off = cbn - ii->ii_startblk; 692 693 /* 694 * We must calculate which disk component to use (ccdisk), 695 * and recalculate cbn to be the superblock relative to 696 * the beginning of the component. This is typically done by 697 * adding 'off' and ii->ii_startoff together. However, 'off' 698 * must typically be divided by the number of components in 699 * this interleave array to be properly convert it from a 700 * CCD-relative logical superblock number to a 701 * component-relative superblock number. 702 */ 703 if (ii->ii_ndisk == 1) { 704 /* 705 * When we have just one disk, it can't be a mirror 706 * or a parity config. 707 */ 708 ccdisk = ii->ii_index[0]; 709 cbn = ii->ii_startoff + off; 710 } else { 711 if (cs->sc_cflags & CCDF_MIRROR) { 712 /* 713 * We have forced a uniform mapping, resulting 714 * in a single interleave array. We double 715 * up on the first half of the available 716 * components and our mirror is in the second 717 * half. This only works with a single 718 * interleave array because doubling up 719 * doubles the number of sectors, so there 720 * cannot be another interleave array because 721 * the next interleave array's calculations 722 * would be off. 723 */ 724 int ndisk2 = ii->ii_ndisk / 2; 725 ccdisk = ii->ii_index[off % ndisk2]; 726 cbn = ii->ii_startoff + off / ndisk2; 727 ci2 = &cs->sc_cinfo[ccdisk + ndisk2]; 728 } else { 729 ccdisk = ii->ii_index[off % ii->ii_ndisk]; 730 cbn = ii->ii_startoff + off / ii->ii_ndisk; 731 } 732 } 733 734 ci = &cs->sc_cinfo[ccdisk]; 735 736 /* 737 * Convert cbn from a superblock to a normal block so it 738 * can be used to calculate (along with cboff) the normal 739 * block index into this particular disk. 740 */ 741 cbn *= cs->sc_ileave; 742 } 743 744 /* 745 * Fill in the component buf structure. 746 */ 747 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT | M_ZERO); 748 if (cbp == NULL) 749 return (ENOMEM); 750 cbp->cb_buf.bio_cmd = bp->bio_cmd; 751 cbp->cb_buf.bio_done = ccdiodone; 752 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */ 753 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET; 754 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET); 755 cbp->cb_buf.bio_data = addr; 756 cbp->cb_buf.bio_caller2 = cbp; 757 if (cs->sc_ileave == 0) 758 cbc = dbtob((off_t)(ci->ci_size - cbn)); 759 else 760 cbc = dbtob((off_t)(cs->sc_ileave - cboff)); 761 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount; 762 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount; 763 764 /* 765 * context for ccdiodone 766 */ 767 cbp->cb_obp = bp; 768 cbp->cb_softc = cs; 769 cbp->cb_comp = ci - cs->sc_cinfo; 770 771 cb[0] = cbp; 772 773 /* 774 * Note: both I/O's setup when reading from mirror, but only one 775 * will be executed. 776 */ 777 if (cs->sc_cflags & CCDF_MIRROR) { 778 /* mirror, setup second I/O */ 779 cbp = malloc(sizeof(struct ccdbuf), M_CCD, M_NOWAIT); 780 if (cbp == NULL) { 781 free(cb[0], M_CCD); 782 cb[0] = NULL; 783 return (ENOMEM); 784 } 785 bcopy(cb[0], cbp, sizeof(struct ccdbuf)); 786 cbp->cb_buf.bio_dev = ci2->ci_dev; 787 cbp->cb_comp = ci2 - cs->sc_cinfo; 788 cb[1] = cbp; 789 /* link together the ccdbuf's and clear "mirror done" flag */ 790 cb[0]->cb_mirror = cb[1]; 791 cb[1]->cb_mirror = cb[0]; 792 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE; 793 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE; 794 } 795 return (0); 796 } 797 798 /* 799 * Called at interrupt time. 800 * Mark the component as done and if all components are done, 801 * take a ccd interrupt. 802 */ 803 static void 804 ccdiodone(struct bio *ibp) 805 { 806 struct ccdbuf *cbp; 807 struct bio *bp; 808 struct ccd_s *cs; 809 int count; 810 811 cbp = ibp->bio_caller2; 812 cs = cbp->cb_softc; 813 bp = cbp->cb_obp; 814 /* 815 * If an error occured, report it. If this is a mirrored 816 * configuration and the first of two possible reads, do not 817 * set the error in the bp yet because the second read may 818 * succeed. 819 */ 820 821 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 822 const char *msg = ""; 823 824 if ((cs->sc_cflags & CCDF_MIRROR) && 825 (cbp->cb_buf.bio_cmd == BIO_READ) && 826 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 827 /* 828 * We will try our read on the other disk down 829 * below, also reverse the default pick so if we 830 * are doing a scan we do not keep hitting the 831 * bad disk first. 832 */ 833 834 msg = ", trying other disk"; 835 cs->sc_pick = 1 - cs->sc_pick; 836 cs->sc_blk[cs->sc_pick] = bp->bio_blkno; 837 } else { 838 bp->bio_flags |= BIO_ERROR; 839 bp->bio_error = cbp->cb_buf.bio_error ? 840 cbp->cb_buf.bio_error : EIO; 841 } 842 printf("ccd%d: error %d on component %d block %jd " 843 "(ccd block %jd)%s\n", cs->sc_unit, bp->bio_error, 844 cbp->cb_comp, 845 (intmax_t)cbp->cb_buf.bio_blkno, (intmax_t)bp->bio_blkno, 846 msg); 847 } 848 849 /* 850 * Process mirror. If we are writing, I/O has been initiated on both 851 * buffers and we fall through only after both are finished. 852 * 853 * If we are reading only one I/O is initiated at a time. If an 854 * error occurs we initiate the second I/O and return, otherwise 855 * we free the second I/O without initiating it. 856 */ 857 858 if (cs->sc_cflags & CCDF_MIRROR) { 859 if (cbp->cb_buf.bio_cmd == BIO_WRITE) { 860 /* 861 * When writing, handshake with the second buffer 862 * to determine when both are done. If both are not 863 * done, return here. 864 */ 865 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 866 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE; 867 free(cbp, M_CCD); 868 return; 869 } 870 } else { 871 /* 872 * When reading, either dispose of the second buffer 873 * or initiate I/O on the second buffer if an error 874 * occured with this one. 875 */ 876 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) { 877 if (cbp->cb_buf.bio_flags & BIO_ERROR) { 878 cbp->cb_mirror->cb_pflags |= 879 CCDPF_MIRROR_DONE; 880 BIO_STRATEGY(&cbp->cb_mirror->cb_buf); 881 free(cbp, M_CCD); 882 return; 883 } else { 884 free(cbp->cb_mirror, M_CCD); 885 } 886 } 887 } 888 } 889 890 /* 891 * use bio_caller1 to determine how big the original request was rather 892 * then bio_bcount, because bio_bcount may have been truncated for EOF. 893 * 894 * XXX We check for an error, but we do not test the resid for an 895 * aligned EOF condition. This may result in character & block 896 * device access not recognizing EOF properly when read or written 897 * sequentially, but will not effect filesystems. 898 */ 899 count = (long)cbp->cb_buf.bio_caller1; 900 free(cbp, M_CCD); 901 902 /* 903 * If all done, "interrupt". 904 */ 905 bp->bio_resid -= count; 906 if (bp->bio_resid < 0) 907 panic("ccdiodone: count"); 908 if (bp->bio_resid == 0) { 909 if (bp->bio_flags & BIO_ERROR) 910 bp->bio_resid = bp->bio_bcount; 911 biofinish(bp, &cs->device_stats, 0); 912 } 913 } 914 915 static int ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td); 916 917 static int 918 ccdctlioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td) 919 { 920 struct ccd_ioctl *ccio; 921 u_int unit; 922 dev_t dev2; 923 int error; 924 925 switch (cmd) { 926 case CCDIOCSET: 927 case CCDIOCCLR: 928 ccio = (struct ccd_ioctl *)data; 929 unit = ccio->ccio_size; 930 return (ccdioctltoo(unit, cmd, data, flag, td)); 931 case CCDCONFINFO: 932 { 933 int ninit = 0; 934 struct ccdconf *conf = (struct ccdconf *)data; 935 struct ccd_s *tmpcs; 936 struct ccd_s *ubuf = conf->buffer; 937 938 /* XXX: LOCK(unique unit numbers) */ 939 LIST_FOREACH(tmpcs, &ccd_softc_list, list) 940 if (IS_INITED(tmpcs)) 941 ninit++; 942 943 if (conf->size == 0) { 944 conf->size = sizeof(struct ccd_s) * ninit; 945 return (0); 946 } else if ((conf->size / sizeof(struct ccd_s) != ninit) || 947 (conf->size % sizeof(struct ccd_s) != 0)) { 948 /* XXX: UNLOCK(unique unit numbers) */ 949 return (EINVAL); 950 } 951 952 ubuf += ninit; 953 LIST_FOREACH(tmpcs, &ccd_softc_list, list) { 954 if (!IS_INITED(tmpcs)) 955 continue; 956 error = copyout(tmpcs, --ubuf, 957 sizeof(struct ccd_s)); 958 if (error != 0) 959 /* XXX: UNLOCK(unique unit numbers) */ 960 return (error); 961 } 962 /* XXX: UNLOCK(unique unit numbers) */ 963 return (0); 964 } 965 966 case CCDCPPINFO: 967 { 968 struct ccdcpps *cpps = (struct ccdcpps *)data; 969 char *ubuf = cpps->buffer; 970 struct ccd_s *cs; 971 972 973 error = copyin(ubuf, &unit, sizeof (unit)); 974 if (error) 975 return (error); 976 977 if (!IS_ALLOCATED(unit)) 978 return (ENXIO); 979 dev2 = makedev(CDEV_MAJOR, unit * 8 + 2); 980 cs = ccdfind(unit); 981 if (!IS_INITED(cs)) 982 return (ENXIO); 983 984 { 985 int len = 0, i; 986 struct ccdcpps *cpps = (struct ccdcpps *)data; 987 char *ubuf = cpps->buffer; 988 989 990 for (i = 0; i < cs->sc_nccdisks; ++i) 991 len += cs->sc_cinfo[i].ci_pathlen; 992 993 if (cpps->size < len) 994 return (ENOMEM); 995 996 for (i = 0; i < cs->sc_nccdisks; ++i) { 997 len = cs->sc_cinfo[i].ci_pathlen; 998 error = copyout(cs->sc_cinfo[i].ci_path, ubuf, 999 len); 1000 if (error != 0) 1001 return (error); 1002 ubuf += len; 1003 } 1004 return(copyout("", ubuf, 1)); 1005 } 1006 break; 1007 } 1008 1009 default: 1010 return (ENXIO); 1011 } 1012 } 1013 1014 static int 1015 ccdioctltoo(int unit, u_long cmd, caddr_t data, int flag, struct thread *td) 1016 { 1017 int i, j, lookedup = 0, error = 0; 1018 struct ccd_s *cs; 1019 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data; 1020 struct ccdgeom *ccg; 1021 char **cpp; 1022 struct vnode **vpp; 1023 1024 cs = ccdfind(unit); 1025 switch (cmd) { 1026 case CCDIOCSET: 1027 if (cs == NULL) 1028 cs = ccdnew(unit); 1029 if (IS_INITED(cs)) 1030 return (EBUSY); 1031 1032 if ((flag & FWRITE) == 0) 1033 return (EBADF); 1034 1035 if ((error = ccdlock(cs)) != 0) 1036 return (error); 1037 1038 if (ccio->ccio_ndisks > CCD_MAXNDISKS) 1039 return (EINVAL); 1040 1041 /* Fill in some important bits. */ 1042 cs->sc_ileave = ccio->ccio_ileave; 1043 if (cs->sc_ileave == 0 && (ccio->ccio_flags & CCDF_MIRROR)) { 1044 printf("ccd%d: disabling mirror, interleave is 0\n", 1045 unit); 1046 ccio->ccio_flags &= ~(CCDF_MIRROR); 1047 } 1048 if ((ccio->ccio_flags & CCDF_MIRROR) && 1049 !(ccio->ccio_flags & CCDF_UNIFORM)) { 1050 printf("ccd%d: mirror/parity forces uniform flag\n", 1051 unit); 1052 ccio->ccio_flags |= CCDF_UNIFORM; 1053 } 1054 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK; 1055 1056 /* 1057 * Allocate space for and copy in the array of 1058 * componet pathnames and device numbers. 1059 */ 1060 cpp = malloc(ccio->ccio_ndisks * sizeof(char *), 1061 M_CCD, M_WAITOK); 1062 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *), 1063 M_CCD, M_WAITOK); 1064 1065 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp, 1066 ccio->ccio_ndisks * sizeof(char **)); 1067 if (error) { 1068 free(vpp, M_CCD); 1069 free(cpp, M_CCD); 1070 ccdunlock(cs); 1071 return (error); 1072 } 1073 1074 1075 for (i = 0; i < ccio->ccio_ndisks; ++i) { 1076 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) { 1077 for (j = 0; j < lookedup; ++j) 1078 (void)vn_close(vpp[j], FREAD|FWRITE, 1079 td->td_ucred, td); 1080 free(vpp, M_CCD); 1081 free(cpp, M_CCD); 1082 ccdunlock(cs); 1083 return (error); 1084 } 1085 ++lookedup; 1086 } 1087 cs->sc_vpp = vpp; 1088 cs->sc_nccdisks = ccio->ccio_ndisks; 1089 1090 /* 1091 * Initialize the ccd. Fills in the softc for us. 1092 */ 1093 if ((error = ccdinit(cs, cpp, td)) != 0) { 1094 for (j = 0; j < lookedup; ++j) 1095 (void)vn_close(vpp[j], FREAD|FWRITE, 1096 td->td_ucred, td); 1097 /* 1098 * We can't ccddestroy() cs just yet, because nothing 1099 * prevents user-level app to do another ioctl() 1100 * without closing the device first, therefore 1101 * declare unit null and void and let ccdclose() 1102 * destroy it when it is safe to do so. 1103 */ 1104 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1105 free(vpp, M_CCD); 1106 free(cpp, M_CCD); 1107 ccdunlock(cs); 1108 return (error); 1109 } 1110 free(cpp, M_CCD); 1111 1112 /* 1113 * The ccd has been successfully initialized, so 1114 * we can place it into the array and read the disklabel. 1115 */ 1116 ccio->ccio_unit = unit; 1117 ccio->ccio_size = cs->sc_size; 1118 ccg = &cs->sc_geom; 1119 cs->sc_disk = malloc(sizeof(struct disk), M_CCD, 1120 M_ZERO | M_WAITOK); 1121 cs->sc_disk->d_strategy = ccdstrategy; 1122 cs->sc_disk->d_name = "ccd"; 1123 cs->sc_disk->d_sectorsize = ccg->ccg_secsize; 1124 cs->sc_disk->d_mediasize = 1125 cs->sc_size * (off_t)ccg->ccg_secsize; 1126 cs->sc_disk->d_fwsectors = ccg->ccg_nsectors; 1127 cs->sc_disk->d_fwheads = ccg->ccg_ntracks; 1128 cs->sc_disk->d_drv1 = cs; 1129 cs->sc_disk->d_maxsize = MAXPHYS; 1130 disk_create(unit, cs->sc_disk, 0, NULL, NULL); 1131 1132 ccdunlock(cs); 1133 1134 break; 1135 1136 case CCDIOCCLR: 1137 if (cs == NULL) 1138 return (ENXIO); 1139 1140 if (!IS_INITED(cs)) 1141 return (ENXIO); 1142 1143 if ((flag & FWRITE) == 0) 1144 return (EBADF); 1145 1146 if ((error = ccdlock(cs)) != 0) 1147 return (error); 1148 1149 /* Don't unconfigure if any other partitions are open */ 1150 if (cs->sc_disk->d_flags & DISKFLAG_OPEN) { 1151 ccdunlock(cs); 1152 return (EBUSY); 1153 } 1154 1155 disk_destroy(cs->sc_disk); 1156 free(cs->sc_disk, M_CCD); 1157 cs->sc_disk = NULL; 1158 /* Declare unit null and void (reset all flags) */ 1159 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED); 1160 1161 /* Close the components and free their pathnames. */ 1162 for (i = 0; i < cs->sc_nccdisks; ++i) { 1163 /* 1164 * XXX: this close could potentially fail and 1165 * cause Bad Things. Maybe we need to force 1166 * the close to happen? 1167 */ 1168 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, 1169 td->td_ucred, td); 1170 free(cs->sc_cinfo[i].ci_path, M_CCD); 1171 } 1172 1173 /* Free interleave index. */ 1174 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) 1175 free(cs->sc_itable[i].ii_index, M_CCD); 1176 1177 /* Free component info and interleave table. */ 1178 free(cs->sc_cinfo, M_CCD); 1179 free(cs->sc_itable, M_CCD); 1180 free(cs->sc_vpp, M_CCD); 1181 1182 /* And remove the devstat entry. */ 1183 devstat_remove_entry(&cs->device_stats); 1184 1185 /* This must be atomic. */ 1186 ccdunlock(cs); 1187 ccddestroy(cs); 1188 1189 break; 1190 } 1191 1192 return (0); 1193 } 1194 1195 1196 /* 1197 * Lookup the provided name in the filesystem. If the file exists, 1198 * is a valid block device, and isn't being used by anyone else, 1199 * set *vpp to the file's vnode. 1200 */ 1201 static int 1202 ccdlookup(char *path, struct thread *td, struct vnode **vpp) 1203 { 1204 struct nameidata nd; 1205 struct vnode *vp; 1206 int error, flags; 1207 1208 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td); 1209 flags = FREAD | FWRITE; 1210 if ((error = vn_open(&nd, &flags, 0)) != 0) { 1211 return (error); 1212 } 1213 vp = nd.ni_vp; 1214 1215 if (vrefcnt(vp) > 1) { 1216 error = EBUSY; 1217 goto bad; 1218 } 1219 1220 if (!vn_isdisk(vp, &error)) 1221 goto bad; 1222 1223 1224 VOP_UNLOCK(vp, 0, td); 1225 NDFREE(&nd, NDF_ONLY_PNBUF); 1226 *vpp = vp; 1227 return (0); 1228 bad: 1229 VOP_UNLOCK(vp, 0, td); 1230 NDFREE(&nd, NDF_ONLY_PNBUF); 1231 /* vn_close does vrele() for vp */ 1232 (void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td); 1233 return (error); 1234 } 1235 1236 /* 1237 1238 * Wait interruptibly for an exclusive lock. 1239 * 1240 * XXX 1241 * Several drivers do this; it should be abstracted and made MP-safe. 1242 */ 1243 static int 1244 ccdlock(struct ccd_s *cs) 1245 { 1246 int error; 1247 1248 while ((cs->sc_flags & CCDF_LOCKED) != 0) { 1249 cs->sc_flags |= CCDF_WANTED; 1250 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0) 1251 return (error); 1252 } 1253 cs->sc_flags |= CCDF_LOCKED; 1254 return (0); 1255 } 1256 1257 /* 1258 * Unlock and wake up any waiters. 1259 */ 1260 static void 1261 ccdunlock(struct ccd_s *cs) 1262 { 1263 1264 cs->sc_flags &= ~CCDF_LOCKED; 1265 if ((cs->sc_flags & CCDF_WANTED) != 0) { 1266 cs->sc_flags &= ~CCDF_WANTED; 1267 wakeup(cs); 1268 } 1269 } 1270