1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * NAME: raid.c 29 * 30 * DESCRIPTION: Main RAID driver source file containing open, close and I/O 31 * operations. 32 * 33 * ROUTINES PROVIDED FOR EXTERNAL USE: 34 * raid_open() - open the RAID metadevice for access. 35 * raid_internal_open() - internal open routine of RAID metdevice. 36 * md_raid_strategy() - perform normal I/O operations, 37 * such as read and write. 38 * raid_close() - close the RAID metadevice. 39 * raid_internal_close() - internal close routine of RAID metadevice. 40 * raid_snarf() - initialize and clean up MDD records. 41 * raid_halt() - reset the RAID metadevice 42 * raid_line() - return the line # of this segment 43 * raid_dcolumn() - return the data column # of this segment 44 * raid_pcolumn() - return the parity column # of this segment 45 */ 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/conf.h> 50 #include <sys/file.h> 51 #include <sys/user.h> 52 #include <sys/uio.h> 53 #include <sys/t_lock.h> 54 #include <sys/buf.h> 55 #include <sys/dkio.h> 56 #include <sys/vtoc.h> 57 #include <sys/kmem.h> 58 #include <vm/page.h> 59 #include <sys/cmn_err.h> 60 #include <sys/sysmacros.h> 61 #include <sys/types.h> 62 #include <sys/mkdev.h> 63 #include <sys/stat.h> 64 #include <sys/open.h> 65 #include <sys/modctl.h> 66 #include <sys/ddi.h> 67 #include <sys/sunddi.h> 68 #include <sys/debug.h> 69 #include <sys/lvm/md_raid.h> 70 #include <sys/lvm/mdvar.h> 71 #include <sys/lvm/md_convert.h> 72 73 #include <sys/sysevent/eventdefs.h> 74 #include <sys/sysevent/svm.h> 75 76 md_ops_t raid_md_ops; 77 #ifndef lint 78 char _depends_on[] = "drv/md"; 79 md_ops_t *md_interface_ops = &raid_md_ops; 80 #endif /* lint */ 81 82 extern unit_t md_nunits; 83 extern unit_t md_nsets; 84 extern md_set_t md_set[]; 85 extern int md_status; 86 extern major_t md_major; 87 extern mdq_anchor_t md_done_daemon; 88 extern mdq_anchor_t md_mstr_daemon; 89 extern int md_sleep_for_test; 90 extern clock_t md_hz; 91 92 extern md_event_queue_t *md_event_queue; 93 94 95 int pchunks = 16; 96 int phigh = 1024; 97 int plow = 128; 98 int cchunks = 64; 99 int chigh = 1024; 100 int clow = 512; 101 int bchunks = 32; 102 int bhigh = 256; 103 int blow = 128; 104 105 int raid_total_io = 0; 106 int raid_reads = 0; 107 int raid_writes = 0; 108 int raid_no_bpmaps = 0; 109 int raid_512 = 0; 110 int raid_1024 = 0; 111 int raid_1024_8192 = 0; 112 int raid_8192 = 0; 113 int raid_8192_bigger = 0; 114 int raid_line_lock_wait = 0; 115 116 int data_buffer_waits = 0; 117 int parity_buffer_waits = 0; 118 119 /* writer line locks */ 120 int raid_writer_locks = 0; /* total writer locks */ 121 int raid_write_waits = 0; /* total writer locks that waited */ 122 int raid_full_line_writes = 0; /* total full line writes */ 123 int raid_write_queue_length = 0; /* wait queue length */ 124 int raid_max_write_q_length = 0; /* maximum queue length */ 125 int raid_write_locks_active = 0; /* writer locks at any time */ 126 int raid_max_write_locks = 0; /* maximum writer locks active */ 127 128 /* read line locks */ 129 int raid_reader_locks = 0; /* total reader locks held */ 130 int raid_reader_locks_active = 0; /* reader locks held */ 131 int raid_max_reader_locks = 0; /* maximum reader locks held in run */ 132 int raid_read_overlaps = 0; /* number of times 2 reads hit same line */ 133 int raid_read_waits = 0; /* times a reader waited on writer */ 134 135 /* prewrite stats */ 136 int raid_prewrite_waits = 0; /* number of waits for a pw slot */ 137 int raid_pw = 0; /* number of pw slots in use */ 138 int raid_prewrite_max = 0; /* maximum number of pw slots in use */ 139 int raid_pw_invalidates = 0; 140 141 static clock_t md_wr_wait = 0; 142 143 int nv_available = 0; /* presence of nv-ram support in device */ 144 int nv_prewrite = 1; /* mark prewrites with nv_available */ 145 int nv_parity = 1; /* mark parity with nv_available */ 146 147 kmem_cache_t *raid_parent_cache = NULL; 148 kmem_cache_t *raid_child_cache = NULL; 149 kmem_cache_t *raid_cbuf_cache = NULL; 150 151 int raid_internal_open(minor_t mnum, int flag, int otyp, 152 int md_oflags); 153 154 static void freebuffers(md_raidcs_t *cs); 155 static int raid_read(mr_unit_t *un, md_raidcs_t *cs); 156 static void raid_read_io(mr_unit_t *un, md_raidcs_t *cs); 157 static int raid_write(mr_unit_t *un, md_raidcs_t *cs); 158 static void raid_write_io(mr_unit_t *un, md_raidcs_t *cs); 159 static void raid_stage(md_raidcs_t *cs); 160 static void raid_enqueue(md_raidcs_t *cs); 161 static diskaddr_t raid_line(diskaddr_t segment, mr_unit_t *un); 162 uint_t raid_dcolumn(diskaddr_t segment, mr_unit_t *un); 163 static void getpbuffer(md_raidcs_t *cs); 164 static void getdbuffer(md_raidcs_t *cs); 165 static void raid_done(buf_t *bp); 166 static void raid_io_startup(mr_unit_t *un); 167 168 static rus_state_t 169 raid_col2unit(rcs_state_t state, rus_state_t unitstate) 170 { 171 switch (state) { 172 case RCS_INIT: 173 return (RUS_INIT); 174 case RCS_OKAY: 175 return (RUS_OKAY); 176 case RCS_RESYNC: 177 if (unitstate & RUS_LAST_ERRED) 178 return (RUS_LAST_ERRED); 179 else 180 return (RUS_ERRED); 181 case RCS_ERRED: 182 return (RUS_ERRED); 183 case RCS_LAST_ERRED: 184 return (RUS_ERRED); 185 default: 186 break; 187 } 188 panic("raid_col2unit"); 189 /*NOTREACHED*/ 190 } 191 192 void 193 raid_set_state(mr_unit_t *un, int col, rcs_state_t newstate, int force) 194 { 195 196 rus_state_t unitstate, origstate; 197 rcs_state_t colstate; 198 rcs_state_t orig_colstate; 199 int errcnt = 0, okaycnt = 0, resynccnt = 0; 200 int i; 201 char *devname; 202 203 ASSERT(un); 204 ASSERT(col < un->un_totalcolumncnt); 205 ASSERT(newstate & 206 (RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | 207 RCS_LAST_ERRED | RCS_REGEN)); 208 ASSERT((newstate & 209 ~(RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | 210 RCS_LAST_ERRED | RCS_REGEN)) 211 == 0); 212 213 ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1); 214 215 unitstate = un->un_state; 216 origstate = unitstate; 217 218 if (force) { 219 un->un_column[col].un_devstate = newstate; 220 un->un_state = raid_col2unit(newstate, unitstate); 221 uniqtime32(&un->un_column[col].un_devtimestamp); 222 uniqtime32(&un->un_timestamp); 223 return; 224 } 225 226 ASSERT(un->un_state & 227 (RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | 228 RUS_REGEN)); 229 ASSERT((un->un_state & ~(RUS_INIT | 230 RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)) == 0); 231 232 if (un->un_column[col].un_devstate == newstate) 233 return; 234 235 if (newstate == RCS_REGEN) { 236 if (raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) 237 return; 238 un->un_state = RUS_REGEN; 239 return; 240 } 241 242 orig_colstate = un->un_column[col].un_devstate; 243 244 /* 245 * if there is another column in the error state then this 246 * column should go to the last errored state 247 */ 248 for (i = 0; i < un->un_totalcolumncnt; i++) { 249 if (i == col) 250 colstate = newstate; 251 else 252 colstate = un->un_column[i].un_devstate; 253 if (colstate & (RCS_ERRED | RCS_LAST_ERRED | RCS_INIT_ERRED)) 254 errcnt++; 255 if (colstate & RCS_OKAY) 256 okaycnt++; 257 if (colstate & RCS_RESYNC) 258 resynccnt++; 259 } 260 ASSERT(resynccnt < 2); 261 262 if (okaycnt == un->un_totalcolumncnt) 263 unitstate = RUS_OKAY; 264 else if (errcnt > 1) { 265 unitstate = RUS_LAST_ERRED; 266 if (newstate & RCS_ERRED) 267 newstate = RCS_LAST_ERRED; 268 } else if (errcnt == 1) 269 if (!(unitstate & RUS_LAST_ERRED)) 270 unitstate = RUS_ERRED; 271 272 if (un->un_state == RUS_DOI) 273 unitstate = RUS_DOI; 274 275 un->un_column[col].un_devstate = newstate; 276 uniqtime32(&un->un_column[col].un_devtimestamp); 277 /* 278 * if there are last errored column being brought back online 279 * by open or snarf, then be sure to clear the RUS_LAST_ERRED 280 * bit to allow writes. If there is a real error then the 281 * column will go back into last erred. 282 */ 283 if ((raid_state_cnt(un, RCS_LAST_ERRED) == 0) && 284 (raid_state_cnt(un, RCS_ERRED) == 1)) 285 unitstate = RUS_ERRED; 286 287 un->un_state = unitstate; 288 uniqtime32(&un->un_timestamp); 289 290 if ((! (origstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) && 291 (unitstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) { 292 devname = md_devname(MD_UN2SET(un), 293 un->un_column[col].un_dev, NULL, 0); 294 295 cmn_err(CE_WARN, "md: %s: %s needs maintenance", 296 md_shortname(MD_SID(un)), devname); 297 298 if (unitstate & RUS_LAST_ERRED) { 299 cmn_err(CE_WARN, "md: %s: %s last erred", 300 md_shortname(MD_SID(un)), devname); 301 302 } else if (un->un_column[col].un_devflags & 303 MD_RAID_DEV_ISOPEN) { 304 /* 305 * Close the broken device and clear the open flag on 306 * it. We have to check that the device is open, 307 * otherwise the first open on it has resulted in the 308 * error that is being processed and the actual un_dev 309 * will be NODEV64. 310 */ 311 md_layered_close(un->un_column[col].un_dev, 312 MD_OFLG_NULL); 313 un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; 314 } 315 } else if (orig_colstate == RCS_LAST_ERRED && newstate == RCS_ERRED && 316 un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) { 317 /* 318 * Similar to logic above except no log messages since we 319 * are just transitioning from Last Erred to Erred. 320 */ 321 md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL); 322 un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; 323 } 324 325 /* 326 * If a resync has completed, see if there is a Last Erred 327 * component that we can change to the Erred state. 328 */ 329 if ((orig_colstate == RCS_RESYNC) && (newstate == RCS_OKAY)) { 330 for (i = 0; i < un->un_totalcolumncnt; i++) { 331 if (i != col && 332 (un->un_column[i].un_devstate & RCS_LAST_ERRED)) { 333 raid_set_state(un, i, RCS_ERRED, 0); 334 break; 335 } 336 } 337 } 338 } 339 340 /* 341 * NAME: erred_check_line 342 * 343 * DESCRIPTION: Return the type of write to perform on an erred column based 344 * upon any resync activity. 345 * 346 * if a column is being resynced and the write is above the 347 * resync point may have to write to the target being resynced. 348 * 349 * Column state may make it impossible to do the write 350 * in which case RCL_EIO or RCL_ENXIO is returned. 351 * 352 * If a column cannot be written directly, RCL_ERRED is 353 * returned and processing should proceed accordingly. 354 * 355 * PARAMETERS: minor_t mnum - minor number identity of metadevice 356 * md_raidcs_t *cs - child save structure 357 * mr_column_t *dcolumn - pointer to data column structure 358 * mr_column_t *pcolumn - pointer to parity column structure 359 * 360 * RETURNS: RCL_OKAY, RCL_ERRED 361 * 362 * LOCKS: Expects Line Writer Lock and Unit Resource Lock to be held 363 * across call. 364 */ 365 366 static int 367 erred_check_line(mr_unit_t *un, md_raidcs_t *cs, mr_column_t *column) 368 { 369 370 ASSERT(un != NULL); 371 ASSERT(cs->cs_flags & MD_RCS_LLOCKD); 372 373 if (column->un_devstate & RCS_OKAY) 374 return (RCL_OKAY); 375 376 if (column->un_devstate & RCS_ERRED) 377 return (RCL_ERRED); /* do not read from errored disk */ 378 379 /* 380 * for the last errored case their are two considerations. 381 * When the last errored column is the only errored column then 382 * do treat it like a maintenance column, not doing I/O from 383 * it. When it there are other failures then just attempt 384 * to use it. 385 */ 386 if (column->un_devstate & RCS_LAST_ERRED) 387 return (RCL_ERRED); 388 389 ASSERT(column->un_devstate & RCS_RESYNC); 390 391 /* 392 * When a resync from a hotspare is being done (copy resync) 393 * then always treat it as an OKAY column, since no regen 394 * is required. 395 */ 396 if (column->un_devflags & MD_RAID_COPY_RESYNC) { 397 return (RCL_OKAY); 398 } 399 400 mutex_enter(&un->un_mx); 401 if (cs->cs_line < un->un_resync_line_index) { 402 mutex_exit(&un->un_mx); 403 return (RCL_OKAY); 404 } 405 mutex_exit(&un->un_mx); 406 return (RCL_ERRED); 407 408 } 409 410 /* 411 * NAMES: raid_state_cnt 412 * 413 * DESCRIPTION: counts number of column in a specific state 414 * 415 * PARAMETERS: md_raid_t *un 416 * rcs_state state 417 */ 418 int 419 raid_state_cnt(mr_unit_t *un, rcs_state_t state) 420 { 421 int i, retval = 0; 422 423 for (i = 0; i < un->un_totalcolumncnt; i++) 424 if (un->un_column[i].un_devstate & state) 425 retval++; 426 return (retval); 427 } 428 429 /* 430 * NAMES: raid_io_overlaps 431 * 432 * DESCRIPTION: checkst for overlap of 2 child save structures 433 * 434 * PARAMETERS: md_raidcs_t cs1 435 * md_raidcs_t cs2 436 * 437 * RETURNS: 0 - no overlap 438 * 1 - overlap 439 */ 440 int 441 raid_io_overlaps(md_raidcs_t *cs1, md_raidcs_t *cs2) 442 { 443 if (cs1->cs_blkno > cs2->cs_lastblk) 444 return (0); 445 if (cs1->cs_lastblk < cs2->cs_blkno) 446 return (0); 447 return (1); 448 } 449 450 /* 451 * NAMES: raid_parent_constructor 452 * DESCRIPTION: parent structure constructor routine 453 * PARAMETERS: 454 */ 455 /*ARGSUSED1*/ 456 static int 457 raid_parent_constructor(void *p, void *d1, int d2) 458 { 459 mutex_init(&((md_raidps_t *)p)->ps_mx, 460 NULL, MUTEX_DEFAULT, NULL); 461 mutex_init(&((md_raidps_t *)p)->ps_mapin_mx, 462 NULL, MUTEX_DEFAULT, NULL); 463 return (0); 464 } 465 466 void 467 raid_parent_init(md_raidps_t *ps) 468 { 469 bzero(ps, offsetof(md_raidps_t, ps_mx)); 470 ((md_raidps_t *)ps)->ps_flags = MD_RPS_INUSE; 471 ((md_raidps_t *)ps)->ps_magic = RAID_PSMAGIC; 472 } 473 474 /*ARGSUSED1*/ 475 static void 476 raid_parent_destructor(void *p, void *d) 477 { 478 mutex_destroy(&((md_raidps_t *)p)->ps_mx); 479 mutex_destroy(&((md_raidps_t *)p)->ps_mapin_mx); 480 } 481 482 /* 483 * NAMES: raid_child_constructor 484 * DESCRIPTION: child structure constructor routine 485 * PARAMETERS: 486 */ 487 /*ARGSUSED1*/ 488 static int 489 raid_child_constructor(void *p, void *d1, int d2) 490 { 491 md_raidcs_t *cs = (md_raidcs_t *)p; 492 mutex_init(&cs->cs_mx, NULL, MUTEX_DEFAULT, NULL); 493 bioinit(&cs->cs_dbuf); 494 bioinit(&cs->cs_pbuf); 495 bioinit(&cs->cs_hbuf); 496 return (0); 497 } 498 499 void 500 raid_child_init(md_raidcs_t *cs) 501 { 502 bzero(cs, offsetof(md_raidcs_t, cs_mx)); 503 504 md_bioreset(&cs->cs_dbuf); 505 md_bioreset(&cs->cs_pbuf); 506 md_bioreset(&cs->cs_hbuf); 507 508 ((md_raidcs_t *)cs)->cs_dbuf.b_chain = 509 ((md_raidcs_t *)cs)->cs_pbuf.b_chain = 510 ((md_raidcs_t *)cs)->cs_hbuf.b_chain = 511 (struct buf *)(cs); 512 513 cs->cs_magic = RAID_CSMAGIC; 514 cs->cs_line = MD_DISKADDR_ERROR; 515 cs->cs_dpwslot = -1; 516 cs->cs_ppwslot = -1; 517 } 518 519 /*ARGSUSED1*/ 520 static void 521 raid_child_destructor(void *p, void *d) 522 { 523 biofini(&((md_raidcs_t *)p)->cs_dbuf); 524 biofini(&((md_raidcs_t *)p)->cs_hbuf); 525 biofini(&((md_raidcs_t *)p)->cs_pbuf); 526 mutex_destroy(&((md_raidcs_t *)p)->cs_mx); 527 } 528 529 /*ARGSUSED1*/ 530 static int 531 raid_cbuf_constructor(void *p, void *d1, int d2) 532 { 533 bioinit(&((md_raidcbuf_t *)p)->cbuf_bp); 534 return (0); 535 } 536 537 static void 538 raid_cbuf_init(md_raidcbuf_t *cb) 539 { 540 bzero(cb, offsetof(md_raidcbuf_t, cbuf_bp)); 541 md_bioreset(&cb->cbuf_bp); 542 cb->cbuf_magic = RAID_BUFMAGIC; 543 cb->cbuf_pwslot = -1; 544 cb->cbuf_flags = CBUF_WRITE; 545 } 546 547 /*ARGSUSED1*/ 548 static void 549 raid_cbuf_destructor(void *p, void *d) 550 { 551 biofini(&((md_raidcbuf_t *)p)->cbuf_bp); 552 } 553 554 /* 555 * NAMES: raid_run_queue 556 * DESCRIPTION: spawn a backend processing daemon for RAID metadevice. 557 * PARAMETERS: 558 */ 559 /*ARGSUSED*/ 560 static void 561 raid_run_queue(void *d) 562 { 563 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 564 md_daemon(1, &md_done_daemon); 565 } 566 567 /* 568 * NAME: raid_build_pwslot 569 * DESCRIPTION: builds mr_pw_reserve for the column 570 * PARAMETERS: un is the pointer to the unit structure 571 * colindex is the column to create the structure for 572 */ 573 int 574 raid_build_pw_reservation(mr_unit_t *un, int colindex) 575 { 576 mr_pw_reserve_t *pw; 577 mr_scoreboard_t *sb; 578 int i; 579 580 pw = (mr_pw_reserve_t *) kmem_zalloc(sizeof (mr_pw_reserve_t) + 581 (sizeof (mr_scoreboard_t) * un->un_pwcnt), KM_SLEEP); 582 pw->pw_magic = RAID_PWMAGIC; 583 pw->pw_column = colindex; 584 pw->pw_free = un->un_pwcnt; 585 sb = &pw->pw_sb[0]; 586 for (i = 0; i < un->un_pwcnt; i++) { 587 sb[i].sb_column = colindex; 588 sb[i].sb_flags = SB_UNUSED; 589 sb[i].sb_start_blk = 0; 590 sb[i].sb_last_blk = 0; 591 sb[i].sb_cs = NULL; 592 } 593 un->un_column_ic[colindex].un_pw_reserve = pw; 594 return (0); 595 } 596 /* 597 * NAME: raid_free_pw_reservation 598 * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine 599 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 600 * int colindex - index of the column whose pre-write slot struct 601 * is to be destroyed. 602 */ 603 void 604 raid_free_pw_reservation(mr_unit_t *un, int colindex) 605 { 606 mr_pw_reserve_t *pw = un->un_column_ic[colindex].un_pw_reserve; 607 608 kmem_free(pw, sizeof (mr_pw_reserve_t) + 609 (sizeof (mr_scoreboard_t) * un->un_pwcnt)); 610 } 611 612 /* 613 * NAME: raid_cancel_pwslot 614 * DESCRIPTION: RAID metadevice write routine 615 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 616 */ 617 static void 618 raid_cancel_pwslot(md_raidcs_t *cs) 619 { 620 mr_unit_t *un = cs->cs_un; 621 mr_pw_reserve_t *pw; 622 mr_scoreboard_t *sb; 623 mr_column_ic_t *col; 624 md_raidcbuf_t *cbuf; 625 int broadcast = 0; 626 627 if (cs->cs_ps->ps_flags & MD_RPS_READ) 628 return; 629 if (cs->cs_dpwslot != -1) { 630 col = &un->un_column_ic[cs->cs_dcolumn]; 631 pw = col->un_pw_reserve; 632 sb = &pw->pw_sb[cs->cs_dpwslot]; 633 sb->sb_flags = SB_AVAIL; 634 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 635 broadcast++; 636 sb->sb_cs = NULL; 637 } 638 639 if (cs->cs_ppwslot != -1) { 640 col = &un->un_column_ic[cs->cs_pcolumn]; 641 pw = col->un_pw_reserve; 642 sb = &pw->pw_sb[cs->cs_ppwslot]; 643 sb->sb_flags = SB_AVAIL; 644 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 645 broadcast++; 646 sb->sb_cs = NULL; 647 } 648 649 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 650 if (cbuf->cbuf_pwslot == -1) 651 continue; 652 col = &un->un_column_ic[cbuf->cbuf_column]; 653 pw = col->un_pw_reserve; 654 sb = &pw->pw_sb[cbuf->cbuf_pwslot]; 655 sb->sb_flags = SB_AVAIL; 656 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 657 broadcast++; 658 sb->sb_cs = NULL; 659 } 660 if (broadcast) { 661 cv_broadcast(&un->un_cv); 662 return; 663 } 664 mutex_enter(&un->un_mx); 665 if (un->un_rflags & MD_RFLAG_NEEDPW) 666 cv_broadcast(&un->un_cv); 667 mutex_exit(&un->un_mx); 668 } 669 670 static void 671 raid_free_pwinvalidate(md_raidcs_t *cs) 672 { 673 md_raidcbuf_t *cbuf; 674 md_raidcbuf_t *cbuf_to_free; 675 mr_unit_t *un = cs->cs_un; 676 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); 677 mr_pw_reserve_t *pw; 678 mr_scoreboard_t *sb; 679 int broadcast = 0; 680 681 cbuf = cs->cs_pw_inval_list; 682 ASSERT(cbuf); 683 mutex_enter(&un->un_linlck_mx); 684 while (cbuf) { 685 pw = un->un_column_ic[cbuf->cbuf_column].un_pw_reserve; 686 sb = &pw->pw_sb[0]; 687 ASSERT(sb[cbuf->cbuf_pwslot].sb_flags & SB_INVAL_PEND); 688 sb[cbuf->cbuf_pwslot].sb_flags = SB_UNUSED; 689 sb[cbuf->cbuf_pwslot].sb_cs = NULL; 690 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 691 broadcast++; 692 cbuf_to_free = cbuf; 693 cbuf = cbuf->cbuf_next; 694 kmem_free(cbuf_to_free->cbuf_buffer, dbtob(un->un_iosize)); 695 kmem_cache_free(raid_cbuf_cache, cbuf_to_free); 696 } 697 cs->cs_pw_inval_list = (md_raidcbuf_t *)NULL; 698 /* 699 * now that there is a free prewrite slot, check to see if there 700 * are any io operations waiting first wake up the raid_io_startup 701 * then signal the the processes waiting in raid_write. 702 */ 703 if (ui->ui_io_lock->io_list_front) 704 raid_io_startup(un); 705 mutex_exit(&un->un_linlck_mx); 706 if (broadcast) { 707 cv_broadcast(&un->un_cv); 708 return; 709 } 710 mutex_enter(&un->un_mx); 711 if (un->un_rflags & MD_RFLAG_NEEDPW) 712 cv_broadcast(&un->un_cv); 713 mutex_exit(&un->un_mx); 714 } 715 716 717 static int 718 raid_get_pwslot(md_raidcs_t *cs, int column) 719 { 720 mr_scoreboard_t *sb; 721 mr_pw_reserve_t *pw; 722 mr_unit_t *un = cs->cs_un; 723 diskaddr_t start_blk = cs->cs_blkno; 724 diskaddr_t last_blk = cs->cs_lastblk; 725 int i; 726 int pwcnt = un->un_pwcnt; 727 int avail = -1; 728 int use = -1; 729 int flags; 730 731 732 /* start with the data column */ 733 pw = cs->cs_un->un_column_ic[column].un_pw_reserve; 734 sb = &pw->pw_sb[0]; 735 ASSERT(pw->pw_free > 0); 736 for (i = 0; i < pwcnt; i++) { 737 flags = sb[i].sb_flags; 738 if (flags & SB_INVAL_PEND) 739 continue; 740 741 if ((avail == -1) && (flags & (SB_AVAIL | SB_UNUSED))) 742 avail = i; 743 744 if ((start_blk > sb[i].sb_last_blk) || 745 (last_blk < sb[i].sb_start_blk)) 746 continue; 747 748 /* OVERLAP */ 749 ASSERT(! (sb[i].sb_flags & SB_INUSE)); 750 751 /* 752 * raid_invalidate_pwslot attempts to zero out prewrite entry 753 * in parallel with other disk reads/writes related to current 754 * transaction. however cs_frags accounting for this case is 755 * broken because raid_write_io resets cs_frags i.e. ignoring 756 * that it could have been been set to > 0 value by 757 * raid_invalidate_pwslot. While this can be fixed an 758 * additional problem is that we don't seem to handle 759 * correctly the case of getting a disk error for prewrite 760 * entry invalidation. 761 * It does not look like we really need 762 * to invalidate prewrite slots because raid_replay sorts 763 * prewrite id's in ascending order and during recovery the 764 * latest prewrite entry for the same block will be replay 765 * last. That's why i ifdef'd out the call to 766 * raid_invalidate_pwslot. --aguzovsk@east 767 */ 768 769 if (use == -1) { 770 use = i; 771 } 772 } 773 774 ASSERT(avail != -1); 775 pw->pw_free--; 776 if (use == -1) 777 use = avail; 778 779 ASSERT(! (sb[use].sb_flags & SB_INUSE)); 780 sb[use].sb_flags = SB_INUSE; 781 sb[use].sb_cs = cs; 782 sb[use].sb_start_blk = start_blk; 783 sb[use].sb_last_blk = last_blk; 784 ASSERT((use >= 0) && (use < un->un_pwcnt)); 785 return (use); 786 } 787 788 static int 789 raid_check_pw(md_raidcs_t *cs) 790 { 791 792 mr_unit_t *un = cs->cs_un; 793 int i; 794 795 ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); 796 /* 797 * check to be sure there is a prewrite slot available 798 * if not just return. 799 */ 800 if (cs->cs_flags & MD_RCS_LINE) { 801 for (i = 0; i < un->un_totalcolumncnt; i++) 802 if (un->un_column_ic[i].un_pw_reserve->pw_free <= 0) 803 return (1); 804 return (0); 805 } 806 807 if (un->un_column_ic[cs->cs_dcolumn].un_pw_reserve->pw_free <= 0) 808 return (1); 809 if (un->un_column_ic[cs->cs_pcolumn].un_pw_reserve->pw_free <= 0) 810 return (1); 811 return (0); 812 } 813 static int 814 raid_alloc_pwslot(md_raidcs_t *cs) 815 { 816 mr_unit_t *un = cs->cs_un; 817 md_raidcbuf_t *cbuf; 818 819 ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); 820 if (raid_check_pw(cs)) 821 return (1); 822 823 mutex_enter(&un->un_mx); 824 un->un_pwid++; 825 cs->cs_pwid = un->un_pwid; 826 mutex_exit(&un->un_mx); 827 828 cs->cs_dpwslot = raid_get_pwslot(cs, cs->cs_dcolumn); 829 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 830 cbuf->cbuf_pwslot = raid_get_pwslot(cs, cbuf->cbuf_column); 831 } 832 cs->cs_ppwslot = raid_get_pwslot(cs, cs->cs_pcolumn); 833 834 cs->cs_flags |= MD_RCS_HAVE_PW_SLOTS; 835 836 return (0); 837 } 838 839 /* 840 * NAMES: raid_build_incore 841 * DESCRIPTION: RAID metadevice incore structure building routine 842 * PARAMETERS: void *p - pointer to a unit structure 843 * int snarfing - a flag to indicate snarfing is required 844 */ 845 int 846 raid_build_incore(void *p, int snarfing) 847 { 848 mr_unit_t *un = (mr_unit_t *)p; 849 minor_t mnum = MD_SID(un); 850 mddb_recid_t hs_recid = 0; 851 int i; 852 int preserve_flags; 853 mr_column_t *column; 854 int iosize; 855 md_dev64_t hs, dev; 856 int resync_cnt = 0, error_cnt = 0; 857 858 hs = NODEV64; 859 dev = NODEV64; 860 861 /* clear out bogus pointer incase we return(1) prior to alloc */ 862 un->mr_ic = NULL; 863 864 if (MD_STATUS(un) & MD_UN_BEING_RESET) { 865 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); 866 return (1); 867 } 868 869 if (MD_UNIT(mnum) != NULL) 870 return (0); 871 872 if (snarfing) 873 MD_STATUS(un) = 0; 874 875 un->mr_ic = (mr_unit_ic_t *)kmem_zalloc(sizeof (*un->mr_ic), 876 KM_SLEEP); 877 878 un->un_column_ic = (mr_column_ic_t *) 879 kmem_zalloc(sizeof (mr_column_ic_t) * 880 un->un_totalcolumncnt, KM_SLEEP); 881 882 for (i = 0; i < un->un_totalcolumncnt; i++) { 883 884 column = &un->un_column[i]; 885 preserve_flags = column->un_devflags & 886 (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC); 887 column->un_devflags &= 888 ~(MD_RAID_ALT_ISOPEN | MD_RAID_DEV_ISOPEN | 889 MD_RAID_WRITE_ALT); 890 if (raid_build_pw_reservation(un, i) != 0) { 891 /* could not build pwslot */ 892 return (1); 893 } 894 895 if (snarfing) { 896 set_t setno = MD_MIN2SET(mnum); 897 dev = md_getdevnum(setno, mddb_getsidenum(setno), 898 column->un_orig_key, MD_NOTRUST_DEVT); 899 /* 900 * Comment out instead of remove so we have history 901 * In the pre-SVM releases stored devt is used so 902 * as long as there is one snarf is always happy 903 * even the component is powered off. This is not 904 * the case in current SVM implementation. NODEV64 905 * can be returned and in this case since we resolve 906 * the devt at 'open' time (first use of metadevice) 907 * we will allow snarf continue. 908 * 909 * if (dev == NODEV64) 910 * return (1); 911 */ 912 913 /* 914 * Setup un_orig_dev from device id info if the device 915 * is valid (not NODEV64). 916 */ 917 if (dev != NODEV64) 918 column->un_orig_dev = dev; 919 920 if (column->un_devstate & RCS_RESYNC) 921 resync_cnt++; 922 if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) 923 error_cnt++; 924 925 if (HOTSPARED(un, i)) { 926 (void) md_hot_spare_ifc(HS_MKDEV, 927 0, 0, 0, &column->un_hs_id, NULL, 928 &hs, NULL); 929 /* 930 * Same here 931 * 932 * if (hs == NODEV64) 933 * return (1); 934 */ 935 } 936 937 if (HOTSPARED(un, i)) { 938 if (column->un_devstate & 939 (RCS_OKAY | RCS_LAST_ERRED)) { 940 column->un_dev = hs; 941 column->un_pwstart = 942 column->un_hs_pwstart; 943 column->un_devstart = 944 column->un_hs_devstart; 945 preserve_flags &= 946 ~(MD_RAID_COPY_RESYNC | 947 MD_RAID_REGEN_RESYNC); 948 } else if (column->un_devstate & RCS_RESYNC) { 949 /* 950 * if previous system was 4.0 set 951 * the direction flags 952 */ 953 if ((preserve_flags & 954 (MD_RAID_COPY_RESYNC | 955 MD_RAID_REGEN_RESYNC)) == 0) { 956 if (column->un_alt_dev != 957 NODEV64) 958 preserve_flags |= 959 MD_RAID_COPY_RESYNC; 960 else 961 preserve_flags |= 962 MD_RAID_REGEN_RESYNC; 963 } 964 } 965 } else { /* no hot spares */ 966 column->un_dev = dev; 967 column->un_pwstart = column->un_orig_pwstart; 968 column->un_devstart = column->un_orig_devstart; 969 if (column->un_devstate & RCS_RESYNC) { 970 preserve_flags |= MD_RAID_REGEN_RESYNC; 971 preserve_flags &= ~MD_RAID_COPY_RESYNC; 972 } 973 } 974 if (! (column->un_devstate & RCS_RESYNC)) { 975 preserve_flags &= 976 ~(MD_RAID_REGEN_RESYNC | 977 MD_RAID_COPY_RESYNC); 978 } 979 980 column->un_devflags = preserve_flags; 981 column->un_alt_dev = NODEV64; 982 column->un_alt_pwstart = 0; 983 column->un_alt_devstart = 0; 984 un->un_resync_line_index = 0; 985 un->un_resync_index = 0; 986 un->un_percent_done = 0; 987 } 988 } 989 990 if (resync_cnt && error_cnt) { 991 for (i = 0; i < un->un_totalcolumncnt; i++) { 992 column = &un->un_column[i]; 993 if (HOTSPARED(un, i) && 994 (column->un_devstate & RCS_RESYNC) && 995 (column->un_devflags & MD_RAID_COPY_RESYNC)) 996 /* hotspare has data */ 997 continue; 998 999 if (HOTSPARED(un, i) && 1000 (column->un_devstate & RCS_RESYNC)) { 1001 /* hotspare does not have data */ 1002 raid_hs_release(HS_FREE, un, &hs_recid, i); 1003 column->un_dev = column->un_orig_dev; 1004 column->un_pwstart = column->un_orig_pwstart; 1005 column->un_devstart = column->un_orig_devstart; 1006 mddb_setrecprivate(hs_recid, MD_PRV_PENDCOM); 1007 } 1008 1009 if (column->un_devstate & RCS_ERRED) 1010 column->un_devstate = RCS_LAST_ERRED; 1011 1012 if (column->un_devstate & RCS_RESYNC) 1013 column->un_devstate = RCS_ERRED; 1014 } 1015 } 1016 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM); 1017 1018 un->un_pwid = 1; /* or some other possible value */ 1019 un->un_magic = RAID_UNMAGIC; 1020 iosize = un->un_iosize; 1021 un->un_pbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); 1022 un->un_dbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); 1023 mutex_init(&un->un_linlck_mx, NULL, MUTEX_DEFAULT, NULL); 1024 cv_init(&un->un_linlck_cv, NULL, CV_DEFAULT, NULL); 1025 un->un_linlck_chn = NULL; 1026 1027 /* place various information in the in-core data structures */ 1028 md_nblocks_set(mnum, un->c.un_total_blocks); 1029 MD_UNIT(mnum) = un; 1030 1031 return (0); 1032 } 1033 1034 /* 1035 * NAMES: reset_raid 1036 * DESCRIPTION: RAID metadevice reset routine 1037 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 1038 * minor_t mnum - RAID metadevice minor number 1039 * int removing - a flag to imply removing device name from 1040 * MDDB database. 1041 */ 1042 void 1043 reset_raid(mr_unit_t *un, minor_t mnum, int removing) 1044 { 1045 int i, n = 0; 1046 sv_dev_t *sv; 1047 mr_column_t *column; 1048 int column_cnt = un->un_totalcolumncnt; 1049 mddb_recid_t *recids, vtoc_id; 1050 int hserr; 1051 1052 ASSERT((MDI_UNIT(mnum)->ui_io_lock->io_list_front == NULL) && 1053 (MDI_UNIT(mnum)->ui_io_lock->io_list_back == NULL)); 1054 1055 md_destroy_unit_incore(mnum, &raid_md_ops); 1056 1057 md_nblocks_set(mnum, -1ULL); 1058 MD_UNIT(mnum) = NULL; 1059 1060 if (un->un_pbuffer) { 1061 kmem_free(un->un_pbuffer, dbtob(un->un_iosize)); 1062 un->un_pbuffer = NULL; 1063 } 1064 if (un->un_dbuffer) { 1065 kmem_free(un->un_dbuffer, dbtob(un->un_iosize)); 1066 un->un_dbuffer = NULL; 1067 } 1068 1069 /* free all pre-write slots created during build incore */ 1070 for (i = 0; i < un->un_totalcolumncnt; i++) 1071 raid_free_pw_reservation(un, i); 1072 1073 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * 1074 un->un_totalcolumncnt); 1075 1076 kmem_free(un->mr_ic, sizeof (*un->mr_ic)); 1077 1078 /* 1079 * Attempt release of its minor node 1080 */ 1081 md_remove_minor_node(mnum); 1082 1083 if (!removing) 1084 return; 1085 1086 sv = (sv_dev_t *)kmem_zalloc((column_cnt + 1) * sizeof (sv_dev_t), 1087 KM_SLEEP); 1088 1089 recids = (mddb_recid_t *) 1090 kmem_zalloc((column_cnt + 2) * sizeof (mddb_recid_t), KM_SLEEP); 1091 1092 for (i = 0; i < column_cnt; i++) { 1093 md_unit_t *comp_un; 1094 md_dev64_t comp_dev; 1095 1096 column = &un->un_column[i]; 1097 sv[i].setno = MD_MIN2SET(mnum); 1098 sv[i].key = column->un_orig_key; 1099 if (HOTSPARED(un, i)) { 1100 if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) 1101 hserr = HS_BAD; 1102 else 1103 hserr = HS_FREE; 1104 raid_hs_release(hserr, un, &recids[n++], i); 1105 } 1106 /* 1107 * deparent any metadevices. 1108 * NOTE: currently soft partitions are the only metadevices 1109 * allowed in RAID metadevices. 1110 */ 1111 comp_dev = column->un_dev; 1112 if (md_getmajor(comp_dev) == md_major) { 1113 comp_un = MD_UNIT(md_getminor(comp_dev)); 1114 recids[n++] = MD_RECID(comp_un); 1115 md_reset_parent(comp_dev); 1116 } 1117 } 1118 /* decrement the reference count of the old hsp */ 1119 if (un->un_hsp_id != -1) 1120 (void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0, 1121 &recids[n++], NULL, NULL, NULL); 1122 recids[n] = 0; 1123 MD_STATUS(un) |= MD_UN_BEING_RESET; 1124 vtoc_id = un->c.un_vtoc_id; 1125 1126 raid_commit(un, recids); 1127 1128 /* 1129 * Remove self from the namespace 1130 */ 1131 if (un->c.un_revision & MD_FN_META_DEV) { 1132 (void) md_rem_selfname(un->c.un_self_id); 1133 } 1134 1135 /* Remove the unit structure */ 1136 mddb_deleterec_wrapper(un->c.un_record_id); 1137 1138 /* Remove the vtoc, if present */ 1139 if (vtoc_id) 1140 mddb_deleterec_wrapper(vtoc_id); 1141 md_rem_names(sv, column_cnt); 1142 kmem_free(sv, (column_cnt + 1) * sizeof (sv_dev_t)); 1143 kmem_free(recids, (column_cnt + 2) * sizeof (mddb_recid_t)); 1144 1145 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 1146 MD_MIN2SET(mnum), mnum); 1147 } 1148 1149 /* 1150 * NAMES: raid_error_parent 1151 * DESCRIPTION: mark a parent structure in error 1152 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1153 * int error - error value to set 1154 * NOTE: (TBR) - this routine currently is not in use. 1155 */ 1156 static void 1157 raid_error_parent(md_raidps_t *ps, int error) 1158 { 1159 mutex_enter(&ps->ps_mx); 1160 ps->ps_flags |= MD_RPS_ERROR; 1161 ps->ps_error = error; 1162 mutex_exit(&ps->ps_mx); 1163 } 1164 1165 /* 1166 * The following defines tell raid_free_parent 1167 * RFP_RLS_LOCK release the unit reader lock when done. 1168 * RFP_DECR_PWFRAGS decrement ps_pwfrags 1169 * RFP_DECR_FRAGS decrement ps_frags 1170 * RFP_DECR_READFRAGS read keeps FRAGS and PWFRAGS in lockstep 1171 */ 1172 #define RFP_RLS_LOCK 0x00001 1173 #define RFP_DECR_PWFRAGS 0x00002 1174 #define RFP_DECR_FRAGS 0x00004 1175 #define RFP_DECR_READFRAGS (RFP_DECR_PWFRAGS | RFP_DECR_FRAGS) 1176 1177 /* 1178 * NAMES: raid_free_parent 1179 * DESCRIPTION: free a parent structure 1180 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1181 * int todo - indicates what needs to be done 1182 */ 1183 static void 1184 raid_free_parent(md_raidps_t *ps, int todo) 1185 { 1186 mdi_unit_t *ui = ps->ps_ui; 1187 1188 ASSERT(ps->ps_magic == RAID_PSMAGIC); 1189 ASSERT(ps->ps_flags & MD_RPS_INUSE); 1190 mutex_enter(&ps->ps_mx); 1191 if (todo & RFP_DECR_PWFRAGS) { 1192 ASSERT(ps->ps_pwfrags); 1193 ps->ps_pwfrags--; 1194 if (ps->ps_pwfrags == 0 && (! (ps->ps_flags & MD_RPS_IODONE))) { 1195 if (ps->ps_flags & MD_RPS_ERROR) { 1196 ps->ps_bp->b_flags |= B_ERROR; 1197 ps->ps_bp->b_error = ps->ps_error; 1198 } 1199 md_kstat_done(ui, ps->ps_bp, 0); 1200 biodone(ps->ps_bp); 1201 ps->ps_flags |= MD_RPS_IODONE; 1202 } 1203 } 1204 1205 if (todo & RFP_DECR_FRAGS) { 1206 ASSERT(ps->ps_frags); 1207 ps->ps_frags--; 1208 } 1209 1210 if (ps->ps_frags != 0) { 1211 mutex_exit(&ps->ps_mx); 1212 return; 1213 } 1214 1215 ASSERT((ps->ps_frags == 0) && (ps->ps_pwfrags == 0)); 1216 mutex_exit(&ps->ps_mx); 1217 1218 if (todo & RFP_RLS_LOCK) 1219 md_io_readerexit(ui); 1220 1221 if (panicstr) { 1222 ps->ps_flags |= MD_RPS_DONE; 1223 return; 1224 } 1225 1226 if (ps->ps_flags & MD_RPS_HSREQ) 1227 (void) raid_hotspares(); 1228 1229 ASSERT(todo & RFP_RLS_LOCK); 1230 ps->ps_flags &= ~MD_RPS_INUSE; 1231 1232 md_dec_iocount(MD_MIN2SET(ps->ps_un->c.un_self_id)); 1233 1234 kmem_cache_free(raid_parent_cache, ps); 1235 } 1236 1237 /* 1238 * NAMES: raid_free_child 1239 * DESCRIPTION: free a parent structure 1240 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1241 * int drop_locks - 0 for no locks held 1242 * NOTE: (TBR) - this routine currently is not in use. 1243 */ 1244 static void 1245 raid_free_child(md_raidcs_t *cs, int drop_locks) 1246 { 1247 mr_unit_t *un = cs->cs_un; 1248 md_raidcbuf_t *cbuf, *cbuf1; 1249 1250 if (cs->cs_pw_inval_list) 1251 raid_free_pwinvalidate(cs); 1252 1253 if (drop_locks) { 1254 ASSERT(cs->cs_flags & MD_RCS_LLOCKD && 1255 (cs->cs_flags & (MD_RCS_READER | MD_RCS_WRITER))); 1256 md_unit_readerexit(MDI_UNIT(MD_SID(un))); 1257 raid_line_exit(cs); 1258 } else { 1259 ASSERT(!(cs->cs_flags & MD_RCS_LLOCKD)); 1260 } 1261 1262 freebuffers(cs); 1263 cbuf = cs->cs_buflist; 1264 while (cbuf) { 1265 cbuf1 = cbuf->cbuf_next; 1266 kmem_cache_free(raid_cbuf_cache, cbuf); 1267 cbuf = cbuf1; 1268 } 1269 if (cs->cs_dbuf.b_flags & B_REMAPPED) 1270 bp_mapout(&cs->cs_dbuf); 1271 kmem_cache_free(raid_child_cache, cs); 1272 } 1273 1274 /* 1275 * NAME: raid_regen_parity 1276 * 1277 * DESCRIPTION: This routine is used to regenerate the parity blocks 1278 * for the entire raid device. It is called from 1279 * both the regen thread and the IO path. 1280 * 1281 * On error the entire device is marked as in error by 1282 * placing the erroring device in error and all other 1283 * devices in last_errored. 1284 * 1285 * PARAMETERS: md_raidcs_t *cs 1286 */ 1287 void 1288 raid_regen_parity(md_raidcs_t *cs) 1289 { 1290 mr_unit_t *un = cs->cs_un; 1291 mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); 1292 caddr_t buffer; 1293 caddr_t parity_buffer; 1294 buf_t *bp; 1295 uint_t *dbuf, *pbuf; 1296 uint_t colcnt = un->un_totalcolumncnt; 1297 int column; 1298 int parity_column = cs->cs_pcolumn; 1299 size_t bcount; 1300 int j; 1301 1302 /* 1303 * This routine uses the data and parity buffers allocated to a 1304 * write. In the case of a read the buffers are allocated and 1305 * freed at the end. 1306 */ 1307 1308 ASSERT(IO_READER_HELD(un)); 1309 ASSERT(cs->cs_flags & MD_RCS_LLOCKD); 1310 ASSERT(UNIT_READER_HELD(un)); 1311 1312 if (raid_state_cnt(un, RCS_OKAY) != colcnt) 1313 return; 1314 1315 if (cs->cs_flags & MD_RCS_READER) { 1316 getpbuffer(cs); 1317 getdbuffer(cs); 1318 } 1319 ASSERT(cs->cs_dbuffer && cs->cs_pbuffer); 1320 bcount = cs->cs_bcount; 1321 buffer = cs->cs_dbuffer; 1322 parity_buffer = cs->cs_pbuffer; 1323 bzero(parity_buffer, bcount); 1324 bp = &cs->cs_dbuf; 1325 for (column = 0; column < colcnt; column++) { 1326 if (column == parity_column) 1327 continue; 1328 reset_buf(bp, B_READ | B_BUSY, bcount); 1329 bp->b_un.b_addr = buffer; 1330 bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev); 1331 bp->b_lblkno = cs->cs_blkno + un->un_column[column].un_devstart; 1332 bp->b_bcount = bcount; 1333 bp->b_bufsize = bcount; 1334 (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); 1335 if (biowait(bp)) 1336 goto bail; 1337 pbuf = (uint_t *)(void *)parity_buffer; 1338 dbuf = (uint_t *)(void *)buffer; 1339 for (j = 0; j < (bcount / (sizeof (uint_t))); j++) { 1340 *pbuf = *pbuf ^ *dbuf; 1341 pbuf++; 1342 dbuf++; 1343 } 1344 } 1345 1346 reset_buf(bp, B_WRITE | B_BUSY, cs->cs_bcount); 1347 bp->b_un.b_addr = parity_buffer; 1348 bp->b_edev = md_dev64_to_dev(un->un_column[parity_column].un_dev); 1349 bp->b_lblkno = cs->cs_blkno + un->un_column[parity_column].un_devstart; 1350 bp->b_bcount = bcount; 1351 bp->b_bufsize = bcount; 1352 (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); 1353 if (biowait(bp)) 1354 goto bail; 1355 1356 if (cs->cs_flags & MD_RCS_READER) { 1357 freebuffers(cs); 1358 cs->cs_pbuffer = NULL; 1359 cs->cs_dbuffer = NULL; 1360 } 1361 bp->b_chain = (struct buf *)cs; 1362 return; 1363 bail: 1364 if (cs->cs_flags & MD_RCS_READER) { 1365 freebuffers(cs); 1366 cs->cs_pbuffer = NULL; 1367 cs->cs_dbuffer = NULL; 1368 } 1369 md_unit_readerexit(ui); 1370 un = md_unit_writerlock(ui); 1371 raid_set_state(un, column, RCS_ERRED, 0); 1372 for (column = 0; column < colcnt; column++) 1373 raid_set_state(un, column, RCS_ERRED, 0); 1374 raid_commit(un, NULL); 1375 md_unit_writerexit(ui); 1376 un = md_unit_readerlock(ui); 1377 bp->b_chain = (struct buf *)cs; 1378 } 1379 1380 /* 1381 * NAMES: raid_error_state 1382 * DESCRIPTION: check unit and column states' impact on I/O error 1383 * NOTE: the state now may not be the state when the 1384 * I/O completed due to race conditions. 1385 * PARAMETERS: mr_unit_t *un - pointer to raid unit structure 1386 * md_raidcs_t *cs - pointer to child structure 1387 * buf_t *bp - pointer to buffer structure 1388 */ 1389 static int 1390 raid_error_state(mr_unit_t *un, buf_t *bp) 1391 { 1392 int column; 1393 int i; 1394 1395 ASSERT(IO_READER_HELD(un)); 1396 ASSERT(UNIT_WRITER_HELD(un)); 1397 1398 column = -1; 1399 for (i = 0; i < un->un_totalcolumncnt; i++) { 1400 if (un->un_column[i].un_dev == md_expldev(bp->b_edev)) { 1401 column = i; 1402 break; 1403 } 1404 if (un->un_column[i].un_alt_dev == md_expldev(bp->b_edev)) { 1405 column = i; 1406 break; 1407 } 1408 } 1409 1410 /* in case a replace snuck in while waiting on unit writer lock */ 1411 1412 if (column == -1) { 1413 return (0); 1414 } 1415 1416 (void) raid_set_state(un, column, RCS_ERRED, 0); 1417 ASSERT(un->un_state & (RUS_ERRED | RUS_LAST_ERRED)); 1418 1419 raid_commit(un, NULL); 1420 if (un->un_state & RUS_ERRED) { 1421 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, 1422 MD_UN2SET(un), MD_SID(un)); 1423 } else if (un->un_state & RUS_LAST_ERRED) { 1424 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE, 1425 MD_UN2SET(un), MD_SID(un)); 1426 } 1427 1428 return (EIO); 1429 } 1430 1431 /* 1432 * NAME: raid_mapin_buf 1433 * DESCRIPTION: wait for the input buffer header to be maped in 1434 * PARAMETERS: md_raidps_t *ps 1435 */ 1436 static void 1437 raid_mapin_buf(md_raidcs_t *cs) 1438 { 1439 md_raidps_t *ps = cs->cs_ps; 1440 1441 /* 1442 * check to see if the buffer is maped. If all is ok return the 1443 * offset of the data and return. Since it is expensive to grab 1444 * a mutex this is only done if the mapin is not complete. 1445 * Once the mutex is aquired it is possible that the mapin was 1446 * not done so recheck and if necessary do the mapin. 1447 */ 1448 if (ps->ps_mapin > 0) { 1449 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1450 return; 1451 } 1452 mutex_enter(&ps->ps_mapin_mx); 1453 if (ps->ps_mapin > 0) { 1454 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1455 mutex_exit(&ps->ps_mapin_mx); 1456 return; 1457 } 1458 bp_mapin(ps->ps_bp); 1459 /* 1460 * get the new b_addr out of the parent since bp_mapin just changed it 1461 */ 1462 ps->ps_addr = ps->ps_bp->b_un.b_addr; 1463 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1464 ps->ps_mapin++; 1465 mutex_exit(&ps->ps_mapin_mx); 1466 } 1467 1468 /* 1469 * NAMES: raid_read_no_retry 1470 * DESCRIPTION: I/O retry routine for a RAID metadevice read 1471 * read failed attempting to regenerate the data, 1472 * no retry possible, error occured in raid_raidregenloop(). 1473 * PARAMETERS: mr_unit_t *un - pointer to raid unit structure 1474 * md_raidcs_t *cs - pointer to child structure 1475 */ 1476 /*ARGSUSED*/ 1477 static void 1478 raid_read_no_retry(mr_unit_t *un, md_raidcs_t *cs) 1479 { 1480 md_raidps_t *ps = cs->cs_ps; 1481 1482 raid_error_parent(ps, EIO); 1483 raid_free_child(cs, 1); 1484 1485 /* decrement readfrags */ 1486 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 1487 } 1488 1489 /* 1490 * NAMES: raid_read_retry 1491 * DESCRIPTION: I/O retry routine for a RAID metadevice read 1492 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1493 */ 1494 static void 1495 raid_read_retry(mr_unit_t *un, md_raidcs_t *cs) 1496 { 1497 /* re-initialize the buf_t structure for raid_read() */ 1498 cs->cs_dbuf.b_chain = (struct buf *)cs; 1499 cs->cs_dbuf.b_back = &cs->cs_dbuf; 1500 cs->cs_dbuf.b_forw = &cs->cs_dbuf; 1501 cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ 1502 cs->cs_dbuf.b_error = 0; /* initialize error */ 1503 cs->cs_dbuf.b_offset = -1; 1504 /* Initialize semaphores */ 1505 sema_init(&cs->cs_dbuf.b_io, 0, NULL, 1506 SEMA_DEFAULT, NULL); 1507 sema_init(&cs->cs_dbuf.b_sem, 0, NULL, 1508 SEMA_DEFAULT, NULL); 1509 1510 cs->cs_pbuf.b_chain = (struct buf *)cs; 1511 cs->cs_pbuf.b_back = &cs->cs_pbuf; 1512 cs->cs_pbuf.b_forw = &cs->cs_pbuf; 1513 cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ 1514 cs->cs_pbuf.b_error = 0; /* initialize error */ 1515 cs->cs_pbuf.b_offset = -1; 1516 sema_init(&cs->cs_pbuf.b_io, 0, NULL, 1517 SEMA_DEFAULT, NULL); 1518 sema_init(&cs->cs_pbuf.b_sem, 0, NULL, 1519 SEMA_DEFAULT, NULL); 1520 1521 cs->cs_flags &= ~MD_RCS_ERROR; /* reset child error flag */ 1522 cs->cs_flags |= MD_RCS_RECOVERY; /* set RECOVERY flag */ 1523 1524 /* 1525 * re-scheduling I/O with raid_read_io() is simpler. basically, 1526 * raid_read_io() is invoked again with same child structure. 1527 * (NOTE: we aren`t supposed to do any error recovery when an I/O 1528 * error occured in raid_raidregenloop(). 1529 */ 1530 raid_mapin_buf(cs); 1531 raid_read_io(un, cs); 1532 } 1533 1534 /* 1535 * NAMES: raid_rderr 1536 * DESCRIPTION: I/O error handling routine for a RAID metadevice read 1537 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1538 * LOCKS: must obtain unit writer lock while calling raid_error_state 1539 * since a unit or column state transition may take place. 1540 * must obtain unit reader lock to retry I/O. 1541 */ 1542 /*ARGSUSED*/ 1543 static void 1544 raid_rderr(md_raidcs_t *cs) 1545 { 1546 md_raidps_t *ps; 1547 mdi_unit_t *ui; 1548 mr_unit_t *un; 1549 int error = 0; 1550 1551 ps = cs->cs_ps; 1552 ui = ps->ps_ui; 1553 un = (mr_unit_t *)md_unit_writerlock(ui); 1554 ASSERT(un != 0); 1555 1556 if (cs->cs_dbuf.b_flags & B_ERROR) 1557 error = raid_error_state(un, &cs->cs_dbuf); 1558 if (cs->cs_pbuf.b_flags & B_ERROR) 1559 error |= raid_error_state(un, &cs->cs_pbuf); 1560 1561 md_unit_writerexit(ui); 1562 1563 ps->ps_flags |= MD_RPS_HSREQ; 1564 1565 un = (mr_unit_t *)md_unit_readerlock(ui); 1566 ASSERT(un != 0); 1567 /* now attempt the appropriate retry routine */ 1568 (*(cs->cs_retry_call))(un, cs); 1569 } 1570 1571 1572 /* 1573 * NAMES: raid_read_error 1574 * DESCRIPTION: I/O error handling routine for a RAID metadevice read 1575 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1576 */ 1577 /*ARGSUSED*/ 1578 static void 1579 raid_read_error(md_raidcs_t *cs) 1580 { 1581 md_raidps_t *ps; 1582 mdi_unit_t *ui; 1583 mr_unit_t *un; 1584 set_t setno; 1585 1586 ps = cs->cs_ps; 1587 ui = ps->ps_ui; 1588 un = cs->cs_un; 1589 1590 setno = MD_UN2SET(un); 1591 1592 if ((cs->cs_dbuf.b_flags & B_ERROR) && 1593 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && 1594 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) 1595 cmn_err(CE_WARN, "md %s: read error on %s", 1596 md_shortname(MD_SID(un)), 1597 md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); 1598 1599 if ((cs->cs_pbuf.b_flags & B_ERROR) && 1600 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && 1601 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) 1602 cmn_err(CE_WARN, "md %s: read error on %s", 1603 md_shortname(MD_SID(un)), 1604 md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); 1605 1606 md_unit_readerexit(ui); 1607 1608 ASSERT(cs->cs_frags == 0); 1609 1610 /* now schedule processing for possible state change */ 1611 daemon_request(&md_mstr_daemon, raid_rderr, 1612 (daemon_queue_t *)cs, REQ_OLD); 1613 1614 } 1615 1616 /* 1617 * NAMES: getdbuffer 1618 * DESCRIPTION: data buffer allocation for a child structure 1619 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1620 * 1621 * NOTE: always get dbuffer before pbuffer 1622 * and get both buffers before pwslot 1623 * otherwise a deadlock could be introduced. 1624 */ 1625 static void 1626 getdbuffer(md_raidcs_t *cs) 1627 { 1628 mr_unit_t *un; 1629 1630 cs->cs_dbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); 1631 if (cs->cs_dbuffer != NULL) 1632 return; 1633 un = cs->cs_ps->ps_un; 1634 mutex_enter(&un->un_mx); 1635 while (un->un_dbuffer == NULL) { 1636 STAT_INC(data_buffer_waits); 1637 un->un_rflags |= MD_RFLAG_NEEDBUF; 1638 cv_wait(&un->un_cv, &un->un_mx); 1639 } 1640 cs->cs_dbuffer = un->un_dbuffer; 1641 cs->cs_flags |= MD_RCS_UNDBUF; 1642 un->un_dbuffer = NULL; 1643 mutex_exit(&un->un_mx); 1644 } 1645 1646 /* 1647 * NAMES: getpbuffer 1648 * DESCRIPTION: parity buffer allocation for a child structure 1649 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1650 * 1651 * NOTE: always get dbuffer before pbuffer 1652 * and get both buffers before pwslot 1653 * otherwise a deadlock could be introduced. 1654 */ 1655 static void 1656 getpbuffer(md_raidcs_t *cs) 1657 { 1658 mr_unit_t *un; 1659 1660 cs->cs_pbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); 1661 if (cs->cs_pbuffer != NULL) 1662 return; 1663 un = cs->cs_ps->ps_un; 1664 mutex_enter(&un->un_mx); 1665 while (un->un_pbuffer == NULL) { 1666 STAT_INC(parity_buffer_waits); 1667 un->un_rflags |= MD_RFLAG_NEEDBUF; 1668 cv_wait(&un->un_cv, &un->un_mx); 1669 } 1670 cs->cs_pbuffer = un->un_pbuffer; 1671 cs->cs_flags |= MD_RCS_UNPBUF; 1672 un->un_pbuffer = NULL; 1673 mutex_exit(&un->un_mx); 1674 } 1675 static void 1676 getresources(md_raidcs_t *cs) 1677 { 1678 md_raidcbuf_t *cbuf; 1679 /* 1680 * NOTE: always get dbuffer before pbuffer 1681 * and get both buffers before pwslot 1682 * otherwise a deadlock could be introduced. 1683 */ 1684 getdbuffer(cs); 1685 getpbuffer(cs); 1686 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 1687 cbuf->cbuf_buffer = 1688 kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_SLEEP); 1689 } 1690 /* 1691 * NAMES: freebuffers 1692 * DESCRIPTION: child structure buffer freeing routine 1693 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1694 */ 1695 static void 1696 freebuffers(md_raidcs_t *cs) 1697 { 1698 mr_unit_t *un; 1699 md_raidcbuf_t *cbuf; 1700 1701 /* free buffers used for full line write */ 1702 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 1703 if (cbuf->cbuf_buffer == NULL) 1704 continue; 1705 kmem_free(cbuf->cbuf_buffer, cbuf->cbuf_bcount + DEV_BSIZE); 1706 cbuf->cbuf_buffer = NULL; 1707 cbuf->cbuf_bcount = 0; 1708 } 1709 1710 if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { 1711 un = cs->cs_un; 1712 mutex_enter(&un->un_mx); 1713 } 1714 if (cs->cs_dbuffer) { 1715 if (cs->cs_flags & MD_RCS_UNDBUF) 1716 un->un_dbuffer = cs->cs_dbuffer; 1717 else 1718 kmem_free(cs->cs_dbuffer, cs->cs_bcount + DEV_BSIZE); 1719 } 1720 if (cs->cs_pbuffer) { 1721 if (cs->cs_flags & MD_RCS_UNPBUF) 1722 un->un_pbuffer = cs->cs_pbuffer; 1723 else 1724 kmem_free(cs->cs_pbuffer, cs->cs_bcount + DEV_BSIZE); 1725 } 1726 if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { 1727 un->un_rflags &= ~MD_RFLAG_NEEDBUF; 1728 cv_broadcast(&un->un_cv); 1729 mutex_exit(&un->un_mx); 1730 } 1731 } 1732 1733 /* 1734 * NAMES: raid_line_reader_lock, raid_line_writer_lock 1735 * DESCRIPTION: RAID metadevice line reader and writer lock routines 1736 * data column # and parity column #. 1737 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1738 */ 1739 1740 void 1741 raid_line_reader_lock(md_raidcs_t *cs, int resync_thread) 1742 { 1743 mr_unit_t *un; 1744 md_raidcs_t *cs1; 1745 1746 ASSERT(cs->cs_line != MD_DISKADDR_ERROR); 1747 un = cs->cs_un; 1748 cs->cs_flags |= MD_RCS_READER; 1749 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1750 if (!panicstr) 1751 mutex_enter(&un->un_linlck_mx); 1752 cs1 = un->un_linlck_chn; 1753 while (cs1 != NULL) { 1754 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1755 if (raid_io_overlaps(cs, cs1) == 1) 1756 if (cs1->cs_flags & MD_RCS_WRITER) 1757 break; 1758 1759 if (cs1 != NULL) { 1760 if (panicstr) 1761 panic("md; raid line write lock held"); 1762 un->un_linlck_flg = 1; 1763 cv_wait(&un->un_linlck_cv, &un->un_linlck_mx); 1764 STAT_INC(raid_read_waits); 1765 } 1766 } 1767 STAT_MAX(raid_max_reader_locks, raid_reader_locks_active); 1768 STAT_INC(raid_reader_locks); 1769 cs1 = un->un_linlck_chn; 1770 if (cs1 != NULL) 1771 cs1->cs_linlck_prev = cs; 1772 cs->cs_linlck_next = cs1; 1773 cs->cs_linlck_prev = NULL; 1774 un->un_linlck_chn = cs; 1775 cs->cs_flags |= MD_RCS_LLOCKD; 1776 if (resync_thread) { 1777 diskaddr_t lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 1778 diskaddr_t line = (lastblk + 1) / un->un_segsize; 1779 ASSERT(raid_state_cnt(un, RCS_RESYNC)); 1780 mutex_enter(&un->un_mx); 1781 un->un_resync_line_index = line; 1782 mutex_exit(&un->un_mx); 1783 } 1784 if (!panicstr) 1785 mutex_exit(&un->un_linlck_mx); 1786 } 1787 1788 int 1789 raid_line_writer_lock(md_raidcs_t *cs, int lock) 1790 { 1791 mr_unit_t *un; 1792 md_raidcs_t *cs1; 1793 1794 ASSERT(cs->cs_line != MD_DISKADDR_ERROR); 1795 cs->cs_flags |= MD_RCS_WRITER; 1796 un = cs->cs_ps->ps_un; 1797 1798 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1799 if (lock && !panicstr) 1800 mutex_enter(&un->un_linlck_mx); 1801 ASSERT(MUTEX_HELD(&un->un_linlck_mx)); 1802 1803 cs1 = un->un_linlck_chn; 1804 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1805 if (raid_io_overlaps(cs, cs1)) 1806 break; 1807 1808 if (cs1 != NULL) { 1809 if (panicstr) 1810 panic("md: line writer lock inaccessible"); 1811 goto no_lock_exit; 1812 } 1813 1814 if (raid_alloc_pwslot(cs)) { 1815 if (panicstr) 1816 panic("md: no prewrite slots"); 1817 STAT_INC(raid_prewrite_waits); 1818 goto no_lock_exit; 1819 } 1820 1821 cs1 = un->un_linlck_chn; 1822 if (cs1 != NULL) 1823 cs1->cs_linlck_prev = cs; 1824 cs->cs_linlck_next = cs1; 1825 cs->cs_linlck_prev = NULL; 1826 un->un_linlck_chn = cs; 1827 cs->cs_flags |= MD_RCS_LLOCKD; 1828 cs->cs_flags &= ~MD_RCS_WAITING; 1829 STAT_INC(raid_writer_locks); 1830 STAT_MAX(raid_max_write_locks, raid_write_locks_active); 1831 if (lock && !panicstr) 1832 mutex_exit(&un->un_linlck_mx); 1833 return (0); 1834 1835 no_lock_exit: 1836 /* if this is already queued then do not requeue it */ 1837 ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); 1838 if (!lock || (cs->cs_flags & MD_RCS_WAITING)) 1839 return (1); 1840 cs->cs_flags |= MD_RCS_WAITING; 1841 cs->cs_un = un; 1842 raid_enqueue(cs); 1843 if (lock && !panicstr) 1844 mutex_exit(&un->un_linlck_mx); 1845 return (1); 1846 } 1847 1848 static void 1849 raid_startio(md_raidcs_t *cs) 1850 { 1851 mdi_unit_t *ui = cs->cs_ps->ps_ui; 1852 mr_unit_t *un = cs->cs_un; 1853 1854 un = md_unit_readerlock(ui); 1855 raid_write_io(un, cs); 1856 } 1857 1858 void 1859 raid_io_startup(mr_unit_t *un) 1860 { 1861 md_raidcs_t *waiting_list, *cs1; 1862 md_raidcs_t *previous = NULL, *next = NULL; 1863 mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); 1864 kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; 1865 1866 ASSERT(MUTEX_HELD(&un->un_linlck_mx)); 1867 mutex_enter(io_list_mutex); 1868 1869 /* 1870 * check to be sure there are no reader locks outstanding. If 1871 * there are not then pass on the writer lock. 1872 */ 1873 waiting_list = ui->ui_io_lock->io_list_front; 1874 while (waiting_list) { 1875 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1876 ASSERT(! (waiting_list->cs_flags & MD_RCS_LLOCKD)); 1877 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1878 if (raid_io_overlaps(waiting_list, cs1) == 1) 1879 break; 1880 /* 1881 * there was an IOs that overlaps this io so go onto 1882 * the next io in the waiting list 1883 */ 1884 if (cs1) { 1885 previous = waiting_list; 1886 waiting_list = waiting_list->cs_linlck_next; 1887 continue; 1888 } 1889 1890 /* 1891 * There are no IOs that overlap this, so remove it from 1892 * the waiting queue, and start it 1893 */ 1894 1895 if (raid_check_pw(waiting_list)) { 1896 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1897 previous = waiting_list; 1898 waiting_list = waiting_list->cs_linlck_next; 1899 continue; 1900 } 1901 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1902 1903 next = waiting_list->cs_linlck_next; 1904 if (previous) 1905 previous->cs_linlck_next = next; 1906 else 1907 ui->ui_io_lock->io_list_front = next; 1908 1909 if (ui->ui_io_lock->io_list_front == NULL) 1910 ui->ui_io_lock->io_list_back = NULL; 1911 1912 if (ui->ui_io_lock->io_list_back == waiting_list) 1913 ui->ui_io_lock->io_list_back = previous; 1914 1915 waiting_list->cs_linlck_next = NULL; 1916 waiting_list->cs_flags &= ~MD_RCS_WAITING; 1917 STAT_DEC(raid_write_queue_length); 1918 if (raid_line_writer_lock(waiting_list, 0)) 1919 panic("region locking corrupted"); 1920 1921 ASSERT(waiting_list->cs_flags & MD_RCS_LLOCKD); 1922 daemon_request(&md_mstr_daemon, raid_startio, 1923 (daemon_queue_t *)waiting_list, REQ_OLD); 1924 waiting_list = next; 1925 1926 } 1927 mutex_exit(io_list_mutex); 1928 } 1929 1930 void 1931 raid_line_exit(md_raidcs_t *cs) 1932 { 1933 mr_unit_t *un; 1934 1935 un = cs->cs_ps->ps_un; 1936 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1937 mutex_enter(&un->un_linlck_mx); 1938 if (cs->cs_flags & MD_RCS_READER) 1939 STAT_DEC(raid_reader_locks_active); 1940 else 1941 STAT_DEC(raid_write_locks_active); 1942 1943 if (cs->cs_linlck_prev) 1944 cs->cs_linlck_prev->cs_linlck_next = cs->cs_linlck_next; 1945 else 1946 un->un_linlck_chn = cs->cs_linlck_next; 1947 if (cs->cs_linlck_next) 1948 cs->cs_linlck_next->cs_linlck_prev = cs->cs_linlck_prev; 1949 1950 cs->cs_flags &= ~MD_RCS_LLOCKD; 1951 1952 if (un->un_linlck_flg) 1953 cv_broadcast(&un->un_linlck_cv); 1954 1955 un->un_linlck_flg = 0; 1956 cs->cs_line = MD_DISKADDR_ERROR; 1957 1958 raid_cancel_pwslot(cs); 1959 /* 1960 * now that the lock is droped go ahead and see if there are any 1961 * other writes that can be started up 1962 */ 1963 raid_io_startup(un); 1964 1965 mutex_exit(&un->un_linlck_mx); 1966 } 1967 1968 /* 1969 * NAMES: raid_line, raid_pcolumn, raid_dcolumn 1970 * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #, 1971 * data column # and parity column #. 1972 * PARAMETERS: int segment - segment number 1973 * mr_unit_t *un - pointer to an unit structure 1974 * RETURNS: raid_line returns line # 1975 * raid_dcolumn returns data column # 1976 * raid_pcolumn returns parity column # 1977 */ 1978 static diskaddr_t 1979 raid_line(diskaddr_t segment, mr_unit_t *un) 1980 { 1981 diskaddr_t adj_seg; 1982 diskaddr_t line; 1983 diskaddr_t max_orig_segment; 1984 1985 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 1986 if (segment >= max_orig_segment) { 1987 adj_seg = segment - max_orig_segment; 1988 line = adj_seg % un->un_segsincolumn; 1989 } else { 1990 line = segment / (un->un_origcolumncnt - 1); 1991 } 1992 return (line); 1993 } 1994 1995 uint_t 1996 raid_dcolumn(diskaddr_t segment, mr_unit_t *un) 1997 { 1998 diskaddr_t adj_seg; 1999 diskaddr_t line; 2000 diskaddr_t max_orig_segment; 2001 uint_t column; 2002 2003 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 2004 if (segment >= max_orig_segment) { 2005 adj_seg = segment - max_orig_segment; 2006 column = un->un_origcolumncnt + 2007 (uint_t)(adj_seg / un->un_segsincolumn); 2008 } else { 2009 line = segment / (un->un_origcolumncnt - 1); 2010 column = (uint_t)((segment % 2011 (un->un_origcolumncnt - 1) + line) % un->un_origcolumncnt); 2012 } 2013 return (column); 2014 } 2015 2016 uint_t 2017 raid_pcolumn(diskaddr_t segment, mr_unit_t *un) 2018 { 2019 diskaddr_t adj_seg; 2020 diskaddr_t line; 2021 diskaddr_t max_orig_segment; 2022 uint_t column; 2023 2024 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 2025 if (segment >= max_orig_segment) { 2026 adj_seg = segment - max_orig_segment; 2027 line = adj_seg % un->un_segsincolumn; 2028 } else { 2029 line = segment / (un->un_origcolumncnt - 1); 2030 } 2031 column = (uint_t)((line + (un->un_origcolumncnt - 1)) % 2032 un->un_origcolumncnt); 2033 return (column); 2034 } 2035 2036 2037 /* 2038 * Is called in raid_iosetup to probe each column to insure 2039 * that all the columns are in 'okay' state and meet the 2040 * 'full line' requirement. If any column is in error, 2041 * we don't want to enable the 'full line' flag. Previously, 2042 * we would do so and disable it only when a error is 2043 * detected after the first 'full line' io which is too late 2044 * and leads to the potential data corruption. 2045 */ 2046 static int 2047 raid_check_cols(mr_unit_t *un) 2048 { 2049 buf_t bp; 2050 char *buf; 2051 mr_column_t *colptr; 2052 minor_t mnum = MD_SID(un); 2053 int i; 2054 int err = 0; 2055 2056 buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP); 2057 2058 for (i = 0; i < un->un_totalcolumncnt; i++) { 2059 md_dev64_t tmpdev; 2060 2061 colptr = &un->un_column[i]; 2062 2063 tmpdev = colptr->un_dev; 2064 /* 2065 * Open by device id 2066 * If this device is hotspared 2067 * use the hotspare key 2068 */ 2069 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ? 2070 colptr->un_hs_key : colptr->un_orig_key); 2071 2072 if (tmpdev == NODEV64) { 2073 err = 1; 2074 break; 2075 } 2076 2077 colptr->un_dev = tmpdev; 2078 2079 bzero((caddr_t)&bp, sizeof (buf_t)); 2080 bp.b_back = &bp; 2081 bp.b_forw = &bp; 2082 bp.b_flags = (B_READ | B_BUSY); 2083 sema_init(&bp.b_io, 0, NULL, 2084 SEMA_DEFAULT, NULL); 2085 sema_init(&bp.b_sem, 0, NULL, 2086 SEMA_DEFAULT, NULL); 2087 bp.b_edev = md_dev64_to_dev(colptr->un_dev); 2088 bp.b_lblkno = colptr->un_pwstart; 2089 bp.b_bcount = DEV_BSIZE; 2090 bp.b_bufsize = DEV_BSIZE; 2091 bp.b_un.b_addr = (caddr_t)buf; 2092 (void) md_call_strategy(&bp, 0, NULL); 2093 if (biowait(&bp)) { 2094 err = 1; 2095 break; 2096 } 2097 } 2098 2099 kmem_free(buf, DEV_BSIZE); 2100 return (err); 2101 } 2102 2103 /* 2104 * NAME: raid_iosetup 2105 * DESCRIPTION: RAID metadevice specific I/O set up routine which does 2106 * all the necessary calculations to determine the location 2107 * of the segement for the I/O. 2108 * PARAMETERS: mr_unit_t *un - unit number of RAID metadevice 2109 * diskaddr_t blkno - block number of the I/O attempt 2110 * size_t blkcnt - block count for this I/O 2111 * md_raidcs_t *cs - child structure for each segmented I/O 2112 * 2113 * NOTE: The following is an example of a raid disk layer out: 2114 * 2115 * Total Column = 5 2116 * Original Column = 4 2117 * Segment Per Column = 10 2118 * 2119 * Col#0 Col#1 Col#2 Col#3 Col#4 Col#5 Col#6 2120 * ------------------------------------------------------------- 2121 * line#0 Seg#0 Seg#1 Seg#2 Parity Seg#30 Seg#40 2122 * line#1 Parity Seg#3 Seg#4 Seg#5 Seg#31 2123 * line#2 Seg#8 Parity Seg#6 Seg#7 Seg#32 2124 * line#3 Seg#10 Seg#11 Parity Seg#9 Seg#33 2125 * line#4 Seg#12 Seg#13 Seg#14 Parity Seg#34 2126 * line#5 Parity Seg#15 Seg#16 Seg#17 Seg#35 2127 * line#6 Seg#20 Parity Seg#18 Seg#19 Seg#36 2128 * line#7 Seg#22 Seg#23 Parity Seg#21 Seg#37 2129 * line#8 Seg#24 Seg#25 Seg#26 Parity Seg#38 2130 * line#9 Parity Seg#27 Seg#28 Seg#29 Seg#39 2131 */ 2132 static size_t 2133 raid_iosetup( 2134 mr_unit_t *un, 2135 diskaddr_t blkno, 2136 size_t blkcnt, 2137 md_raidcs_t *cs 2138 ) 2139 { 2140 diskaddr_t segment; 2141 diskaddr_t segstart; 2142 diskaddr_t segoff; 2143 size_t leftover; 2144 diskaddr_t line; 2145 uint_t iosize; 2146 uint_t colcnt; 2147 2148 /* caculate the segment# and offset for the block */ 2149 segment = blkno / un->un_segsize; 2150 segstart = segment * un->un_segsize; 2151 segoff = blkno - segstart; 2152 iosize = un->un_iosize - 1; 2153 colcnt = un->un_totalcolumncnt - 1; 2154 line = raid_line(segment, un); 2155 cs->cs_dcolumn = raid_dcolumn(segment, un); 2156 cs->cs_pcolumn = raid_pcolumn(segment, un); 2157 cs->cs_dflags = un->un_column[cs->cs_dcolumn].un_devflags; 2158 cs->cs_pflags = un->un_column[cs->cs_pcolumn].un_devflags; 2159 cs->cs_line = line; 2160 2161 if ((cs->cs_ps->ps_flags & MD_RPS_WRITE) && 2162 (UNIT_STATE(un) & RCS_OKAY) && 2163 (segoff == 0) && 2164 (un->un_totalcolumncnt == un->un_origcolumncnt) && 2165 (un->un_segsize < un->un_iosize) && 2166 (un->un_iosize <= un->un_maxio) && 2167 (blkno == line * un->un_segsize * colcnt) && 2168 (blkcnt >= ((un->un_totalcolumncnt -1) * un->un_segsize)) && 2169 (raid_state_cnt(un, RCS_OKAY) == un->un_origcolumncnt) && 2170 (raid_check_cols(un) == 0)) { 2171 2172 md_raidcbuf_t **cbufp; 2173 md_raidcbuf_t *cbuf; 2174 int i, j; 2175 2176 STAT_INC(raid_full_line_writes); 2177 leftover = blkcnt - (un->un_segsize * colcnt); 2178 ASSERT(blkcnt >= (un->un_segsize * colcnt)); 2179 cs->cs_blkno = line * un->un_segsize; 2180 cs->cs_blkcnt = un->un_segsize; 2181 cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 2182 cs->cs_bcount = dbtob(cs->cs_blkcnt); 2183 cs->cs_flags |= MD_RCS_LINE; 2184 2185 cbufp = &cs->cs_buflist; 2186 for (i = 0; i < un->un_totalcolumncnt; i++) { 2187 j = cs->cs_dcolumn + i; 2188 j = j % un->un_totalcolumncnt; 2189 2190 if ((j == cs->cs_dcolumn) || (j == cs->cs_pcolumn)) 2191 continue; 2192 cbuf = kmem_cache_alloc(raid_cbuf_cache, 2193 MD_ALLOCFLAGS); 2194 raid_cbuf_init(cbuf); 2195 cbuf->cbuf_un = cs->cs_un; 2196 cbuf->cbuf_ps = cs->cs_ps; 2197 cbuf->cbuf_column = j; 2198 cbuf->cbuf_bcount = dbtob(un->un_segsize); 2199 *cbufp = cbuf; 2200 cbufp = &cbuf->cbuf_next; 2201 } 2202 return (leftover); 2203 } 2204 2205 leftover = blkcnt - (un->un_segsize - segoff); 2206 if (blkcnt > (un->un_segsize - segoff)) 2207 blkcnt -= leftover; 2208 else 2209 leftover = 0; 2210 2211 if (blkcnt > (size_t)iosize) { 2212 leftover += (blkcnt - iosize); 2213 blkcnt = iosize; 2214 } 2215 2216 /* calculate the line# and column# for the segment */ 2217 cs->cs_flags &= ~MD_RCS_LINE; 2218 cs->cs_blkno = line * un->un_segsize + segoff; 2219 cs->cs_blkcnt = (uint_t)blkcnt; 2220 cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 2221 cs->cs_bcount = dbtob((uint_t)blkcnt); 2222 return (leftover); 2223 } 2224 2225 /* 2226 * NAME: raid_done 2227 * DESCRIPTION: RAID metadevice I/O done interrupt routine 2228 * PARAMETERS: struct buf *bp - pointer to a buffer structure 2229 */ 2230 static void 2231 raid_done(struct buf *bp) 2232 { 2233 md_raidcs_t *cs; 2234 int flags, frags; 2235 2236 sema_v(&bp->b_io); 2237 cs = (md_raidcs_t *)bp->b_chain; 2238 2239 ASSERT(cs != NULL); 2240 2241 mutex_enter(&cs->cs_mx); 2242 if (bp->b_flags & B_ERROR) { 2243 cs->cs_flags |= MD_RCS_ERROR; 2244 cs->cs_flags &= ~(MD_RCS_ISCALL); 2245 } 2246 2247 flags = cs->cs_flags; 2248 frags = --cs->cs_frags; 2249 mutex_exit(&cs->cs_mx); 2250 if (frags != 0) { 2251 return; 2252 } 2253 2254 if (flags & MD_RCS_ERROR) { 2255 if (cs->cs_error_call) { 2256 daemon_request(&md_done_daemon, cs->cs_error_call, 2257 (daemon_queue_t *)cs, REQ_OLD); 2258 } 2259 return; 2260 } 2261 2262 if (flags & MD_RCS_ISCALL) { 2263 cs->cs_flags &= ~(MD_RCS_ISCALL); 2264 (*(cs->cs_call))(cs); 2265 return; 2266 } 2267 daemon_request(&md_done_daemon, cs->cs_call, 2268 (daemon_queue_t *)cs, REQ_OLD); 2269 } 2270 /* 2271 * the flag RIO_EXTRA is used when dealing with a column in the process 2272 * of being resynced. During the resync, writes may have to take place 2273 * on both the original component and a hotspare component. 2274 */ 2275 #define RIO_DATA 0x00100 /* use data buffer & data column */ 2276 #define RIO_PARITY 0x00200 /* use parity buffer & parity column */ 2277 #define RIO_WRITE 0x00400 /* issue a write */ 2278 #define RIO_READ 0x00800 /* issue a read */ 2279 #define RIO_PWIO 0x01000 /* do the I/O to the prewrite entry */ 2280 #define RIO_ALT 0x02000 /* do write to alternate device */ 2281 #define RIO_EXTRA 0x04000 /* use extra buffer */ 2282 2283 #define RIO_COLMASK 0x000ff 2284 2285 #define RIO_PREWRITE RIO_WRITE | RIO_PWIO 2286 2287 /* 2288 * NAME: raidio 2289 * DESCRIPTION: RAID metadevice write routine 2290 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2291 */ 2292 static void 2293 raidio(md_raidcs_t *cs, int flags) 2294 { 2295 buf_t *bp; 2296 int column; 2297 int flag; 2298 void *private; 2299 mr_unit_t *un; 2300 int iosize; 2301 diskaddr_t pwstart; 2302 diskaddr_t devstart; 2303 md_dev64_t dev; 2304 2305 un = cs->cs_un; 2306 2307 ASSERT(IO_READER_HELD(un)); 2308 ASSERT(UNIT_READER_HELD(un)); 2309 2310 if (flags & RIO_DATA) { 2311 if (flags & RIO_EXTRA) 2312 bp = &cs->cs_hbuf; 2313 else 2314 bp = &cs->cs_dbuf; 2315 bp->b_un.b_addr = cs->cs_dbuffer; 2316 column = cs->cs_dcolumn; 2317 } else { 2318 if (flags & RIO_EXTRA) 2319 bp = &cs->cs_hbuf; 2320 else 2321 bp = &cs->cs_pbuf; 2322 bp->b_un.b_addr = cs->cs_pbuffer; 2323 column = cs->cs_pcolumn; 2324 } 2325 if (flags & RIO_COLMASK) 2326 column = (flags & RIO_COLMASK) - 1; 2327 2328 bp->b_bcount = cs->cs_bcount; 2329 bp->b_bufsize = cs->cs_bcount; 2330 iosize = un->un_iosize; 2331 2332 /* check if the hotspared device will be used */ 2333 if (flags & RIO_ALT && (flags & RIO_WRITE)) { 2334 pwstart = un->un_column[column].un_alt_pwstart; 2335 devstart = un->un_column[column].un_alt_devstart; 2336 dev = un->un_column[column].un_alt_dev; 2337 } else { 2338 pwstart = un->un_column[column].un_pwstart; 2339 devstart = un->un_column[column].un_devstart; 2340 dev = un->un_column[column].un_dev; 2341 } 2342 2343 /* if not writing to log skip log header */ 2344 if ((flags & RIO_PWIO) == 0) { 2345 bp->b_lblkno = devstart + cs->cs_blkno; 2346 bp->b_un.b_addr += DEV_BSIZE; 2347 } else { 2348 bp->b_bcount += DEV_BSIZE; 2349 bp->b_bufsize = bp->b_bcount; 2350 if (flags & RIO_DATA) { 2351 bp->b_lblkno = cs->cs_dpwslot * iosize + pwstart; 2352 } else { /* not DATA -> PARITY */ 2353 bp->b_lblkno = cs->cs_ppwslot * iosize + pwstart; 2354 } 2355 } 2356 2357 bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR | nv_available); 2358 bp->b_flags |= B_BUSY; 2359 if (flags & RIO_READ) { 2360 bp->b_flags |= B_READ; 2361 } else { 2362 bp->b_flags |= B_WRITE; 2363 if ((nv_available && nv_parity && (flags & RIO_PARITY)) || 2364 (nv_available && nv_prewrite && (flags & RIO_PWIO))) 2365 bp->b_flags |= nv_available; 2366 } 2367 bp->b_iodone = (int (*)())raid_done; 2368 bp->b_edev = md_dev64_to_dev(dev); 2369 2370 ASSERT((bp->b_edev != 0) && (bp->b_edev != NODEV)); 2371 2372 private = cs->cs_strategy_private; 2373 flag = cs->cs_strategy_flag; 2374 2375 md_call_strategy(bp, flag, private); 2376 } 2377 2378 /* 2379 * NAME: genstandardparity 2380 * DESCRIPTION: This routine 2381 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2382 */ 2383 static void 2384 genstandardparity(md_raidcs_t *cs) 2385 { 2386 uint_t *dbuf, *pbuf; 2387 size_t wordcnt; 2388 uint_t dsum = 0; 2389 uint_t psum = 0; 2390 2391 ASSERT((cs->cs_bcount & 0x3) == 0); 2392 2393 wordcnt = cs->cs_bcount / sizeof (uint_t); 2394 2395 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2396 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2397 2398 /* Word aligned */ 2399 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2400 uint_t *uwbuf = (uint_t *)(void *)(cs->cs_addr); 2401 uint_t uval; 2402 2403 while (wordcnt--) { 2404 uval = *uwbuf++; 2405 psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ uval)); 2406 ++pbuf; 2407 *dbuf = uval; 2408 dsum ^= uval; 2409 ++dbuf; 2410 } 2411 } else { 2412 uchar_t *ubbuf = (uchar_t *)(cs->cs_addr); 2413 union { 2414 uint_t wb; 2415 uchar_t bb[4]; 2416 } cb; 2417 2418 while (wordcnt--) { 2419 cb.bb[0] = *ubbuf++; 2420 cb.bb[1] = *ubbuf++; 2421 cb.bb[2] = *ubbuf++; 2422 cb.bb[3] = *ubbuf++; 2423 psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ cb.wb)); 2424 ++pbuf; 2425 *dbuf = cb.wb; 2426 dsum ^= cb.wb; 2427 ++dbuf; 2428 } 2429 } 2430 2431 RAID_FILLIN_RPW(cs->cs_dbuffer, cs->cs_un, dsum, cs->cs_pcolumn, 2432 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2433 2, cs->cs_dcolumn, RAID_PWMAGIC); 2434 2435 RAID_FILLIN_RPW(cs->cs_pbuffer, cs->cs_un, psum, cs->cs_dcolumn, 2436 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2437 2, cs->cs_pcolumn, RAID_PWMAGIC); 2438 } 2439 2440 static void 2441 genlineparity(md_raidcs_t *cs) 2442 { 2443 2444 mr_unit_t *un = cs->cs_un; 2445 md_raidcbuf_t *cbuf; 2446 uint_t *pbuf, *dbuf; 2447 uint_t *uwbuf; 2448 uchar_t *ubbuf; 2449 size_t wordcnt; 2450 uint_t psum = 0, dsum = 0; 2451 size_t count = un->un_segsize * DEV_BSIZE; 2452 uint_t col; 2453 buf_t *bp; 2454 2455 ASSERT((cs->cs_bcount & 0x3) == 0); 2456 2457 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2458 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2459 uwbuf = (uint_t *)(void *)(cs->cs_addr); 2460 ubbuf = (uchar_t *)(void *)(cs->cs_addr); 2461 2462 wordcnt = count / sizeof (uint_t); 2463 2464 /* Word aligned */ 2465 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2466 uint_t uval; 2467 2468 while (wordcnt--) { 2469 uval = *uwbuf++; 2470 *dbuf = uval; 2471 *pbuf = uval; 2472 dsum ^= uval; 2473 ++pbuf; 2474 ++dbuf; 2475 } 2476 } else { 2477 union { 2478 uint_t wb; 2479 uchar_t bb[4]; 2480 } cb; 2481 2482 while (wordcnt--) { 2483 cb.bb[0] = *ubbuf++; 2484 cb.bb[1] = *ubbuf++; 2485 cb.bb[2] = *ubbuf++; 2486 cb.bb[3] = *ubbuf++; 2487 *dbuf = cb.wb; 2488 *pbuf = cb.wb; 2489 dsum ^= cb.wb; 2490 ++pbuf; 2491 ++dbuf; 2492 } 2493 } 2494 2495 RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, cs->cs_pcolumn, 2496 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2497 un->un_totalcolumncnt, cs->cs_dcolumn, RAID_PWMAGIC); 2498 2499 raidio(cs, RIO_PREWRITE | RIO_DATA); 2500 2501 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 2502 2503 dsum = 0; 2504 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2505 dbuf = (uint_t *)(void *)(cbuf->cbuf_buffer + DEV_BSIZE); 2506 2507 wordcnt = count / sizeof (uint_t); 2508 2509 col = cbuf->cbuf_column; 2510 2511 /* Word aligned */ 2512 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2513 uint_t uval; 2514 2515 /* 2516 * Only calculate psum when working on the last 2517 * data buffer. 2518 */ 2519 if (cbuf->cbuf_next == NULL) { 2520 psum = 0; 2521 while (wordcnt--) { 2522 uval = *uwbuf++; 2523 *dbuf = uval; 2524 psum ^= (*pbuf ^= uval); 2525 dsum ^= uval; 2526 ++dbuf; 2527 ++pbuf; 2528 } 2529 } else { 2530 while (wordcnt--) { 2531 uval = *uwbuf++; 2532 *dbuf = uval; 2533 *pbuf ^= uval; 2534 dsum ^= uval; 2535 ++dbuf; 2536 ++pbuf; 2537 } 2538 } 2539 } else { 2540 union { 2541 uint_t wb; 2542 uchar_t bb[4]; 2543 } cb; 2544 2545 /* 2546 * Only calculate psum when working on the last 2547 * data buffer. 2548 */ 2549 if (cbuf->cbuf_next == NULL) { 2550 psum = 0; 2551 while (wordcnt--) { 2552 cb.bb[0] = *ubbuf++; 2553 cb.bb[1] = *ubbuf++; 2554 cb.bb[2] = *ubbuf++; 2555 cb.bb[3] = *ubbuf++; 2556 *dbuf = cb.wb; 2557 psum ^= (*pbuf ^= cb.wb); 2558 dsum ^= cb.wb; 2559 ++dbuf; 2560 ++pbuf; 2561 } 2562 } else { 2563 while (wordcnt--) { 2564 cb.bb[0] = *ubbuf++; 2565 cb.bb[1] = *ubbuf++; 2566 cb.bb[2] = *ubbuf++; 2567 cb.bb[3] = *ubbuf++; 2568 *dbuf = cb.wb; 2569 *pbuf ^= cb.wb; 2570 dsum ^= cb.wb; 2571 ++dbuf; 2572 ++pbuf; 2573 } 2574 } 2575 } 2576 RAID_FILLIN_RPW(cbuf->cbuf_buffer, un, dsum, cs->cs_pcolumn, 2577 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2578 un->un_totalcolumncnt, col, RAID_PWMAGIC); 2579 2580 /* 2581 * fill in buffer for write to prewrite area 2582 */ 2583 bp = &cbuf->cbuf_bp; 2584 bp->b_un.b_addr = cbuf->cbuf_buffer; 2585 bp->b_bcount = cbuf->cbuf_bcount + DEV_BSIZE; 2586 bp->b_bufsize = bp->b_bcount; 2587 bp->b_lblkno = (cbuf->cbuf_pwslot * un->un_iosize) + 2588 un->un_column[col].un_pwstart; 2589 bp->b_flags = B_WRITE | B_BUSY; 2590 if (nv_available && nv_prewrite) 2591 bp->b_flags |= nv_available; 2592 bp->b_iodone = (int (*)())raid_done; 2593 bp->b_edev = md_dev64_to_dev(un->un_column[col].un_dev); 2594 bp->b_chain = (struct buf *)cs; 2595 md_call_strategy(bp, 2596 cs->cs_strategy_flag, cs->cs_strategy_private); 2597 } 2598 2599 RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, cs->cs_dcolumn, 2600 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2601 un->un_totalcolumncnt, cs->cs_pcolumn, RAID_PWMAGIC); 2602 2603 raidio(cs, RIO_PREWRITE | RIO_PARITY); 2604 } 2605 2606 /* 2607 * NAME: raid_readregenloop 2608 * DESCRIPTION: RAID metadevice write routine 2609 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2610 */ 2611 static void 2612 raid_readregenloop(md_raidcs_t *cs) 2613 { 2614 mr_unit_t *un; 2615 md_raidps_t *ps; 2616 uint_t *dbuf; 2617 uint_t *pbuf; 2618 size_t wordcnt; 2619 2620 un = cs->cs_un; 2621 2622 /* 2623 * XOR the parity with data bytes, must skip the 2624 * pre-write entry header in all data/parity buffers 2625 */ 2626 wordcnt = cs->cs_bcount / sizeof (uint_t); 2627 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2628 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2629 while (wordcnt--) 2630 *dbuf++ ^= *pbuf++; 2631 2632 /* bump up the loop count */ 2633 cs->cs_loop++; 2634 2635 /* skip the errored component */ 2636 if (cs->cs_loop == cs->cs_dcolumn) 2637 cs->cs_loop++; 2638 2639 if (cs->cs_loop != un->un_totalcolumncnt) { 2640 cs->cs_frags = 1; 2641 raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); 2642 return; 2643 } 2644 /* reaching the end sof loop */ 2645 ps = cs->cs_ps; 2646 bcopy(cs->cs_dbuffer + DEV_BSIZE, cs->cs_addr, cs->cs_bcount); 2647 raid_free_child(cs, 1); 2648 2649 /* decrement readfrags */ 2650 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 2651 } 2652 2653 /* 2654 * NAME: raid_read_io 2655 * DESCRIPTION: RAID metadevice read I/O routine 2656 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2657 * md_raidcs_t *cs - pointer to a child structure 2658 */ 2659 static void 2660 raid_read_io(mr_unit_t *un, md_raidcs_t *cs) 2661 { 2662 int flag; 2663 void *private; 2664 buf_t *bp; 2665 buf_t *pb = cs->cs_ps->ps_bp; 2666 mr_column_t *column; 2667 2668 flag = cs->cs_strategy_flag; 2669 private = cs->cs_strategy_private; 2670 column = &un->un_column[cs->cs_dcolumn]; 2671 2672 /* 2673 * The component to be read is good, simply set up bp structure 2674 * and call low level md routine doing the read. 2675 */ 2676 2677 if (COLUMN_ISOKAY(un, cs->cs_dcolumn) || 2678 (COLUMN_ISLASTERR(un, cs->cs_dcolumn) && 2679 (cs->cs_flags & MD_RCS_RECOVERY) == 0)) { 2680 dev_t ddi_dev; /* needed for bioclone, so not md_dev64_t */ 2681 ddi_dev = md_dev64_to_dev(column->un_dev); 2682 2683 bp = &cs->cs_dbuf; 2684 bp = md_bioclone(pb, cs->cs_offset, cs->cs_bcount, ddi_dev, 2685 column->un_devstart + cs->cs_blkno, 2686 (int (*)())raid_done, bp, KM_NOSLEEP); 2687 2688 bp->b_chain = (buf_t *)cs; 2689 2690 cs->cs_frags = 1; 2691 cs->cs_error_call = raid_read_error; 2692 cs->cs_retry_call = raid_read_retry; 2693 cs->cs_flags |= MD_RCS_ISCALL; 2694 cs->cs_stage = RAID_READ_DONE; 2695 cs->cs_call = raid_stage; 2696 2697 ASSERT(bp->b_edev != 0); 2698 2699 md_call_strategy(bp, flag, private); 2700 return; 2701 } 2702 2703 /* 2704 * The component to be read is bad, have to go through 2705 * raid specific method to read data from other members. 2706 */ 2707 cs->cs_loop = 0; 2708 /* 2709 * NOTE: always get dbuffer before pbuffer 2710 * and get both buffers before pwslot 2711 * otherwise a deadlock could be introduced. 2712 */ 2713 raid_mapin_buf(cs); 2714 getdbuffer(cs); 2715 getpbuffer(cs); 2716 if (cs->cs_loop == cs->cs_dcolumn) 2717 cs->cs_loop++; 2718 2719 /* zero out data buffer for use as a data sink */ 2720 bzero(cs->cs_dbuffer + DEV_BSIZE, cs->cs_bcount); 2721 cs->cs_stage = RAID_NONE; 2722 cs->cs_call = raid_readregenloop; 2723 cs->cs_error_call = raid_read_error; 2724 cs->cs_retry_call = raid_read_no_retry; 2725 cs->cs_frags = 1; 2726 2727 /* use parity buffer to read other columns */ 2728 raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); 2729 } 2730 2731 /* 2732 * NAME: raid_read 2733 * DESCRIPTION: RAID metadevice write routine 2734 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2735 * md_raidcs_t *cs - pointer to a child structure 2736 */ 2737 static int 2738 raid_read(mr_unit_t *un, md_raidcs_t *cs) 2739 { 2740 int error = 0; 2741 md_raidps_t *ps; 2742 mdi_unit_t *ui; 2743 minor_t mnum; 2744 2745 ASSERT(IO_READER_HELD(un)); 2746 ps = cs->cs_ps; 2747 ui = ps->ps_ui; 2748 raid_line_reader_lock(cs, 0); 2749 un = (mr_unit_t *)md_unit_readerlock(ui); 2750 ASSERT(UNIT_STATE(un) != RUS_INIT); 2751 mnum = MD_SID(un); 2752 cs->cs_un = un; 2753 2754 /* make sure the read doesn't go beyond the end of the column */ 2755 if (cs->cs_blkno + cs->cs_blkcnt > 2756 un->un_segsize * un->un_segsincolumn) { 2757 error = ENXIO; 2758 } 2759 if (error) 2760 goto rerror; 2761 2762 if (un->un_state & RUS_REGEN) { 2763 raid_regen_parity(cs); 2764 un = MD_UNIT(mnum); 2765 cs->cs_un = un; 2766 } 2767 2768 raid_read_io(un, cs); 2769 return (0); 2770 2771 rerror: 2772 raid_error_parent(ps, error); 2773 raid_free_child(cs, 1); 2774 /* decrement readfrags */ 2775 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 2776 return (0); 2777 } 2778 2779 /* 2780 * NAME: raid_write_err_retry 2781 * DESCRIPTION: RAID metadevice write retry routine 2782 * write was for parity or data only; 2783 * complete write with error, no recovery possible 2784 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2785 * md_raidcs_t *cs - pointer to a child structure 2786 */ 2787 /*ARGSUSED*/ 2788 static void 2789 raid_write_err_retry(mr_unit_t *un, md_raidcs_t *cs) 2790 { 2791 md_raidps_t *ps = cs->cs_ps; 2792 int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; 2793 2794 /* decrement pwfrags if needed, and frags */ 2795 if (!(cs->cs_flags & MD_RCS_PWDONE)) 2796 flags |= RFP_DECR_PWFRAGS; 2797 raid_error_parent(ps, EIO); 2798 raid_free_child(cs, 1); 2799 raid_free_parent(ps, flags); 2800 } 2801 2802 /* 2803 * NAME: raid_write_err_retry 2804 * DESCRIPTION: RAID metadevice write retry routine 2805 * write is too far along to retry and parent 2806 * has already been signaled with iodone. 2807 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2808 * md_raidcs_t *cs - pointer to a child structure 2809 */ 2810 /*ARGSUSED*/ 2811 static void 2812 raid_write_no_retry(mr_unit_t *un, md_raidcs_t *cs) 2813 { 2814 md_raidps_t *ps = cs->cs_ps; 2815 int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; 2816 2817 /* decrement pwfrags if needed, and frags */ 2818 if (!(cs->cs_flags & MD_RCS_PWDONE)) 2819 flags |= RFP_DECR_PWFRAGS; 2820 raid_free_child(cs, 1); 2821 raid_free_parent(ps, flags); 2822 } 2823 2824 /* 2825 * NAME: raid_write_retry 2826 * DESCRIPTION: RAID metadevice write retry routine 2827 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2828 * md_raidcs_t *cs - pointer to a child structure 2829 */ 2830 static void 2831 raid_write_retry(mr_unit_t *un, md_raidcs_t *cs) 2832 { 2833 md_raidps_t *ps; 2834 2835 ps = cs->cs_ps; 2836 2837 /* re-initialize the buf_t structure for raid_write() */ 2838 cs->cs_dbuf.b_chain = (struct buf *)cs; 2839 cs->cs_dbuf.b_back = &cs->cs_dbuf; 2840 cs->cs_dbuf.b_forw = &cs->cs_dbuf; 2841 cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ 2842 cs->cs_dbuf.b_error = 0; /* initialize error */ 2843 cs->cs_dbuf.b_offset = -1; 2844 /* Initialize semaphores */ 2845 sema_init(&cs->cs_dbuf.b_io, 0, NULL, 2846 SEMA_DEFAULT, NULL); 2847 sema_init(&cs->cs_dbuf.b_sem, 0, NULL, 2848 SEMA_DEFAULT, NULL); 2849 2850 cs->cs_pbuf.b_chain = (struct buf *)cs; 2851 cs->cs_pbuf.b_back = &cs->cs_pbuf; 2852 cs->cs_pbuf.b_forw = &cs->cs_pbuf; 2853 cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ 2854 cs->cs_pbuf.b_error = 0; /* initialize error */ 2855 cs->cs_pbuf.b_offset = -1; 2856 sema_init(&cs->cs_pbuf.b_io, 0, NULL, 2857 SEMA_DEFAULT, NULL); 2858 sema_init(&cs->cs_pbuf.b_sem, 0, NULL, 2859 SEMA_DEFAULT, NULL); 2860 2861 cs->cs_hbuf.b_chain = (struct buf *)cs; 2862 cs->cs_hbuf.b_back = &cs->cs_hbuf; 2863 cs->cs_hbuf.b_forw = &cs->cs_hbuf; 2864 cs->cs_hbuf.b_flags = B_BUSY; /* initialize flags */ 2865 cs->cs_hbuf.b_error = 0; /* initialize error */ 2866 cs->cs_hbuf.b_offset = -1; 2867 sema_init(&cs->cs_hbuf.b_io, 0, NULL, 2868 SEMA_DEFAULT, NULL); 2869 sema_init(&cs->cs_hbuf.b_sem, 0, NULL, 2870 SEMA_DEFAULT, NULL); 2871 2872 cs->cs_flags &= ~(MD_RCS_ERROR); 2873 /* 2874 * If we have already done'ed the i/o but have done prewrite 2875 * on this child, then reset PWDONE flag and bump pwfrags before 2876 * restarting i/o. 2877 * If pwfrags is zero, we have already 'iodone'd the i/o so 2878 * leave things alone. We don't want to re-'done' it. 2879 */ 2880 mutex_enter(&ps->ps_mx); 2881 if (cs->cs_flags & MD_RCS_PWDONE) { 2882 cs->cs_flags &= ~MD_RCS_PWDONE; 2883 ps->ps_pwfrags++; 2884 } 2885 mutex_exit(&ps->ps_mx); 2886 raid_write_io(un, cs); 2887 } 2888 2889 /* 2890 * NAME: raid_wrerr 2891 * DESCRIPTION: RAID metadevice write routine 2892 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2893 * LOCKS: must obtain unit writer lock while calling raid_error_state 2894 * since a unit or column state transition may take place. 2895 * must obtain unit reader lock to retry I/O. 2896 */ 2897 static void 2898 raid_wrerr(md_raidcs_t *cs) 2899 { 2900 md_raidps_t *ps; 2901 mdi_unit_t *ui; 2902 mr_unit_t *un; 2903 md_raidcbuf_t *cbuf; 2904 2905 ps = cs->cs_ps; 2906 ui = ps->ps_ui; 2907 2908 un = (mr_unit_t *)md_unit_writerlock(ui); 2909 ASSERT(un != 0); 2910 2911 if (cs->cs_dbuf.b_flags & B_ERROR) 2912 (void) raid_error_state(un, &cs->cs_dbuf); 2913 if (cs->cs_pbuf.b_flags & B_ERROR) 2914 (void) raid_error_state(un, &cs->cs_pbuf); 2915 if (cs->cs_hbuf.b_flags & B_ERROR) 2916 (void) raid_error_state(un, &cs->cs_hbuf); 2917 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 2918 if (cbuf->cbuf_bp.b_flags & B_ERROR) 2919 (void) raid_error_state(un, &cbuf->cbuf_bp); 2920 2921 md_unit_writerexit(ui); 2922 2923 ps->ps_flags |= MD_RPS_HSREQ; 2924 2925 un = (mr_unit_t *)md_unit_readerlock(ui); 2926 2927 /* now attempt the appropriate retry routine */ 2928 (*(cs->cs_retry_call))(un, cs); 2929 } 2930 /* 2931 * NAMES: raid_write_error 2932 * DESCRIPTION: I/O error handling routine for a RAID metadevice write 2933 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 2934 */ 2935 /*ARGSUSED*/ 2936 static void 2937 raid_write_error(md_raidcs_t *cs) 2938 { 2939 md_raidps_t *ps; 2940 mdi_unit_t *ui; 2941 mr_unit_t *un; 2942 md_raidcbuf_t *cbuf; 2943 set_t setno; 2944 2945 ps = cs->cs_ps; 2946 ui = ps->ps_ui; 2947 un = cs->cs_un; 2948 2949 setno = MD_UN2SET(un); 2950 2951 /* 2952 * locate each buf that is in error on this io and then 2953 * output an error message 2954 */ 2955 if ((cs->cs_dbuf.b_flags & B_ERROR) && 2956 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && 2957 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) 2958 cmn_err(CE_WARN, "md %s: write error on %s", 2959 md_shortname(MD_SID(un)), 2960 md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); 2961 2962 if ((cs->cs_pbuf.b_flags & B_ERROR) && 2963 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && 2964 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) 2965 cmn_err(CE_WARN, "md %s: write error on %s", 2966 md_shortname(MD_SID(un)), 2967 md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); 2968 2969 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 2970 if ((cbuf->cbuf_bp.b_flags & B_ERROR) && 2971 (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_ERRED) && 2972 (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_LAST_ERRED)) 2973 cmn_err(CE_WARN, "md %s: write error on %s", 2974 md_shortname(MD_SID(un)), 2975 md_devname(setno, md_expldev(cbuf->cbuf_bp.b_edev), 2976 NULL, 0)); 2977 2978 md_unit_readerexit(ui); 2979 2980 ASSERT(cs->cs_frags == 0); 2981 2982 /* now schedule processing for possible state change */ 2983 daemon_request(&md_mstr_daemon, raid_wrerr, 2984 (daemon_queue_t *)cs, REQ_OLD); 2985 2986 } 2987 2988 /* 2989 * NAME: raid_write_ponly 2990 * DESCRIPTION: RAID metadevice write routine 2991 * in the case where only the parity column can be written 2992 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2993 */ 2994 static void 2995 raid_write_ponly(md_raidcs_t *cs) 2996 { 2997 md_raidps_t *ps; 2998 mr_unit_t *un = cs->cs_un; 2999 3000 ps = cs->cs_ps; 3001 /* decrement pwfrags if needed, but not frags */ 3002 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 3003 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3004 cs->cs_flags |= MD_RCS_PWDONE; 3005 cs->cs_frags = 1; 3006 cs->cs_stage = RAID_WRITE_PONLY_DONE; 3007 cs->cs_call = raid_stage; 3008 cs->cs_error_call = raid_write_error; 3009 cs->cs_retry_call = raid_write_no_retry; 3010 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3011 cs->cs_frags++; 3012 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE); 3013 } 3014 raidio(cs, RIO_PARITY | RIO_WRITE); 3015 } 3016 3017 /* 3018 * NAME: raid_write_ploop 3019 * DESCRIPTION: RAID metadevice write routine, constructs parity from 3020 * data in other columns. 3021 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3022 */ 3023 static void 3024 raid_write_ploop(md_raidcs_t *cs) 3025 { 3026 mr_unit_t *un = cs->cs_un; 3027 uint_t *dbuf; 3028 uint_t *pbuf; 3029 size_t wordcnt; 3030 uint_t psum = 0; 3031 3032 wordcnt = cs->cs_bcount / sizeof (uint_t); 3033 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 3034 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 3035 while (wordcnt--) 3036 *pbuf++ ^= *dbuf++; 3037 cs->cs_loop++; 3038 3039 /* 3040 * build parity from scratch using new data, 3041 * skip reading the data and parity columns. 3042 */ 3043 while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn) 3044 cs->cs_loop++; 3045 3046 if (cs->cs_loop != un->un_totalcolumncnt) { 3047 cs->cs_frags = 1; 3048 raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); 3049 return; 3050 } 3051 3052 /* construct checksum for parity buffer */ 3053 wordcnt = cs->cs_bcount / sizeof (uint_t); 3054 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 3055 while (wordcnt--) { 3056 psum ^= *pbuf; 3057 pbuf++; 3058 } 3059 RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, -1, 3060 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 3061 1, cs->cs_pcolumn, RAID_PWMAGIC); 3062 3063 cs->cs_stage = RAID_NONE; 3064 cs->cs_call = raid_write_ponly; 3065 cs->cs_error_call = raid_write_error; 3066 cs->cs_retry_call = raid_write_err_retry; 3067 cs->cs_frags = 1; 3068 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3069 cs->cs_frags++; 3070 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); 3071 } 3072 raidio(cs, RIO_PARITY | RIO_PREWRITE); 3073 } 3074 3075 /* 3076 * NAME: raid_write_donly 3077 * DESCRIPTION: RAID metadevice write routine 3078 * Completed writing data to prewrite entry 3079 * in the case where only the data column can be written 3080 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3081 */ 3082 static void 3083 raid_write_donly(md_raidcs_t *cs) 3084 { 3085 md_raidps_t *ps; 3086 mr_unit_t *un = cs->cs_un; 3087 3088 ps = cs->cs_ps; 3089 /* WARNING: don't release unit reader lock here... */ 3090 /* decrement pwfrags if needed, but not frags */ 3091 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 3092 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3093 cs->cs_flags |= MD_RCS_PWDONE; 3094 cs->cs_frags = 1; 3095 cs->cs_stage = RAID_WRITE_DONLY_DONE; 3096 cs->cs_call = raid_stage; 3097 cs->cs_error_call = raid_write_error; 3098 cs->cs_retry_call = raid_write_err_retry; 3099 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3100 cs->cs_frags++; 3101 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); 3102 } 3103 raidio(cs, RIO_DATA | RIO_WRITE); 3104 } 3105 3106 /* 3107 * NAME: raid_write_got_old 3108 * DESCRIPTION: RAID metadevice write routine 3109 * completed read of old data and old parity 3110 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3111 */ 3112 static void 3113 raid_write_got_old(md_raidcs_t *cs) 3114 { 3115 mr_unit_t *un = cs->cs_un; 3116 3117 ASSERT(IO_READER_HELD(cs->cs_un)); 3118 ASSERT(UNIT_READER_HELD(cs->cs_un)); 3119 3120 raid_mapin_buf(cs); 3121 genstandardparity(cs); 3122 cs->cs_frags = 2; 3123 cs->cs_call = raid_stage; 3124 cs->cs_stage = RAID_PREWRITE_DONE; 3125 cs->cs_error_call = raid_write_error; 3126 cs->cs_retry_call = raid_write_retry; 3127 3128 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3129 cs->cs_frags++; 3130 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_PREWRITE); 3131 } 3132 3133 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3134 cs->cs_frags++; 3135 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); 3136 } 3137 ASSERT(cs->cs_frags < 4); 3138 raidio(cs, RIO_DATA | RIO_PREWRITE); 3139 raidio(cs, RIO_PARITY | RIO_PREWRITE); 3140 } 3141 3142 /* 3143 * NAME: raid_write_io 3144 * DESCRIPTION: RAID metadevice write I/O routine 3145 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 3146 * md_raidcs_t *cs - pointer to a child structure 3147 */ 3148 3149 /*ARGSUSED*/ 3150 static void 3151 raid_write_io(mr_unit_t *un, md_raidcs_t *cs) 3152 { 3153 md_raidps_t *ps = cs->cs_ps; 3154 uint_t *dbuf; 3155 uint_t *ubuf; 3156 size_t wordcnt; 3157 uint_t dsum = 0; 3158 int pcheck; 3159 int dcheck; 3160 3161 ASSERT((un->un_column[cs->cs_pcolumn].un_devstate & 3162 RCS_INIT) == 0); 3163 ASSERT((un->un_column[cs->cs_dcolumn].un_devstate & 3164 RCS_INIT) == 0); 3165 ASSERT(IO_READER_HELD(un)); 3166 ASSERT(UNIT_READER_HELD(un)); 3167 ASSERT(cs->cs_flags & MD_RCS_HAVE_PW_SLOTS); 3168 if (cs->cs_flags & MD_RCS_LINE) { 3169 3170 mr_unit_t *un = cs->cs_un; 3171 3172 ASSERT(un->un_origcolumncnt == un->un_totalcolumncnt); 3173 raid_mapin_buf(cs); 3174 cs->cs_frags = un->un_origcolumncnt; 3175 cs->cs_call = raid_stage; 3176 cs->cs_error_call = raid_write_error; 3177 cs->cs_retry_call = raid_write_no_retry; 3178 cs->cs_stage = RAID_LINE_PWDONE; 3179 genlineparity(cs); 3180 return; 3181 } 3182 3183 pcheck = erred_check_line(un, cs, &un->un_column[cs->cs_pcolumn]); 3184 dcheck = erred_check_line(un, cs, &un->un_column[cs->cs_dcolumn]); 3185 cs->cs_resync_check = pcheck << RCL_PARITY_OFFSET || dcheck; 3186 3187 if (pcheck == RCL_ERRED && dcheck == RCL_ERRED) { 3188 int err = EIO; 3189 3190 if ((un->un_column[cs->cs_pcolumn].un_devstate == 3191 RCS_LAST_ERRED) || 3192 (un->un_column[cs->cs_dcolumn].un_devstate == 3193 RCS_LAST_ERRED)) 3194 err = ENXIO; 3195 raid_error_parent(ps, err); 3196 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 3197 raid_free_child(cs, 1); 3198 raid_free_parent(ps, RFP_DECR_FRAGS 3199 | RFP_RLS_LOCK | RFP_DECR_PWFRAGS); 3200 return; 3201 } 3202 3203 if (pcheck & RCL_ERRED) { 3204 /* 3205 * handle case of only having data drive 3206 */ 3207 raid_mapin_buf(cs); 3208 wordcnt = cs->cs_bcount / sizeof (uint_t); 3209 3210 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 3211 ubuf = (uint_t *)(void *)(cs->cs_addr); 3212 3213 while (wordcnt--) { 3214 *dbuf = *ubuf; 3215 dsum ^= *ubuf; 3216 dbuf++; 3217 ubuf++; 3218 } 3219 RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, -1, 3220 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 3221 1, cs->cs_dcolumn, RAID_PWMAGIC); 3222 cs->cs_frags = 1; 3223 cs->cs_stage = RAID_NONE; 3224 cs->cs_call = raid_write_donly; 3225 cs->cs_error_call = raid_write_error; 3226 cs->cs_retry_call = raid_write_err_retry; 3227 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3228 cs->cs_frags++; 3229 raidio(cs, RIO_DATA | RIO_ALT | RIO_EXTRA | 3230 RIO_PREWRITE); 3231 } 3232 raidio(cs, RIO_DATA | RIO_PREWRITE); 3233 return; 3234 } 3235 3236 if (dcheck & RCL_ERRED) { 3237 /* 3238 * handle case of only having parity drive 3239 * build parity from scratch using new data, 3240 * skip reading the data and parity columns. 3241 */ 3242 raid_mapin_buf(cs); 3243 cs->cs_loop = 0; 3244 while (cs->cs_loop == cs->cs_dcolumn || 3245 cs->cs_loop == cs->cs_pcolumn) 3246 cs->cs_loop++; 3247 3248 /* copy new data in to begin building parity */ 3249 bcopy(cs->cs_addr, cs->cs_pbuffer + DEV_BSIZE, cs->cs_bcount); 3250 cs->cs_stage = RAID_NONE; 3251 cs->cs_call = raid_write_ploop; 3252 cs->cs_error_call = raid_write_error; 3253 cs->cs_retry_call = raid_write_err_retry; 3254 cs->cs_frags = 1; 3255 raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); 3256 return; 3257 } 3258 /* 3259 * handle normal cases 3260 * read old data and old parity 3261 */ 3262 cs->cs_frags = 2; 3263 cs->cs_stage = RAID_NONE; 3264 cs->cs_call = raid_write_got_old; 3265 cs->cs_error_call = raid_write_error; 3266 cs->cs_retry_call = raid_write_retry; 3267 ASSERT(ps->ps_magic == RAID_PSMAGIC); 3268 raidio(cs, RIO_DATA | RIO_READ); 3269 raidio(cs, RIO_PARITY | RIO_READ); 3270 } 3271 3272 static void 3273 raid_enqueue(md_raidcs_t *cs) 3274 { 3275 mdi_unit_t *ui = cs->cs_ps->ps_ui; 3276 kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; 3277 md_raidcs_t *cs1; 3278 3279 mutex_enter(io_list_mutex); 3280 ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); 3281 if (ui->ui_io_lock->io_list_front == NULL) { 3282 ui->ui_io_lock->io_list_front = cs; 3283 ui->ui_io_lock->io_list_back = cs; 3284 } else { 3285 cs1 = ui->ui_io_lock->io_list_back; 3286 cs1->cs_linlck_next = cs; 3287 ui->ui_io_lock->io_list_back = cs; 3288 } 3289 STAT_INC(raid_write_waits); 3290 STAT_MAX(raid_max_write_q_length, raid_write_queue_length); 3291 cs->cs_linlck_next = NULL; 3292 mutex_exit(io_list_mutex); 3293 } 3294 3295 /* 3296 * NAME: raid_write 3297 * DESCRIPTION: RAID metadevice write routine 3298 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 3299 * md_raidcs_t *cs - pointer to a child structure 3300 */ 3301 3302 /*ARGSUSED*/ 3303 static int 3304 raid_write(mr_unit_t *un, md_raidcs_t *cs) 3305 { 3306 int error = 0; 3307 md_raidps_t *ps; 3308 mdi_unit_t *ui; 3309 minor_t mnum; 3310 clock_t timeout; 3311 3312 ASSERT(IO_READER_HELD(un)); 3313 ps = cs->cs_ps; 3314 ui = ps->ps_ui; 3315 3316 ASSERT(UNIT_STATE(un) != RUS_INIT); 3317 if (UNIT_STATE(un) == RUS_LAST_ERRED) 3318 error = EIO; 3319 3320 /* make sure the write doesn't go beyond the column */ 3321 if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn) 3322 error = ENXIO; 3323 if (error) 3324 goto werror; 3325 3326 getresources(cs); 3327 3328 /* 3329 * this is an advisory loop that keeps the waiting lists short 3330 * to reduce cpu time. Since there is a race introduced by not 3331 * aquiring all the correct mutexes, use a cv_timedwait to be 3332 * sure the write always will wake up and start. 3333 */ 3334 while (raid_check_pw(cs)) { 3335 mutex_enter(&un->un_mx); 3336 (void) drv_getparm(LBOLT, &timeout); 3337 timeout += md_wr_wait; 3338 un->un_rflags |= MD_RFLAG_NEEDPW; 3339 STAT_INC(raid_prewrite_waits); 3340 (void) cv_timedwait(&un->un_cv, &un->un_mx, timeout); 3341 un->un_rflags &= ~MD_RFLAG_NEEDPW; 3342 mutex_exit(&un->un_mx); 3343 } 3344 3345 if (raid_line_writer_lock(cs, 1)) 3346 return (0); 3347 3348 un = (mr_unit_t *)md_unit_readerlock(ui); 3349 cs->cs_un = un; 3350 mnum = MD_SID(un); 3351 3352 if (un->un_state & RUS_REGEN) { 3353 raid_regen_parity(cs); 3354 un = MD_UNIT(mnum); 3355 cs->cs_un = un; 3356 } 3357 3358 raid_write_io(un, cs); 3359 return (0); 3360 werror: 3361 /* aquire unit reader lock sinc raid_free_child always drops it */ 3362 raid_error_parent(ps, error); 3363 raid_free_child(cs, 0); 3364 /* decrement both pwfrags and frags */ 3365 raid_free_parent(ps, RFP_DECR_PWFRAGS | RFP_DECR_FRAGS | RFP_RLS_LOCK); 3366 return (0); 3367 } 3368 3369 3370 /* 3371 * NAMES: raid_stage 3372 * DESCRIPTION: post-processing routine for a RAID metadevice 3373 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 3374 */ 3375 static void 3376 raid_stage(md_raidcs_t *cs) 3377 { 3378 md_raidps_t *ps = cs->cs_ps; 3379 mr_unit_t *un = cs->cs_un; 3380 md_raidcbuf_t *cbuf; 3381 buf_t *bp; 3382 void *private; 3383 int flag; 3384 3385 switch (cs->cs_stage) { 3386 case RAID_READ_DONE: 3387 raid_free_child(cs, 1); 3388 /* decrement readfrags */ 3389 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 3390 return; 3391 3392 case RAID_WRITE_DONE: 3393 case RAID_WRITE_PONLY_DONE: 3394 case RAID_WRITE_DONLY_DONE: 3395 /* 3396 * Completed writing real parity and/or data. 3397 */ 3398 ASSERT(cs->cs_flags & MD_RCS_PWDONE); 3399 raid_free_child(cs, 1); 3400 /* decrement frags but not pwfrags */ 3401 raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK); 3402 return; 3403 3404 case RAID_PREWRITE_DONE: 3405 /* 3406 * completed writing data and parity to prewrite entries 3407 */ 3408 /* 3409 * WARNING: don't release unit reader lock here.. 3410 * decrement pwfrags but not frags 3411 */ 3412 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3413 cs->cs_flags |= MD_RCS_PWDONE; 3414 cs->cs_frags = 2; 3415 cs->cs_stage = RAID_WRITE_DONE; 3416 cs->cs_call = raid_stage; 3417 cs->cs_error_call = raid_write_error; 3418 cs->cs_retry_call = raid_write_no_retry; 3419 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3420 cs->cs_frags++; 3421 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | 3422 RIO_WRITE); 3423 } 3424 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3425 cs->cs_frags++; 3426 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); 3427 } 3428 ASSERT(cs->cs_frags < 4); 3429 raidio(cs, RIO_DATA | RIO_WRITE); 3430 raidio(cs, RIO_PARITY | RIO_WRITE); 3431 if (cs->cs_pw_inval_list) { 3432 raid_free_pwinvalidate(cs); 3433 } 3434 return; 3435 3436 case RAID_LINE_PWDONE: 3437 ASSERT(cs->cs_frags == 0); 3438 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3439 cs->cs_flags |= MD_RCS_PWDONE; 3440 cs->cs_frags = un->un_origcolumncnt; 3441 cs->cs_call = raid_stage; 3442 cs->cs_error_call = raid_write_error; 3443 cs->cs_retry_call = raid_write_no_retry; 3444 cs->cs_stage = RAID_WRITE_DONE; 3445 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 3446 /* 3447 * fill in buffer for write to prewrite area 3448 */ 3449 bp = &cbuf->cbuf_bp; 3450 bp->b_back = bp; 3451 bp->b_forw = bp; 3452 bp->b_un.b_addr = cbuf->cbuf_buffer + DEV_BSIZE; 3453 bp->b_bcount = cbuf->cbuf_bcount; 3454 bp->b_bufsize = cbuf->cbuf_bcount; 3455 bp->b_lblkno = 3456 un->un_column[cbuf->cbuf_column].un_devstart + 3457 cs->cs_blkno; 3458 bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR); 3459 bp->b_flags &= ~nv_available; 3460 bp->b_flags |= B_WRITE | B_BUSY; 3461 bp->b_iodone = (int (*)())raid_done; 3462 bp->b_edev = md_dev64_to_dev( 3463 un->un_column[cbuf->cbuf_column].un_dev); 3464 bp->b_chain = (struct buf *)cs; 3465 private = cs->cs_strategy_private; 3466 flag = cs->cs_strategy_flag; 3467 md_call_strategy(bp, flag, private); 3468 } 3469 raidio(cs, RIO_DATA | RIO_WRITE); 3470 raidio(cs, RIO_PARITY | RIO_WRITE); 3471 if (cs->cs_pw_inval_list) { 3472 raid_free_pwinvalidate(cs); 3473 } 3474 return; 3475 3476 default: 3477 ASSERT(0); 3478 break; 3479 } 3480 } 3481 /* 3482 * NAME: md_raid_strategy 3483 * DESCRIPTION: RAID metadevice I/O oprations entry point. 3484 * PARAMETERS: buf_t *pb - pointer to a user I/O buffer 3485 * int flag - metadevice specific flag 3486 * void *private - carry over flag ?? 3487 * 3488 */ 3489 3490 void 3491 md_raid_strategy(buf_t *pb, int flag, void *private) 3492 { 3493 md_raidps_t *ps; 3494 md_raidcs_t *cs; 3495 int doing_writes; 3496 int err; 3497 mr_unit_t *un; 3498 mdi_unit_t *ui; 3499 size_t count; 3500 diskaddr_t blkno; 3501 caddr_t addr; 3502 off_t offset; 3503 int colcnt; 3504 minor_t mnum; 3505 set_t setno; 3506 3507 ui = MDI_UNIT(getminor(pb->b_edev)); 3508 md_kstat_waitq_enter(ui); 3509 un = (mr_unit_t *)md_io_readerlock(ui); 3510 setno = MD_MIN2SET(getminor(pb->b_edev)); 3511 3512 if ((flag & MD_NOBLOCK) == 0) { 3513 if (md_inc_iocount(setno) != 0) { 3514 pb->b_flags |= B_ERROR; 3515 pb->b_error = ENXIO; 3516 pb->b_resid = pb->b_bcount; 3517 md_kstat_waitq_exit(ui); 3518 md_io_readerexit(ui); 3519 biodone(pb); 3520 return; 3521 } 3522 } else { 3523 md_inc_iocount_noblock(setno); 3524 } 3525 3526 mnum = MD_SID(un); 3527 colcnt = un->un_totalcolumncnt - 1; 3528 count = pb->b_bcount; 3529 3530 STAT_CHECK(raid_512, count == 512); 3531 STAT_CHECK(raid_1024, count == 1024); 3532 STAT_CHECK(raid_1024_8192, count > 1024 && count < 8192); 3533 STAT_CHECK(raid_8192, count == 8192); 3534 STAT_CHECK(raid_8192_bigger, count > 8192); 3535 3536 (void *) md_unit_readerlock(ui); 3537 if (!(flag & MD_STR_NOTTOP)) { 3538 err = md_checkbuf(ui, (md_unit_t *)un, pb); /* check and map */ 3539 if (err != 0) { 3540 md_kstat_waitq_exit(ui); 3541 md_io_readerexit(ui); 3542 return; 3543 } 3544 } 3545 md_unit_readerexit(ui); 3546 3547 STAT_INC(raid_total_io); 3548 3549 /* allocate a parent structure for the user I/O */ 3550 ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS); 3551 raid_parent_init(ps); 3552 3553 /* 3554 * Save essential information from the original buffhdr 3555 * in the md_save structure. 3556 */ 3557 ps->ps_un = un; 3558 ps->ps_ui = ui; 3559 ps->ps_bp = pb; 3560 ps->ps_addr = pb->b_un.b_addr; 3561 3562 if ((pb->b_flags & B_READ) == 0) { 3563 ps->ps_flags |= MD_RPS_WRITE; 3564 doing_writes = 1; 3565 STAT_INC(raid_writes); 3566 } else { 3567 ps->ps_flags |= MD_RPS_READ; 3568 doing_writes = 0; 3569 STAT_INC(raid_reads); 3570 } 3571 3572 count = lbtodb(pb->b_bcount); /* transfer count (in blocks) */ 3573 blkno = pb->b_lblkno; /* block number on device */ 3574 addr = 0; 3575 offset = 0; 3576 ps->ps_pwfrags = 1; 3577 ps->ps_frags = 1; 3578 md_kstat_waitq_to_runq(ui); 3579 3580 do { 3581 cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS); 3582 raid_child_init(cs); 3583 cs->cs_ps = ps; 3584 cs->cs_un = un; 3585 cs->cs_mdunit = mnum; 3586 cs->cs_strategy_flag = flag; 3587 cs->cs_strategy_private = private; 3588 cs->cs_addr = addr; 3589 cs->cs_offset = offset; 3590 count = raid_iosetup(un, blkno, count, cs); 3591 if (cs->cs_flags & MD_RCS_LINE) { 3592 blkno += (cs->cs_blkcnt * colcnt); 3593 offset += (cs->cs_bcount * colcnt); 3594 } else { 3595 blkno += cs->cs_blkcnt; 3596 offset += cs->cs_bcount; 3597 } 3598 /* for each cs bump up the ps_pwfrags and ps_frags fields */ 3599 if (count) { 3600 mutex_enter(&ps->ps_mx); 3601 ps->ps_pwfrags++; 3602 ps->ps_frags++; 3603 mutex_exit(&ps->ps_mx); 3604 if (doing_writes) 3605 (void) raid_write(un, cs); 3606 else 3607 (void) raid_read(un, cs); 3608 } 3609 } while (count); 3610 if (doing_writes) { 3611 (void) raid_write(un, cs); 3612 } else 3613 (void) raid_read(un, cs); 3614 3615 if (! (flag & MD_STR_NOTTOP) && panicstr) { 3616 while (! (ps->ps_flags & MD_RPS_DONE)) { 3617 md_daemon(1, &md_done_daemon); 3618 drv_usecwait(10); 3619 } 3620 kmem_cache_free(raid_parent_cache, ps); 3621 } 3622 } 3623 3624 /* 3625 * NAMES: raid_snarf 3626 * DESCRIPTION: RAID metadevice SNARF entry point 3627 * PARAMETERS: md_snarfcmd_t cmd, 3628 * set_t setno 3629 * RETURNS: 3630 */ 3631 static int 3632 raid_snarf(md_snarfcmd_t cmd, set_t setno) 3633 { 3634 mr_unit_t *un; 3635 mddb_recid_t recid; 3636 int gotsomething; 3637 int all_raid_gotten; 3638 mddb_type_t typ1; 3639 uint_t ncol; 3640 mddb_de_ic_t *dep; 3641 mddb_rb32_t *rbp; 3642 size_t newreqsize; 3643 mr_unit_t *big_un; 3644 mr_unit32_od_t *small_un; 3645 3646 3647 if (cmd == MD_SNARF_CLEANUP) 3648 return (0); 3649 3650 all_raid_gotten = 1; 3651 gotsomething = 0; 3652 typ1 = (mddb_type_t)md_getshared_key(setno, 3653 raid_md_ops.md_driver.md_drivername); 3654 recid = mddb_makerecid(setno, 0); 3655 3656 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 3657 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) { 3658 continue; 3659 } 3660 3661 dep = mddb_getrecdep(recid); 3662 dep->de_flags = MDDB_F_RAID; 3663 rbp = dep->de_rb; 3664 switch (rbp->rb_revision) { 3665 case MDDB_REV_RB: 3666 case MDDB_REV_RBFN: 3667 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 3668 /* 3669 * This means, we have an old and small record 3670 * and this record hasn't already been 3671 * converted. Before we create an incore 3672 * metadevice from this we have to convert it to 3673 * a big record. 3674 */ 3675 small_un = 3676 (mr_unit32_od_t *)mddb_getrecaddr(recid); 3677 ncol = small_un->un_totalcolumncnt; 3678 newreqsize = sizeof (mr_unit_t) + 3679 ((ncol - 1) * sizeof (mr_column_t)); 3680 big_un = (mr_unit_t *)kmem_zalloc(newreqsize, 3681 KM_SLEEP); 3682 raid_convert((caddr_t)small_un, (caddr_t)big_un, 3683 SMALL_2_BIG); 3684 kmem_free(small_un, dep->de_reqsize); 3685 dep->de_rb_userdata = big_un; 3686 dep->de_reqsize = newreqsize; 3687 un = big_un; 3688 rbp->rb_private |= MD_PRV_CONVD; 3689 } else { 3690 /* 3691 * Record has already been converted. Just 3692 * get its address. 3693 */ 3694 un = (mr_unit_t *)mddb_getrecaddr(recid); 3695 } 3696 un->c.un_revision &= ~MD_64BIT_META_DEV; 3697 break; 3698 case MDDB_REV_RB64: 3699 case MDDB_REV_RB64FN: 3700 /* Big device */ 3701 un = (mr_unit_t *)mddb_getrecaddr(recid); 3702 un->c.un_revision |= MD_64BIT_META_DEV; 3703 un->c.un_flag |= MD_EFILABEL; 3704 break; 3705 } 3706 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 3707 3708 /* 3709 * Create minor device node for snarfed entry. 3710 */ 3711 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 3712 3713 if (MD_UNIT(MD_SID(un)) != NULL) { 3714 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 3715 continue; 3716 } 3717 all_raid_gotten = 0; 3718 if (raid_build_incore((void *)un, 1) == 0) { 3719 mddb_setrecprivate(recid, MD_PRV_GOTIT); 3720 md_create_unit_incore(MD_SID(un), &raid_md_ops, 1); 3721 gotsomething = 1; 3722 } else if (un->mr_ic) { 3723 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * 3724 un->un_totalcolumncnt); 3725 kmem_free(un->mr_ic, sizeof (*un->mr_ic)); 3726 } 3727 } 3728 3729 if (!all_raid_gotten) { 3730 return (gotsomething); 3731 } 3732 3733 recid = mddb_makerecid(setno, 0); 3734 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) 3735 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 3736 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 3737 3738 return (0); 3739 } 3740 3741 /* 3742 * NAMES: raid_halt 3743 * DESCRIPTION: RAID metadevice HALT entry point 3744 * PARAMETERS: md_haltcmd_t cmd - 3745 * set_t setno - 3746 * RETURNS: 3747 */ 3748 static int 3749 raid_halt(md_haltcmd_t cmd, set_t setno) 3750 { 3751 set_t i; 3752 mdi_unit_t *ui; 3753 minor_t mnum; 3754 3755 if (cmd == MD_HALT_CLOSE) 3756 return (0); 3757 3758 if (cmd == MD_HALT_OPEN) 3759 return (0); 3760 3761 if (cmd == MD_HALT_UNLOAD) 3762 return (0); 3763 3764 if (cmd == MD_HALT_CHECK) { 3765 for (i = 0; i < md_nunits; i++) { 3766 mnum = MD_MKMIN(setno, i); 3767 if ((ui = MDI_UNIT(mnum)) == NULL) 3768 continue; 3769 if (ui->ui_opsindex != raid_md_ops.md_selfindex) 3770 continue; 3771 if (md_unit_isopen(ui)) 3772 return (1); 3773 } 3774 return (0); 3775 } 3776 3777 if (cmd != MD_HALT_DOIT) 3778 return (1); 3779 3780 for (i = 0; i < md_nunits; i++) { 3781 mnum = MD_MKMIN(setno, i); 3782 if ((ui = MDI_UNIT(mnum)) == NULL) 3783 continue; 3784 if (ui->ui_opsindex != raid_md_ops.md_selfindex) 3785 continue; 3786 reset_raid((mr_unit_t *)MD_UNIT(mnum), mnum, 0); 3787 } 3788 return (0); 3789 } 3790 3791 /* 3792 * NAMES: raid_close_all_devs 3793 * DESCRIPTION: Close all the devices of the unit. 3794 * PARAMETERS: mr_unit_t *un - pointer to unit structure 3795 * RETURNS: 3796 */ 3797 void 3798 raid_close_all_devs(mr_unit_t *un, int init_pw, int md_cflags) 3799 { 3800 int i; 3801 mr_column_t *device; 3802 3803 for (i = 0; i < un->un_totalcolumncnt; i++) { 3804 device = &un->un_column[i]; 3805 if (device->un_devflags & MD_RAID_DEV_ISOPEN) { 3806 ASSERT((device->un_dev != (md_dev64_t)0) && 3807 (device->un_dev != NODEV64)); 3808 if ((device->un_devstate & RCS_OKAY) && init_pw) 3809 (void) init_pw_area(un, device->un_dev, 3810 device->un_pwstart, i); 3811 md_layered_close(device->un_dev, md_cflags); 3812 device->un_devflags &= ~MD_RAID_DEV_ISOPEN; 3813 } 3814 } 3815 } 3816 3817 /* 3818 * NAMES: raid_open_all_devs 3819 * DESCRIPTION: Open all the components (columns) of the device unit. 3820 * PARAMETERS: mr_unit_t *un - pointer to unit structure 3821 * RETURNS: 3822 */ 3823 static int 3824 raid_open_all_devs(mr_unit_t *un, int md_oflags) 3825 { 3826 minor_t mnum = MD_SID(un); 3827 int i; 3828 int not_opened = 0; 3829 int commit = 0; 3830 int col = -1; 3831 mr_column_t *device; 3832 set_t setno = MD_MIN2SET(MD_SID(un)); 3833 side_t side = mddb_getsidenum(setno); 3834 mdkey_t key; 3835 mdi_unit_t *ui = MDI_UNIT(mnum); 3836 3837 ui->ui_tstate &= ~MD_INACCESSIBLE; 3838 3839 for (i = 0; i < un->un_totalcolumncnt; i++) { 3840 md_dev64_t tmpdev; 3841 3842 device = &un->un_column[i]; 3843 3844 if (COLUMN_STATE(un, i) & RCS_ERRED) { 3845 not_opened++; 3846 continue; 3847 } 3848 3849 if (device->un_devflags & MD_RAID_DEV_ISOPEN) 3850 continue; 3851 3852 tmpdev = device->un_dev; 3853 /* 3854 * Open by device id 3855 */ 3856 key = HOTSPARED(un, i) ? 3857 device->un_hs_key : device->un_orig_key; 3858 if ((md_getmajor(tmpdev) != md_major) && 3859 md_devid_found(setno, side, key) == 1) { 3860 tmpdev = md_resolve_bydevid(mnum, tmpdev, key); 3861 } 3862 if (md_layered_open(mnum, &tmpdev, md_oflags)) { 3863 device->un_dev = tmpdev; 3864 not_opened++; 3865 continue; 3866 } 3867 device->un_dev = tmpdev; 3868 device->un_devflags |= MD_RAID_DEV_ISOPEN; 3869 } 3870 3871 /* if open errors and errored devices are 1 then device can run */ 3872 if (not_opened > 1) { 3873 cmn_err(CE_WARN, 3874 "md: %s failed to open. open error on %s\n", 3875 md_shortname(MD_SID(un)), 3876 md_devname(MD_UN2SET(un), device->un_orig_dev, NULL, 0)); 3877 3878 ui->ui_tstate |= MD_INACCESSIBLE; 3879 3880 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 3881 MD_UN2SET(un), MD_SID(un)); 3882 3883 return (not_opened > 1); 3884 } 3885 3886 for (i = 0; i < un->un_totalcolumncnt; i++) { 3887 device = &un->un_column[i]; 3888 if (device->un_devflags & MD_RAID_DEV_ISOPEN) { 3889 if (device->un_devstate & RCS_LAST_ERRED) { 3890 /* 3891 * At this point in time there is a possibility 3892 * that errors were the result of a controller 3893 * failure with more than a single column on it 3894 * so clear out last errored columns and let errors 3895 * re-occur is necessary. 3896 */ 3897 raid_set_state(un, i, RCS_OKAY, 0); 3898 commit++; 3899 } 3900 continue; 3901 } 3902 ASSERT(col == -1); 3903 col = i; 3904 } 3905 3906 if (col != -1) { 3907 raid_set_state(un, col, RCS_ERRED, 0); 3908 commit++; 3909 } 3910 3911 if (commit) 3912 raid_commit(un, NULL); 3913 3914 if (col != -1) { 3915 if (COLUMN_STATE(un, col) & RCS_ERRED) { 3916 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 3917 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 3918 } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { 3919 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 3920 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 3921 } 3922 } 3923 3924 return (0); 3925 } 3926 3927 /* 3928 * NAMES: raid_internal_open 3929 * DESCRIPTION: Do the actual RAID open 3930 * PARAMETERS: minor_t mnum - minor number of the RAID device 3931 * int flag - 3932 * int otyp - 3933 * int md_oflags - RAID open flags 3934 * RETURNS: 0 if successful, nonzero otherwise 3935 */ 3936 int 3937 raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags) 3938 { 3939 mr_unit_t *un; 3940 mdi_unit_t *ui; 3941 int err = 0; 3942 int replay_error = 0; 3943 3944 ui = MDI_UNIT(mnum); 3945 ASSERT(ui != NULL); 3946 3947 un = (mr_unit_t *)md_unit_openclose_enter(ui); 3948 /* 3949 * this MUST be checked before md_unit_isopen is checked. 3950 * raid_init_columns sets md_unit_isopen to block reset, halt. 3951 */ 3952 if ((UNIT_STATE(un) & (RUS_INIT | RUS_DOI)) && 3953 !(md_oflags & MD_OFLG_ISINIT)) { 3954 md_unit_openclose_exit(ui); 3955 return (EAGAIN); 3956 } 3957 3958 if ((md_oflags & MD_OFLG_ISINIT) || md_unit_isopen(ui)) { 3959 err = md_unit_incopen(mnum, flag, otyp); 3960 goto out; 3961 } 3962 3963 md_unit_readerexit(ui); 3964 3965 un = (mr_unit_t *)md_unit_writerlock(ui); 3966 if (raid_open_all_devs(un, md_oflags) == 0) { 3967 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) { 3968 md_unit_writerexit(ui); 3969 un = (mr_unit_t *)md_unit_readerlock(ui); 3970 raid_close_all_devs(un, 0, md_oflags); 3971 goto out; 3972 } 3973 } else { 3974 /* 3975 * if this unit contains more than two errored components 3976 * should return error and close all opened devices 3977 */ 3978 3979 md_unit_writerexit(ui); 3980 un = (mr_unit_t *)md_unit_readerlock(ui); 3981 raid_close_all_devs(un, 0, md_oflags); 3982 md_unit_openclose_exit(ui); 3983 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 3984 MD_UN2SET(un), MD_SID(un)); 3985 return (ENXIO); 3986 } 3987 3988 if (!(MD_STATUS(un) & MD_UN_REPLAYED)) { 3989 replay_error = raid_replay(un); 3990 MD_STATUS(un) |= MD_UN_REPLAYED; 3991 } 3992 3993 md_unit_writerexit(ui); 3994 un = (mr_unit_t *)md_unit_readerlock(ui); 3995 3996 if ((replay_error == RAID_RPLY_READONLY) && 3997 ((flag & (FREAD | FWRITE)) == FREAD)) { 3998 md_unit_openclose_exit(ui); 3999 return (0); 4000 } 4001 4002 /* allocate hotspare if possible */ 4003 (void) raid_hotspares(); 4004 4005 4006 out: 4007 md_unit_openclose_exit(ui); 4008 return (err); 4009 } 4010 /* 4011 * NAMES: raid_open 4012 * DESCRIPTION: RAID metadevice OPEN entry point 4013 * PARAMETERS: dev_t dev - 4014 * int flag - 4015 * int otyp - 4016 * cred_t * cred_p - 4017 * int md_oflags - 4018 * RETURNS: 4019 */ 4020 /*ARGSUSED1*/ 4021 static int 4022 raid_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 4023 { 4024 int error = 0; 4025 4026 if (error = raid_internal_open(getminor(*dev), flag, otyp, md_oflags)) { 4027 return (error); 4028 } 4029 return (0); 4030 } 4031 4032 /* 4033 * NAMES: raid_internal_close 4034 * DESCRIPTION: RAID metadevice CLOSE actual implementation 4035 * PARAMETERS: minor_t - minor number of the RAID device 4036 * int otyp - 4037 * int init_pw - 4038 * int md_cflags - RAID close flags 4039 * RETURNS: 0 if successful, nonzero otherwise 4040 */ 4041 /*ARGSUSED*/ 4042 int 4043 raid_internal_close(minor_t mnum, int otyp, int init_pw, int md_cflags) 4044 { 4045 mdi_unit_t *ui = MDI_UNIT(mnum); 4046 mr_unit_t *un; 4047 int err = 0; 4048 4049 /* single thread */ 4050 un = (mr_unit_t *)md_unit_openclose_enter(ui); 4051 4052 /* count closed */ 4053 if ((err = md_unit_decopen(mnum, otyp)) != 0) 4054 goto out; 4055 /* close devices, if necessary */ 4056 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 4057 raid_close_all_devs(un, init_pw, md_cflags); 4058 } 4059 4060 /* unlock, return success */ 4061 out: 4062 md_unit_openclose_exit(ui); 4063 return (err); 4064 } 4065 4066 /* 4067 * NAMES: raid_close 4068 * DESCRIPTION: RAID metadevice close entry point 4069 * PARAMETERS: dev_t dev - 4070 * int flag - 4071 * int otyp - 4072 * cred_t * cred_p - 4073 * int md_oflags - 4074 * RETURNS: 4075 */ 4076 /*ARGSUSED1*/ 4077 static int 4078 raid_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) 4079 { 4080 int retval; 4081 4082 (void) md_io_writerlock(MDI_UNIT(getminor(dev))); 4083 retval = raid_internal_close(getminor(dev), otyp, 1, md_cflags); 4084 (void) md_io_writerexit(MDI_UNIT(getminor(dev))); 4085 return (retval); 4086 } 4087 4088 /* 4089 * raid_probe_close_all_devs 4090 */ 4091 void 4092 raid_probe_close_all_devs(mr_unit_t *un) 4093 { 4094 int i; 4095 mr_column_t *device; 4096 4097 for (i = 0; i < un->un_totalcolumncnt; i++) { 4098 device = &un->un_column[i]; 4099 4100 if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { 4101 md_layered_close(device->un_dev, 4102 MD_OFLG_PROBEDEV); 4103 device->un_devflags &= ~MD_RAID_DEV_PROBEOPEN; 4104 } 4105 } 4106 } 4107 /* 4108 * Raid_probe_dev: 4109 * 4110 * On entry the unit writerlock is held 4111 */ 4112 static int 4113 raid_probe_dev(mdi_unit_t *ui, minor_t mnum) 4114 { 4115 mr_unit_t *un; 4116 int i; 4117 int not_opened = 0; 4118 int commit = 0; 4119 int col = -1; 4120 mr_column_t *device; 4121 int md_devopen = 0; 4122 4123 if (md_unit_isopen(ui)) 4124 md_devopen++; 4125 4126 un = MD_UNIT(mnum); 4127 /* 4128 * If the state has been set to LAST_ERRED because 4129 * of an error when the raid device was open at some 4130 * point in the past, don't probe. We really don't want 4131 * to reset the state in this case. 4132 */ 4133 if (UNIT_STATE(un) == RUS_LAST_ERRED) 4134 return (0); 4135 4136 ui->ui_tstate &= ~MD_INACCESSIBLE; 4137 4138 for (i = 0; i < un->un_totalcolumncnt; i++) { 4139 md_dev64_t tmpdev; 4140 4141 device = &un->un_column[i]; 4142 if (COLUMN_STATE(un, i) & RCS_ERRED) { 4143 not_opened++; 4144 continue; 4145 } 4146 4147 tmpdev = device->un_dev; 4148 /* 4149 * Currently the flags passed are not needed since 4150 * there cannot be an underlying metadevice. However 4151 * they are kept here for consistency. 4152 * 4153 * Open by device id 4154 */ 4155 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i)? 4156 device->un_hs_key : device->un_orig_key); 4157 if (md_layered_open(mnum, &tmpdev, 4158 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV)) { 4159 device->un_dev = tmpdev; 4160 not_opened++; 4161 continue; 4162 } 4163 device->un_dev = tmpdev; 4164 4165 device->un_devflags |= MD_RAID_DEV_PROBEOPEN; 4166 } 4167 4168 /* 4169 * The code below is careful on setting the LAST_ERRED state. 4170 * 4171 * If open errors and exactly one device has failed we can run. 4172 * If more then one device fails we have to figure out when to set 4173 * LAST_ERRED state. The rationale is to avoid unnecessary resyncs 4174 * since they are painful and time consuming. 4175 * 4176 * When more than one component/column fails there are 2 scenerios. 4177 * 4178 * 1. Metadevice has NOT been opened: In this case, the behavior 4179 * mimics the open symantics. ie. Only the first failed device 4180 * is ERRED and LAST_ERRED is not set. 4181 * 4182 * 2. Metadevice has been opened: Here the read/write sematics are 4183 * followed. The first failed devicce is ERRED and on the next 4184 * failed device LAST_ERRED is set. 4185 */ 4186 4187 if (not_opened > 1 && !md_devopen) { 4188 cmn_err(CE_WARN, 4189 "md: %s failed to open. open error on %s\n", 4190 md_shortname(MD_SID(un)), 4191 md_devname(MD_UN2SET(un), device->un_orig_dev, NULL, 0)); 4192 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 4193 MD_UN2SET(un), MD_SID(un)); 4194 raid_probe_close_all_devs(un); 4195 ui->ui_tstate |= MD_INACCESSIBLE; 4196 return (not_opened > 1); 4197 } 4198 4199 if (!md_devopen) { 4200 for (i = 0; i < un->un_totalcolumncnt; i++) { 4201 device = &un->un_column[i]; 4202 if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { 4203 if (device->un_devstate & RCS_LAST_ERRED) { 4204 /* 4205 * At this point in time there is a 4206 * possibility that errors were the 4207 * result of a controller failure with 4208 * more than a single column on it so 4209 * clear out last errored columns and 4210 * let errors re-occur is necessary. 4211 */ 4212 raid_set_state(un, i, RCS_OKAY, 0); 4213 commit++; 4214 } 4215 continue; 4216 } 4217 ASSERT(col == -1); 4218 /* 4219 * note if multiple devices are failing then only 4220 * the last one is marked as error 4221 */ 4222 col = i; 4223 } 4224 4225 if (col != -1) { 4226 raid_set_state(un, col, RCS_ERRED, 0); 4227 commit++; 4228 } 4229 4230 } else { 4231 for (i = 0; i < un->un_totalcolumncnt; i++) { 4232 device = &un->un_column[i]; 4233 4234 /* if we have LAST_ERRED go ahead and commit. */ 4235 if (un->un_state & RUS_LAST_ERRED) 4236 break; 4237 /* 4238 * could not open the component 4239 */ 4240 4241 if (!(device->un_devflags & MD_RAID_DEV_PROBEOPEN)) { 4242 col = i; 4243 raid_set_state(un, col, RCS_ERRED, 0); 4244 commit++; 4245 } 4246 } 4247 } 4248 4249 if (commit) 4250 raid_commit(un, NULL); 4251 4252 if (col != -1) { 4253 if (COLUMN_STATE(un, col) & RCS_ERRED) { 4254 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 4255 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 4256 } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { 4257 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 4258 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 4259 } 4260 } 4261 4262 raid_probe_close_all_devs(un); 4263 return (0); 4264 } 4265 4266 static int 4267 raid_imp_set( 4268 set_t setno 4269 ) 4270 { 4271 mddb_recid_t recid; 4272 int i, gotsomething; 4273 mddb_type_t typ1; 4274 mddb_de_ic_t *dep; 4275 mddb_rb32_t *rbp; 4276 mr_unit_t *un64; 4277 mr_unit32_od_t *un32; 4278 md_dev64_t self_devt; 4279 minor_t *self_id; /* minor needs to be updated */ 4280 md_parent_t *parent_id; /* parent needs to be updated */ 4281 mddb_recid_t *record_id; /* record id needs to be updated */ 4282 hsp_t *hsp_id; 4283 4284 gotsomething = 0; 4285 4286 typ1 = (mddb_type_t)md_getshared_key(setno, 4287 raid_md_ops.md_driver.md_drivername); 4288 recid = mddb_makerecid(setno, 0); 4289 4290 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 4291 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 4292 continue; 4293 4294 dep = mddb_getrecdep(recid); 4295 rbp = dep->de_rb; 4296 4297 switch (rbp->rb_revision) { 4298 case MDDB_REV_RB: 4299 case MDDB_REV_RBFN: 4300 /* 4301 * Small device 4302 */ 4303 un32 = (mr_unit32_od_t *)mddb_getrecaddr(recid); 4304 self_id = &(un32->c.un_self_id); 4305 parent_id = &(un32->c.un_parent); 4306 record_id = &(un32->c.un_record_id); 4307 hsp_id = &(un32->un_hsp_id); 4308 4309 for (i = 0; i < un32->un_totalcolumncnt; i++) { 4310 mr_column32_od_t *device; 4311 4312 device = &un32->un_column[i]; 4313 if (!md_update_minor(setno, mddb_getsidenum 4314 (setno), device->un_orig_key)) 4315 goto out; 4316 4317 if (device->un_hs_id != 0) 4318 device->un_hs_id = 4319 MAKERECID(setno, device->un_hs_id); 4320 } 4321 break; 4322 case MDDB_REV_RB64: 4323 case MDDB_REV_RB64FN: 4324 un64 = (mr_unit_t *)mddb_getrecaddr(recid); 4325 self_id = &(un64->c.un_self_id); 4326 parent_id = &(un64->c.un_parent); 4327 record_id = &(un64->c.un_record_id); 4328 hsp_id = &(un64->un_hsp_id); 4329 4330 for (i = 0; i < un64->un_totalcolumncnt; i++) { 4331 mr_column_t *device; 4332 4333 device = &un64->un_column[i]; 4334 if (!md_update_minor(setno, mddb_getsidenum 4335 (setno), device->un_orig_key)) 4336 goto out; 4337 4338 if (device->un_hs_id != 0) 4339 device->un_hs_id = 4340 MAKERECID(setno, device->un_hs_id); 4341 } 4342 break; 4343 } 4344 4345 /* 4346 * If this is a top level and a friendly name metadevice, 4347 * update its minor in the namespace. 4348 */ 4349 if ((*parent_id == MD_NO_PARENT) && 4350 ((rbp->rb_revision == MDDB_REV_RBFN) || 4351 (rbp->rb_revision == MDDB_REV_RB64FN))) { 4352 4353 self_devt = md_makedevice(md_major, *self_id); 4354 if (!md_update_top_device_minor(setno, 4355 mddb_getsidenum(setno), self_devt)) 4356 goto out; 4357 } 4358 4359 /* 4360 * Update unit with the imported setno 4361 */ 4362 mddb_setrecprivate(recid, MD_PRV_GOTIT); 4363 4364 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 4365 4366 if (*hsp_id != -1) 4367 *hsp_id = MAKERECID(setno, DBID(*hsp_id)); 4368 4369 if (*parent_id != MD_NO_PARENT) 4370 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 4371 *record_id = MAKERECID(setno, DBID(*record_id)); 4372 gotsomething = 1; 4373 } 4374 4375 out: 4376 return (gotsomething); 4377 } 4378 4379 static md_named_services_t raid_named_services[] = { 4380 {raid_hotspares, "poke hotspares" }, 4381 {raid_rename_check, MDRNM_CHECK }, 4382 {raid_rename_lock, MDRNM_LOCK }, 4383 {(intptr_t (*)()) raid_rename_unlock, MDRNM_UNLOCK }, 4384 {(intptr_t (*)()) raid_probe_dev, "probe open test" }, 4385 {NULL, 0 } 4386 }; 4387 4388 md_ops_t raid_md_ops = { 4389 raid_open, /* open */ 4390 raid_close, /* close */ 4391 md_raid_strategy, /* strategy */ 4392 NULL, /* print */ 4393 NULL, /* dump */ 4394 NULL, /* read */ 4395 NULL, /* write */ 4396 md_raid_ioctl, /* ioctl, */ 4397 raid_snarf, /* raid_snarf */ 4398 raid_halt, /* raid_halt */ 4399 NULL, /* aread */ 4400 NULL, /* awrite */ 4401 raid_imp_set, /* import set */ 4402 raid_named_services 4403 }; 4404 4405 static void 4406 init_init() 4407 { 4408 /* default to a second */ 4409 if (md_wr_wait == 0) 4410 md_wr_wait = md_hz >> 1; 4411 4412 raid_parent_cache = kmem_cache_create("md_raid_parent", 4413 sizeof (md_raidps_t), 0, raid_parent_constructor, 4414 raid_parent_destructor, raid_run_queue, NULL, NULL, 0); 4415 raid_child_cache = kmem_cache_create("md_raid_child", 4416 sizeof (md_raidcs_t) - sizeof (buf_t) + biosize(), 0, 4417 raid_child_constructor, raid_child_destructor, 4418 raid_run_queue, NULL, NULL, 0); 4419 raid_cbuf_cache = kmem_cache_create("md_raid_cbufs", 4420 sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor, 4421 raid_cbuf_destructor, raid_run_queue, NULL, NULL, 0); 4422 } 4423 4424 static void 4425 fini_uninit() 4426 { 4427 kmem_cache_destroy(raid_parent_cache); 4428 kmem_cache_destroy(raid_child_cache); 4429 kmem_cache_destroy(raid_cbuf_cache); 4430 raid_parent_cache = raid_child_cache = raid_cbuf_cache = NULL; 4431 } 4432 4433 /* define the module linkage */ 4434 MD_PLUGIN_MISC_MODULE("raid module", init_init(), fini_uninit()) 4435