1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * NAME: raid.c 30 * 31 * DESCRIPTION: Main RAID driver source file containing open, close and I/O 32 * operations. 33 * 34 * ROUTINES PROVIDED FOR EXTERNAL USE: 35 * raid_open() - open the RAID metadevice for access. 36 * raid_internal_open() - internal open routine of RAID metdevice. 37 * md_raid_strategy() - perform normal I/O operations, 38 * such as read and write. 39 * raid_close() - close the RAID metadevice. 40 * raid_internal_close() - internal close routine of RAID metadevice. 41 * raid_snarf() - initialize and clean up MDD records. 42 * raid_halt() - reset the RAID metadevice 43 * raid_line() - return the line # of this segment 44 * raid_dcolumn() - return the data column # of this segment 45 * raid_pcolumn() - return the parity column # of this segment 46 */ 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/conf.h> 51 #include <sys/file.h> 52 #include <sys/user.h> 53 #include <sys/uio.h> 54 #include <sys/t_lock.h> 55 #include <sys/buf.h> 56 #include <sys/dkio.h> 57 #include <sys/vtoc.h> 58 #include <sys/kmem.h> 59 #include <vm/page.h> 60 #include <sys/cmn_err.h> 61 #include <sys/sysmacros.h> 62 #include <sys/types.h> 63 #include <sys/mkdev.h> 64 #include <sys/stat.h> 65 #include <sys/open.h> 66 #include <sys/modctl.h> 67 #include <sys/ddi.h> 68 #include <sys/sunddi.h> 69 #include <sys/debug.h> 70 #include <sys/lvm/md_raid.h> 71 #include <sys/lvm/mdvar.h> 72 #include <sys/lvm/md_convert.h> 73 74 #include <sys/sysevent/eventdefs.h> 75 #include <sys/sysevent/svm.h> 76 77 md_ops_t raid_md_ops; 78 #ifndef lint 79 char _depends_on[] = "drv/md"; 80 md_ops_t *md_interface_ops = &raid_md_ops; 81 #endif /* lint */ 82 83 extern unit_t md_nunits; 84 extern unit_t md_nsets; 85 extern md_set_t md_set[]; 86 extern int md_status; 87 extern major_t md_major; 88 extern mdq_anchor_t md_done_daemon; 89 extern mdq_anchor_t md_mstr_daemon; 90 extern int md_sleep_for_test; 91 extern clock_t md_hz; 92 93 extern md_event_queue_t *md_event_queue; 94 95 96 int pchunks = 16; 97 int phigh = 1024; 98 int plow = 128; 99 int cchunks = 64; 100 int chigh = 1024; 101 int clow = 512; 102 int bchunks = 32; 103 int bhigh = 256; 104 int blow = 128; 105 106 int raid_total_io = 0; 107 int raid_reads = 0; 108 int raid_writes = 0; 109 int raid_no_bpmaps = 0; 110 int raid_512 = 0; 111 int raid_1024 = 0; 112 int raid_1024_8192 = 0; 113 int raid_8192 = 0; 114 int raid_8192_bigger = 0; 115 int raid_line_lock_wait = 0; 116 117 int data_buffer_waits = 0; 118 int parity_buffer_waits = 0; 119 120 /* writer line locks */ 121 int raid_writer_locks = 0; /* total writer locks */ 122 int raid_write_waits = 0; /* total writer locks that waited */ 123 int raid_full_line_writes = 0; /* total full line writes */ 124 int raid_write_queue_length = 0; /* wait queue length */ 125 int raid_max_write_q_length = 0; /* maximum queue length */ 126 int raid_write_locks_active = 0; /* writer locks at any time */ 127 int raid_max_write_locks = 0; /* maximum writer locks active */ 128 129 /* read line locks */ 130 int raid_reader_locks = 0; /* total reader locks held */ 131 int raid_reader_locks_active = 0; /* reader locks held */ 132 int raid_max_reader_locks = 0; /* maximum reader locks held in run */ 133 int raid_read_overlaps = 0; /* number of times 2 reads hit same line */ 134 int raid_read_waits = 0; /* times a reader waited on writer */ 135 136 /* prewrite stats */ 137 int raid_prewrite_waits = 0; /* number of waits for a pw slot */ 138 int raid_pw = 0; /* number of pw slots in use */ 139 int raid_prewrite_max = 0; /* maximum number of pw slots in use */ 140 int raid_pw_invalidates = 0; 141 142 static clock_t md_wr_wait = 0; 143 144 int nv_available = 0; /* presence of nv-ram support in device */ 145 int nv_prewrite = 1; /* mark prewrites with nv_available */ 146 int nv_parity = 1; /* mark parity with nv_available */ 147 148 kmem_cache_t *raid_parent_cache = NULL; 149 kmem_cache_t *raid_child_cache = NULL; 150 kmem_cache_t *raid_cbuf_cache = NULL; 151 152 int raid_internal_open(minor_t mnum, int flag, int otyp, 153 int md_oflags); 154 155 static void freebuffers(md_raidcs_t *cs); 156 static int raid_read(mr_unit_t *un, md_raidcs_t *cs); 157 static void raid_read_io(mr_unit_t *un, md_raidcs_t *cs); 158 static int raid_write(mr_unit_t *un, md_raidcs_t *cs); 159 static void raid_write_io(mr_unit_t *un, md_raidcs_t *cs); 160 static void raid_stage(md_raidcs_t *cs); 161 static void raid_enqueue(md_raidcs_t *cs); 162 static diskaddr_t raid_line(diskaddr_t segment, mr_unit_t *un); 163 uint_t raid_dcolumn(diskaddr_t segment, mr_unit_t *un); 164 static void getpbuffer(md_raidcs_t *cs); 165 static void getdbuffer(md_raidcs_t *cs); 166 static void raid_done(buf_t *bp); 167 static void raid_io_startup(mr_unit_t *un); 168 169 static rus_state_t 170 raid_col2unit(rcs_state_t state, rus_state_t unitstate) 171 { 172 switch (state) { 173 case RCS_INIT: 174 return (RUS_INIT); 175 case RCS_OKAY: 176 return (RUS_OKAY); 177 case RCS_RESYNC: 178 if (unitstate & RUS_LAST_ERRED) 179 return (RUS_LAST_ERRED); 180 else 181 return (RUS_ERRED); 182 case RCS_ERRED: 183 return (RUS_ERRED); 184 case RCS_LAST_ERRED: 185 return (RUS_ERRED); 186 default: 187 break; 188 } 189 panic("raid_col2unit"); 190 /*NOTREACHED*/ 191 } 192 193 void 194 raid_set_state(mr_unit_t *un, int col, rcs_state_t newstate, int force) 195 { 196 197 rus_state_t unitstate, origstate; 198 rcs_state_t colstate; 199 rcs_state_t orig_colstate; 200 int errcnt = 0, 201 okaycnt = 0, 202 resynccnt = 0; 203 int i; 204 char *devname; 205 206 ASSERT(un); 207 ASSERT(col < un->un_totalcolumncnt); 208 ASSERT(newstate & 209 (RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | 210 RCS_LAST_ERRED | RCS_REGEN)); 211 ASSERT((newstate & 212 ~(RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | 213 RCS_LAST_ERRED | RCS_REGEN)) 214 == 0); 215 216 ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1); 217 218 unitstate = un->un_state; 219 origstate = unitstate; 220 221 if (force) { 222 un->un_column[col].un_devstate = newstate; 223 un->un_state = raid_col2unit(newstate, unitstate); 224 uniqtime32(&un->un_column[col].un_devtimestamp); 225 uniqtime32(&un->un_timestamp); 226 return; 227 } 228 229 ASSERT(un->un_state & 230 (RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | 231 RUS_REGEN)); 232 ASSERT((un->un_state & ~(RUS_INIT | 233 RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)) == 0); 234 235 if (un->un_column[col].un_devstate == newstate) 236 return; 237 238 if (newstate == RCS_REGEN) { 239 if (raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) 240 return; 241 un->un_state = RUS_REGEN; 242 return; 243 } 244 245 orig_colstate = un->un_column[col].un_devstate; 246 247 /* 248 * if there is another column in the error state then this 249 * column should go to the last errored state 250 */ 251 for (i = 0; i < un->un_totalcolumncnt; i++) { 252 if (i == col) 253 colstate = newstate; 254 else 255 colstate = un->un_column[i].un_devstate; 256 if (colstate & (RCS_ERRED | RCS_LAST_ERRED | RCS_INIT_ERRED)) 257 errcnt++; 258 if (colstate & RCS_OKAY) 259 okaycnt++; 260 if (colstate & RCS_RESYNC) 261 resynccnt++; 262 } 263 ASSERT(resynccnt < 2); 264 265 if (okaycnt == un->un_totalcolumncnt) 266 unitstate = RUS_OKAY; 267 else if (errcnt > 1) { 268 unitstate = RUS_LAST_ERRED; 269 if (newstate & RCS_ERRED) 270 newstate = RCS_LAST_ERRED; 271 } else if (errcnt == 1) 272 if (!(unitstate & RUS_LAST_ERRED)) 273 unitstate = RUS_ERRED; 274 275 if (un->un_state == RUS_DOI) 276 unitstate = RUS_DOI; 277 278 un->un_column[col].un_devstate = newstate; 279 uniqtime32(&un->un_column[col].un_devtimestamp); 280 /* 281 * if there are last errored column being brought back online 282 * by open or snarf, then be sure to clear the RUS_LAST_ERRED 283 * bit to allow writes. If there is a real error then the 284 * column will go back into last erred. 285 */ 286 if ((raid_state_cnt(un, RCS_LAST_ERRED) == 0) && 287 (raid_state_cnt(un, RCS_ERRED) == 1)) 288 unitstate = RUS_ERRED; 289 290 un->un_state = unitstate; 291 uniqtime32(&un->un_timestamp); 292 293 if ((! (origstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) && 294 (unitstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) { 295 devname = md_devname(MD_UN2SET(un), 296 un->un_column[col].un_dev, NULL, 0); 297 298 cmn_err(CE_WARN, "md: %s: %s needs maintenance", 299 md_shortname(MD_SID(un)), devname); 300 301 if (unitstate & RUS_LAST_ERRED) { 302 cmn_err(CE_WARN, "md: %s: %s last erred", 303 md_shortname(MD_SID(un)), devname); 304 305 } else if (un->un_column[col].un_devflags & 306 MD_RAID_DEV_ISOPEN) { 307 /* 308 * Close the broken device and clear the open flag on 309 * it. We have to check that the device is open, 310 * otherwise the first open on it has resulted in the 311 * error that is being processed and the actual un_dev 312 * will be NODEV64. 313 */ 314 md_layered_close(un->un_column[col].un_dev, 315 MD_OFLG_NULL); 316 un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; 317 } 318 } else if (orig_colstate == RCS_LAST_ERRED && newstate == RCS_ERRED && 319 un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) { 320 /* 321 * Similar to logic above except no log messages since we 322 * are just transitioning from Last Erred to Erred. 323 */ 324 md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL); 325 un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; 326 } 327 328 /* 329 * If a resync has completed, see if there is a Last Erred 330 * component that we can change to the Erred state. 331 */ 332 if ((orig_colstate == RCS_RESYNC) && (newstate == RCS_OKAY)) { 333 for (i = 0; i < un->un_totalcolumncnt; i++) { 334 if (i != col && 335 (un->un_column[i].un_devstate & RCS_LAST_ERRED)) { 336 raid_set_state(un, i, RCS_ERRED, 0); 337 break; 338 } 339 } 340 } 341 } 342 343 /* 344 * NAME: erred_check_line 345 * 346 * DESCRIPTION: Return the type of write to perform on an erred column based 347 * upon any resync activity. 348 * 349 * if a column is being resynced and the write is above the 350 * resync point may have to write to the target being resynced. 351 * 352 * Column state may make it impossible to do the write 353 * in which case RCL_EIO or RCL_ENXIO is returned. 354 * 355 * If a column cannot be written directly, RCL_ERRED is 356 * returned and processing should proceed accordingly. 357 * 358 * PARAMETERS: minor_t mnum - minor number identity of metadevice 359 * md_raidcs_t *cs - child save structure 360 * mr_column_t *dcolumn - pointer to data column structure 361 * mr_column_t *pcolumn - pointer to parity column structure 362 * 363 * RETURNS: RCL_OKAY, RCL_ERRED 364 * 365 * LOCKS: Expects Line Writer Lock and Unit Resource Lock to be held 366 * across call. 367 */ 368 369 static int 370 erred_check_line(mr_unit_t *un, md_raidcs_t *cs, mr_column_t *column) 371 { 372 373 ASSERT(un != NULL); 374 ASSERT(cs->cs_flags & MD_RCS_LLOCKD); 375 376 if (column->un_devstate & RCS_OKAY) 377 return (RCL_OKAY); 378 379 if (column->un_devstate & RCS_ERRED) 380 return (RCL_ERRED); /* do not read from errored disk */ 381 382 /* 383 * for the last errored case their are two considerations. 384 * When the last errored column is the only errored column then 385 * do treat it like a maintenance column, not doing I/O from 386 * it. When it there are other failures then just attempt 387 * to use it. 388 */ 389 if (column->un_devstate & RCS_LAST_ERRED) 390 return (RCL_ERRED); 391 392 ASSERT(column->un_devstate & RCS_RESYNC); 393 394 /* 395 * When a resync from a hotspare is being done (copy resync) 396 * then always treat it as an OKAY column, since no regen 397 * is required. 398 */ 399 if (column->un_devflags & MD_RAID_COPY_RESYNC) { 400 return (RCL_OKAY); 401 } 402 403 mutex_enter(&un->un_mx); 404 if (cs->cs_line < un->un_resync_line_index) { 405 mutex_exit(&un->un_mx); 406 return (RCL_OKAY); 407 } 408 mutex_exit(&un->un_mx); 409 return (RCL_ERRED); 410 411 } 412 413 /* 414 * NAMES: raid_state_cnt 415 * 416 * DESCRIPTION: counts number of column in a specific state 417 * 418 * PARAMETERS: md_raid_t *un 419 * rcs_state state 420 */ 421 int 422 raid_state_cnt(mr_unit_t *un, rcs_state_t state) 423 { 424 int i, retval = 0; 425 426 for (i = 0; i < un->un_totalcolumncnt; i++) 427 if (un->un_column[i].un_devstate & state) 428 retval++; 429 return (retval); 430 } 431 432 /* 433 * NAMES: raid_io_overlaps 434 * 435 * DESCRIPTION: checkst for overlap of 2 child save structures 436 * 437 * PARAMETERS: md_raidcs_t cs1 438 * md_raidcs_t cs2 439 * 440 * RETURNS: 0 - no overlap 441 * 1 - overlap 442 */ 443 int 444 raid_io_overlaps(md_raidcs_t *cs1, md_raidcs_t *cs2) 445 { 446 if (cs1->cs_blkno > cs2->cs_lastblk) 447 return (0); 448 if (cs1->cs_lastblk < cs2->cs_blkno) 449 return (0); 450 return (1); 451 } 452 453 /* 454 * NAMES: raid_parent_constructor 455 * DESCRIPTION: parent structure constructor routine 456 * PARAMETERS: 457 */ 458 /*ARGSUSED1*/ 459 static int 460 raid_parent_constructor(void *p, void *d1, int d2) 461 { 462 mutex_init(&((md_raidps_t *)p)->ps_mx, 463 NULL, MUTEX_DEFAULT, NULL); 464 mutex_init(&((md_raidps_t *)p)->ps_mapin_mx, 465 NULL, MUTEX_DEFAULT, NULL); 466 return (0); 467 } 468 469 void 470 raid_parent_init(md_raidps_t *ps) 471 { 472 bzero(ps, offsetof(md_raidps_t, ps_mx)); 473 ((md_raidps_t *)ps)->ps_flags = MD_RPS_INUSE; 474 ((md_raidps_t *)ps)->ps_magic = RAID_PSMAGIC; 475 } 476 477 /*ARGSUSED1*/ 478 static void 479 raid_parent_destructor(void *p, void *d) 480 { 481 mutex_destroy(&((md_raidps_t *)p)->ps_mx); 482 mutex_destroy(&((md_raidps_t *)p)->ps_mapin_mx); 483 } 484 485 /* 486 * NAMES: raid_child_constructor 487 * DESCRIPTION: child structure constructor routine 488 * PARAMETERS: 489 */ 490 /*ARGSUSED1*/ 491 static int 492 raid_child_constructor(void *p, void *d1, int d2) 493 { 494 md_raidcs_t *cs = (md_raidcs_t *)p; 495 mutex_init(&cs->cs_mx, NULL, MUTEX_DEFAULT, NULL); 496 bioinit(&cs->cs_dbuf); 497 bioinit(&cs->cs_pbuf); 498 bioinit(&cs->cs_hbuf); 499 return (0); 500 } 501 502 void 503 raid_child_init(md_raidcs_t *cs) 504 { 505 bzero(cs, offsetof(md_raidcs_t, cs_mx)); 506 507 md_bioreset(&cs->cs_dbuf); 508 md_bioreset(&cs->cs_pbuf); 509 md_bioreset(&cs->cs_hbuf); 510 511 ((md_raidcs_t *)cs)->cs_dbuf.b_chain = 512 ((md_raidcs_t *)cs)->cs_pbuf.b_chain = 513 ((md_raidcs_t *)cs)->cs_hbuf.b_chain = 514 (struct buf *)(cs); 515 516 cs->cs_magic = RAID_CSMAGIC; 517 cs->cs_line = MD_DISKADDR_ERROR; 518 cs->cs_dpwslot = -1; 519 cs->cs_ppwslot = -1; 520 } 521 522 /*ARGSUSED1*/ 523 static void 524 raid_child_destructor(void *p, void *d) 525 { 526 biofini(&((md_raidcs_t *)p)->cs_dbuf); 527 biofini(&((md_raidcs_t *)p)->cs_hbuf); 528 biofini(&((md_raidcs_t *)p)->cs_pbuf); 529 mutex_destroy(&((md_raidcs_t *)p)->cs_mx); 530 } 531 532 /*ARGSUSED1*/ 533 static int 534 raid_cbuf_constructor(void *p, void *d1, int d2) 535 { 536 bioinit(&((md_raidcbuf_t *)p)->cbuf_bp); 537 return (0); 538 } 539 540 static void 541 raid_cbuf_init(md_raidcbuf_t *cb) 542 { 543 bzero(cb, offsetof(md_raidcbuf_t, cbuf_bp)); 544 md_bioreset(&cb->cbuf_bp); 545 cb->cbuf_magic = RAID_BUFMAGIC; 546 cb->cbuf_pwslot = -1; 547 cb->cbuf_flags = CBUF_WRITE; 548 } 549 550 /*ARGSUSED1*/ 551 static void 552 raid_cbuf_destructor(void *p, void *d) 553 { 554 biofini(&((md_raidcbuf_t *)p)->cbuf_bp); 555 } 556 557 /* 558 * NAMES: raid_run_queue 559 * DESCRIPTION: spawn a backend processing daemon for RAID metadevice. 560 * PARAMETERS: 561 */ 562 /*ARGSUSED*/ 563 static void 564 raid_run_queue(void *d) 565 { 566 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 567 md_daemon(1, &md_done_daemon); 568 } 569 570 /* 571 * NAME: raid_build_pwslot 572 * DESCRIPTION: builds mr_pw_reserve for the column 573 * PARAMETERS: un is the pointer to the unit structure 574 * colindex is the column to create the structure for 575 */ 576 int 577 raid_build_pw_reservation(mr_unit_t *un, int colindex) 578 { 579 mr_pw_reserve_t *pw; 580 mr_scoreboard_t *sb; 581 int i; 582 583 pw = (mr_pw_reserve_t *) kmem_zalloc(sizeof (mr_pw_reserve_t) + 584 (sizeof (mr_scoreboard_t) * un->un_pwcnt), KM_SLEEP); 585 pw->pw_magic = RAID_PWMAGIC; 586 pw->pw_column = colindex; 587 pw->pw_free = un->un_pwcnt; 588 sb = &pw->pw_sb[0]; 589 for (i = 0; i < un->un_pwcnt; i++) { 590 sb[i].sb_column = colindex; 591 sb[i].sb_flags = SB_UNUSED; 592 sb[i].sb_start_blk = 0; 593 sb[i].sb_last_blk = 0; 594 sb[i].sb_cs = NULL; 595 } 596 un->un_column_ic[colindex].un_pw_reserve = pw; 597 return (0); 598 } 599 /* 600 * NAME: raid_free_pw_reservation 601 * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine 602 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 603 * int colindex - index of the column whose pre-write slot struct 604 * is to be destroyed. 605 */ 606 void 607 raid_free_pw_reservation(mr_unit_t *un, int colindex) 608 { 609 mr_pw_reserve_t *pw = un->un_column_ic[colindex].un_pw_reserve; 610 611 kmem_free(pw, sizeof (mr_pw_reserve_t) + 612 (sizeof (mr_scoreboard_t) * un->un_pwcnt)); 613 } 614 615 /* 616 * NAME: raid_cancel_pwslot 617 * DESCRIPTION: RAID metadevice write routine 618 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 619 */ 620 static void 621 raid_cancel_pwslot(md_raidcs_t *cs) 622 { 623 mr_unit_t *un = cs->cs_un; 624 mr_pw_reserve_t *pw; 625 mr_scoreboard_t *sb; 626 mr_column_ic_t *col; 627 md_raidcbuf_t *cbuf; 628 int broadcast = 0; 629 630 if (cs->cs_ps->ps_flags & MD_RPS_READ) 631 return; 632 if (cs->cs_dpwslot != -1) { 633 col = &un->un_column_ic[cs->cs_dcolumn]; 634 pw = col->un_pw_reserve; 635 sb = &pw->pw_sb[cs->cs_dpwslot]; 636 sb->sb_flags = SB_AVAIL; 637 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 638 broadcast++; 639 sb->sb_cs = NULL; 640 } 641 642 if (cs->cs_ppwslot != -1) { 643 col = &un->un_column_ic[cs->cs_pcolumn]; 644 pw = col->un_pw_reserve; 645 sb = &pw->pw_sb[cs->cs_ppwslot]; 646 sb->sb_flags = SB_AVAIL; 647 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 648 broadcast++; 649 sb->sb_cs = NULL; 650 } 651 652 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 653 if (cbuf->cbuf_pwslot == -1) 654 continue; 655 col = &un->un_column_ic[cbuf->cbuf_column]; 656 pw = col->un_pw_reserve; 657 sb = &pw->pw_sb[cbuf->cbuf_pwslot]; 658 sb->sb_flags = SB_AVAIL; 659 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 660 broadcast++; 661 sb->sb_cs = NULL; 662 } 663 if (broadcast) { 664 cv_broadcast(&un->un_cv); 665 return; 666 } 667 mutex_enter(&un->un_mx); 668 if (un->un_rflags & MD_RFLAG_NEEDPW) 669 cv_broadcast(&un->un_cv); 670 mutex_exit(&un->un_mx); 671 } 672 673 static void 674 raid_free_pwinvalidate(md_raidcs_t *cs) 675 { 676 md_raidcbuf_t *cbuf; 677 md_raidcbuf_t *cbuf_to_free; 678 mr_unit_t *un = cs->cs_un; 679 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); 680 mr_pw_reserve_t *pw; 681 mr_scoreboard_t *sb; 682 int broadcast = 0; 683 684 cbuf = cs->cs_pw_inval_list; 685 ASSERT(cbuf); 686 mutex_enter(&un->un_linlck_mx); 687 while (cbuf) { 688 pw = un->un_column_ic[cbuf->cbuf_column].un_pw_reserve; 689 sb = &pw->pw_sb[0]; 690 ASSERT(sb[cbuf->cbuf_pwslot].sb_flags & SB_INVAL_PEND); 691 sb[cbuf->cbuf_pwslot].sb_flags = SB_UNUSED; 692 sb[cbuf->cbuf_pwslot].sb_cs = NULL; 693 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 694 broadcast++; 695 cbuf_to_free = cbuf; 696 cbuf = cbuf->cbuf_next; 697 kmem_free(cbuf_to_free->cbuf_buffer, dbtob(un->un_iosize)); 698 kmem_cache_free(raid_cbuf_cache, cbuf_to_free); 699 } 700 cs->cs_pw_inval_list = (md_raidcbuf_t *)NULL; 701 /* 702 * now that there is a free prewrite slot, check to see if there 703 * are any io operations waiting first wake up the raid_io_startup 704 * then signal the the processes waiting in raid_write. 705 */ 706 if (ui->ui_io_lock->io_list_front) 707 raid_io_startup(un); 708 mutex_exit(&un->un_linlck_mx); 709 if (broadcast) { 710 cv_broadcast(&un->un_cv); 711 return; 712 } 713 mutex_enter(&un->un_mx); 714 if (un->un_rflags & MD_RFLAG_NEEDPW) 715 cv_broadcast(&un->un_cv); 716 mutex_exit(&un->un_mx); 717 } 718 719 720 static int 721 raid_get_pwslot(md_raidcs_t *cs, int column) 722 { 723 mr_scoreboard_t *sb; 724 mr_pw_reserve_t *pw; 725 mr_unit_t *un = cs->cs_un; 726 diskaddr_t start_blk = cs->cs_blkno; 727 diskaddr_t last_blk = cs->cs_lastblk; 728 int i; 729 int pwcnt = un->un_pwcnt; 730 int avail = -1; 731 int use = -1; 732 int flags; 733 734 735 /* start with the data column */ 736 pw = cs->cs_un->un_column_ic[column].un_pw_reserve; 737 sb = &pw->pw_sb[0]; 738 ASSERT(pw->pw_free > 0); 739 for (i = 0; i < pwcnt; i++) { 740 flags = sb[i].sb_flags; 741 if (flags & SB_INVAL_PEND) 742 continue; 743 744 if ((avail == -1) && (flags & (SB_AVAIL | SB_UNUSED))) 745 avail = i; 746 747 if ((start_blk > sb[i].sb_last_blk) || 748 (last_blk < sb[i].sb_start_blk)) 749 continue; 750 751 /* OVERLAP */ 752 ASSERT(! (sb[i].sb_flags & SB_INUSE)); 753 754 /* 755 * raid_invalidate_pwslot attempts to zero out prewrite entry 756 * in parallel with other disk reads/writes related to current 757 * transaction. however cs_frags accounting for this case is 758 * broken because raid_write_io resets cs_frags i.e. ignoring 759 * that it could have been been set to > 0 value by 760 * raid_invalidate_pwslot. While this can be fixed an 761 * additional problem is that we don't seem to handle 762 * correctly the case of getting a disk error for prewrite 763 * entry invalidation. 764 * It does not look like we really need 765 * to invalidate prewrite slots because raid_replay sorts 766 * prewrite id's in ascending order and during recovery the 767 * latest prewrite entry for the same block will be replay 768 * last. That's why i ifdef'd out the call to 769 * raid_invalidate_pwslot. --aguzovsk@east 770 */ 771 772 if (use == -1) { 773 use = i; 774 } 775 } 776 777 ASSERT(avail != -1); 778 pw->pw_free--; 779 if (use == -1) 780 use = avail; 781 782 ASSERT(! (sb[use].sb_flags & SB_INUSE)); 783 sb[use].sb_flags = SB_INUSE; 784 sb[use].sb_cs = cs; 785 sb[use].sb_start_blk = start_blk; 786 sb[use].sb_last_blk = last_blk; 787 ASSERT((use >= 0) && (use < un->un_pwcnt)); 788 return (use); 789 } 790 791 static int 792 raid_check_pw(md_raidcs_t *cs) 793 { 794 795 mr_unit_t *un = cs->cs_un; 796 int i; 797 798 ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); 799 /* 800 * check to be sure there is a prewrite slot available 801 * if not just return. 802 */ 803 if (cs->cs_flags & MD_RCS_LINE) { 804 for (i = 0; i < un->un_totalcolumncnt; i++) 805 if (un->un_column_ic[i].un_pw_reserve->pw_free <= 0) 806 return (1); 807 return (0); 808 } 809 810 if (un->un_column_ic[cs->cs_dcolumn].un_pw_reserve->pw_free <= 0) 811 return (1); 812 if (un->un_column_ic[cs->cs_pcolumn].un_pw_reserve->pw_free <= 0) 813 return (1); 814 return (0); 815 } 816 static int 817 raid_alloc_pwslot(md_raidcs_t *cs) 818 { 819 mr_unit_t *un = cs->cs_un; 820 md_raidcbuf_t *cbuf; 821 822 ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); 823 if (raid_check_pw(cs)) 824 return (1); 825 826 mutex_enter(&un->un_mx); 827 un->un_pwid++; 828 cs->cs_pwid = un->un_pwid; 829 mutex_exit(&un->un_mx); 830 831 cs->cs_dpwslot = raid_get_pwslot(cs, cs->cs_dcolumn); 832 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 833 cbuf->cbuf_pwslot = raid_get_pwslot(cs, cbuf->cbuf_column); 834 } 835 cs->cs_ppwslot = raid_get_pwslot(cs, cs->cs_pcolumn); 836 837 cs->cs_flags |= MD_RCS_HAVE_PW_SLOTS; 838 839 return (0); 840 } 841 842 /* 843 * NAMES: raid_build_incore 844 * DESCRIPTION: RAID metadevice incore structure building routine 845 * PARAMETERS: void *p - pointer to a unit structure 846 * int snarfing - a flag to indicate snarfing is required 847 */ 848 int 849 raid_build_incore(void *p, int snarfing) 850 { 851 mr_unit_t *un = (mr_unit_t *)p; 852 minor_t mnum = MD_SID(un); 853 mddb_recid_t hs_recid = 0; 854 int i; 855 int preserve_flags; 856 mr_column_t *column; 857 int iosize; 858 md_dev64_t hs, dev; 859 int resync_cnt = 0, 860 error_cnt = 0; 861 862 hs = NODEV64; 863 dev = NODEV64; 864 865 /* clear out bogus pointer incase we return(1) prior to alloc */ 866 un->mr_ic = NULL; 867 868 if (MD_STATUS(un) & MD_UN_BEING_RESET) { 869 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); 870 return (1); 871 } 872 873 if (MD_UNIT(mnum) != NULL) 874 return (0); 875 876 if (snarfing) 877 MD_STATUS(un) = 0; 878 879 un->mr_ic = (mr_unit_ic_t *)kmem_zalloc(sizeof (*un->mr_ic), 880 KM_SLEEP); 881 882 un->un_column_ic = (mr_column_ic_t *) 883 kmem_zalloc(sizeof (mr_column_ic_t) * 884 un->un_totalcolumncnt, KM_SLEEP); 885 886 for (i = 0; i < un->un_totalcolumncnt; i++) { 887 888 column = &un->un_column[i]; 889 preserve_flags = column->un_devflags & 890 (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC); 891 column->un_devflags &= 892 ~(MD_RAID_ALT_ISOPEN | MD_RAID_DEV_ISOPEN | 893 MD_RAID_WRITE_ALT); 894 if (raid_build_pw_reservation(un, i) != 0) { 895 /* could not build pwslot */ 896 return (1); 897 } 898 899 if (snarfing) { 900 set_t setno = MD_MIN2SET(mnum); 901 dev = md_getdevnum(setno, mddb_getsidenum(setno), 902 column->un_orig_key, MD_NOTRUST_DEVT); 903 /* 904 * Comment out instead of remove so we have history 905 * In the pre-SVM releases stored devt is used so 906 * as long as there is one snarf is always happy 907 * even the component is powered off. This is not 908 * the case in current SVM implementation. NODEV64 909 * can be returned and in this case since we resolve 910 * the devt at 'open' time (first use of metadevice) 911 * we will allow snarf continue. 912 * 913 * if (dev == NODEV64) 914 * return (1); 915 */ 916 917 /* 918 * Setup un_orig_dev from device id info if the device 919 * is valid (not NODEV64). 920 */ 921 if (dev != NODEV64) 922 column->un_orig_dev = dev; 923 924 if (column->un_devstate & RCS_RESYNC) 925 resync_cnt++; 926 if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) 927 error_cnt++; 928 929 if (HOTSPARED(un, i)) { 930 (void) md_hot_spare_ifc(HS_MKDEV, 931 0, 0, 0, &column->un_hs_id, NULL, 932 &hs, NULL); 933 /* 934 * Same here 935 * 936 * if (hs == NODEV64) 937 * return (1); 938 */ 939 } 940 941 if (HOTSPARED(un, i)) { 942 if (column->un_devstate & 943 (RCS_OKAY | RCS_LAST_ERRED)) { 944 column->un_dev = hs; 945 column->un_pwstart = 946 column->un_hs_pwstart; 947 column->un_devstart = 948 column->un_hs_devstart; 949 preserve_flags &= 950 ~(MD_RAID_COPY_RESYNC | 951 MD_RAID_REGEN_RESYNC); 952 } else if (column->un_devstate & RCS_RESYNC) { 953 /* 954 * if previous system was 4.0 set 955 * the direction flags 956 */ 957 if ((preserve_flags & 958 (MD_RAID_COPY_RESYNC | 959 MD_RAID_REGEN_RESYNC)) == 0) { 960 if (column->un_alt_dev != NODEV64) 961 preserve_flags |= 962 MD_RAID_COPY_RESYNC; 963 else 964 preserve_flags |= 965 MD_RAID_REGEN_RESYNC; 966 } 967 } 968 } else { /* no hot spares */ 969 column->un_dev = dev; 970 column->un_pwstart = column->un_orig_pwstart; 971 column->un_devstart = column->un_orig_devstart; 972 if (column->un_devstate & RCS_RESYNC) { 973 preserve_flags |= MD_RAID_REGEN_RESYNC; 974 preserve_flags &= ~MD_RAID_COPY_RESYNC; 975 } 976 } 977 if (! (column->un_devstate & RCS_RESYNC)) { 978 preserve_flags &= 979 ~(MD_RAID_REGEN_RESYNC | 980 MD_RAID_COPY_RESYNC); 981 } 982 983 column->un_devflags = preserve_flags; 984 column->un_alt_dev = NODEV64; 985 column->un_alt_pwstart = 0; 986 column->un_alt_devstart = 0; 987 un->un_resync_line_index = 0; 988 un->un_resync_index = 0; 989 un->un_percent_done = 0; 990 } 991 } 992 993 if (resync_cnt && error_cnt) { 994 for (i = 0; i < un->un_totalcolumncnt; i++) { 995 column = &un->un_column[i]; 996 if (HOTSPARED(un, i) && 997 (column->un_devstate & RCS_RESYNC) && 998 (column->un_devflags & MD_RAID_COPY_RESYNC)) 999 /* hotspare has data */ 1000 continue; 1001 1002 if (HOTSPARED(un, i) && 1003 (column->un_devstate & RCS_RESYNC)) { 1004 /* hotspare does not have data */ 1005 raid_hs_release(HS_FREE, un, &hs_recid, i); 1006 column->un_dev = column->un_orig_dev; 1007 column->un_pwstart = column->un_orig_pwstart; 1008 column->un_devstart = column->un_orig_devstart; 1009 mddb_setrecprivate(hs_recid, MD_PRV_PENDCOM); 1010 } 1011 1012 if (column->un_devstate & RCS_ERRED) 1013 column->un_devstate = RCS_LAST_ERRED; 1014 1015 if (column->un_devstate & RCS_RESYNC) 1016 column->un_devstate = RCS_ERRED; 1017 } 1018 } 1019 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM); 1020 1021 un->un_pwid = 1; /* or some other possible value */ 1022 un->un_magic = RAID_UNMAGIC; 1023 iosize = un->un_iosize; 1024 un->un_pbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); 1025 un->un_dbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); 1026 mutex_init(&un->un_linlck_mx, NULL, MUTEX_DEFAULT, NULL); 1027 cv_init(&un->un_linlck_cv, NULL, CV_DEFAULT, NULL); 1028 un->un_linlck_chn = NULL; 1029 MD_UNIT(mnum) = un; 1030 1031 1032 return (0); 1033 } 1034 1035 /* 1036 * NAMES: reset_raid 1037 * DESCRIPTION: RAID metadevice reset routine 1038 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 1039 * minor_t mnum - RAID metadevice minor number 1040 * int removing - a flag to imply removing device name from 1041 * MDDB database. 1042 */ 1043 void 1044 reset_raid(mr_unit_t *un, minor_t mnum, int removing) 1045 { 1046 int i, n = 0; 1047 sv_dev_t *sv; 1048 mr_column_t *column; 1049 int column_cnt = un->un_totalcolumncnt; 1050 mddb_recid_t *recids, vtoc_id; 1051 int hserr; 1052 1053 ASSERT((MDI_UNIT(mnum)->ui_io_lock->io_list_front == NULL) && 1054 (MDI_UNIT(mnum)->ui_io_lock->io_list_back == NULL)); 1055 1056 md_destroy_unit_incore(mnum, &raid_md_ops); 1057 1058 MD_UNIT(mnum) = NULL; 1059 1060 if (un->un_pbuffer) { 1061 kmem_free(un->un_pbuffer, dbtob(un->un_iosize)); 1062 un->un_pbuffer = NULL; 1063 } 1064 if (un->un_dbuffer) { 1065 kmem_free(un->un_dbuffer, dbtob(un->un_iosize)); 1066 un->un_dbuffer = NULL; 1067 } 1068 1069 /* free all pre-write slots created during build incore */ 1070 for (i = 0; i < un->un_totalcolumncnt; i++) 1071 raid_free_pw_reservation(un, i); 1072 1073 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * 1074 un->un_totalcolumncnt); 1075 1076 kmem_free(un->mr_ic, sizeof (*un->mr_ic)); 1077 1078 /* 1079 * Attempt release of its minor node 1080 */ 1081 md_remove_minor_node(mnum); 1082 1083 if (!removing) 1084 return; 1085 1086 sv = (sv_dev_t *)kmem_zalloc((column_cnt + 1) * sizeof (sv_dev_t), 1087 KM_SLEEP); 1088 1089 recids = (mddb_recid_t *) 1090 kmem_zalloc((column_cnt + 2) * sizeof (mddb_recid_t), KM_SLEEP); 1091 1092 for (i = 0; i < column_cnt; i++) { 1093 md_unit_t *comp_un; 1094 md_dev64_t comp_dev; 1095 1096 column = &un->un_column[i]; 1097 sv[i].setno = MD_MIN2SET(mnum); 1098 sv[i].key = column->un_orig_key; 1099 if (HOTSPARED(un, i)) { 1100 if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) 1101 hserr = HS_BAD; 1102 else 1103 hserr = HS_FREE; 1104 raid_hs_release(hserr, un, &recids[n++], i); 1105 } 1106 /* 1107 * deparent any metadevices. 1108 * NOTE: currently soft partitions are the only metadevices 1109 * allowed in RAID metadevices. 1110 */ 1111 comp_dev = column->un_dev; 1112 if (md_getmajor(comp_dev) == md_major) { 1113 comp_un = MD_UNIT(md_getminor(comp_dev)); 1114 recids[n++] = MD_RECID(comp_un); 1115 md_reset_parent(comp_dev); 1116 } 1117 } 1118 /* decrement the reference count of the old hsp */ 1119 if (un->un_hsp_id != -1) 1120 (void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0, 1121 &recids[n++], NULL, NULL, NULL); 1122 recids[n] = 0; 1123 MD_STATUS(un) |= MD_UN_BEING_RESET; 1124 vtoc_id = un->c.un_vtoc_id; 1125 1126 raid_commit(un, recids); 1127 1128 /* 1129 * Remove self from the namespace 1130 */ 1131 if (un->c.un_revision & MD_FN_META_DEV) { 1132 (void) md_rem_selfname(un->c.un_self_id); 1133 } 1134 1135 /* Remove the unit structure */ 1136 mddb_deleterec_wrapper(un->c.un_record_id); 1137 1138 /* Remove the vtoc, if present */ 1139 if (vtoc_id) 1140 mddb_deleterec_wrapper(vtoc_id); 1141 md_rem_names(sv, column_cnt); 1142 kmem_free(sv, (column_cnt + 1) * sizeof (sv_dev_t)); 1143 kmem_free(recids, (column_cnt + 2) * sizeof (mddb_recid_t)); 1144 1145 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 1146 MD_MIN2SET(mnum), mnum); 1147 } 1148 1149 /* 1150 * NAMES: raid_error_parent 1151 * DESCRIPTION: mark a parent structure in error 1152 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1153 * int error - error value to set 1154 * NOTE: (TBR) - this routine currently is not in use. 1155 */ 1156 static void 1157 raid_error_parent(md_raidps_t *ps, int error) 1158 { 1159 mutex_enter(&ps->ps_mx); 1160 ps->ps_flags |= MD_RPS_ERROR; 1161 ps->ps_error = error; 1162 mutex_exit(&ps->ps_mx); 1163 } 1164 1165 /* 1166 * The following defines tell raid_free_parent 1167 * RFP_RLS_LOCK release the unit reader lock when done. 1168 * RFP_DECR_PWFRAGS decrement ps_pwfrags 1169 * RFP_DECR_FRAGS decrement ps_frags 1170 * RFP_DECR_READFRAGS read keeps FRAGS and PWFRAGS in lockstep 1171 */ 1172 #define RFP_RLS_LOCK 0x00001 1173 #define RFP_DECR_PWFRAGS 0x00002 1174 #define RFP_DECR_FRAGS 0x00004 1175 #define RFP_DECR_READFRAGS (RFP_DECR_PWFRAGS | RFP_DECR_FRAGS) 1176 1177 /* 1178 * NAMES: raid_free_parent 1179 * DESCRIPTION: free a parent structure 1180 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1181 * int todo - indicates what needs to be done 1182 */ 1183 static void 1184 raid_free_parent(md_raidps_t *ps, int todo) 1185 { 1186 mdi_unit_t *ui = ps->ps_ui; 1187 1188 ASSERT(ps->ps_magic == RAID_PSMAGIC); 1189 ASSERT(ps->ps_flags & MD_RPS_INUSE); 1190 mutex_enter(&ps->ps_mx); 1191 if (todo & RFP_DECR_PWFRAGS) { 1192 ASSERT(ps->ps_pwfrags); 1193 ps->ps_pwfrags--; 1194 if (ps->ps_pwfrags == 0 && (! (ps->ps_flags & MD_RPS_IODONE))) { 1195 if (ps->ps_flags & MD_RPS_ERROR) { 1196 ps->ps_bp->b_flags |= B_ERROR; 1197 ps->ps_bp->b_error = ps->ps_error; 1198 } 1199 md_kstat_done(ui, ps->ps_bp, 0); 1200 biodone(ps->ps_bp); 1201 ps->ps_flags |= MD_RPS_IODONE; 1202 } 1203 } 1204 1205 if (todo & RFP_DECR_FRAGS) { 1206 ASSERT(ps->ps_frags); 1207 ps->ps_frags--; 1208 } 1209 1210 if (ps->ps_frags != 0) { 1211 mutex_exit(&ps->ps_mx); 1212 return; 1213 } 1214 1215 ASSERT((ps->ps_frags == 0) && (ps->ps_pwfrags == 0)); 1216 mutex_exit(&ps->ps_mx); 1217 1218 if (todo & RFP_RLS_LOCK) 1219 md_io_readerexit(ui); 1220 1221 if (panicstr) { 1222 ps->ps_flags |= MD_RPS_DONE; 1223 return; 1224 } 1225 1226 if (ps->ps_flags & MD_RPS_HSREQ) 1227 (void) raid_hotspares(); 1228 1229 ASSERT(todo & RFP_RLS_LOCK); 1230 ps->ps_flags &= ~MD_RPS_INUSE; 1231 1232 md_dec_iocount(MD_MIN2SET(ps->ps_un->c.un_self_id)); 1233 1234 kmem_cache_free(raid_parent_cache, ps); 1235 } 1236 1237 /* 1238 * NAMES: raid_free_child 1239 * DESCRIPTION: free a parent structure 1240 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1241 * int drop_locks - 0 for no locks held 1242 * NOTE: (TBR) - this routine currently is not in use. 1243 */ 1244 static void 1245 raid_free_child(md_raidcs_t *cs, int drop_locks) 1246 { 1247 mr_unit_t *un = cs->cs_un; 1248 md_raidcbuf_t *cbuf, *cbuf1; 1249 1250 if (cs->cs_pw_inval_list) 1251 raid_free_pwinvalidate(cs); 1252 1253 if (drop_locks) { 1254 ASSERT(cs->cs_flags & MD_RCS_LLOCKD && 1255 (cs->cs_flags & (MD_RCS_READER | MD_RCS_WRITER))); 1256 md_unit_readerexit(MDI_UNIT(MD_SID(un))); 1257 raid_line_exit(cs); 1258 } else { 1259 ASSERT(!(cs->cs_flags & MD_RCS_LLOCKD)); 1260 } 1261 1262 freebuffers(cs); 1263 cbuf = cs->cs_buflist; 1264 while (cbuf) { 1265 cbuf1 = cbuf->cbuf_next; 1266 kmem_cache_free(raid_cbuf_cache, cbuf); 1267 cbuf = cbuf1; 1268 } 1269 if (cs->cs_dbuf.b_flags & B_REMAPPED) 1270 bp_mapout(&cs->cs_dbuf); 1271 kmem_cache_free(raid_child_cache, cs); 1272 } 1273 1274 /* 1275 * NAME: raid_regen_parity 1276 * 1277 * DESCRIPTION: This routine is used to regenerate the parity blocks 1278 * for the entire raid device. It is called from 1279 * both the regen thread and the IO path. 1280 * 1281 * On error the entire device is marked as in error by 1282 * placing the erroring device in error and all other 1283 * devices in last_errored. 1284 * 1285 * PARAMETERS: md_raidcs_t *cs 1286 */ 1287 void 1288 raid_regen_parity(md_raidcs_t *cs) 1289 { 1290 mr_unit_t *un = cs->cs_un; 1291 mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); 1292 caddr_t buffer; 1293 caddr_t parity_buffer; 1294 buf_t *bp; 1295 uint_t *dbuf, *pbuf; 1296 uint_t colcnt = un->un_totalcolumncnt; 1297 int column; 1298 int parity_column = cs->cs_pcolumn; 1299 size_t bcount; 1300 int j; 1301 1302 /* 1303 * This routine uses the data and parity buffers allocated to a 1304 * write. In the case of a read the buffers are allocated and 1305 * freed at the end. 1306 */ 1307 1308 ASSERT(IO_READER_HELD(un)); 1309 ASSERT(cs->cs_flags & MD_RCS_LLOCKD); 1310 ASSERT(UNIT_READER_HELD(un)); 1311 1312 if (raid_state_cnt(un, RCS_OKAY) != colcnt) 1313 return; 1314 1315 if (cs->cs_flags & MD_RCS_READER) { 1316 getpbuffer(cs); 1317 getdbuffer(cs); 1318 } 1319 ASSERT(cs->cs_dbuffer && cs->cs_pbuffer); 1320 bcount = cs->cs_bcount; 1321 buffer = cs->cs_dbuffer; 1322 parity_buffer = cs->cs_pbuffer; 1323 bzero(parity_buffer, bcount); 1324 bp = &cs->cs_dbuf; 1325 for (column = 0; column < colcnt; column++) { 1326 if (column == parity_column) 1327 continue; 1328 reset_buf(bp, B_READ | B_BUSY, bcount); 1329 bp->b_un.b_addr = buffer; 1330 bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev); 1331 bp->b_lblkno = cs->cs_blkno + un->un_column[column].un_devstart; 1332 bp->b_bcount = bcount; 1333 bp->b_bufsize = bcount; 1334 (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); 1335 if (biowait(bp)) 1336 goto bail; 1337 pbuf = (uint_t *)(void *)parity_buffer; 1338 dbuf = (uint_t *)(void *)buffer; 1339 for (j = 0; j < (bcount / (sizeof (uint_t))); j++) { 1340 *pbuf = *pbuf ^ *dbuf; 1341 pbuf++; 1342 dbuf++; 1343 } 1344 } 1345 1346 reset_buf(bp, B_WRITE | B_BUSY, cs->cs_bcount); 1347 bp->b_un.b_addr = parity_buffer; 1348 bp->b_edev = md_dev64_to_dev(un->un_column[parity_column].un_dev); 1349 bp->b_lblkno = cs->cs_blkno + un->un_column[parity_column].un_devstart; 1350 bp->b_bcount = bcount; 1351 bp->b_bufsize = bcount; 1352 (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); 1353 if (biowait(bp)) 1354 goto bail; 1355 1356 if (cs->cs_flags & MD_RCS_READER) { 1357 freebuffers(cs); 1358 cs->cs_pbuffer = NULL; 1359 cs->cs_dbuffer = NULL; 1360 } 1361 bp->b_chain = (struct buf *)cs; 1362 return; 1363 bail: 1364 if (cs->cs_flags & MD_RCS_READER) { 1365 freebuffers(cs); 1366 cs->cs_pbuffer = NULL; 1367 cs->cs_dbuffer = NULL; 1368 } 1369 md_unit_readerexit(ui); 1370 un = md_unit_writerlock(ui); 1371 raid_set_state(un, column, RCS_ERRED, 0); 1372 for (column = 0; column < colcnt; column++) 1373 raid_set_state(un, column, RCS_ERRED, 0); 1374 raid_commit(un, NULL); 1375 md_unit_writerexit(ui); 1376 un = md_unit_readerlock(ui); 1377 bp->b_chain = (struct buf *)cs; 1378 } 1379 1380 /* 1381 * NAMES: raid_error_state 1382 * DESCRIPTION: check unit and column states' impact on I/O error 1383 * NOTE: the state now may not be the state when the 1384 * I/O completed due to race conditions. 1385 * PARAMETERS: mr_unit_t *un - pointer to raid unit structure 1386 * md_raidcs_t *cs - pointer to child structure 1387 * buf_t *bp - pointer to buffer structure 1388 */ 1389 static int 1390 raid_error_state(mr_unit_t *un, buf_t *bp) 1391 { 1392 int column; 1393 int i; 1394 1395 ASSERT(IO_READER_HELD(un)); 1396 ASSERT(UNIT_WRITER_HELD(un)); 1397 1398 column = -1; 1399 for (i = 0; i < un->un_totalcolumncnt; i++) { 1400 if (un->un_column[i].un_dev == md_expldev(bp->b_edev)) { 1401 column = i; 1402 break; 1403 } 1404 if (un->un_column[i].un_alt_dev == md_expldev(bp->b_edev)) { 1405 column = i; 1406 break; 1407 } 1408 } 1409 1410 /* in case a replace snuck in while waiting on unit writer lock */ 1411 1412 if (column == -1) { 1413 return (0); 1414 } 1415 1416 (void) raid_set_state(un, column, RCS_ERRED, 0); 1417 ASSERT(un->un_state & (RUS_ERRED | RUS_LAST_ERRED)); 1418 1419 raid_commit(un, NULL); 1420 if (un->un_state & RUS_ERRED) { 1421 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, 1422 MD_UN2SET(un), MD_SID(un)); 1423 } else if (un->un_state & RUS_LAST_ERRED) { 1424 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE, 1425 MD_UN2SET(un), MD_SID(un)); 1426 } 1427 1428 return (EIO); 1429 } 1430 1431 /* 1432 * NAME: raid_mapin_buf 1433 * DESCRIPTION: wait for the input buffer header to be maped in 1434 * PARAMETERS: md_raidps_t *ps 1435 */ 1436 static void 1437 raid_mapin_buf(md_raidcs_t *cs) 1438 { 1439 md_raidps_t *ps = cs->cs_ps; 1440 1441 /* 1442 * check to see if the buffer is maped. If all is ok return the 1443 * offset of the data and return. Since it is expensive to grab 1444 * a mutex this is only done if the mapin is not complete. 1445 * Once the mutex is aquired it is possible that the mapin was 1446 * not done so recheck and if necessary do the mapin. 1447 */ 1448 if (ps->ps_mapin > 0) { 1449 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1450 return; 1451 } 1452 mutex_enter(&ps->ps_mapin_mx); 1453 if (ps->ps_mapin > 0) { 1454 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1455 mutex_exit(&ps->ps_mapin_mx); 1456 return; 1457 } 1458 bp_mapin(ps->ps_bp); 1459 /* 1460 * get the new b_addr out of the parent since bp_mapin just changed it 1461 */ 1462 ps->ps_addr = ps->ps_bp->b_un.b_addr; 1463 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1464 ps->ps_mapin++; 1465 mutex_exit(&ps->ps_mapin_mx); 1466 } 1467 1468 /* 1469 * NAMES: raid_read_no_retry 1470 * DESCRIPTION: I/O retry routine for a RAID metadevice read 1471 * read failed attempting to regenerate the data, 1472 * no retry possible, error occured in raid_raidregenloop(). 1473 * PARAMETERS: mr_unit_t *un - pointer to raid unit structure 1474 * md_raidcs_t *cs - pointer to child structure 1475 */ 1476 /*ARGSUSED*/ 1477 static void 1478 raid_read_no_retry(mr_unit_t *un, md_raidcs_t *cs) 1479 { 1480 md_raidps_t *ps = cs->cs_ps; 1481 1482 raid_error_parent(ps, EIO); 1483 raid_free_child(cs, 1); 1484 1485 /* decrement readfrags */ 1486 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 1487 } 1488 1489 /* 1490 * NAMES: raid_read_retry 1491 * DESCRIPTION: I/O retry routine for a RAID metadevice read 1492 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1493 */ 1494 static void 1495 raid_read_retry(mr_unit_t *un, md_raidcs_t *cs) 1496 { 1497 /* re-initialize the buf_t structure for raid_read() */ 1498 cs->cs_dbuf.b_chain = (struct buf *)cs; 1499 cs->cs_dbuf.b_back = &cs->cs_dbuf; 1500 cs->cs_dbuf.b_forw = &cs->cs_dbuf; 1501 cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ 1502 cs->cs_dbuf.b_error = 0; /* initialize error */ 1503 cs->cs_dbuf.b_offset = -1; 1504 /* Initialize semaphores */ 1505 sema_init(&cs->cs_dbuf.b_io, 0, NULL, 1506 SEMA_DEFAULT, NULL); 1507 sema_init(&cs->cs_dbuf.b_sem, 0, NULL, 1508 SEMA_DEFAULT, NULL); 1509 1510 cs->cs_pbuf.b_chain = (struct buf *)cs; 1511 cs->cs_pbuf.b_back = &cs->cs_pbuf; 1512 cs->cs_pbuf.b_forw = &cs->cs_pbuf; 1513 cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ 1514 cs->cs_pbuf.b_error = 0; /* initialize error */ 1515 cs->cs_pbuf.b_offset = -1; 1516 sema_init(&cs->cs_pbuf.b_io, 0, NULL, 1517 SEMA_DEFAULT, NULL); 1518 sema_init(&cs->cs_pbuf.b_sem, 0, NULL, 1519 SEMA_DEFAULT, NULL); 1520 1521 cs->cs_flags &= ~MD_RCS_ERROR; /* reset child error flag */ 1522 cs->cs_flags |= MD_RCS_RECOVERY; /* set RECOVERY flag */ 1523 1524 /* 1525 * re-scheduling I/O with raid_read_io() is simpler. basically, 1526 * raid_read_io() is invoked again with same child structure. 1527 * (NOTE: we aren`t supposed to do any error recovery when an I/O 1528 * error occured in raid_raidregenloop(). 1529 */ 1530 raid_mapin_buf(cs); 1531 raid_read_io(un, cs); 1532 } 1533 1534 /* 1535 * NAMES: raid_rderr 1536 * DESCRIPTION: I/O error handling routine for a RAID metadevice read 1537 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1538 * LOCKS: must obtain unit writer lock while calling raid_error_state 1539 * since a unit or column state transition may take place. 1540 * must obtain unit reader lock to retry I/O. 1541 */ 1542 /*ARGSUSED*/ 1543 static void 1544 raid_rderr(md_raidcs_t *cs) 1545 { 1546 md_raidps_t *ps; 1547 mdi_unit_t *ui; 1548 mr_unit_t *un; 1549 int error = 0; 1550 1551 ps = cs->cs_ps; 1552 ui = ps->ps_ui; 1553 un = (mr_unit_t *)md_unit_writerlock(ui); 1554 ASSERT(un != 0); 1555 1556 if (cs->cs_dbuf.b_flags & B_ERROR) 1557 error = raid_error_state(un, &cs->cs_dbuf); 1558 if (cs->cs_pbuf.b_flags & B_ERROR) 1559 error |= raid_error_state(un, &cs->cs_pbuf); 1560 1561 md_unit_writerexit(ui); 1562 1563 ps->ps_flags |= MD_RPS_HSREQ; 1564 1565 un = (mr_unit_t *)md_unit_readerlock(ui); 1566 ASSERT(un != 0); 1567 /* now attempt the appropriate retry routine */ 1568 (*(cs->cs_retry_call))(un, cs); 1569 } 1570 1571 1572 /* 1573 * NAMES: raid_read_error 1574 * DESCRIPTION: I/O error handling routine for a RAID metadevice read 1575 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1576 */ 1577 /*ARGSUSED*/ 1578 static void 1579 raid_read_error(md_raidcs_t *cs) 1580 { 1581 md_raidps_t *ps; 1582 mdi_unit_t *ui; 1583 mr_unit_t *un; 1584 set_t setno; 1585 1586 ps = cs->cs_ps; 1587 ui = ps->ps_ui; 1588 un = cs->cs_un; 1589 1590 setno = MD_UN2SET(un); 1591 1592 if ((cs->cs_dbuf.b_flags & B_ERROR) && 1593 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && 1594 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) 1595 cmn_err(CE_WARN, "md %s: read error on %s", 1596 md_shortname(MD_SID(un)), 1597 md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); 1598 1599 if ((cs->cs_pbuf.b_flags & B_ERROR) && 1600 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && 1601 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) 1602 cmn_err(CE_WARN, "md %s: read error on %s", 1603 md_shortname(MD_SID(un)), 1604 md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); 1605 1606 md_unit_readerexit(ui); 1607 1608 ASSERT(cs->cs_frags == 0); 1609 1610 /* now schedule processing for possible state change */ 1611 daemon_request(&md_mstr_daemon, raid_rderr, 1612 (daemon_queue_t *)cs, REQ_OLD); 1613 1614 } 1615 1616 /* 1617 * NAMES: getdbuffer 1618 * DESCRIPTION: data buffer allocation for a child structure 1619 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1620 * 1621 * NOTE: always get dbuffer before pbuffer 1622 * and get both buffers before pwslot 1623 * otherwise a deadlock could be introduced. 1624 */ 1625 static void 1626 getdbuffer(md_raidcs_t *cs) 1627 { 1628 mr_unit_t *un; 1629 1630 cs->cs_dbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); 1631 if (cs->cs_dbuffer != NULL) 1632 return; 1633 un = cs->cs_ps->ps_un; 1634 mutex_enter(&un->un_mx); 1635 while (un->un_dbuffer == NULL) { 1636 STAT_INC(data_buffer_waits); 1637 un->un_rflags |= MD_RFLAG_NEEDBUF; 1638 cv_wait(&un->un_cv, &un->un_mx); 1639 } 1640 cs->cs_dbuffer = un->un_dbuffer; 1641 cs->cs_flags |= MD_RCS_UNDBUF; 1642 un->un_dbuffer = NULL; 1643 mutex_exit(&un->un_mx); 1644 } 1645 1646 /* 1647 * NAMES: getpbuffer 1648 * DESCRIPTION: parity buffer allocation for a child structure 1649 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1650 * 1651 * NOTE: always get dbuffer before pbuffer 1652 * and get both buffers before pwslot 1653 * otherwise a deadlock could be introduced. 1654 */ 1655 static void 1656 getpbuffer(md_raidcs_t *cs) 1657 { 1658 mr_unit_t *un; 1659 1660 cs->cs_pbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); 1661 if (cs->cs_pbuffer != NULL) 1662 return; 1663 un = cs->cs_ps->ps_un; 1664 mutex_enter(&un->un_mx); 1665 while (un->un_pbuffer == NULL) { 1666 STAT_INC(parity_buffer_waits); 1667 un->un_rflags |= MD_RFLAG_NEEDBUF; 1668 cv_wait(&un->un_cv, &un->un_mx); 1669 } 1670 cs->cs_pbuffer = un->un_pbuffer; 1671 cs->cs_flags |= MD_RCS_UNPBUF; 1672 un->un_pbuffer = NULL; 1673 mutex_exit(&un->un_mx); 1674 } 1675 static void 1676 getresources(md_raidcs_t *cs) 1677 { 1678 md_raidcbuf_t *cbuf; 1679 /* 1680 * NOTE: always get dbuffer before pbuffer 1681 * and get both buffers before pwslot 1682 * otherwise a deadlock could be introduced. 1683 */ 1684 getdbuffer(cs); 1685 getpbuffer(cs); 1686 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 1687 cbuf->cbuf_buffer = 1688 kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_SLEEP); 1689 } 1690 /* 1691 * NAMES: freebuffers 1692 * DESCRIPTION: child structure buffer freeing routine 1693 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1694 */ 1695 static void 1696 freebuffers(md_raidcs_t *cs) 1697 { 1698 mr_unit_t *un; 1699 md_raidcbuf_t *cbuf; 1700 1701 /* free buffers used for full line write */ 1702 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 1703 if (cbuf->cbuf_buffer == NULL) 1704 continue; 1705 kmem_free(cbuf->cbuf_buffer, cbuf->cbuf_bcount + DEV_BSIZE); 1706 cbuf->cbuf_buffer = NULL; 1707 cbuf->cbuf_bcount = 0; 1708 } 1709 1710 if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { 1711 un = cs->cs_un; 1712 mutex_enter(&un->un_mx); 1713 } 1714 if (cs->cs_dbuffer) { 1715 if (cs->cs_flags & MD_RCS_UNDBUF) 1716 un->un_dbuffer = cs->cs_dbuffer; 1717 else 1718 kmem_free(cs->cs_dbuffer, cs->cs_bcount + DEV_BSIZE); 1719 } 1720 if (cs->cs_pbuffer) { 1721 if (cs->cs_flags & MD_RCS_UNPBUF) 1722 un->un_pbuffer = cs->cs_pbuffer; 1723 else 1724 kmem_free(cs->cs_pbuffer, cs->cs_bcount + DEV_BSIZE); 1725 } 1726 if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { 1727 un->un_rflags &= ~MD_RFLAG_NEEDBUF; 1728 cv_broadcast(&un->un_cv); 1729 mutex_exit(&un->un_mx); 1730 } 1731 } 1732 1733 /* 1734 * NAMES: raid_line_reader_lock, raid_line_writer_lock 1735 * DESCRIPTION: RAID metadevice line reader and writer lock routines 1736 * data column # and parity column #. 1737 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1738 */ 1739 1740 void 1741 raid_line_reader_lock(md_raidcs_t *cs, int resync_thread) 1742 { 1743 mr_unit_t *un; 1744 md_raidcs_t *cs1; 1745 1746 ASSERT(cs->cs_line != MD_DISKADDR_ERROR); 1747 un = cs->cs_un; 1748 cs->cs_flags |= MD_RCS_READER; 1749 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1750 if (!panicstr) 1751 mutex_enter(&un->un_linlck_mx); 1752 cs1 = un->un_linlck_chn; 1753 while (cs1 != NULL) { 1754 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1755 if (raid_io_overlaps(cs, cs1) == 1) 1756 if (cs1->cs_flags & MD_RCS_WRITER) 1757 break; 1758 1759 if (cs1 != NULL) { 1760 if (panicstr) 1761 panic("md; raid line write lock held"); 1762 un->un_linlck_flg = 1; 1763 cv_wait(&un->un_linlck_cv, &un->un_linlck_mx); 1764 STAT_INC(raid_read_waits); 1765 } 1766 } 1767 STAT_MAX(raid_max_reader_locks, raid_reader_locks_active); 1768 STAT_INC(raid_reader_locks); 1769 cs1 = un->un_linlck_chn; 1770 if (cs1 != NULL) 1771 cs1->cs_linlck_prev = cs; 1772 cs->cs_linlck_next = cs1; 1773 cs->cs_linlck_prev = NULL; 1774 un->un_linlck_chn = cs; 1775 cs->cs_flags |= MD_RCS_LLOCKD; 1776 if (resync_thread) { 1777 diskaddr_t lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 1778 diskaddr_t line = (lastblk + 1) / un->un_segsize; 1779 ASSERT(raid_state_cnt(un, RCS_RESYNC)); 1780 mutex_enter(&un->un_mx); 1781 un->un_resync_line_index = line; 1782 mutex_exit(&un->un_mx); 1783 } 1784 if (!panicstr) 1785 mutex_exit(&un->un_linlck_mx); 1786 } 1787 1788 int 1789 raid_line_writer_lock(md_raidcs_t *cs, int lock) 1790 { 1791 mr_unit_t *un; 1792 md_raidcs_t *cs1; 1793 1794 ASSERT(cs->cs_line != MD_DISKADDR_ERROR); 1795 cs->cs_flags |= MD_RCS_WRITER; 1796 un = cs->cs_ps->ps_un; 1797 1798 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1799 if (lock && !panicstr) 1800 mutex_enter(&un->un_linlck_mx); 1801 ASSERT(MUTEX_HELD(&un->un_linlck_mx)); 1802 1803 cs1 = un->un_linlck_chn; 1804 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1805 if (raid_io_overlaps(cs, cs1)) 1806 break; 1807 1808 if (cs1 != NULL) { 1809 if (panicstr) 1810 panic("md: line writer lock inaccessible"); 1811 goto no_lock_exit; 1812 } 1813 1814 if (raid_alloc_pwslot(cs)) { 1815 if (panicstr) 1816 panic("md: no prewrite slots"); 1817 STAT_INC(raid_prewrite_waits); 1818 goto no_lock_exit; 1819 } 1820 1821 cs1 = un->un_linlck_chn; 1822 if (cs1 != NULL) 1823 cs1->cs_linlck_prev = cs; 1824 cs->cs_linlck_next = cs1; 1825 cs->cs_linlck_prev = NULL; 1826 un->un_linlck_chn = cs; 1827 cs->cs_flags |= MD_RCS_LLOCKD; 1828 cs->cs_flags &= ~MD_RCS_WAITING; 1829 STAT_INC(raid_writer_locks); 1830 STAT_MAX(raid_max_write_locks, raid_write_locks_active); 1831 if (lock && !panicstr) 1832 mutex_exit(&un->un_linlck_mx); 1833 return (0); 1834 1835 no_lock_exit: 1836 /* if this is already queued then do not requeue it */ 1837 ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); 1838 if (!lock || (cs->cs_flags & MD_RCS_WAITING)) 1839 return (1); 1840 cs->cs_flags |= MD_RCS_WAITING; 1841 cs->cs_un = un; 1842 raid_enqueue(cs); 1843 if (lock && !panicstr) 1844 mutex_exit(&un->un_linlck_mx); 1845 return (1); 1846 } 1847 1848 static void 1849 raid_startio(md_raidcs_t *cs) 1850 { 1851 mdi_unit_t *ui = cs->cs_ps->ps_ui; 1852 mr_unit_t *un = cs->cs_un; 1853 1854 un = md_unit_readerlock(ui); 1855 raid_write_io(un, cs); 1856 } 1857 1858 void 1859 raid_io_startup(mr_unit_t *un) 1860 { 1861 md_raidcs_t *waiting_list, *cs1; 1862 md_raidcs_t *previous = NULL, *next = NULL; 1863 mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); 1864 kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; 1865 1866 ASSERT(MUTEX_HELD(&un->un_linlck_mx)); 1867 mutex_enter(io_list_mutex); 1868 1869 /* 1870 * check to be sure there are no reader locks outstanding. If 1871 * there are not then pass on the writer lock. 1872 */ 1873 waiting_list = ui->ui_io_lock->io_list_front; 1874 while (waiting_list) { 1875 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1876 ASSERT(! (waiting_list->cs_flags & MD_RCS_LLOCKD)); 1877 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1878 if (raid_io_overlaps(waiting_list, cs1) == 1) 1879 break; 1880 /* 1881 * there was an IOs that overlaps this io so go onto 1882 * the next io in the waiting list 1883 */ 1884 if (cs1) { 1885 previous = waiting_list; 1886 waiting_list = waiting_list->cs_linlck_next; 1887 continue; 1888 } 1889 1890 /* 1891 * There are no IOs that overlap this, so remove it from 1892 * the waiting queue, and start it 1893 */ 1894 1895 if (raid_check_pw(waiting_list)) { 1896 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1897 previous = waiting_list; 1898 waiting_list = waiting_list->cs_linlck_next; 1899 continue; 1900 } 1901 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1902 1903 next = waiting_list->cs_linlck_next; 1904 if (previous) 1905 previous->cs_linlck_next = next; 1906 else 1907 ui->ui_io_lock->io_list_front = next; 1908 1909 if (ui->ui_io_lock->io_list_front == NULL) 1910 ui->ui_io_lock->io_list_back = NULL; 1911 1912 if (ui->ui_io_lock->io_list_back == waiting_list) 1913 ui->ui_io_lock->io_list_back = previous; 1914 1915 waiting_list->cs_linlck_next = NULL; 1916 waiting_list->cs_flags &= ~MD_RCS_WAITING; 1917 STAT_DEC(raid_write_queue_length); 1918 if (raid_line_writer_lock(waiting_list, 0)) 1919 panic("region locking corrupted"); 1920 1921 ASSERT(waiting_list->cs_flags & MD_RCS_LLOCKD); 1922 daemon_request(&md_mstr_daemon, raid_startio, 1923 (daemon_queue_t *)waiting_list, REQ_OLD); 1924 waiting_list = next; 1925 1926 } 1927 mutex_exit(io_list_mutex); 1928 } 1929 1930 void 1931 raid_line_exit(md_raidcs_t *cs) 1932 { 1933 mr_unit_t *un; 1934 1935 un = cs->cs_ps->ps_un; 1936 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1937 mutex_enter(&un->un_linlck_mx); 1938 if (cs->cs_flags & MD_RCS_READER) 1939 STAT_DEC(raid_reader_locks_active); 1940 else 1941 STAT_DEC(raid_write_locks_active); 1942 1943 if (cs->cs_linlck_prev) 1944 cs->cs_linlck_prev->cs_linlck_next = cs->cs_linlck_next; 1945 else 1946 un->un_linlck_chn = cs->cs_linlck_next; 1947 if (cs->cs_linlck_next) 1948 cs->cs_linlck_next->cs_linlck_prev = cs->cs_linlck_prev; 1949 1950 cs->cs_flags &= ~MD_RCS_LLOCKD; 1951 1952 if (un->un_linlck_flg) 1953 cv_broadcast(&un->un_linlck_cv); 1954 1955 un->un_linlck_flg = 0; 1956 cs->cs_line = MD_DISKADDR_ERROR; 1957 1958 raid_cancel_pwslot(cs); 1959 /* 1960 * now that the lock is droped go ahead and see if there are any 1961 * other writes that can be started up 1962 */ 1963 raid_io_startup(un); 1964 1965 mutex_exit(&un->un_linlck_mx); 1966 } 1967 1968 /* 1969 * NAMES: raid_line, raid_pcolumn, raid_dcolumn 1970 * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #, 1971 * data column # and parity column #. 1972 * PARAMETERS: int segment - segment number 1973 * mr_unit_t *un - pointer to an unit structure 1974 * RETURNS: raid_line returns line # 1975 * raid_dcolumn returns data column # 1976 * raid_pcolumn returns parity column # 1977 */ 1978 static diskaddr_t 1979 raid_line(diskaddr_t segment, mr_unit_t *un) 1980 { 1981 diskaddr_t adj_seg; 1982 diskaddr_t line; 1983 diskaddr_t max_orig_segment; 1984 1985 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 1986 if (segment >= max_orig_segment) { 1987 adj_seg = segment - max_orig_segment; 1988 line = adj_seg % un->un_segsincolumn; 1989 } else { 1990 line = segment / (un->un_origcolumncnt - 1); 1991 } 1992 return (line); 1993 } 1994 1995 uint_t 1996 raid_dcolumn(diskaddr_t segment, mr_unit_t *un) 1997 { 1998 diskaddr_t adj_seg; 1999 diskaddr_t line; 2000 diskaddr_t max_orig_segment; 2001 uint_t column; 2002 2003 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 2004 if (segment >= max_orig_segment) { 2005 adj_seg = segment - max_orig_segment; 2006 column = un->un_origcolumncnt + 2007 (uint_t)(adj_seg / un->un_segsincolumn); 2008 } else { 2009 line = segment / (un->un_origcolumncnt - 1); 2010 column = (uint_t)((segment % (un->un_origcolumncnt - 1) + line) 2011 % un->un_origcolumncnt); 2012 } 2013 return (column); 2014 } 2015 2016 uint_t 2017 raid_pcolumn(diskaddr_t segment, mr_unit_t *un) 2018 { 2019 diskaddr_t adj_seg; 2020 diskaddr_t line; 2021 diskaddr_t max_orig_segment; 2022 uint_t column; 2023 2024 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 2025 if (segment >= max_orig_segment) { 2026 adj_seg = segment - max_orig_segment; 2027 line = adj_seg % un->un_segsincolumn; 2028 } else { 2029 line = segment / (un->un_origcolumncnt - 1); 2030 } 2031 column = (uint_t)((line + (un->un_origcolumncnt - 1)) 2032 % un->un_origcolumncnt); 2033 return (column); 2034 } 2035 2036 2037 /* 2038 * Is called in raid_iosetup to probe each column to insure 2039 * that all the columns are in 'okay' state and meet the 2040 * 'full line' requirement. If any column is in error, 2041 * we don't want to enable the 'full line' flag. Previously, 2042 * we would do so and disable it only when a error is 2043 * detected after the first 'full line' io which is too late 2044 * and leads to the potential data corruption. 2045 */ 2046 static int 2047 raid_check_cols(mr_unit_t *un) 2048 { 2049 buf_t bp; 2050 char *buf; 2051 mr_column_t *colptr; 2052 minor_t mnum = MD_SID(un); 2053 int i; 2054 int err = 0; 2055 2056 buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP); 2057 2058 for (i = 0; i < un->un_totalcolumncnt; i++) { 2059 md_dev64_t tmpdev; 2060 2061 colptr = &un->un_column[i]; 2062 2063 tmpdev = colptr->un_dev; 2064 /* 2065 * Open by device id 2066 * If this device is hotspared 2067 * use the hotspare key 2068 */ 2069 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ? 2070 colptr->un_hs_key : colptr->un_orig_key); 2071 2072 if (tmpdev == NODEV64) { 2073 err = 1; 2074 break; 2075 } 2076 2077 colptr->un_dev = tmpdev; 2078 2079 bzero((caddr_t)&bp, sizeof (buf_t)); 2080 bp.b_back = &bp; 2081 bp.b_forw = &bp; 2082 bp.b_flags = (B_READ | B_BUSY); 2083 sema_init(&bp.b_io, 0, NULL, 2084 SEMA_DEFAULT, NULL); 2085 sema_init(&bp.b_sem, 0, NULL, 2086 SEMA_DEFAULT, NULL); 2087 bp.b_edev = md_dev64_to_dev(colptr->un_dev); 2088 bp.b_lblkno = colptr->un_pwstart; 2089 bp.b_bcount = DEV_BSIZE; 2090 bp.b_bufsize = DEV_BSIZE; 2091 bp.b_un.b_addr = (caddr_t)buf; 2092 (void) md_call_strategy(&bp, 0, NULL); 2093 if (biowait(&bp)) { 2094 err = 1; 2095 break; 2096 } 2097 } 2098 2099 kmem_free(buf, DEV_BSIZE); 2100 return (err); 2101 } 2102 2103 /* 2104 * NAME: raid_iosetup 2105 * DESCRIPTION: RAID metadevice specific I/O set up routine which does 2106 * all the necessary calculations to determine the location 2107 * of the segement for the I/O. 2108 * PARAMETERS: mr_unit_t *un - unit number of RAID metadevice 2109 * diskaddr_t blkno - block number of the I/O attempt 2110 * size_t blkcnt - block count for this I/O 2111 * md_raidcs_t *cs - child structure for each segmented I/O 2112 * 2113 * NOTE: The following is an example of a raid disk layer out: 2114 * 2115 * Total Column = 5 2116 * Original Column = 4 2117 * Segment Per Column = 10 2118 * 2119 * Col#0 Col#1 Col#2 Col#3 Col#4 Col#5 Col#6 2120 * ------------------------------------------------------------- 2121 * line#0 Seg#0 Seg#1 Seg#2 Parity Seg#30 Seg#40 2122 * line#1 Parity Seg#3 Seg#4 Seg#5 Seg#31 2123 * line#2 Seg#8 Parity Seg#6 Seg#7 Seg#32 2124 * line#3 Seg#10 Seg#11 Parity Seg#9 Seg#33 2125 * line#4 Seg#12 Seg#13 Seg#14 Parity Seg#34 2126 * line#5 Parity Seg#15 Seg#16 Seg#17 Seg#35 2127 * line#6 Seg#20 Parity Seg#18 Seg#19 Seg#36 2128 * line#7 Seg#22 Seg#23 Parity Seg#21 Seg#37 2129 * line#8 Seg#24 Seg#25 Seg#26 Parity Seg#38 2130 * line#9 Parity Seg#27 Seg#28 Seg#29 Seg#39 2131 */ 2132 static size_t 2133 raid_iosetup( 2134 mr_unit_t *un, 2135 diskaddr_t blkno, 2136 size_t blkcnt, 2137 md_raidcs_t *cs 2138 ) 2139 { 2140 diskaddr_t segment; 2141 diskaddr_t segstart; 2142 diskaddr_t segoff; 2143 size_t leftover; 2144 diskaddr_t line; 2145 uint_t iosize; 2146 uint_t colcnt; 2147 2148 /* caculate the segment# and offset for the block */ 2149 segment = blkno / un->un_segsize; 2150 segstart = segment * un->un_segsize; 2151 segoff = blkno - segstart; 2152 iosize = un->un_iosize - 1; 2153 colcnt = un->un_totalcolumncnt - 1; 2154 line = raid_line(segment, un); 2155 cs->cs_dcolumn = raid_dcolumn(segment, un); 2156 cs->cs_pcolumn = raid_pcolumn(segment, un); 2157 cs->cs_dflags = un->un_column[cs->cs_dcolumn].un_devflags; 2158 cs->cs_pflags = un->un_column[cs->cs_pcolumn].un_devflags; 2159 cs->cs_line = line; 2160 2161 if ((cs->cs_ps->ps_flags & MD_RPS_WRITE) && 2162 (UNIT_STATE(un) & RCS_OKAY) && 2163 (segoff == 0) && 2164 (un->un_totalcolumncnt == un->un_origcolumncnt) && 2165 (un->un_segsize < un->un_iosize) && 2166 (un->un_iosize <= un->un_maxio) && 2167 (blkno == line * un->un_segsize * colcnt) && 2168 (blkcnt >= ((un->un_totalcolumncnt -1) * un->un_segsize)) && 2169 (raid_state_cnt(un, RCS_OKAY) == un->un_origcolumncnt) && 2170 (raid_check_cols(un) == 0)) { 2171 2172 md_raidcbuf_t **cbufp; 2173 md_raidcbuf_t *cbuf; 2174 int i, j; 2175 2176 STAT_INC(raid_full_line_writes); 2177 leftover = blkcnt - (un->un_segsize * colcnt); 2178 ASSERT(blkcnt >= (un->un_segsize * colcnt)); 2179 cs->cs_blkno = line * un->un_segsize; 2180 cs->cs_blkcnt = un->un_segsize; 2181 cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 2182 cs->cs_bcount = dbtob(cs->cs_blkcnt); 2183 cs->cs_flags |= MD_RCS_LINE; 2184 2185 cbufp = &cs->cs_buflist; 2186 for (i = 0; i < un->un_totalcolumncnt; i++) { 2187 j = cs->cs_dcolumn + i; 2188 j = j % un->un_totalcolumncnt; 2189 2190 if ((j == cs->cs_dcolumn) || (j == cs->cs_pcolumn)) 2191 continue; 2192 cbuf = kmem_cache_alloc(raid_cbuf_cache, 2193 MD_ALLOCFLAGS); 2194 raid_cbuf_init(cbuf); 2195 cbuf->cbuf_un = cs->cs_un; 2196 cbuf->cbuf_ps = cs->cs_ps; 2197 cbuf->cbuf_column = j; 2198 cbuf->cbuf_bcount = dbtob(un->un_segsize); 2199 *cbufp = cbuf; 2200 cbufp = &cbuf->cbuf_next; 2201 } 2202 return (leftover); 2203 } 2204 2205 leftover = blkcnt - (un->un_segsize - segoff); 2206 if (blkcnt > (un->un_segsize - segoff)) 2207 blkcnt -= leftover; 2208 else 2209 leftover = 0; 2210 2211 if (blkcnt > (size_t)iosize) { 2212 leftover += (blkcnt - iosize); 2213 blkcnt = iosize; 2214 } 2215 2216 /* calculate the line# and column# for the segment */ 2217 cs->cs_flags &= ~MD_RCS_LINE; 2218 cs->cs_blkno = line * un->un_segsize + segoff; 2219 cs->cs_blkcnt = (uint_t)blkcnt; 2220 cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 2221 cs->cs_bcount = dbtob((uint_t)blkcnt); 2222 return (leftover); 2223 } 2224 2225 /* 2226 * NAME: raid_done 2227 * DESCRIPTION: RAID metadevice I/O done interrupt routine 2228 * PARAMETERS: struct buf *bp - pointer to a buffer structure 2229 */ 2230 static void 2231 raid_done(struct buf *bp) 2232 { 2233 md_raidcs_t *cs; 2234 int flags, frags; 2235 2236 sema_v(&bp->b_io); 2237 cs = (md_raidcs_t *)bp->b_chain; 2238 2239 ASSERT(cs != NULL); 2240 2241 mutex_enter(&cs->cs_mx); 2242 if (bp->b_flags & B_ERROR) { 2243 cs->cs_flags |= MD_RCS_ERROR; 2244 cs->cs_flags &= ~(MD_RCS_ISCALL); 2245 } 2246 2247 flags = cs->cs_flags; 2248 frags = --cs->cs_frags; 2249 mutex_exit(&cs->cs_mx); 2250 if (frags != 0) { 2251 return; 2252 } 2253 2254 if (flags & MD_RCS_ERROR) { 2255 if (cs->cs_error_call) { 2256 daemon_request(&md_done_daemon, cs->cs_error_call, 2257 (daemon_queue_t *)cs, REQ_OLD); 2258 } 2259 return; 2260 } 2261 2262 if (flags & MD_RCS_ISCALL) { 2263 cs->cs_flags &= ~(MD_RCS_ISCALL); 2264 (*(cs->cs_call))(cs); 2265 return; 2266 } 2267 daemon_request(&md_done_daemon, cs->cs_call, 2268 (daemon_queue_t *)cs, REQ_OLD); 2269 } 2270 /* 2271 * the flag RIO_EXTRA is used when dealing with a column in the process 2272 * of being resynced. During the resync, writes may have to take place 2273 * on both the original component and a hotspare component. 2274 */ 2275 #define RIO_DATA 0x00100 /* use data buffer & data column */ 2276 #define RIO_PARITY 0x00200 /* use parity buffer & parity column */ 2277 #define RIO_WRITE 0x00400 /* issue a write */ 2278 #define RIO_READ 0x00800 /* issue a read */ 2279 #define RIO_PWIO 0x01000 /* do the I/O to the prewrite entry */ 2280 #define RIO_ALT 0x02000 /* do write to alternate device */ 2281 #define RIO_EXTRA 0x04000 /* use extra buffer */ 2282 2283 #define RIO_COLMASK 0x000ff 2284 2285 #define RIO_PREWRITE RIO_WRITE | RIO_PWIO 2286 2287 /* 2288 * NAME: raidio 2289 * DESCRIPTION: RAID metadevice write routine 2290 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2291 */ 2292 static void 2293 raidio(md_raidcs_t *cs, int flags) 2294 { 2295 buf_t *bp; 2296 int column; 2297 int flag; 2298 void *private; 2299 mr_unit_t *un; 2300 int iosize; 2301 diskaddr_t pwstart; 2302 diskaddr_t devstart; 2303 md_dev64_t dev; 2304 2305 un = cs->cs_un; 2306 2307 ASSERT(IO_READER_HELD(un)); 2308 ASSERT(UNIT_READER_HELD(un)); 2309 2310 if (flags & RIO_DATA) { 2311 if (flags & RIO_EXTRA) 2312 bp = &cs->cs_hbuf; 2313 else 2314 bp = &cs->cs_dbuf; 2315 bp->b_un.b_addr = cs->cs_dbuffer; 2316 column = cs->cs_dcolumn; 2317 } else { 2318 if (flags & RIO_EXTRA) 2319 bp = &cs->cs_hbuf; 2320 else 2321 bp = &cs->cs_pbuf; 2322 bp->b_un.b_addr = cs->cs_pbuffer; 2323 column = cs->cs_pcolumn; 2324 } 2325 if (flags & RIO_COLMASK) 2326 column = (flags & RIO_COLMASK) - 1; 2327 2328 bp->b_bcount = cs->cs_bcount; 2329 bp->b_bufsize = cs->cs_bcount; 2330 iosize = un->un_iosize; 2331 2332 /* check if the hotspared device will be used */ 2333 if (flags & RIO_ALT && (flags & RIO_WRITE)) { 2334 pwstart = un->un_column[column].un_alt_pwstart; 2335 devstart = un->un_column[column].un_alt_devstart; 2336 dev = un->un_column[column].un_alt_dev; 2337 } else { 2338 pwstart = un->un_column[column].un_pwstart; 2339 devstart = un->un_column[column].un_devstart; 2340 dev = un->un_column[column].un_dev; 2341 } 2342 2343 /* if not writing to log skip log header */ 2344 if ((flags & RIO_PWIO) == 0) { 2345 bp->b_lblkno = devstart + cs->cs_blkno; 2346 bp->b_un.b_addr += DEV_BSIZE; 2347 } else { 2348 bp->b_bcount += DEV_BSIZE; 2349 bp->b_bufsize = bp->b_bcount; 2350 if (flags & RIO_DATA) { 2351 bp->b_lblkno = cs->cs_dpwslot * iosize + pwstart; 2352 } else { /* not DATA -> PARITY */ 2353 bp->b_lblkno = cs->cs_ppwslot * iosize + pwstart; 2354 } 2355 } 2356 2357 bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR | nv_available); 2358 bp->b_flags |= B_BUSY; 2359 if (flags & RIO_READ) { 2360 bp->b_flags |= B_READ; 2361 } else { 2362 bp->b_flags |= B_WRITE; 2363 if ((nv_available && nv_parity && (flags & RIO_PARITY)) || 2364 (nv_available && nv_prewrite && (flags & RIO_PWIO))) 2365 bp->b_flags |= nv_available; 2366 } 2367 bp->b_iodone = (int (*)())raid_done; 2368 bp->b_edev = md_dev64_to_dev(dev); 2369 2370 ASSERT((bp->b_edev != 0) && (bp->b_edev != NODEV)); 2371 2372 private = cs->cs_strategy_private; 2373 flag = cs->cs_strategy_flag; 2374 2375 md_call_strategy(bp, flag, private); 2376 } 2377 2378 /* 2379 * NAME: genstandardparity 2380 * DESCRIPTION: This routine 2381 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2382 */ 2383 static void 2384 genstandardparity(md_raidcs_t *cs) 2385 { 2386 uint_t *dbuf, *pbuf; 2387 size_t wordcnt; 2388 uint_t dsum = 0; 2389 uint_t psum = 0; 2390 2391 ASSERT((cs->cs_bcount & 0x3) == 0); 2392 2393 wordcnt = cs->cs_bcount / sizeof (uint_t); 2394 2395 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2396 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2397 2398 /* Word aligned */ 2399 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2400 uint_t *uwbuf = (uint_t *)(void *)(cs->cs_addr); 2401 uint_t uval; 2402 2403 while (wordcnt--) { 2404 uval = *uwbuf++; 2405 psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ uval)); 2406 ++pbuf; 2407 *dbuf = uval; 2408 dsum ^= uval; 2409 ++dbuf; 2410 } 2411 } else { 2412 uchar_t *ubbuf = (uchar_t *)(cs->cs_addr); 2413 union { 2414 uint_t wb; 2415 uchar_t bb[4]; 2416 } cb; 2417 2418 while (wordcnt--) { 2419 cb.bb[0] = *ubbuf++; 2420 cb.bb[1] = *ubbuf++; 2421 cb.bb[2] = *ubbuf++; 2422 cb.bb[3] = *ubbuf++; 2423 psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ cb.wb)); 2424 ++pbuf; 2425 *dbuf = cb.wb; 2426 dsum ^= cb.wb; 2427 ++dbuf; 2428 } 2429 } 2430 2431 RAID_FILLIN_RPW(cs->cs_dbuffer, cs->cs_un, dsum, cs->cs_pcolumn, 2432 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2433 2, cs->cs_dcolumn, RAID_PWMAGIC); 2434 2435 RAID_FILLIN_RPW(cs->cs_pbuffer, cs->cs_un, psum, cs->cs_dcolumn, 2436 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2437 2, cs->cs_pcolumn, RAID_PWMAGIC); 2438 } 2439 2440 static void 2441 genlineparity(md_raidcs_t *cs) 2442 { 2443 2444 mr_unit_t *un = cs->cs_un; 2445 md_raidcbuf_t *cbuf; 2446 uint_t *pbuf, *dbuf; 2447 uint_t *uwbuf; 2448 uchar_t *ubbuf; 2449 size_t wordcnt; 2450 uint_t psum = 0, dsum = 0; 2451 size_t count = un->un_segsize * DEV_BSIZE; 2452 uint_t col; 2453 buf_t *bp; 2454 2455 ASSERT((cs->cs_bcount & 0x3) == 0); 2456 2457 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2458 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2459 uwbuf = (uint_t *)(void *)(cs->cs_addr); 2460 ubbuf = (uchar_t *)(void *)(cs->cs_addr); 2461 2462 wordcnt = count / sizeof (uint_t); 2463 2464 /* Word aligned */ 2465 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2466 uint_t uval; 2467 2468 while (wordcnt--) { 2469 uval = *uwbuf++; 2470 *dbuf = uval; 2471 *pbuf = uval; 2472 dsum ^= uval; 2473 ++pbuf; 2474 ++dbuf; 2475 } 2476 } else { 2477 union { 2478 uint_t wb; 2479 uchar_t bb[4]; 2480 } cb; 2481 2482 while (wordcnt--) { 2483 cb.bb[0] = *ubbuf++; 2484 cb.bb[1] = *ubbuf++; 2485 cb.bb[2] = *ubbuf++; 2486 cb.bb[3] = *ubbuf++; 2487 *dbuf = cb.wb; 2488 *pbuf = cb.wb; 2489 dsum ^= cb.wb; 2490 ++pbuf; 2491 ++dbuf; 2492 } 2493 } 2494 2495 RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, cs->cs_pcolumn, 2496 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2497 un->un_totalcolumncnt, cs->cs_dcolumn, RAID_PWMAGIC); 2498 2499 raidio(cs, RIO_PREWRITE | RIO_DATA); 2500 2501 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 2502 2503 dsum = 0; 2504 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2505 dbuf = (uint_t *)(void *)(cbuf->cbuf_buffer + DEV_BSIZE); 2506 2507 wordcnt = count / sizeof (uint_t); 2508 2509 col = cbuf->cbuf_column; 2510 2511 /* Word aligned */ 2512 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2513 uint_t uval; 2514 2515 /* 2516 * Only calculate psum when working on the last 2517 * data buffer. 2518 */ 2519 if (cbuf->cbuf_next == NULL) { 2520 psum = 0; 2521 while (wordcnt--) { 2522 uval = *uwbuf++; 2523 *dbuf = uval; 2524 psum ^= (*pbuf ^= uval); 2525 dsum ^= uval; 2526 ++dbuf; 2527 ++pbuf; 2528 } 2529 } else { 2530 while (wordcnt--) { 2531 uval = *uwbuf++; 2532 *dbuf = uval; 2533 *pbuf ^= uval; 2534 dsum ^= uval; 2535 ++dbuf; 2536 ++pbuf; 2537 } 2538 } 2539 } else { 2540 union { 2541 uint_t wb; 2542 uchar_t bb[4]; 2543 } cb; 2544 2545 /* 2546 * Only calculate psum when working on the last 2547 * data buffer. 2548 */ 2549 if (cbuf->cbuf_next == NULL) { 2550 psum = 0; 2551 while (wordcnt--) { 2552 cb.bb[0] = *ubbuf++; 2553 cb.bb[1] = *ubbuf++; 2554 cb.bb[2] = *ubbuf++; 2555 cb.bb[3] = *ubbuf++; 2556 *dbuf = cb.wb; 2557 psum ^= (*pbuf ^= cb.wb); 2558 dsum ^= cb.wb; 2559 ++dbuf; 2560 ++pbuf; 2561 } 2562 } else { 2563 while (wordcnt--) { 2564 cb.bb[0] = *ubbuf++; 2565 cb.bb[1] = *ubbuf++; 2566 cb.bb[2] = *ubbuf++; 2567 cb.bb[3] = *ubbuf++; 2568 *dbuf = cb.wb; 2569 *pbuf ^= cb.wb; 2570 dsum ^= cb.wb; 2571 ++dbuf; 2572 ++pbuf; 2573 } 2574 } 2575 } 2576 RAID_FILLIN_RPW(cbuf->cbuf_buffer, un, dsum, cs->cs_pcolumn, 2577 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2578 un->un_totalcolumncnt, col, RAID_PWMAGIC); 2579 2580 /* 2581 * fill in buffer for write to prewrite area 2582 */ 2583 bp = &cbuf->cbuf_bp; 2584 bp->b_un.b_addr = cbuf->cbuf_buffer; 2585 bp->b_bcount = cbuf->cbuf_bcount + DEV_BSIZE; 2586 bp->b_bufsize = bp->b_bcount; 2587 bp->b_lblkno = (cbuf->cbuf_pwslot * un->un_iosize) + 2588 un->un_column[col].un_pwstart; 2589 bp->b_flags = B_WRITE | B_BUSY; 2590 if (nv_available && nv_prewrite) 2591 bp->b_flags |= nv_available; 2592 bp->b_iodone = (int (*)())raid_done; 2593 bp->b_edev = md_dev64_to_dev(un->un_column[col].un_dev); 2594 bp->b_chain = (struct buf *)cs; 2595 md_call_strategy(bp, 2596 cs->cs_strategy_flag, cs->cs_strategy_private); 2597 } 2598 2599 RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, cs->cs_dcolumn, 2600 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2601 un->un_totalcolumncnt, cs->cs_pcolumn, RAID_PWMAGIC); 2602 2603 raidio(cs, RIO_PREWRITE | RIO_PARITY); 2604 } 2605 2606 /* 2607 * NAME: raid_readregenloop 2608 * DESCRIPTION: RAID metadevice write routine 2609 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2610 */ 2611 static void 2612 raid_readregenloop(md_raidcs_t *cs) 2613 { 2614 mr_unit_t *un; 2615 md_raidps_t *ps; 2616 uint_t *dbuf; 2617 uint_t *pbuf; 2618 size_t wordcnt; 2619 2620 un = cs->cs_un; 2621 2622 /* 2623 * XOR the parity with data bytes, must skip the 2624 * pre-write entry header in all data/parity buffers 2625 */ 2626 wordcnt = cs->cs_bcount / sizeof (uint_t); 2627 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2628 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2629 while (wordcnt--) 2630 *dbuf++ ^= *pbuf++; 2631 2632 /* bump up the loop count */ 2633 cs->cs_loop++; 2634 2635 /* skip the errored component */ 2636 if (cs->cs_loop == cs->cs_dcolumn) 2637 cs->cs_loop++; 2638 2639 if (cs->cs_loop != un->un_totalcolumncnt) { 2640 cs->cs_frags = 1; 2641 raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); 2642 return; 2643 } 2644 /* reaching the end sof loop */ 2645 ps = cs->cs_ps; 2646 bcopy(cs->cs_dbuffer + DEV_BSIZE, cs->cs_addr, cs->cs_bcount); 2647 raid_free_child(cs, 1); 2648 2649 /* decrement readfrags */ 2650 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 2651 } 2652 2653 /* 2654 * NAME: raid_read_io 2655 * DESCRIPTION: RAID metadevice read I/O routine 2656 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2657 * md_raidcs_t *cs - pointer to a child structure 2658 */ 2659 static void 2660 raid_read_io(mr_unit_t *un, md_raidcs_t *cs) 2661 { 2662 int flag; 2663 void *private; 2664 buf_t *bp; 2665 buf_t *pb = cs->cs_ps->ps_bp; 2666 mr_column_t *column; 2667 2668 flag = cs->cs_strategy_flag; 2669 private = cs->cs_strategy_private; 2670 column = &un->un_column[cs->cs_dcolumn]; 2671 2672 /* 2673 * The component to be read is good, simply set up bp structure 2674 * and call low level md routine doing the read. 2675 */ 2676 2677 if (COLUMN_ISOKAY(un, cs->cs_dcolumn) || 2678 (COLUMN_ISLASTERR(un, cs->cs_dcolumn) && 2679 (cs->cs_flags & MD_RCS_RECOVERY) == 0)) { 2680 dev_t ddi_dev; /* needed for bioclone, so not md_dev64_t */ 2681 ddi_dev = md_dev64_to_dev(column->un_dev); 2682 2683 bp = &cs->cs_dbuf; 2684 bp = md_bioclone(pb, cs->cs_offset, cs->cs_bcount, ddi_dev, 2685 column->un_devstart + cs->cs_blkno, 2686 (int (*)())raid_done, bp, KM_NOSLEEP); 2687 2688 bp->b_chain = (buf_t *)cs; 2689 2690 cs->cs_frags = 1; 2691 cs->cs_error_call = raid_read_error; 2692 cs->cs_retry_call = raid_read_retry; 2693 cs->cs_flags |= MD_RCS_ISCALL; 2694 cs->cs_stage = RAID_READ_DONE; 2695 cs->cs_call = raid_stage; 2696 2697 ASSERT(bp->b_edev != 0); 2698 2699 md_call_strategy(bp, flag, private); 2700 return; 2701 } 2702 2703 /* 2704 * The component to be read is bad, have to go through 2705 * raid specific method to read data from other members. 2706 */ 2707 cs->cs_loop = 0; 2708 /* 2709 * NOTE: always get dbuffer before pbuffer 2710 * and get both buffers before pwslot 2711 * otherwise a deadlock could be introduced. 2712 */ 2713 raid_mapin_buf(cs); 2714 getdbuffer(cs); 2715 getpbuffer(cs); 2716 if (cs->cs_loop == cs->cs_dcolumn) 2717 cs->cs_loop++; 2718 2719 /* zero out data buffer for use as a data sink */ 2720 bzero(cs->cs_dbuffer + DEV_BSIZE, cs->cs_bcount); 2721 cs->cs_stage = RAID_NONE; 2722 cs->cs_call = raid_readregenloop; 2723 cs->cs_error_call = raid_read_error; 2724 cs->cs_retry_call = raid_read_no_retry; 2725 cs->cs_frags = 1; 2726 2727 /* use parity buffer to read other columns */ 2728 raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); 2729 } 2730 2731 /* 2732 * NAME: raid_read 2733 * DESCRIPTION: RAID metadevice write routine 2734 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2735 * md_raidcs_t *cs - pointer to a child structure 2736 */ 2737 static int 2738 raid_read(mr_unit_t *un, md_raidcs_t *cs) 2739 { 2740 int error = 0; 2741 md_raidps_t *ps; 2742 mdi_unit_t *ui; 2743 minor_t mnum; 2744 2745 ASSERT(IO_READER_HELD(un)); 2746 ps = cs->cs_ps; 2747 ui = ps->ps_ui; 2748 raid_line_reader_lock(cs, 0); 2749 un = (mr_unit_t *)md_unit_readerlock(ui); 2750 ASSERT(UNIT_STATE(un) != RUS_INIT); 2751 mnum = MD_SID(un); 2752 cs->cs_un = un; 2753 2754 /* make sure the read doesn't go beyond the end of the column */ 2755 if (cs->cs_blkno + cs->cs_blkcnt > 2756 un->un_segsize * un->un_segsincolumn) { 2757 error = ENXIO; 2758 } 2759 if (error) 2760 goto rerror; 2761 2762 if (un->un_state & RUS_REGEN) { 2763 raid_regen_parity(cs); 2764 un = MD_UNIT(mnum); 2765 cs->cs_un = un; 2766 } 2767 2768 raid_read_io(un, cs); 2769 return (0); 2770 2771 rerror: 2772 raid_error_parent(ps, error); 2773 raid_free_child(cs, 1); 2774 /* decrement readfrags */ 2775 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 2776 return (0); 2777 } 2778 2779 /* 2780 * NAME: raid_write_err_retry 2781 * DESCRIPTION: RAID metadevice write retry routine 2782 * write was for parity or data only; 2783 * complete write with error, no recovery possible 2784 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2785 * md_raidcs_t *cs - pointer to a child structure 2786 */ 2787 /*ARGSUSED*/ 2788 static void 2789 raid_write_err_retry(mr_unit_t *un, md_raidcs_t *cs) 2790 { 2791 md_raidps_t *ps = cs->cs_ps; 2792 int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; 2793 2794 /* decrement pwfrags if needed, and frags */ 2795 if (!(cs->cs_flags & MD_RCS_PWDONE)) 2796 flags |= RFP_DECR_PWFRAGS; 2797 raid_error_parent(ps, EIO); 2798 raid_free_child(cs, 1); 2799 raid_free_parent(ps, flags); 2800 } 2801 2802 /* 2803 * NAME: raid_write_err_retry 2804 * DESCRIPTION: RAID metadevice write retry routine 2805 * write is too far along to retry and parent 2806 * has already been signaled with iodone. 2807 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2808 * md_raidcs_t *cs - pointer to a child structure 2809 */ 2810 /*ARGSUSED*/ 2811 static void 2812 raid_write_no_retry(mr_unit_t *un, md_raidcs_t *cs) 2813 { 2814 md_raidps_t *ps = cs->cs_ps; 2815 int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; 2816 2817 /* decrement pwfrags if needed, and frags */ 2818 if (!(cs->cs_flags & MD_RCS_PWDONE)) 2819 flags |= RFP_DECR_PWFRAGS; 2820 raid_free_child(cs, 1); 2821 raid_free_parent(ps, flags); 2822 } 2823 2824 /* 2825 * NAME: raid_write_retry 2826 * DESCRIPTION: RAID metadevice write retry routine 2827 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2828 * md_raidcs_t *cs - pointer to a child structure 2829 */ 2830 static void 2831 raid_write_retry(mr_unit_t *un, md_raidcs_t *cs) 2832 { 2833 md_raidps_t *ps; 2834 2835 ps = cs->cs_ps; 2836 2837 /* re-initialize the buf_t structure for raid_write() */ 2838 cs->cs_dbuf.b_chain = (struct buf *)cs; 2839 cs->cs_dbuf.b_back = &cs->cs_dbuf; 2840 cs->cs_dbuf.b_forw = &cs->cs_dbuf; 2841 cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ 2842 cs->cs_dbuf.b_error = 0; /* initialize error */ 2843 cs->cs_dbuf.b_offset = -1; 2844 /* Initialize semaphores */ 2845 sema_init(&cs->cs_dbuf.b_io, 0, NULL, 2846 SEMA_DEFAULT, NULL); 2847 sema_init(&cs->cs_dbuf.b_sem, 0, NULL, 2848 SEMA_DEFAULT, NULL); 2849 2850 cs->cs_pbuf.b_chain = (struct buf *)cs; 2851 cs->cs_pbuf.b_back = &cs->cs_pbuf; 2852 cs->cs_pbuf.b_forw = &cs->cs_pbuf; 2853 cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ 2854 cs->cs_pbuf.b_error = 0; /* initialize error */ 2855 cs->cs_pbuf.b_offset = -1; 2856 sema_init(&cs->cs_pbuf.b_io, 0, NULL, 2857 SEMA_DEFAULT, NULL); 2858 sema_init(&cs->cs_pbuf.b_sem, 0, NULL, 2859 SEMA_DEFAULT, NULL); 2860 2861 cs->cs_hbuf.b_chain = (struct buf *)cs; 2862 cs->cs_hbuf.b_back = &cs->cs_hbuf; 2863 cs->cs_hbuf.b_forw = &cs->cs_hbuf; 2864 cs->cs_hbuf.b_flags = B_BUSY; /* initialize flags */ 2865 cs->cs_hbuf.b_error = 0; /* initialize error */ 2866 cs->cs_hbuf.b_offset = -1; 2867 sema_init(&cs->cs_hbuf.b_io, 0, NULL, 2868 SEMA_DEFAULT, NULL); 2869 sema_init(&cs->cs_hbuf.b_sem, 0, NULL, 2870 SEMA_DEFAULT, NULL); 2871 2872 cs->cs_flags &= ~(MD_RCS_ERROR); 2873 /* 2874 * If we have already done'ed the i/o but have done prewrite 2875 * on this child, then reset PWDONE flag and bump pwfrags before 2876 * restarting i/o. 2877 * If pwfrags is zero, we have already 'iodone'd the i/o so 2878 * leave things alone. We don't want to re-'done' it. 2879 */ 2880 mutex_enter(&ps->ps_mx); 2881 if (cs->cs_flags & MD_RCS_PWDONE) { 2882 cs->cs_flags &= ~MD_RCS_PWDONE; 2883 ps->ps_pwfrags++; 2884 } 2885 mutex_exit(&ps->ps_mx); 2886 raid_write_io(un, cs); 2887 } 2888 2889 /* 2890 * NAME: raid_wrerr 2891 * DESCRIPTION: RAID metadevice write routine 2892 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2893 * LOCKS: must obtain unit writer lock while calling raid_error_state 2894 * since a unit or column state transition may take place. 2895 * must obtain unit reader lock to retry I/O. 2896 */ 2897 static void 2898 raid_wrerr(md_raidcs_t *cs) 2899 { 2900 md_raidps_t *ps; 2901 mdi_unit_t *ui; 2902 mr_unit_t *un; 2903 md_raidcbuf_t *cbuf; 2904 2905 ps = cs->cs_ps; 2906 ui = ps->ps_ui; 2907 2908 un = (mr_unit_t *)md_unit_writerlock(ui); 2909 ASSERT(un != 0); 2910 2911 if (cs->cs_dbuf.b_flags & B_ERROR) 2912 (void) raid_error_state(un, &cs->cs_dbuf); 2913 if (cs->cs_pbuf.b_flags & B_ERROR) 2914 (void) raid_error_state(un, &cs->cs_pbuf); 2915 if (cs->cs_hbuf.b_flags & B_ERROR) 2916 (void) raid_error_state(un, &cs->cs_hbuf); 2917 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 2918 if (cbuf->cbuf_bp.b_flags & B_ERROR) 2919 (void) raid_error_state(un, &cbuf->cbuf_bp); 2920 2921 md_unit_writerexit(ui); 2922 2923 ps->ps_flags |= MD_RPS_HSREQ; 2924 2925 un = (mr_unit_t *)md_unit_readerlock(ui); 2926 2927 /* now attempt the appropriate retry routine */ 2928 (*(cs->cs_retry_call))(un, cs); 2929 } 2930 /* 2931 * NAMES: raid_write_error 2932 * DESCRIPTION: I/O error handling routine for a RAID metadevice write 2933 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 2934 */ 2935 /*ARGSUSED*/ 2936 static void 2937 raid_write_error(md_raidcs_t *cs) 2938 { 2939 md_raidps_t *ps; 2940 mdi_unit_t *ui; 2941 mr_unit_t *un; 2942 md_raidcbuf_t *cbuf; 2943 set_t setno; 2944 2945 ps = cs->cs_ps; 2946 ui = ps->ps_ui; 2947 un = cs->cs_un; 2948 2949 setno = MD_UN2SET(un); 2950 2951 /* 2952 * locate each buf that is in error on this io and then 2953 * output an error message 2954 */ 2955 if ((cs->cs_dbuf.b_flags & B_ERROR) && 2956 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && 2957 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) 2958 cmn_err(CE_WARN, "md %s: write error on %s", 2959 md_shortname(MD_SID(un)), 2960 md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); 2961 2962 if ((cs->cs_pbuf.b_flags & B_ERROR) && 2963 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && 2964 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) 2965 cmn_err(CE_WARN, "md %s: write error on %s", 2966 md_shortname(MD_SID(un)), 2967 md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); 2968 2969 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 2970 if ((cbuf->cbuf_bp.b_flags & B_ERROR) && 2971 (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_ERRED) && 2972 (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_LAST_ERRED)) 2973 cmn_err(CE_WARN, "md %s: write error on %s", 2974 md_shortname(MD_SID(un)), 2975 md_devname(setno, md_expldev(cbuf->cbuf_bp.b_edev), 2976 NULL, 0)); 2977 2978 md_unit_readerexit(ui); 2979 2980 ASSERT(cs->cs_frags == 0); 2981 2982 /* now schedule processing for possible state change */ 2983 daemon_request(&md_mstr_daemon, raid_wrerr, 2984 (daemon_queue_t *)cs, REQ_OLD); 2985 2986 } 2987 2988 /* 2989 * NAME: raid_write_ponly 2990 * DESCRIPTION: RAID metadevice write routine 2991 * in the case where only the parity column can be written 2992 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2993 */ 2994 static void 2995 raid_write_ponly(md_raidcs_t *cs) 2996 { 2997 md_raidps_t *ps; 2998 mr_unit_t *un = cs->cs_un; 2999 3000 ps = cs->cs_ps; 3001 /* decrement pwfrags if needed, but not frags */ 3002 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 3003 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3004 cs->cs_flags |= MD_RCS_PWDONE; 3005 cs->cs_frags = 1; 3006 cs->cs_stage = RAID_WRITE_PONLY_DONE; 3007 cs->cs_call = raid_stage; 3008 cs->cs_error_call = raid_write_error; 3009 cs->cs_retry_call = raid_write_no_retry; 3010 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3011 cs->cs_frags++; 3012 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE); 3013 } 3014 raidio(cs, RIO_PARITY | RIO_WRITE); 3015 } 3016 3017 /* 3018 * NAME: raid_write_ploop 3019 * DESCRIPTION: RAID metadevice write routine, constructs parity from 3020 * data in other columns. 3021 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3022 */ 3023 static void 3024 raid_write_ploop(md_raidcs_t *cs) 3025 { 3026 mr_unit_t *un = cs->cs_un; 3027 uint_t *dbuf; 3028 uint_t *pbuf; 3029 size_t wordcnt; 3030 uint_t psum = 0; 3031 3032 wordcnt = cs->cs_bcount / sizeof (uint_t); 3033 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 3034 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 3035 while (wordcnt--) 3036 *pbuf++ ^= *dbuf++; 3037 cs->cs_loop++; 3038 3039 /* 3040 * build parity from scratch using new data, 3041 * skip reading the data and parity columns. 3042 */ 3043 while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn) 3044 cs->cs_loop++; 3045 3046 if (cs->cs_loop != un->un_totalcolumncnt) { 3047 cs->cs_frags = 1; 3048 raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); 3049 return; 3050 } 3051 3052 /* construct checksum for parity buffer */ 3053 wordcnt = cs->cs_bcount / sizeof (uint_t); 3054 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 3055 while (wordcnt--) { 3056 psum ^= *pbuf; 3057 pbuf++; 3058 } 3059 RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, -1, 3060 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 3061 1, cs->cs_pcolumn, RAID_PWMAGIC); 3062 3063 cs->cs_stage = RAID_NONE; 3064 cs->cs_call = raid_write_ponly; 3065 cs->cs_error_call = raid_write_error; 3066 cs->cs_retry_call = raid_write_err_retry; 3067 cs->cs_frags = 1; 3068 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3069 cs->cs_frags++; 3070 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); 3071 } 3072 raidio(cs, RIO_PARITY | RIO_PREWRITE); 3073 } 3074 3075 /* 3076 * NAME: raid_write_donly 3077 * DESCRIPTION: RAID metadevice write routine 3078 * Completed writing data to prewrite entry 3079 * in the case where only the data column can be written 3080 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3081 */ 3082 static void 3083 raid_write_donly(md_raidcs_t *cs) 3084 { 3085 md_raidps_t *ps; 3086 mr_unit_t *un = cs->cs_un; 3087 3088 ps = cs->cs_ps; 3089 /* WARNING: don't release unit reader lock here... */ 3090 /* decrement pwfrags if needed, but not frags */ 3091 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 3092 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3093 cs->cs_flags |= MD_RCS_PWDONE; 3094 cs->cs_frags = 1; 3095 cs->cs_stage = RAID_WRITE_DONLY_DONE; 3096 cs->cs_call = raid_stage; 3097 cs->cs_error_call = raid_write_error; 3098 cs->cs_retry_call = raid_write_err_retry; 3099 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3100 cs->cs_frags++; 3101 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); 3102 } 3103 raidio(cs, RIO_DATA | RIO_WRITE); 3104 } 3105 3106 /* 3107 * NAME: raid_write_got_old 3108 * DESCRIPTION: RAID metadevice write routine 3109 * completed read of old data and old parity 3110 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3111 */ 3112 static void 3113 raid_write_got_old(md_raidcs_t *cs) 3114 { 3115 mr_unit_t *un = cs->cs_un; 3116 3117 ASSERT(IO_READER_HELD(cs->cs_un)); 3118 ASSERT(UNIT_READER_HELD(cs->cs_un)); 3119 3120 raid_mapin_buf(cs); 3121 genstandardparity(cs); 3122 cs->cs_frags = 2; 3123 cs->cs_call = raid_stage; 3124 cs->cs_stage = RAID_PREWRITE_DONE; 3125 cs->cs_error_call = raid_write_error; 3126 cs->cs_retry_call = raid_write_retry; 3127 3128 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3129 cs->cs_frags++; 3130 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_PREWRITE); 3131 } 3132 3133 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3134 cs->cs_frags++; 3135 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); 3136 } 3137 ASSERT(cs->cs_frags < 4); 3138 raidio(cs, RIO_DATA | RIO_PREWRITE); 3139 raidio(cs, RIO_PARITY | RIO_PREWRITE); 3140 } 3141 3142 /* 3143 * NAME: raid_write_io 3144 * DESCRIPTION: RAID metadevice write I/O routine 3145 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 3146 * md_raidcs_t *cs - pointer to a child structure 3147 */ 3148 3149 /*ARGSUSED*/ 3150 static void 3151 raid_write_io(mr_unit_t *un, md_raidcs_t *cs) 3152 { 3153 md_raidps_t *ps = cs->cs_ps; 3154 uint_t *dbuf; 3155 uint_t *ubuf; 3156 size_t wordcnt; 3157 uint_t dsum = 0; 3158 int pcheck; 3159 int dcheck; 3160 3161 ASSERT((un->un_column[cs->cs_pcolumn].un_devstate & 3162 RCS_INIT) == 0); 3163 ASSERT((un->un_column[cs->cs_dcolumn].un_devstate & 3164 RCS_INIT) == 0); 3165 ASSERT(IO_READER_HELD(un)); 3166 ASSERT(UNIT_READER_HELD(un)); 3167 ASSERT(cs->cs_flags & MD_RCS_HAVE_PW_SLOTS); 3168 if (cs->cs_flags & MD_RCS_LINE) { 3169 3170 mr_unit_t *un = cs->cs_un; 3171 3172 ASSERT(un->un_origcolumncnt == un->un_totalcolumncnt); 3173 raid_mapin_buf(cs); 3174 cs->cs_frags = un->un_origcolumncnt; 3175 cs->cs_call = raid_stage; 3176 cs->cs_error_call = raid_write_error; 3177 cs->cs_retry_call = raid_write_no_retry; 3178 cs->cs_stage = RAID_LINE_PWDONE; 3179 genlineparity(cs); 3180 return; 3181 } 3182 3183 pcheck = erred_check_line(un, cs, &un->un_column[cs->cs_pcolumn]); 3184 dcheck = erred_check_line(un, cs, &un->un_column[cs->cs_dcolumn]); 3185 cs->cs_resync_check = pcheck << RCL_PARITY_OFFSET || dcheck; 3186 3187 if (pcheck == RCL_ERRED && dcheck == RCL_ERRED) { 3188 int err = EIO; 3189 3190 if ((un->un_column[cs->cs_pcolumn].un_devstate == 3191 RCS_LAST_ERRED) || 3192 (un->un_column[cs->cs_dcolumn].un_devstate == 3193 RCS_LAST_ERRED)) 3194 err = ENXIO; 3195 raid_error_parent(ps, err); 3196 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 3197 raid_free_child(cs, 1); 3198 raid_free_parent(ps, RFP_DECR_FRAGS 3199 | RFP_RLS_LOCK | RFP_DECR_PWFRAGS); 3200 return; 3201 } 3202 3203 if (pcheck & RCL_ERRED) { 3204 /* 3205 * handle case of only having data drive 3206 */ 3207 raid_mapin_buf(cs); 3208 wordcnt = cs->cs_bcount / sizeof (uint_t); 3209 3210 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 3211 ubuf = (uint_t *)(void *)(cs->cs_addr); 3212 3213 while (wordcnt--) { 3214 *dbuf = *ubuf; 3215 dsum ^= *ubuf; 3216 dbuf++; 3217 ubuf++; 3218 } 3219 RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, -1, 3220 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 3221 1, cs->cs_dcolumn, RAID_PWMAGIC); 3222 cs->cs_frags = 1; 3223 cs->cs_stage = RAID_NONE; 3224 cs->cs_call = raid_write_donly; 3225 cs->cs_error_call = raid_write_error; 3226 cs->cs_retry_call = raid_write_err_retry; 3227 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3228 cs->cs_frags++; 3229 raidio(cs, RIO_DATA | RIO_ALT | RIO_EXTRA | 3230 RIO_PREWRITE); 3231 } 3232 raidio(cs, RIO_DATA | RIO_PREWRITE); 3233 return; 3234 } 3235 3236 if (dcheck & RCL_ERRED) { 3237 /* 3238 * handle case of only having parity drive 3239 * build parity from scratch using new data, 3240 * skip reading the data and parity columns. 3241 */ 3242 raid_mapin_buf(cs); 3243 cs->cs_loop = 0; 3244 while (cs->cs_loop == cs->cs_dcolumn || 3245 cs->cs_loop == cs->cs_pcolumn) 3246 cs->cs_loop++; 3247 3248 /* copy new data in to begin building parity */ 3249 bcopy(cs->cs_addr, cs->cs_pbuffer + DEV_BSIZE, cs->cs_bcount); 3250 cs->cs_stage = RAID_NONE; 3251 cs->cs_call = raid_write_ploop; 3252 cs->cs_error_call = raid_write_error; 3253 cs->cs_retry_call = raid_write_err_retry; 3254 cs->cs_frags = 1; 3255 raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); 3256 return; 3257 } 3258 /* 3259 * handle normal cases 3260 * read old data and old parity 3261 */ 3262 cs->cs_frags = 2; 3263 cs->cs_stage = RAID_NONE; 3264 cs->cs_call = raid_write_got_old; 3265 cs->cs_error_call = raid_write_error; 3266 cs->cs_retry_call = raid_write_retry; 3267 ASSERT(ps->ps_magic == RAID_PSMAGIC); 3268 raidio(cs, RIO_DATA | RIO_READ); 3269 raidio(cs, RIO_PARITY | RIO_READ); 3270 } 3271 3272 static void 3273 raid_enqueue(md_raidcs_t *cs) 3274 { 3275 mdi_unit_t *ui = cs->cs_ps->ps_ui; 3276 kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; 3277 md_raidcs_t *cs1; 3278 3279 mutex_enter(io_list_mutex); 3280 ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); 3281 if (ui->ui_io_lock->io_list_front == NULL) { 3282 ui->ui_io_lock->io_list_front = cs; 3283 ui->ui_io_lock->io_list_back = cs; 3284 } else { 3285 cs1 = ui->ui_io_lock->io_list_back; 3286 cs1->cs_linlck_next = cs; 3287 ui->ui_io_lock->io_list_back = cs; 3288 } 3289 STAT_INC(raid_write_waits); 3290 STAT_MAX(raid_max_write_q_length, raid_write_queue_length); 3291 cs->cs_linlck_next = NULL; 3292 mutex_exit(io_list_mutex); 3293 } 3294 3295 /* 3296 * NAME: raid_write 3297 * DESCRIPTION: RAID metadevice write routine 3298 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 3299 * md_raidcs_t *cs - pointer to a child structure 3300 */ 3301 3302 /*ARGSUSED*/ 3303 static int 3304 raid_write(mr_unit_t *un, md_raidcs_t *cs) 3305 { 3306 int error = 0; 3307 md_raidps_t *ps; 3308 mdi_unit_t *ui; 3309 minor_t mnum; 3310 clock_t timeout; 3311 3312 ASSERT(IO_READER_HELD(un)); 3313 ps = cs->cs_ps; 3314 ui = ps->ps_ui; 3315 3316 ASSERT(UNIT_STATE(un) != RUS_INIT); 3317 if (UNIT_STATE(un) == RUS_LAST_ERRED) 3318 error = EIO; 3319 3320 /* make sure the write doesn't go beyond the column */ 3321 if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn) 3322 error = ENXIO; 3323 if (error) 3324 goto werror; 3325 3326 getresources(cs); 3327 3328 /* 3329 * this is an advisory loop that keeps the waiting lists short 3330 * to reduce cpu time. Since there is a race introduced by not 3331 * aquiring all the correct mutexes, use a cv_timedwait to be 3332 * sure the write always will wake up and start. 3333 */ 3334 while (raid_check_pw(cs)) { 3335 mutex_enter(&un->un_mx); 3336 (void) drv_getparm(LBOLT, &timeout); 3337 timeout += md_wr_wait; 3338 un->un_rflags |= MD_RFLAG_NEEDPW; 3339 STAT_INC(raid_prewrite_waits); 3340 (void) cv_timedwait(&un->un_cv, &un->un_mx, timeout); 3341 un->un_rflags &= ~MD_RFLAG_NEEDPW; 3342 mutex_exit(&un->un_mx); 3343 } 3344 3345 if (raid_line_writer_lock(cs, 1)) 3346 return (0); 3347 3348 un = (mr_unit_t *)md_unit_readerlock(ui); 3349 cs->cs_un = un; 3350 mnum = MD_SID(un); 3351 3352 if (un->un_state & RUS_REGEN) { 3353 raid_regen_parity(cs); 3354 un = MD_UNIT(mnum); 3355 cs->cs_un = un; 3356 } 3357 3358 raid_write_io(un, cs); 3359 return (0); 3360 werror: 3361 /* aquire unit reader lock sinc raid_free_child always drops it */ 3362 raid_error_parent(ps, error); 3363 raid_free_child(cs, 0); 3364 /* decrement both pwfrags and frags */ 3365 raid_free_parent(ps, RFP_DECR_PWFRAGS | RFP_DECR_FRAGS | RFP_RLS_LOCK); 3366 return (0); 3367 } 3368 3369 3370 /* 3371 * NAMES: raid_stage 3372 * DESCRIPTION: post-processing routine for a RAID metadevice 3373 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 3374 */ 3375 static void 3376 raid_stage(md_raidcs_t *cs) 3377 { 3378 md_raidps_t *ps = cs->cs_ps; 3379 mr_unit_t *un = cs->cs_un; 3380 md_raidcbuf_t *cbuf; 3381 buf_t *bp; 3382 void *private; 3383 int flag; 3384 3385 switch (cs->cs_stage) { 3386 case RAID_READ_DONE: 3387 raid_free_child(cs, 1); 3388 /* decrement readfrags */ 3389 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 3390 return; 3391 3392 case RAID_WRITE_DONE: 3393 case RAID_WRITE_PONLY_DONE: 3394 case RAID_WRITE_DONLY_DONE: 3395 /* 3396 * Completed writing real parity and/or data. 3397 */ 3398 ASSERT(cs->cs_flags & MD_RCS_PWDONE); 3399 raid_free_child(cs, 1); 3400 /* decrement frags but not pwfrags */ 3401 raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK); 3402 return; 3403 3404 case RAID_PREWRITE_DONE: 3405 /* 3406 * completed writing data and parity to prewrite entries 3407 */ 3408 /* 3409 * WARNING: don't release unit reader lock here.. 3410 * decrement pwfrags but not frags 3411 */ 3412 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3413 cs->cs_flags |= MD_RCS_PWDONE; 3414 cs->cs_frags = 2; 3415 cs->cs_stage = RAID_WRITE_DONE; 3416 cs->cs_call = raid_stage; 3417 cs->cs_error_call = raid_write_error; 3418 cs->cs_retry_call = raid_write_no_retry; 3419 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3420 cs->cs_frags++; 3421 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | 3422 RIO_WRITE); 3423 } 3424 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3425 cs->cs_frags++; 3426 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); 3427 } 3428 ASSERT(cs->cs_frags < 4); 3429 raidio(cs, RIO_DATA | RIO_WRITE); 3430 raidio(cs, RIO_PARITY | RIO_WRITE); 3431 if (cs->cs_pw_inval_list) { 3432 raid_free_pwinvalidate(cs); 3433 } 3434 return; 3435 3436 case RAID_LINE_PWDONE: 3437 ASSERT(cs->cs_frags == 0); 3438 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3439 cs->cs_flags |= MD_RCS_PWDONE; 3440 cs->cs_frags = un->un_origcolumncnt; 3441 cs->cs_call = raid_stage; 3442 cs->cs_error_call = raid_write_error; 3443 cs->cs_retry_call = raid_write_no_retry; 3444 cs->cs_stage = RAID_WRITE_DONE; 3445 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 3446 /* 3447 * fill in buffer for write to prewrite area 3448 */ 3449 bp = &cbuf->cbuf_bp; 3450 bp->b_back = bp; 3451 bp->b_forw = bp; 3452 bp->b_un.b_addr = cbuf->cbuf_buffer + DEV_BSIZE; 3453 bp->b_bcount = cbuf->cbuf_bcount; 3454 bp->b_bufsize = cbuf->cbuf_bcount; 3455 bp->b_lblkno = 3456 un->un_column[cbuf->cbuf_column].un_devstart + 3457 cs->cs_blkno; 3458 bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR); 3459 bp->b_flags &= ~nv_available; 3460 bp->b_flags |= B_WRITE | B_BUSY; 3461 bp->b_iodone = (int (*)())raid_done; 3462 bp->b_edev = md_dev64_to_dev( 3463 un->un_column[cbuf->cbuf_column].un_dev); 3464 bp->b_chain = (struct buf *)cs; 3465 private = cs->cs_strategy_private; 3466 flag = cs->cs_strategy_flag; 3467 md_call_strategy(bp, flag, private); 3468 } 3469 raidio(cs, RIO_DATA | RIO_WRITE); 3470 raidio(cs, RIO_PARITY | RIO_WRITE); 3471 if (cs->cs_pw_inval_list) { 3472 raid_free_pwinvalidate(cs); 3473 } 3474 return; 3475 3476 default: 3477 ASSERT(0); 3478 break; 3479 } 3480 } 3481 /* 3482 * NAME: md_raid_strategy 3483 * DESCRIPTION: RAID metadevice I/O oprations entry point. 3484 * PARAMETERS: buf_t *pb - pointer to a user I/O buffer 3485 * int flag - metadevice specific flag 3486 * void *private - carry over flag ?? 3487 * 3488 */ 3489 3490 void 3491 md_raid_strategy(buf_t *pb, int flag, void *private) 3492 { 3493 md_raidps_t *ps; 3494 md_raidcs_t *cs; 3495 int doing_writes; 3496 int err; 3497 mr_unit_t *un; 3498 mdi_unit_t *ui; 3499 size_t count; 3500 diskaddr_t blkno; 3501 caddr_t addr; 3502 off_t offset; 3503 int colcnt; 3504 minor_t mnum; 3505 set_t setno; 3506 3507 ui = MDI_UNIT(getminor(pb->b_edev)); 3508 md_kstat_waitq_enter(ui); 3509 un = (mr_unit_t *)md_io_readerlock(ui); 3510 setno = MD_MIN2SET(getminor(pb->b_edev)); 3511 3512 if ((flag & MD_NOBLOCK) == 0) { 3513 if (md_inc_iocount(setno) != 0) { 3514 pb->b_flags |= B_ERROR; 3515 pb->b_error = ENXIO; 3516 pb->b_resid = pb->b_bcount; 3517 md_io_readerexit(ui); 3518 biodone(pb); 3519 return; 3520 } 3521 } else { 3522 md_inc_iocount_noblock(setno); 3523 } 3524 3525 mnum = MD_SID(un); 3526 colcnt = un->un_totalcolumncnt - 1; 3527 count = pb->b_bcount; 3528 3529 STAT_CHECK(raid_512, count == 512); 3530 STAT_CHECK(raid_1024, count == 1024); 3531 STAT_CHECK(raid_1024_8192, count > 1024 && count < 8192); 3532 STAT_CHECK(raid_8192, count == 8192); 3533 STAT_CHECK(raid_8192_bigger, count > 8192); 3534 3535 (void *) md_unit_readerlock(ui); 3536 if (!(flag & MD_STR_NOTTOP)) { 3537 err = md_checkbuf(ui, (md_unit_t *)un, pb); /* check and map */ 3538 if (err != 0) { 3539 md_kstat_waitq_exit(ui); 3540 md_io_readerexit(ui); 3541 return; 3542 } 3543 } 3544 md_unit_readerexit(ui); 3545 3546 STAT_INC(raid_total_io); 3547 3548 /* allocate a parent structure for the user I/O */ 3549 ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS); 3550 raid_parent_init(ps); 3551 3552 /* 3553 * Save essential information from the original buffhdr 3554 * in the md_save structure. 3555 */ 3556 ps->ps_un = un; 3557 ps->ps_ui = ui; 3558 ps->ps_bp = pb; 3559 ps->ps_addr = pb->b_un.b_addr; 3560 3561 if ((pb->b_flags & B_READ) == 0) { 3562 ps->ps_flags |= MD_RPS_WRITE; 3563 doing_writes = 1; 3564 STAT_INC(raid_writes); 3565 } else { 3566 ps->ps_flags |= MD_RPS_READ; 3567 doing_writes = 0; 3568 STAT_INC(raid_reads); 3569 } 3570 3571 count = lbtodb(pb->b_bcount); /* transfer count (in blocks) */ 3572 blkno = pb->b_lblkno; /* block number on device */ 3573 addr = 0; 3574 offset = 0; 3575 ps->ps_pwfrags = 1; 3576 ps->ps_frags = 1; 3577 md_kstat_waitq_to_runq(ui); 3578 3579 do { 3580 cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS); 3581 raid_child_init(cs); 3582 cs->cs_ps = ps; 3583 cs->cs_un = un; 3584 cs->cs_mdunit = mnum; 3585 cs->cs_strategy_flag = flag; 3586 cs->cs_strategy_private = private; 3587 cs->cs_addr = addr; 3588 cs->cs_offset = offset; 3589 count = raid_iosetup(un, blkno, count, cs); 3590 if (cs->cs_flags & MD_RCS_LINE) { 3591 blkno += (cs->cs_blkcnt * colcnt); 3592 offset += (cs->cs_bcount * colcnt); 3593 } else { 3594 blkno += cs->cs_blkcnt; 3595 offset += cs->cs_bcount; 3596 } 3597 /* for each cs bump up the ps_pwfrags and ps_frags fields */ 3598 if (count) { 3599 mutex_enter(&ps->ps_mx); 3600 ps->ps_pwfrags++; 3601 ps->ps_frags++; 3602 mutex_exit(&ps->ps_mx); 3603 if (doing_writes) 3604 (void) raid_write(un, cs); 3605 else 3606 (void) raid_read(un, cs); 3607 } 3608 } while (count); 3609 if (doing_writes) { 3610 (void) raid_write(un, cs); 3611 } else 3612 (void) raid_read(un, cs); 3613 3614 if (! (flag & MD_STR_NOTTOP) && panicstr) { 3615 while (! (ps->ps_flags & MD_RPS_DONE)) { 3616 md_daemon(1, &md_done_daemon); 3617 drv_usecwait(10); 3618 } 3619 kmem_cache_free(raid_parent_cache, ps); 3620 } 3621 } 3622 3623 /* 3624 * NAMES: raid_snarf 3625 * DESCRIPTION: RAID metadevice SNARF entry point 3626 * PARAMETERS: md_snarfcmd_t cmd, 3627 * set_t setno 3628 * RETURNS: 3629 */ 3630 static int 3631 raid_snarf(md_snarfcmd_t cmd, set_t setno) 3632 { 3633 mr_unit_t *un; 3634 mddb_recid_t recid; 3635 int gotsomething; 3636 int all_raid_gotten; 3637 mddb_type_t typ1; 3638 uint_t ncol; 3639 mddb_de_ic_t *dep; 3640 mddb_rb32_t *rbp; 3641 size_t newreqsize; 3642 mr_unit_t *big_un; 3643 mr_unit32_od_t *small_un; 3644 3645 3646 if (cmd == MD_SNARF_CLEANUP) 3647 return (0); 3648 3649 all_raid_gotten = 1; 3650 gotsomething = 0; 3651 typ1 = (mddb_type_t)md_getshared_key(setno, 3652 raid_md_ops.md_driver.md_drivername); 3653 recid = mddb_makerecid(setno, 0); 3654 3655 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 3656 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) { 3657 continue; 3658 } 3659 3660 dep = mddb_getrecdep(recid); 3661 dep->de_flags = MDDB_F_RAID; 3662 rbp = dep->de_rb; 3663 switch (rbp->rb_revision) { 3664 case MDDB_REV_RB: 3665 case MDDB_REV_RBFN: 3666 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 3667 /* 3668 * This means, we have an old and small record 3669 * and this record hasn't already been 3670 * converted. Before we create an incore 3671 * metadevice from this we have to convert it to 3672 * a big record. 3673 */ 3674 small_un = 3675 (mr_unit32_od_t *)mddb_getrecaddr(recid); 3676 ncol = small_un->un_totalcolumncnt; 3677 newreqsize = sizeof (mr_unit_t) + 3678 ((ncol - 1) * sizeof (mr_column_t)); 3679 big_un = (mr_unit_t *)kmem_zalloc(newreqsize, 3680 KM_SLEEP); 3681 raid_convert((caddr_t)small_un, (caddr_t)big_un, 3682 SMALL_2_BIG); 3683 kmem_free(small_un, dep->de_reqsize); 3684 dep->de_rb_userdata = big_un; 3685 dep->de_reqsize = newreqsize; 3686 un = big_un; 3687 rbp->rb_private |= MD_PRV_CONVD; 3688 } else { 3689 /* 3690 * Record has already been converted. Just 3691 * get its address. 3692 */ 3693 un = (mr_unit_t *)mddb_getrecaddr(recid); 3694 } 3695 un->c.un_revision &= ~MD_64BIT_META_DEV; 3696 break; 3697 case MDDB_REV_RB64: 3698 case MDDB_REV_RB64FN: 3699 /* Big device */ 3700 un = (mr_unit_t *)mddb_getrecaddr(recid); 3701 un->c.un_revision |= MD_64BIT_META_DEV; 3702 un->c.un_flag |= MD_EFILABEL; 3703 break; 3704 } 3705 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 3706 3707 /* 3708 * Create minor device node for snarfed entry. 3709 */ 3710 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 3711 3712 if (MD_UNIT(MD_SID(un)) != NULL) { 3713 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 3714 continue; 3715 } 3716 all_raid_gotten = 0; 3717 if (raid_build_incore((void *)un, 1) == 0) { 3718 mddb_setrecprivate(recid, MD_PRV_GOTIT); 3719 md_create_unit_incore(MD_SID(un), &raid_md_ops, 3720 1); 3721 gotsomething = 1; 3722 } else if (un->mr_ic) { 3723 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * 3724 un->un_totalcolumncnt); 3725 kmem_free(un->mr_ic, sizeof (*un->mr_ic)); 3726 } 3727 } 3728 3729 if (!all_raid_gotten) { 3730 return (gotsomething); 3731 } 3732 3733 recid = mddb_makerecid(setno, 0); 3734 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) 3735 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 3736 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 3737 3738 return (0); 3739 } 3740 3741 /* 3742 * NAMES: raid_halt 3743 * DESCRIPTION: RAID metadevice HALT entry point 3744 * PARAMETERS: md_haltcmd_t cmd - 3745 * set_t setno - 3746 * RETURNS: 3747 */ 3748 static int 3749 raid_halt(md_haltcmd_t cmd, set_t setno) 3750 { 3751 set_t i; 3752 mdi_unit_t *ui; 3753 minor_t mnum; 3754 3755 if (cmd == MD_HALT_CLOSE) 3756 return (0); 3757 3758 if (cmd == MD_HALT_OPEN) 3759 return (0); 3760 3761 if (cmd == MD_HALT_UNLOAD) 3762 return (0); 3763 3764 if (cmd == MD_HALT_CHECK) { 3765 for (i = 0; i < md_nunits; i++) { 3766 mnum = MD_MKMIN(setno, i); 3767 if ((ui = MDI_UNIT(mnum)) == NULL) 3768 continue; 3769 if (ui->ui_opsindex != raid_md_ops.md_selfindex) 3770 continue; 3771 if (md_unit_isopen(ui)) 3772 return (1); 3773 } 3774 return (0); 3775 } 3776 3777 if (cmd != MD_HALT_DOIT) 3778 return (1); 3779 3780 for (i = 0; i < md_nunits; i++) { 3781 mnum = MD_MKMIN(setno, i); 3782 if ((ui = MDI_UNIT(mnum)) == NULL) 3783 continue; 3784 if (ui->ui_opsindex != raid_md_ops.md_selfindex) 3785 continue; 3786 reset_raid((mr_unit_t *)MD_UNIT(mnum), mnum, 0); 3787 } 3788 return (0); 3789 } 3790 3791 /* 3792 * NAMES: raid_close_all_devs 3793 * DESCRIPTION: Close all the devices of the unit. 3794 * PARAMETERS: mr_unit_t *un - pointer to unit structure 3795 * RETURNS: 3796 */ 3797 void 3798 raid_close_all_devs(mr_unit_t *un, int init_pw, int md_cflags) 3799 { 3800 int i; 3801 mr_column_t *device; 3802 3803 for (i = 0; i < un->un_totalcolumncnt; i++) { 3804 device = &un->un_column[i]; 3805 if (device->un_devflags & MD_RAID_DEV_ISOPEN) { 3806 ASSERT((device->un_dev != (md_dev64_t)0) && 3807 (device->un_dev != NODEV64)); 3808 if ((device->un_devstate & RCS_OKAY) && init_pw) 3809 (void) init_pw_area(un, device->un_dev, 3810 device->un_pwstart, i); 3811 md_layered_close(device->un_dev, md_cflags); 3812 device->un_devflags &= ~MD_RAID_DEV_ISOPEN; 3813 } 3814 } 3815 } 3816 3817 /* 3818 * NAMES: raid_open_all_devs 3819 * DESCRIPTION: Open all the components (columns) of the device unit. 3820 * PARAMETERS: mr_unit_t *un - pointer to unit structure 3821 * RETURNS: 3822 */ 3823 static int 3824 raid_open_all_devs(mr_unit_t *un, int md_oflags) 3825 { 3826 minor_t mnum = MD_SID(un); 3827 int i; 3828 int not_opened = 0; 3829 int commit = 0; 3830 int col = -1; 3831 mr_column_t *device; 3832 set_t setno = MD_MIN2SET(MD_SID(un)); 3833 side_t side = mddb_getsidenum(setno); 3834 mdkey_t key; 3835 mdi_unit_t *ui = MDI_UNIT(mnum); 3836 3837 ui->ui_tstate &= ~MD_INACCESSIBLE; 3838 3839 for (i = 0; i < un->un_totalcolumncnt; i++) { 3840 md_dev64_t tmpdev; 3841 3842 device = &un->un_column[i]; 3843 3844 if (COLUMN_STATE(un, i) & RCS_ERRED) { 3845 not_opened++; 3846 continue; 3847 } 3848 3849 if (device->un_devflags & MD_RAID_DEV_ISOPEN) 3850 continue; 3851 3852 tmpdev = device->un_dev; 3853 /* 3854 * Open by device id 3855 */ 3856 key = HOTSPARED(un, i) ? 3857 device->un_hs_key : device->un_orig_key; 3858 if ((md_getmajor(tmpdev) != md_major) && 3859 md_devid_found(setno, side, key) == 1) { 3860 tmpdev = md_resolve_bydevid(mnum, tmpdev, key); 3861 } 3862 if (md_layered_open(mnum, &tmpdev, md_oflags)) { 3863 device->un_dev = tmpdev; 3864 not_opened++; 3865 continue; 3866 } 3867 device->un_dev = tmpdev; 3868 device->un_devflags |= MD_RAID_DEV_ISOPEN; 3869 } 3870 3871 /* if open errors and errored devices are 1 then device can run */ 3872 if (not_opened > 1) { 3873 cmn_err(CE_WARN, 3874 "md: %s failed to open. open error on %s\n", 3875 md_shortname(MD_SID(un)), 3876 md_devname(MD_UN2SET(un), device->un_orig_dev, 3877 NULL, 0)); 3878 3879 ui->ui_tstate |= MD_INACCESSIBLE; 3880 3881 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 3882 MD_UN2SET(un), MD_SID(un)); 3883 3884 return (not_opened > 1); 3885 } 3886 3887 for (i = 0; i < un->un_totalcolumncnt; i++) { 3888 device = &un->un_column[i]; 3889 if (device->un_devflags & MD_RAID_DEV_ISOPEN) { 3890 if (device->un_devstate & RCS_LAST_ERRED) { 3891 /* 3892 * At this point in time there is a possibility 3893 * that errors were the result of a controller 3894 * failure with more than a single column on it 3895 * so clear out last errored columns and let errors 3896 * re-occur is necessary. 3897 */ 3898 raid_set_state(un, i, RCS_OKAY, 0); 3899 commit++; 3900 } 3901 continue; 3902 } 3903 ASSERT(col == -1); 3904 col = i; 3905 } 3906 3907 if (col != -1) { 3908 raid_set_state(un, col, RCS_ERRED, 0); 3909 commit++; 3910 } 3911 3912 if (commit) 3913 raid_commit(un, NULL); 3914 3915 if (col != -1) { 3916 if (COLUMN_STATE(un, col) & RCS_ERRED) { 3917 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 3918 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 3919 } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { 3920 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 3921 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 3922 } 3923 } 3924 3925 return (0); 3926 } 3927 3928 /* 3929 * NAMES: raid_internal_open 3930 * DESCRIPTION: Do the actual RAID open 3931 * PARAMETERS: minor_t mnum - minor number of the RAID device 3932 * int flag - 3933 * int otyp - 3934 * int md_oflags - RAID open flags 3935 * RETURNS: 0 if successful, nonzero otherwise 3936 */ 3937 int 3938 raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags) 3939 { 3940 mr_unit_t *un; 3941 mdi_unit_t *ui; 3942 int err = 0; 3943 int replay_error = 0; 3944 3945 ui = MDI_UNIT(mnum); 3946 ASSERT(ui != NULL); 3947 3948 un = (mr_unit_t *)md_unit_openclose_enter(ui); 3949 /* 3950 * this MUST be checked before md_unit_isopen is checked. 3951 * raid_init_columns sets md_unit_isopen to block reset, halt. 3952 */ 3953 if ((UNIT_STATE(un) & (RUS_INIT | RUS_DOI)) && 3954 !(md_oflags & MD_OFLG_ISINIT)) { 3955 md_unit_openclose_exit(ui); 3956 return (EAGAIN); 3957 } 3958 3959 if ((md_oflags & MD_OFLG_ISINIT) || md_unit_isopen(ui)) { 3960 err = md_unit_incopen(mnum, flag, otyp); 3961 goto out; 3962 } 3963 3964 md_unit_readerexit(ui); 3965 3966 un = (mr_unit_t *)md_unit_writerlock(ui); 3967 if (raid_open_all_devs(un, md_oflags) == 0) { 3968 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) { 3969 md_unit_writerexit(ui); 3970 un = (mr_unit_t *)md_unit_readerlock(ui); 3971 raid_close_all_devs(un, 0, md_oflags); 3972 goto out; 3973 } 3974 } else { 3975 /* 3976 * if this unit contains more than two errored components 3977 * should return error and close all opened devices 3978 */ 3979 3980 md_unit_writerexit(ui); 3981 un = (mr_unit_t *)md_unit_readerlock(ui); 3982 raid_close_all_devs(un, 0, md_oflags); 3983 md_unit_openclose_exit(ui); 3984 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 3985 MD_UN2SET(un), MD_SID(un)); 3986 return (ENXIO); 3987 } 3988 3989 if (!(MD_STATUS(un) & MD_UN_REPLAYED)) { 3990 replay_error = raid_replay(un); 3991 MD_STATUS(un) |= MD_UN_REPLAYED; 3992 } 3993 3994 md_unit_writerexit(ui); 3995 un = (mr_unit_t *)md_unit_readerlock(ui); 3996 3997 if ((replay_error == RAID_RPLY_READONLY) && 3998 ((flag & (FREAD | FWRITE)) == FREAD)) { 3999 md_unit_openclose_exit(ui); 4000 return (0); 4001 } 4002 4003 /* allocate hotspare if possible */ 4004 (void) raid_hotspares(); 4005 4006 4007 out: 4008 md_unit_openclose_exit(ui); 4009 return (err); 4010 } 4011 /* 4012 * NAMES: raid_open 4013 * DESCRIPTION: RAID metadevice OPEN entry point 4014 * PARAMETERS: dev_t dev - 4015 * int flag - 4016 * int otyp - 4017 * cred_t * cred_p - 4018 * int md_oflags - 4019 * RETURNS: 4020 */ 4021 /*ARGSUSED1*/ 4022 static int 4023 raid_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 4024 { 4025 int error = 0; 4026 4027 if (error = raid_internal_open(getminor(*dev), flag, otyp, md_oflags)) { 4028 return (error); 4029 } 4030 return (0); 4031 } 4032 4033 /* 4034 * NAMES: raid_internal_close 4035 * DESCRIPTION: RAID metadevice CLOSE actual implementation 4036 * PARAMETERS: minor_t - minor number of the RAID device 4037 * int otyp - 4038 * int init_pw - 4039 * int md_cflags - RAID close flags 4040 * RETURNS: 0 if successful, nonzero otherwise 4041 */ 4042 /*ARGSUSED*/ 4043 int 4044 raid_internal_close(minor_t mnum, int otyp, int init_pw, int md_cflags) 4045 { 4046 mdi_unit_t *ui = MDI_UNIT(mnum); 4047 mr_unit_t *un; 4048 int err = 0; 4049 4050 /* single thread */ 4051 un = (mr_unit_t *)md_unit_openclose_enter(ui); 4052 4053 /* count closed */ 4054 if ((err = md_unit_decopen(mnum, otyp)) != 0) 4055 goto out; 4056 /* close devices, if necessary */ 4057 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 4058 raid_close_all_devs(un, init_pw, md_cflags); 4059 } 4060 4061 /* unlock, return success */ 4062 out: 4063 md_unit_openclose_exit(ui); 4064 return (err); 4065 } 4066 4067 /* 4068 * NAMES: raid_close 4069 * DESCRIPTION: RAID metadevice close entry point 4070 * PARAMETERS: dev_t dev - 4071 * int flag - 4072 * int otyp - 4073 * cred_t * cred_p - 4074 * int md_oflags - 4075 * RETURNS: 4076 */ 4077 /*ARGSUSED1*/ 4078 static int 4079 raid_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) 4080 { 4081 int retval; 4082 4083 (void) md_io_writerlock(MDI_UNIT(getminor(dev))); 4084 retval = raid_internal_close(getminor(dev), otyp, 1, md_cflags); 4085 (void) md_io_writerexit(MDI_UNIT(getminor(dev))); 4086 return (retval); 4087 } 4088 4089 /* 4090 * raid_probe_close_all_devs 4091 */ 4092 void 4093 raid_probe_close_all_devs(mr_unit_t *un) 4094 { 4095 int i; 4096 mr_column_t *device; 4097 4098 for (i = 0; i < un->un_totalcolumncnt; i++) { 4099 device = &un->un_column[i]; 4100 4101 if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { 4102 md_layered_close(device->un_dev, 4103 MD_OFLG_PROBEDEV); 4104 device->un_devflags &= ~MD_RAID_DEV_PROBEOPEN; 4105 } 4106 } 4107 } 4108 /* 4109 * Raid_probe_dev: 4110 * 4111 * On entry the unit writerlock is held 4112 */ 4113 static int 4114 raid_probe_dev(mdi_unit_t *ui, minor_t mnum) 4115 { 4116 mr_unit_t *un; 4117 int i; 4118 int not_opened = 0; 4119 int commit = 0; 4120 int col = -1; 4121 mr_column_t *device; 4122 int md_devopen = 0; 4123 4124 if (md_unit_isopen(ui)) 4125 md_devopen++; 4126 4127 un = MD_UNIT(mnum); 4128 /* 4129 * If the state has been set to LAST_ERRED because 4130 * of an error when the raid device was open at some 4131 * point in the past, don't probe. We really don't want 4132 * to reset the state in this case. 4133 */ 4134 if (UNIT_STATE(un) == RUS_LAST_ERRED) 4135 return (0); 4136 4137 ui->ui_tstate &= ~MD_INACCESSIBLE; 4138 4139 for (i = 0; i < un->un_totalcolumncnt; i++) { 4140 md_dev64_t tmpdev; 4141 4142 device = &un->un_column[i]; 4143 if (COLUMN_STATE(un, i) & RCS_ERRED) { 4144 not_opened++; 4145 continue; 4146 } 4147 4148 tmpdev = device->un_dev; 4149 /* 4150 * Currently the flags passed are not needed since 4151 * there cannot be an underlying metadevice. However 4152 * they are kept here for consistency. 4153 * 4154 * Open by device id 4155 */ 4156 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i)? 4157 device->un_hs_key : device->un_orig_key); 4158 if (md_layered_open(mnum, &tmpdev, 4159 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV)) { 4160 device->un_dev = tmpdev; 4161 not_opened++; 4162 continue; 4163 } 4164 device->un_dev = tmpdev; 4165 4166 device->un_devflags |= MD_RAID_DEV_PROBEOPEN; 4167 } 4168 4169 /* 4170 * The code below is careful on setting the LAST_ERRED state. 4171 * 4172 * If open errors and exactly one device has failed we can run. 4173 * If more then one device fails we have to figure out when to set 4174 * LAST_ERRED state. The rationale is to avoid unnecessary resyncs 4175 * since they are painful and time consuming. 4176 * 4177 * When more than one component/column fails there are 2 scenerios. 4178 * 4179 * 1. Metadevice has NOT been opened: In this case, the behavior 4180 * mimics the open symantics. ie. Only the first failed device 4181 * is ERRED and LAST_ERRED is not set. 4182 * 4183 * 2. Metadevice has been opened: Here the read/write sematics are 4184 * followed. The first failed devicce is ERRED and on the next 4185 * failed device LAST_ERRED is set. 4186 */ 4187 4188 if (not_opened > 1 && !md_devopen) { 4189 cmn_err(CE_WARN, 4190 "md: %s failed to open. open error on %s\n", 4191 md_shortname(MD_SID(un)), 4192 md_devname(MD_UN2SET(un), device->un_orig_dev, 4193 NULL, 0)); 4194 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 4195 MD_UN2SET(un), MD_SID(un)); 4196 raid_probe_close_all_devs(un); 4197 ui->ui_tstate |= MD_INACCESSIBLE; 4198 return (not_opened > 1); 4199 } 4200 4201 if (!md_devopen) { 4202 for (i = 0; i < un->un_totalcolumncnt; i++) { 4203 device = &un->un_column[i]; 4204 if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { 4205 if (device->un_devstate & RCS_LAST_ERRED) { 4206 /* 4207 * At this point in time there is a 4208 * possibility that errors were the 4209 * result of a controller failure with 4210 * more than a single column on it so 4211 * clear out last errored columns and 4212 * let errors re-occur is necessary. 4213 */ 4214 raid_set_state(un, i, RCS_OKAY, 0); 4215 commit++; 4216 } 4217 continue; 4218 } 4219 ASSERT(col == -1); 4220 /* 4221 * note if multiple devices are failing then only 4222 * the last one is marked as error 4223 */ 4224 col = i; 4225 } 4226 4227 if (col != -1) { 4228 raid_set_state(un, col, RCS_ERRED, 0); 4229 commit++; 4230 } 4231 4232 } else { 4233 for (i = 0; i < un->un_totalcolumncnt; i++) { 4234 device = &un->un_column[i]; 4235 4236 /* if we have LAST_ERRED go ahead and commit. */ 4237 if (un->un_state & RUS_LAST_ERRED) 4238 break; 4239 /* 4240 * could not open the component 4241 */ 4242 4243 if (!(device->un_devflags & MD_RAID_DEV_PROBEOPEN)) { 4244 col = i; 4245 raid_set_state(un, col, RCS_ERRED, 0); 4246 commit++; 4247 } 4248 } 4249 } 4250 4251 if (commit) 4252 raid_commit(un, NULL); 4253 4254 if (col != -1) { 4255 if (COLUMN_STATE(un, col) & RCS_ERRED) { 4256 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 4257 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 4258 } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { 4259 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 4260 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 4261 } 4262 } 4263 4264 raid_probe_close_all_devs(un); 4265 return (0); 4266 } 4267 4268 static int 4269 raid_imp_set( 4270 set_t setno 4271 ) 4272 { 4273 mddb_recid_t recid; 4274 int i, gotsomething; 4275 mddb_type_t typ1; 4276 mddb_de_ic_t *dep; 4277 mddb_rb32_t *rbp; 4278 mr_unit_t *un64; 4279 mr_unit32_od_t *un32; 4280 md_dev64_t self_devt; 4281 minor_t *self_id; /* minor needs to be updated */ 4282 md_parent_t *parent_id; /* parent needs to be updated */ 4283 mddb_recid_t *record_id; /* record id needs to be updated */ 4284 hsp_t *hsp_id; 4285 4286 gotsomething = 0; 4287 4288 typ1 = (mddb_type_t)md_getshared_key(setno, 4289 raid_md_ops.md_driver.md_drivername); 4290 recid = mddb_makerecid(setno, 0); 4291 4292 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 4293 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 4294 continue; 4295 4296 dep = mddb_getrecdep(recid); 4297 rbp = dep->de_rb; 4298 4299 switch (rbp->rb_revision) { 4300 case MDDB_REV_RB: 4301 case MDDB_REV_RBFN: 4302 /* 4303 * Small device 4304 */ 4305 un32 = (mr_unit32_od_t *)mddb_getrecaddr(recid); 4306 self_id = &(un32->c.un_self_id); 4307 parent_id = &(un32->c.un_parent); 4308 record_id = &(un32->c.un_record_id); 4309 hsp_id = &(un32->un_hsp_id); 4310 4311 for (i = 0; i < un32->un_totalcolumncnt; i++) { 4312 mr_column32_od_t *device; 4313 4314 device = &un32->un_column[i]; 4315 if (!md_update_minor(setno, mddb_getsidenum 4316 (setno), device->un_orig_key)) 4317 goto out; 4318 4319 if (device->un_hs_id != 0) 4320 device->un_hs_id = MAKERECID( 4321 setno, device->un_hs_id); 4322 } 4323 break; 4324 case MDDB_REV_RB64: 4325 case MDDB_REV_RB64FN: 4326 un64 = (mr_unit_t *)mddb_getrecaddr(recid); 4327 self_id = &(un64->c.un_self_id); 4328 parent_id = &(un64->c.un_parent); 4329 record_id = &(un64->c.un_record_id); 4330 hsp_id = &(un64->un_hsp_id); 4331 4332 for (i = 0; i < un64->un_totalcolumncnt; i++) { 4333 mr_column_t *device; 4334 4335 device = &un64->un_column[i]; 4336 if (!md_update_minor(setno, mddb_getsidenum 4337 (setno), device->un_orig_key)) 4338 goto out; 4339 4340 if (device->un_hs_id != 0) 4341 device->un_hs_id = MAKERECID( 4342 setno, device->un_hs_id); 4343 } 4344 break; 4345 } 4346 4347 /* 4348 * If this is a top level and a friendly name metadevice, 4349 * update its minor in the namespace. 4350 */ 4351 if ((*parent_id == MD_NO_PARENT) && 4352 ((rbp->rb_revision == MDDB_REV_RBFN) || 4353 (rbp->rb_revision == MDDB_REV_RB64FN))) { 4354 4355 self_devt = md_makedevice(md_major, *self_id); 4356 if (!md_update_top_device_minor(setno, 4357 mddb_getsidenum(setno), self_devt)) 4358 goto out; 4359 } 4360 4361 /* 4362 * Update unit with the imported setno 4363 */ 4364 mddb_setrecprivate(recid, MD_PRV_GOTIT); 4365 4366 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 4367 4368 if (*hsp_id != -1) 4369 *hsp_id = MAKERECID(setno, DBID(*hsp_id)); 4370 4371 if (*parent_id != MD_NO_PARENT) 4372 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 4373 *record_id = MAKERECID(setno, DBID(*record_id)); 4374 gotsomething = 1; 4375 } 4376 4377 out: 4378 return (gotsomething); 4379 } 4380 4381 static md_named_services_t raid_named_services[] = { 4382 {raid_hotspares, "poke hotspares" }, 4383 {raid_rename_check, MDRNM_CHECK }, 4384 {raid_rename_lock, MDRNM_LOCK }, 4385 {(intptr_t (*)()) raid_rename_unlock, MDRNM_UNLOCK }, 4386 {(intptr_t (*)()) raid_probe_dev, "probe open test" }, 4387 {NULL, 0 } 4388 }; 4389 4390 md_ops_t raid_md_ops = { 4391 raid_open, /* open */ 4392 raid_close, /* close */ 4393 md_raid_strategy, /* strategy */ 4394 NULL, /* print */ 4395 NULL, /* dump */ 4396 NULL, /* read */ 4397 NULL, /* write */ 4398 md_raid_ioctl, /* ioctl, */ 4399 raid_snarf, /* raid_snarf */ 4400 raid_halt, /* raid_halt */ 4401 NULL, /* aread */ 4402 NULL, /* awrite */ 4403 raid_imp_set, /* import set */ 4404 raid_named_services 4405 }; 4406 4407 static void 4408 init_init() 4409 { 4410 /* default to a second */ 4411 if (md_wr_wait == 0) 4412 md_wr_wait = md_hz >> 1; 4413 4414 raid_parent_cache = kmem_cache_create("md_raid_parent", 4415 sizeof (md_raidps_t), 0, raid_parent_constructor, 4416 raid_parent_destructor, raid_run_queue, NULL, NULL, 0); 4417 raid_child_cache = kmem_cache_create("md_raid_child", 4418 sizeof (md_raidcs_t) - sizeof (buf_t) + biosize(), 0, 4419 raid_child_constructor, raid_child_destructor, 4420 raid_run_queue, NULL, NULL, 0); 4421 raid_cbuf_cache = kmem_cache_create("md_raid_cbufs", 4422 sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor, 4423 raid_cbuf_destructor, raid_run_queue, NULL, NULL, 0); 4424 } 4425 4426 static void 4427 fini_uninit() 4428 { 4429 kmem_cache_destroy(raid_parent_cache); 4430 kmem_cache_destroy(raid_child_cache); 4431 kmem_cache_destroy(raid_cbuf_cache); 4432 raid_parent_cache = raid_child_cache = raid_cbuf_cache = NULL; 4433 } 4434 4435 /* define the module linkage */ 4436 MD_PLUGIN_MISC_MODULE("raid module %I%", init_init(), fini_uninit()) 4437