1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * NAME: raid.c 30 * 31 * DESCRIPTION: Main RAID driver source file containing open, close and I/O 32 * operations. 33 * 34 * ROUTINES PROVIDED FOR EXTERNAL USE: 35 * raid_open() - open the RAID metadevice for access. 36 * raid_internal_open() - internal open routine of RAID metdevice. 37 * md_raid_strategy() - perform normal I/O operations, 38 * such as read and write. 39 * raid_close() - close the RAID metadevice. 40 * raid_internal_close() - internal close routine of RAID metadevice. 41 * raid_snarf() - initialize and clean up MDD records. 42 * raid_halt() - reset the RAID metadevice 43 * raid_line() - return the line # of this segment 44 * raid_dcolumn() - return the data column # of this segment 45 * raid_pcolumn() - return the parity column # of this segment 46 */ 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/conf.h> 51 #include <sys/file.h> 52 #include <sys/user.h> 53 #include <sys/uio.h> 54 #include <sys/t_lock.h> 55 #include <sys/buf.h> 56 #include <sys/dkio.h> 57 #include <sys/vtoc.h> 58 #include <sys/kmem.h> 59 #include <vm/page.h> 60 #include <sys/cmn_err.h> 61 #include <sys/sysmacros.h> 62 #include <sys/types.h> 63 #include <sys/mkdev.h> 64 #include <sys/stat.h> 65 #include <sys/open.h> 66 #include <sys/modctl.h> 67 #include <sys/ddi.h> 68 #include <sys/sunddi.h> 69 #include <sys/debug.h> 70 #include <sys/lvm/md_raid.h> 71 #include <sys/lvm/mdvar.h> 72 #include <sys/lvm/md_convert.h> 73 74 #include <sys/sysevent/eventdefs.h> 75 #include <sys/sysevent/svm.h> 76 77 md_ops_t raid_md_ops; 78 #ifndef lint 79 char _depends_on[] = "drv/md"; 80 md_ops_t *md_interface_ops = &raid_md_ops; 81 #endif /* lint */ 82 83 extern unit_t md_nunits; 84 extern unit_t md_nsets; 85 extern md_set_t md_set[]; 86 extern int md_status; 87 extern major_t md_major; 88 extern mdq_anchor_t md_done_daemon; 89 extern mdq_anchor_t md_mstr_daemon; 90 extern int md_sleep_for_test; 91 extern clock_t md_hz; 92 93 extern md_event_queue_t *md_event_queue; 94 95 96 int pchunks = 16; 97 int phigh = 1024; 98 int plow = 128; 99 int cchunks = 64; 100 int chigh = 1024; 101 int clow = 512; 102 int bchunks = 32; 103 int bhigh = 256; 104 int blow = 128; 105 106 int raid_total_io = 0; 107 int raid_reads = 0; 108 int raid_writes = 0; 109 int raid_no_bpmaps = 0; 110 int raid_512 = 0; 111 int raid_1024 = 0; 112 int raid_1024_8192 = 0; 113 int raid_8192 = 0; 114 int raid_8192_bigger = 0; 115 int raid_line_lock_wait = 0; 116 117 int data_buffer_waits = 0; 118 int parity_buffer_waits = 0; 119 120 /* writer line locks */ 121 int raid_writer_locks = 0; /* total writer locks */ 122 int raid_write_waits = 0; /* total writer locks that waited */ 123 int raid_full_line_writes = 0; /* total full line writes */ 124 int raid_write_queue_length = 0; /* wait queue length */ 125 int raid_max_write_q_length = 0; /* maximum queue length */ 126 int raid_write_locks_active = 0; /* writer locks at any time */ 127 int raid_max_write_locks = 0; /* maximum writer locks active */ 128 129 /* read line locks */ 130 int raid_reader_locks = 0; /* total reader locks held */ 131 int raid_reader_locks_active = 0; /* reader locks held */ 132 int raid_max_reader_locks = 0; /* maximum reader locks held in run */ 133 int raid_read_overlaps = 0; /* number of times 2 reads hit same line */ 134 int raid_read_waits = 0; /* times a reader waited on writer */ 135 136 /* prewrite stats */ 137 int raid_prewrite_waits = 0; /* number of waits for a pw slot */ 138 int raid_pw = 0; /* number of pw slots in use */ 139 int raid_prewrite_max = 0; /* maximum number of pw slots in use */ 140 int raid_pw_invalidates = 0; 141 142 static clock_t md_wr_wait = 0; 143 144 int nv_available = 0; /* presence of nv-ram support in device */ 145 int nv_prewrite = 1; /* mark prewrites with nv_available */ 146 int nv_parity = 1; /* mark parity with nv_available */ 147 148 kmem_cache_t *raid_parent_cache = NULL; 149 kmem_cache_t *raid_child_cache = NULL; 150 kmem_cache_t *raid_cbuf_cache = NULL; 151 152 int raid_internal_open(minor_t mnum, int flag, int otyp, 153 int md_oflags); 154 155 static void freebuffers(md_raidcs_t *cs); 156 static int raid_read(mr_unit_t *un, md_raidcs_t *cs); 157 static void raid_read_io(mr_unit_t *un, md_raidcs_t *cs); 158 static int raid_write(mr_unit_t *un, md_raidcs_t *cs); 159 static void raid_write_io(mr_unit_t *un, md_raidcs_t *cs); 160 static void raid_stage(md_raidcs_t *cs); 161 static void raid_enqueue(md_raidcs_t *cs); 162 static diskaddr_t raid_line(diskaddr_t segment, mr_unit_t *un); 163 uint_t raid_dcolumn(diskaddr_t segment, mr_unit_t *un); 164 static void getpbuffer(md_raidcs_t *cs); 165 static void getdbuffer(md_raidcs_t *cs); 166 static void raid_done(buf_t *bp); 167 static void raid_io_startup(mr_unit_t *un); 168 169 static rus_state_t 170 raid_col2unit(rcs_state_t state, rus_state_t unitstate) 171 { 172 switch (state) { 173 case RCS_INIT: 174 return (RUS_INIT); 175 case RCS_OKAY: 176 return (RUS_OKAY); 177 case RCS_RESYNC: 178 if (unitstate & RUS_LAST_ERRED) 179 return (RUS_LAST_ERRED); 180 else 181 return (RUS_ERRED); 182 case RCS_ERRED: 183 return (RUS_ERRED); 184 case RCS_LAST_ERRED: 185 return (RUS_ERRED); 186 default: 187 break; 188 } 189 panic("raid_col2unit"); 190 /*NOTREACHED*/ 191 } 192 193 void 194 raid_set_state(mr_unit_t *un, int col, rcs_state_t newstate, int force) 195 { 196 197 rus_state_t unitstate, origstate; 198 rcs_state_t colstate; 199 rcs_state_t orig_colstate; 200 int errcnt = 0, 201 okaycnt = 0, 202 resynccnt = 0; 203 int i; 204 char *devname; 205 206 ASSERT(un); 207 ASSERT(col < un->un_totalcolumncnt); 208 ASSERT(newstate & 209 (RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | 210 RCS_LAST_ERRED | RCS_REGEN)); 211 ASSERT((newstate & 212 ~(RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | 213 RCS_LAST_ERRED | RCS_REGEN)) 214 == 0); 215 216 ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1); 217 218 unitstate = un->un_state; 219 origstate = unitstate; 220 221 if (force) { 222 un->un_column[col].un_devstate = newstate; 223 un->un_state = raid_col2unit(newstate, unitstate); 224 uniqtime32(&un->un_column[col].un_devtimestamp); 225 uniqtime32(&un->un_timestamp); 226 return; 227 } 228 229 ASSERT(un->un_state & 230 (RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | 231 RUS_REGEN)); 232 ASSERT((un->un_state & ~(RUS_INIT | 233 RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)) == 0); 234 235 if (un->un_column[col].un_devstate == newstate) 236 return; 237 238 if (newstate == RCS_REGEN) { 239 if (raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) 240 return; 241 un->un_state = RUS_REGEN; 242 return; 243 } 244 245 orig_colstate = un->un_column[col].un_devstate; 246 247 /* 248 * if there is another column in the error state then this 249 * column should go to the last errored state 250 */ 251 for (i = 0; i < un->un_totalcolumncnt; i++) { 252 if (i == col) 253 colstate = newstate; 254 else 255 colstate = un->un_column[i].un_devstate; 256 if (colstate & (RCS_ERRED | RCS_LAST_ERRED | RCS_INIT_ERRED)) 257 errcnt++; 258 if (colstate & RCS_OKAY) 259 okaycnt++; 260 if (colstate & RCS_RESYNC) 261 resynccnt++; 262 } 263 ASSERT(resynccnt < 2); 264 265 if (okaycnt == un->un_totalcolumncnt) 266 unitstate = RUS_OKAY; 267 else if (errcnt > 1) { 268 unitstate = RUS_LAST_ERRED; 269 if (newstate & RCS_ERRED) 270 newstate = RCS_LAST_ERRED; 271 } else if (errcnt == 1) 272 if (!(unitstate & RUS_LAST_ERRED)) 273 unitstate = RUS_ERRED; 274 275 if (un->un_state == RUS_DOI) 276 unitstate = RUS_DOI; 277 278 un->un_column[col].un_devstate = newstate; 279 uniqtime32(&un->un_column[col].un_devtimestamp); 280 /* 281 * if there are last errored column being brought back online 282 * by open or snarf, then be sure to clear the RUS_LAST_ERRED 283 * bit to allow writes. If there is a real error then the 284 * column will go back into last erred. 285 */ 286 if ((raid_state_cnt(un, RCS_LAST_ERRED) == 0) && 287 (raid_state_cnt(un, RCS_ERRED) == 1)) 288 unitstate = RUS_ERRED; 289 290 un->un_state = unitstate; 291 uniqtime32(&un->un_timestamp); 292 293 if ((! (origstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) && 294 (unitstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) { 295 devname = md_devname(MD_UN2SET(un), 296 un->un_column[col].un_dev, NULL, 0); 297 298 cmn_err(CE_WARN, "md: %s: %s needs maintenance", 299 md_shortname(MD_SID(un)), devname); 300 301 if (unitstate & RUS_LAST_ERRED) { 302 cmn_err(CE_WARN, "md: %s: %s last erred", 303 md_shortname(MD_SID(un)), devname); 304 305 } else if (un->un_column[col].un_devflags & 306 MD_RAID_DEV_ISOPEN) { 307 /* 308 * Close the broken device and clear the open flag on 309 * it. We have to check that the device is open, 310 * otherwise the first open on it has resulted in the 311 * error that is being processed and the actual un_dev 312 * will be NODEV64. 313 */ 314 md_layered_close(un->un_column[col].un_dev, 315 MD_OFLG_NULL); 316 un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; 317 } 318 } else if (orig_colstate == RCS_LAST_ERRED && newstate == RCS_ERRED && 319 un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) { 320 /* 321 * Similar to logic above except no log messages since we 322 * are just transitioning from Last Erred to Erred. 323 */ 324 md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL); 325 un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; 326 } 327 328 /* 329 * If a resync has completed, see if there is a Last Erred 330 * component that we can change to the Erred state. 331 */ 332 if ((orig_colstate == RCS_RESYNC) && (newstate == RCS_OKAY)) { 333 for (i = 0; i < un->un_totalcolumncnt; i++) { 334 if (i != col && 335 (un->un_column[i].un_devstate & RCS_LAST_ERRED)) { 336 raid_set_state(un, i, RCS_ERRED, 0); 337 break; 338 } 339 } 340 } 341 } 342 343 /* 344 * NAME: erred_check_line 345 * 346 * DESCRIPTION: Return the type of write to perform on an erred column based 347 * upon any resync activity. 348 * 349 * if a column is being resynced and the write is above the 350 * resync point may have to write to the target being resynced. 351 * 352 * Column state may make it impossible to do the write 353 * in which case RCL_EIO or RCL_ENXIO is returned. 354 * 355 * If a column cannot be written directly, RCL_ERRED is 356 * returned and processing should proceed accordingly. 357 * 358 * PARAMETERS: minor_t mnum - minor number identity of metadevice 359 * md_raidcs_t *cs - child save structure 360 * mr_column_t *dcolumn - pointer to data column structure 361 * mr_column_t *pcolumn - pointer to parity column structure 362 * 363 * RETURNS: RCL_OKAY, RCL_ERRED 364 * 365 * LOCKS: Expects Line Writer Lock and Unit Resource Lock to be held 366 * across call. 367 */ 368 369 static int 370 erred_check_line(mr_unit_t *un, md_raidcs_t *cs, mr_column_t *column) 371 { 372 373 ASSERT(un != NULL); 374 ASSERT(cs->cs_flags & MD_RCS_LLOCKD); 375 376 if (column->un_devstate & RCS_OKAY) 377 return (RCL_OKAY); 378 379 if (column->un_devstate & RCS_ERRED) 380 return (RCL_ERRED); /* do not read from errored disk */ 381 382 /* 383 * for the last errored case their are two considerations. 384 * When the last errored column is the only errored column then 385 * do treat it like a maintenance column, not doing I/O from 386 * it. When it there are other failures then just attempt 387 * to use it. 388 */ 389 if (column->un_devstate & RCS_LAST_ERRED) 390 return (RCL_ERRED); 391 392 ASSERT(column->un_devstate & RCS_RESYNC); 393 394 /* 395 * When a resync from a hotspare is being done (copy resync) 396 * then always treat it as an OKAY column, since no regen 397 * is required. 398 */ 399 if (column->un_devflags & MD_RAID_COPY_RESYNC) { 400 return (RCL_OKAY); 401 } 402 403 mutex_enter(&un->un_mx); 404 if (cs->cs_line < un->un_resync_line_index) { 405 mutex_exit(&un->un_mx); 406 return (RCL_OKAY); 407 } 408 mutex_exit(&un->un_mx); 409 return (RCL_ERRED); 410 411 } 412 413 /* 414 * NAMES: raid_state_cnt 415 * 416 * DESCRIPTION: counts number of column in a specific state 417 * 418 * PARAMETERS: md_raid_t *un 419 * rcs_state state 420 */ 421 int 422 raid_state_cnt(mr_unit_t *un, rcs_state_t state) 423 { 424 int i, retval = 0; 425 426 for (i = 0; i < un->un_totalcolumncnt; i++) 427 if (un->un_column[i].un_devstate & state) 428 retval++; 429 return (retval); 430 } 431 432 /* 433 * NAMES: raid_io_overlaps 434 * 435 * DESCRIPTION: checkst for overlap of 2 child save structures 436 * 437 * PARAMETERS: md_raidcs_t cs1 438 * md_raidcs_t cs2 439 * 440 * RETURNS: 0 - no overlap 441 * 1 - overlap 442 */ 443 int 444 raid_io_overlaps(md_raidcs_t *cs1, md_raidcs_t *cs2) 445 { 446 if (cs1->cs_blkno > cs2->cs_lastblk) 447 return (0); 448 if (cs1->cs_lastblk < cs2->cs_blkno) 449 return (0); 450 return (1); 451 } 452 453 /* 454 * NAMES: raid_parent_constructor 455 * DESCRIPTION: parent structure constructor routine 456 * PARAMETERS: 457 */ 458 /*ARGSUSED1*/ 459 static int 460 raid_parent_constructor(void *p, void *d1, int d2) 461 { 462 mutex_init(&((md_raidps_t *)p)->ps_mx, 463 NULL, MUTEX_DEFAULT, NULL); 464 mutex_init(&((md_raidps_t *)p)->ps_mapin_mx, 465 NULL, MUTEX_DEFAULT, NULL); 466 return (0); 467 } 468 469 void 470 raid_parent_init(md_raidps_t *ps) 471 { 472 bzero(ps, offsetof(md_raidps_t, ps_mx)); 473 ((md_raidps_t *)ps)->ps_flags = MD_RPS_INUSE; 474 ((md_raidps_t *)ps)->ps_magic = RAID_PSMAGIC; 475 } 476 477 /*ARGSUSED1*/ 478 static void 479 raid_parent_destructor(void *p, void *d) 480 { 481 mutex_destroy(&((md_raidps_t *)p)->ps_mx); 482 mutex_destroy(&((md_raidps_t *)p)->ps_mapin_mx); 483 } 484 485 /* 486 * NAMES: raid_child_constructor 487 * DESCRIPTION: child structure constructor routine 488 * PARAMETERS: 489 */ 490 /*ARGSUSED1*/ 491 static int 492 raid_child_constructor(void *p, void *d1, int d2) 493 { 494 md_raidcs_t *cs = (md_raidcs_t *)p; 495 mutex_init(&cs->cs_mx, NULL, MUTEX_DEFAULT, NULL); 496 bioinit(&cs->cs_dbuf); 497 bioinit(&cs->cs_pbuf); 498 bioinit(&cs->cs_hbuf); 499 return (0); 500 } 501 502 void 503 raid_child_init(md_raidcs_t *cs) 504 { 505 bzero(cs, offsetof(md_raidcs_t, cs_mx)); 506 507 md_bioreset(&cs->cs_dbuf); 508 md_bioreset(&cs->cs_pbuf); 509 md_bioreset(&cs->cs_hbuf); 510 511 ((md_raidcs_t *)cs)->cs_dbuf.b_chain = 512 ((md_raidcs_t *)cs)->cs_pbuf.b_chain = 513 ((md_raidcs_t *)cs)->cs_hbuf.b_chain = 514 (struct buf *)(cs); 515 516 cs->cs_magic = RAID_CSMAGIC; 517 cs->cs_line = MD_DISKADDR_ERROR; 518 cs->cs_dpwslot = -1; 519 cs->cs_ppwslot = -1; 520 } 521 522 /*ARGSUSED1*/ 523 static void 524 raid_child_destructor(void *p, void *d) 525 { 526 biofini(&((md_raidcs_t *)p)->cs_dbuf); 527 biofini(&((md_raidcs_t *)p)->cs_hbuf); 528 biofini(&((md_raidcs_t *)p)->cs_pbuf); 529 mutex_destroy(&((md_raidcs_t *)p)->cs_mx); 530 } 531 532 /*ARGSUSED1*/ 533 static int 534 raid_cbuf_constructor(void *p, void *d1, int d2) 535 { 536 bioinit(&((md_raidcbuf_t *)p)->cbuf_bp); 537 return (0); 538 } 539 540 static void 541 raid_cbuf_init(md_raidcbuf_t *cb) 542 { 543 bzero(cb, offsetof(md_raidcbuf_t, cbuf_bp)); 544 md_bioreset(&cb->cbuf_bp); 545 cb->cbuf_magic = RAID_BUFMAGIC; 546 cb->cbuf_pwslot = -1; 547 cb->cbuf_flags = CBUF_WRITE; 548 } 549 550 /*ARGSUSED1*/ 551 static void 552 raid_cbuf_destructor(void *p, void *d) 553 { 554 biofini(&((md_raidcbuf_t *)p)->cbuf_bp); 555 } 556 557 /* 558 * NAMES: raid_run_queue 559 * DESCRIPTION: spawn a backend processing daemon for RAID metadevice. 560 * PARAMETERS: 561 */ 562 /*ARGSUSED*/ 563 static void 564 raid_run_queue(void *d) 565 { 566 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 567 md_daemon(1, &md_done_daemon); 568 } 569 570 /* 571 * NAME: raid_build_pwslot 572 * DESCRIPTION: builds mr_pw_reserve for the column 573 * PARAMETERS: un is the pointer to the unit structure 574 * colindex is the column to create the structure for 575 */ 576 int 577 raid_build_pw_reservation(mr_unit_t *un, int colindex) 578 { 579 mr_pw_reserve_t *pw; 580 mr_scoreboard_t *sb; 581 int i; 582 583 pw = (mr_pw_reserve_t *) kmem_zalloc(sizeof (mr_pw_reserve_t) + 584 (sizeof (mr_scoreboard_t) * un->un_pwcnt), KM_SLEEP); 585 pw->pw_magic = RAID_PWMAGIC; 586 pw->pw_column = colindex; 587 pw->pw_free = un->un_pwcnt; 588 sb = &pw->pw_sb[0]; 589 for (i = 0; i < un->un_pwcnt; i++) { 590 sb[i].sb_column = colindex; 591 sb[i].sb_flags = SB_UNUSED; 592 sb[i].sb_start_blk = 0; 593 sb[i].sb_last_blk = 0; 594 sb[i].sb_cs = NULL; 595 } 596 un->un_column_ic[colindex].un_pw_reserve = pw; 597 return (0); 598 } 599 /* 600 * NAME: raid_free_pw_reservation 601 * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine 602 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 603 * int colindex - index of the column whose pre-write slot struct 604 * is to be destroyed. 605 */ 606 void 607 raid_free_pw_reservation(mr_unit_t *un, int colindex) 608 { 609 mr_pw_reserve_t *pw = un->un_column_ic[colindex].un_pw_reserve; 610 611 kmem_free(pw, sizeof (mr_pw_reserve_t) + 612 (sizeof (mr_scoreboard_t) * un->un_pwcnt)); 613 } 614 615 /* 616 * NAME: raid_cancel_pwslot 617 * DESCRIPTION: RAID metadevice write routine 618 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 619 */ 620 static void 621 raid_cancel_pwslot(md_raidcs_t *cs) 622 { 623 mr_unit_t *un = cs->cs_un; 624 mr_pw_reserve_t *pw; 625 mr_scoreboard_t *sb; 626 mr_column_ic_t *col; 627 md_raidcbuf_t *cbuf; 628 int broadcast = 0; 629 630 if (cs->cs_ps->ps_flags & MD_RPS_READ) 631 return; 632 if (cs->cs_dpwslot != -1) { 633 col = &un->un_column_ic[cs->cs_dcolumn]; 634 pw = col->un_pw_reserve; 635 sb = &pw->pw_sb[cs->cs_dpwslot]; 636 sb->sb_flags = SB_AVAIL; 637 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 638 broadcast++; 639 sb->sb_cs = NULL; 640 } 641 642 if (cs->cs_ppwslot != -1) { 643 col = &un->un_column_ic[cs->cs_pcolumn]; 644 pw = col->un_pw_reserve; 645 sb = &pw->pw_sb[cs->cs_ppwslot]; 646 sb->sb_flags = SB_AVAIL; 647 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 648 broadcast++; 649 sb->sb_cs = NULL; 650 } 651 652 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 653 if (cbuf->cbuf_pwslot == -1) 654 continue; 655 col = &un->un_column_ic[cbuf->cbuf_column]; 656 pw = col->un_pw_reserve; 657 sb = &pw->pw_sb[cbuf->cbuf_pwslot]; 658 sb->sb_flags = SB_AVAIL; 659 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 660 broadcast++; 661 sb->sb_cs = NULL; 662 } 663 if (broadcast) { 664 cv_broadcast(&un->un_cv); 665 return; 666 } 667 mutex_enter(&un->un_mx); 668 if (un->un_rflags & MD_RFLAG_NEEDPW) 669 cv_broadcast(&un->un_cv); 670 mutex_exit(&un->un_mx); 671 } 672 673 static void 674 raid_free_pwinvalidate(md_raidcs_t *cs) 675 { 676 md_raidcbuf_t *cbuf; 677 md_raidcbuf_t *cbuf_to_free; 678 mr_unit_t *un = cs->cs_un; 679 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); 680 mr_pw_reserve_t *pw; 681 mr_scoreboard_t *sb; 682 int broadcast = 0; 683 684 cbuf = cs->cs_pw_inval_list; 685 ASSERT(cbuf); 686 mutex_enter(&un->un_linlck_mx); 687 while (cbuf) { 688 pw = un->un_column_ic[cbuf->cbuf_column].un_pw_reserve; 689 sb = &pw->pw_sb[0]; 690 ASSERT(sb[cbuf->cbuf_pwslot].sb_flags & SB_INVAL_PEND); 691 sb[cbuf->cbuf_pwslot].sb_flags = SB_UNUSED; 692 sb[cbuf->cbuf_pwslot].sb_cs = NULL; 693 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 694 broadcast++; 695 cbuf_to_free = cbuf; 696 cbuf = cbuf->cbuf_next; 697 kmem_free(cbuf_to_free->cbuf_buffer, dbtob(un->un_iosize)); 698 kmem_cache_free(raid_cbuf_cache, cbuf_to_free); 699 } 700 cs->cs_pw_inval_list = (md_raidcbuf_t *)NULL; 701 /* 702 * now that there is a free prewrite slot, check to see if there 703 * are any io operations waiting first wake up the raid_io_startup 704 * then signal the the processes waiting in raid_write. 705 */ 706 if (ui->ui_io_lock->io_list_front) 707 raid_io_startup(un); 708 mutex_exit(&un->un_linlck_mx); 709 if (broadcast) { 710 cv_broadcast(&un->un_cv); 711 return; 712 } 713 mutex_enter(&un->un_mx); 714 if (un->un_rflags & MD_RFLAG_NEEDPW) 715 cv_broadcast(&un->un_cv); 716 mutex_exit(&un->un_mx); 717 } 718 719 720 static int 721 raid_get_pwslot(md_raidcs_t *cs, int column) 722 { 723 mr_scoreboard_t *sb; 724 mr_pw_reserve_t *pw; 725 mr_unit_t *un = cs->cs_un; 726 diskaddr_t start_blk = cs->cs_blkno; 727 diskaddr_t last_blk = cs->cs_lastblk; 728 int i; 729 int pwcnt = un->un_pwcnt; 730 int avail = -1; 731 int use = -1; 732 int flags; 733 734 735 /* start with the data column */ 736 pw = cs->cs_un->un_column_ic[column].un_pw_reserve; 737 sb = &pw->pw_sb[0]; 738 ASSERT(pw->pw_free > 0); 739 for (i = 0; i < pwcnt; i++) { 740 flags = sb[i].sb_flags; 741 if (flags & SB_INVAL_PEND) 742 continue; 743 744 if ((avail == -1) && (flags & (SB_AVAIL | SB_UNUSED))) 745 avail = i; 746 747 if ((start_blk > sb[i].sb_last_blk) || 748 (last_blk < sb[i].sb_start_blk)) 749 continue; 750 751 /* OVERLAP */ 752 ASSERT(! (sb[i].sb_flags & SB_INUSE)); 753 754 /* 755 * raid_invalidate_pwslot attempts to zero out prewrite entry 756 * in parallel with other disk reads/writes related to current 757 * transaction. however cs_frags accounting for this case is 758 * broken because raid_write_io resets cs_frags i.e. ignoring 759 * that it could have been been set to > 0 value by 760 * raid_invalidate_pwslot. While this can be fixed an 761 * additional problem is that we don't seem to handle 762 * correctly the case of getting a disk error for prewrite 763 * entry invalidation. 764 * It does not look like we really need 765 * to invalidate prewrite slots because raid_replay sorts 766 * prewrite id's in ascending order and during recovery the 767 * latest prewrite entry for the same block will be replay 768 * last. That's why i ifdef'd out the call to 769 * raid_invalidate_pwslot. --aguzovsk@east 770 */ 771 772 if (use == -1) { 773 use = i; 774 } 775 } 776 777 ASSERT(avail != -1); 778 pw->pw_free--; 779 if (use == -1) 780 use = avail; 781 782 ASSERT(! (sb[use].sb_flags & SB_INUSE)); 783 sb[use].sb_flags = SB_INUSE; 784 sb[use].sb_cs = cs; 785 sb[use].sb_start_blk = start_blk; 786 sb[use].sb_last_blk = last_blk; 787 ASSERT((use >= 0) && (use < un->un_pwcnt)); 788 return (use); 789 } 790 791 static int 792 raid_check_pw(md_raidcs_t *cs) 793 { 794 795 mr_unit_t *un = cs->cs_un; 796 int i; 797 798 ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); 799 /* 800 * check to be sure there is a prewrite slot available 801 * if not just return. 802 */ 803 if (cs->cs_flags & MD_RCS_LINE) { 804 for (i = 0; i < un->un_totalcolumncnt; i++) 805 if (un->un_column_ic[i].un_pw_reserve->pw_free <= 0) 806 return (1); 807 return (0); 808 } 809 810 if (un->un_column_ic[cs->cs_dcolumn].un_pw_reserve->pw_free <= 0) 811 return (1); 812 if (un->un_column_ic[cs->cs_pcolumn].un_pw_reserve->pw_free <= 0) 813 return (1); 814 return (0); 815 } 816 static int 817 raid_alloc_pwslot(md_raidcs_t *cs) 818 { 819 mr_unit_t *un = cs->cs_un; 820 md_raidcbuf_t *cbuf; 821 822 ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); 823 if (raid_check_pw(cs)) 824 return (1); 825 826 mutex_enter(&un->un_mx); 827 un->un_pwid++; 828 cs->cs_pwid = un->un_pwid; 829 mutex_exit(&un->un_mx); 830 831 cs->cs_dpwslot = raid_get_pwslot(cs, cs->cs_dcolumn); 832 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 833 cbuf->cbuf_pwslot = raid_get_pwslot(cs, cbuf->cbuf_column); 834 } 835 cs->cs_ppwslot = raid_get_pwslot(cs, cs->cs_pcolumn); 836 837 cs->cs_flags |= MD_RCS_HAVE_PW_SLOTS; 838 839 return (0); 840 } 841 842 /* 843 * NAMES: raid_build_incore 844 * DESCRIPTION: RAID metadevice incore structure building routine 845 * PARAMETERS: void *p - pointer to a unit structure 846 * int snarfing - a flag to indicate snarfing is required 847 */ 848 int 849 raid_build_incore(void *p, int snarfing) 850 { 851 mr_unit_t *un = (mr_unit_t *)p; 852 minor_t mnum = MD_SID(un); 853 mddb_recid_t hs_recid = 0; 854 int i; 855 int preserve_flags; 856 mr_column_t *column; 857 int iosize; 858 md_dev64_t hs, dev; 859 int resync_cnt = 0, 860 error_cnt = 0; 861 862 hs = NODEV64; 863 dev = NODEV64; 864 865 /* clear out bogus pointer incase we return(1) prior to alloc */ 866 un->mr_ic = NULL; 867 868 if (MD_STATUS(un) & MD_UN_BEING_RESET) { 869 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); 870 return (1); 871 } 872 873 if (MD_UNIT(mnum) != NULL) 874 return (0); 875 876 if (snarfing) 877 MD_STATUS(un) = 0; 878 879 un->mr_ic = (mr_unit_ic_t *)kmem_zalloc(sizeof (*un->mr_ic), 880 KM_SLEEP); 881 882 un->un_column_ic = (mr_column_ic_t *) 883 kmem_zalloc(sizeof (mr_column_ic_t) * 884 un->un_totalcolumncnt, KM_SLEEP); 885 886 for (i = 0; i < un->un_totalcolumncnt; i++) { 887 888 column = &un->un_column[i]; 889 preserve_flags = column->un_devflags & 890 (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC); 891 column->un_devflags &= 892 ~(MD_RAID_ALT_ISOPEN | MD_RAID_DEV_ISOPEN | 893 MD_RAID_WRITE_ALT); 894 if (raid_build_pw_reservation(un, i) != 0) { 895 /* could not build pwslot */ 896 return (1); 897 } 898 899 if (snarfing) { 900 set_t setno = MD_MIN2SET(mnum); 901 dev = md_getdevnum(setno, mddb_getsidenum(setno), 902 column->un_orig_key, MD_NOTRUST_DEVT); 903 /* 904 * Comment out instead of remove so we have history 905 * In the pre-SVM releases stored devt is used so 906 * as long as there is one snarf is always happy 907 * even the component is powered off. This is not 908 * the case in current SVM implementation. NODEV64 909 * can be returned and in this case since we resolve 910 * the devt at 'open' time (first use of metadevice) 911 * we will allow snarf continue. 912 * 913 * if (dev == NODEV64) 914 * return (1); 915 */ 916 917 /* 918 * Setup un_orig_dev from device id info if the device 919 * is valid (not NODEV64). 920 */ 921 if (dev != NODEV64) 922 column->un_orig_dev = dev; 923 924 if (column->un_devstate & RCS_RESYNC) 925 resync_cnt++; 926 if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) 927 error_cnt++; 928 929 if (HOTSPARED(un, i)) { 930 (void) md_hot_spare_ifc(HS_MKDEV, 931 0, 0, 0, &column->un_hs_id, NULL, 932 &hs, NULL); 933 /* 934 * Same here 935 * 936 * if (hs == NODEV64) 937 * return (1); 938 */ 939 } 940 941 if (HOTSPARED(un, i)) { 942 if (column->un_devstate & 943 (RCS_OKAY | RCS_LAST_ERRED)) { 944 column->un_dev = hs; 945 column->un_pwstart = 946 column->un_hs_pwstart; 947 column->un_devstart = 948 column->un_hs_devstart; 949 preserve_flags &= 950 ~(MD_RAID_COPY_RESYNC | 951 MD_RAID_REGEN_RESYNC); 952 } else if (column->un_devstate & RCS_RESYNC) { 953 /* 954 * if previous system was 4.0 set 955 * the direction flags 956 */ 957 if ((preserve_flags & 958 (MD_RAID_COPY_RESYNC | 959 MD_RAID_REGEN_RESYNC)) == 0) { 960 if (column->un_alt_dev != NODEV64) 961 preserve_flags |= 962 MD_RAID_COPY_RESYNC; 963 else 964 preserve_flags |= 965 MD_RAID_REGEN_RESYNC; 966 } 967 } 968 } else { /* no hot spares */ 969 column->un_dev = dev; 970 column->un_pwstart = column->un_orig_pwstart; 971 column->un_devstart = column->un_orig_devstart; 972 if (column->un_devstate & RCS_RESYNC) { 973 preserve_flags |= MD_RAID_REGEN_RESYNC; 974 preserve_flags &= ~MD_RAID_COPY_RESYNC; 975 } 976 } 977 if (! (column->un_devstate & RCS_RESYNC)) { 978 preserve_flags &= 979 ~(MD_RAID_REGEN_RESYNC | 980 MD_RAID_COPY_RESYNC); 981 } 982 983 column->un_devflags = preserve_flags; 984 column->un_alt_dev = NODEV64; 985 column->un_alt_pwstart = 0; 986 column->un_alt_devstart = 0; 987 un->un_resync_line_index = 0; 988 un->un_resync_index = 0; 989 un->un_percent_done = 0; 990 } 991 } 992 993 if (resync_cnt && error_cnt) { 994 for (i = 0; i < un->un_totalcolumncnt; i++) { 995 column = &un->un_column[i]; 996 if (HOTSPARED(un, i) && 997 (column->un_devstate & RCS_RESYNC) && 998 (column->un_devflags & MD_RAID_COPY_RESYNC)) 999 /* hotspare has data */ 1000 continue; 1001 1002 if (HOTSPARED(un, i) && 1003 (column->un_devstate & RCS_RESYNC)) { 1004 /* hotspare does not have data */ 1005 raid_hs_release(HS_FREE, un, &hs_recid, i); 1006 column->un_dev = column->un_orig_dev; 1007 column->un_pwstart = column->un_orig_pwstart; 1008 column->un_devstart = column->un_orig_devstart; 1009 mddb_setrecprivate(hs_recid, MD_PRV_PENDCOM); 1010 } 1011 1012 if (column->un_devstate & RCS_ERRED) 1013 column->un_devstate = RCS_LAST_ERRED; 1014 1015 if (column->un_devstate & RCS_RESYNC) 1016 column->un_devstate = RCS_ERRED; 1017 } 1018 } 1019 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM); 1020 1021 un->un_pwid = 1; /* or some other possible value */ 1022 un->un_magic = RAID_UNMAGIC; 1023 iosize = un->un_iosize; 1024 un->un_pbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); 1025 un->un_dbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); 1026 mutex_init(&un->un_linlck_mx, NULL, MUTEX_DEFAULT, NULL); 1027 cv_init(&un->un_linlck_cv, NULL, CV_DEFAULT, NULL); 1028 un->un_linlck_chn = NULL; 1029 MD_UNIT(mnum) = un; 1030 1031 1032 return (0); 1033 } 1034 1035 /* 1036 * NAMES: reset_raid 1037 * DESCRIPTION: RAID metadevice reset routine 1038 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 1039 * minor_t mnum - RAID metadevice minor number 1040 * int removing - a flag to imply removing device name from 1041 * MDDB database. 1042 */ 1043 void 1044 reset_raid(mr_unit_t *un, minor_t mnum, int removing) 1045 { 1046 int i, n = 0; 1047 sv_dev_t *sv; 1048 mr_column_t *column; 1049 int column_cnt = un->un_totalcolumncnt; 1050 mddb_recid_t *recids, vtoc_id; 1051 int hserr; 1052 1053 ASSERT((MDI_UNIT(mnum)->ui_io_lock->io_list_front == NULL) && 1054 (MDI_UNIT(mnum)->ui_io_lock->io_list_back == NULL)); 1055 1056 md_destroy_unit_incore(mnum, &raid_md_ops); 1057 1058 MD_UNIT(mnum) = NULL; 1059 1060 if (un->un_pbuffer) { 1061 kmem_free(un->un_pbuffer, dbtob(un->un_iosize)); 1062 un->un_pbuffer = NULL; 1063 } 1064 if (un->un_dbuffer) { 1065 kmem_free(un->un_dbuffer, dbtob(un->un_iosize)); 1066 un->un_dbuffer = NULL; 1067 } 1068 1069 /* free all pre-write slots created during build incore */ 1070 for (i = 0; i < un->un_totalcolumncnt; i++) 1071 raid_free_pw_reservation(un, i); 1072 1073 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * 1074 un->un_totalcolumncnt); 1075 1076 kmem_free(un->mr_ic, sizeof (*un->mr_ic)); 1077 1078 if (!removing) 1079 return; 1080 1081 sv = (sv_dev_t *)kmem_zalloc((column_cnt + 1) * sizeof (sv_dev_t), 1082 KM_SLEEP); 1083 1084 recids = (mddb_recid_t *) 1085 kmem_zalloc((column_cnt + 2) * sizeof (mddb_recid_t), KM_SLEEP); 1086 1087 for (i = 0; i < column_cnt; i++) { 1088 md_unit_t *comp_un; 1089 md_dev64_t comp_dev; 1090 1091 column = &un->un_column[i]; 1092 sv[i].setno = MD_MIN2SET(mnum); 1093 sv[i].key = column->un_orig_key; 1094 if (HOTSPARED(un, i)) { 1095 if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) 1096 hserr = HS_BAD; 1097 else 1098 hserr = HS_FREE; 1099 raid_hs_release(hserr, un, &recids[n++], i); 1100 } 1101 /* 1102 * deparent any metadevices. 1103 * NOTE: currently soft partitions are the only metadevices 1104 * allowed in RAID metadevices. 1105 */ 1106 comp_dev = column->un_dev; 1107 if (md_getmajor(comp_dev) == md_major) { 1108 comp_un = MD_UNIT(md_getminor(comp_dev)); 1109 recids[n++] = MD_RECID(comp_un); 1110 md_reset_parent(comp_dev); 1111 } 1112 } 1113 /* decrement the reference count of the old hsp */ 1114 if (un->un_hsp_id != -1) 1115 (void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0, 1116 &recids[n++], NULL, NULL, NULL); 1117 recids[n] = 0; 1118 MD_STATUS(un) |= MD_UN_BEING_RESET; 1119 vtoc_id = un->c.un_vtoc_id; 1120 1121 raid_commit(un, recids); 1122 1123 1124 /* Remove the unit structure */ 1125 mddb_deleterec_wrapper(un->c.un_record_id); 1126 1127 /* Remove the vtoc, if present */ 1128 if (vtoc_id) 1129 mddb_deleterec_wrapper(vtoc_id); 1130 md_rem_names(sv, column_cnt); 1131 kmem_free(sv, (column_cnt + 1) * sizeof (sv_dev_t)); 1132 kmem_free(recids, (column_cnt + 2) * sizeof (mddb_recid_t)); 1133 1134 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 1135 MD_MIN2SET(mnum), mnum); 1136 } 1137 1138 /* 1139 * NAMES: raid_error_parent 1140 * DESCRIPTION: mark a parent structure in error 1141 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1142 * int error - error value to set 1143 * NOTE: (TBR) - this routine currently is not in use. 1144 */ 1145 static void 1146 raid_error_parent(md_raidps_t *ps, int error) 1147 { 1148 mutex_enter(&ps->ps_mx); 1149 ps->ps_flags |= MD_RPS_ERROR; 1150 ps->ps_error = error; 1151 mutex_exit(&ps->ps_mx); 1152 } 1153 1154 /* 1155 * The following defines tell raid_free_parent 1156 * RFP_RLS_LOCK release the unit reader lock when done. 1157 * RFP_DECR_PWFRAGS decrement ps_pwfrags 1158 * RFP_DECR_FRAGS decrement ps_frags 1159 * RFP_DECR_READFRAGS read keeps FRAGS and PWFRAGS in lockstep 1160 */ 1161 #define RFP_RLS_LOCK 0x00001 1162 #define RFP_DECR_PWFRAGS 0x00002 1163 #define RFP_DECR_FRAGS 0x00004 1164 #define RFP_DECR_READFRAGS (RFP_DECR_PWFRAGS | RFP_DECR_FRAGS) 1165 1166 /* 1167 * NAMES: raid_free_parent 1168 * DESCRIPTION: free a parent structure 1169 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1170 * int todo - indicates what needs to be done 1171 */ 1172 static void 1173 raid_free_parent(md_raidps_t *ps, int todo) 1174 { 1175 mdi_unit_t *ui = ps->ps_ui; 1176 1177 ASSERT(ps->ps_magic == RAID_PSMAGIC); 1178 ASSERT(ps->ps_flags & MD_RPS_INUSE); 1179 mutex_enter(&ps->ps_mx); 1180 if (todo & RFP_DECR_PWFRAGS) { 1181 ASSERT(ps->ps_pwfrags); 1182 ps->ps_pwfrags--; 1183 if (ps->ps_pwfrags == 0 && (! (ps->ps_flags & MD_RPS_IODONE))) { 1184 if (ps->ps_flags & MD_RPS_ERROR) { 1185 ps->ps_bp->b_flags |= B_ERROR; 1186 ps->ps_bp->b_error = ps->ps_error; 1187 } 1188 md_kstat_done(ui, ps->ps_bp, 0); 1189 biodone(ps->ps_bp); 1190 ps->ps_flags |= MD_RPS_IODONE; 1191 } 1192 } 1193 1194 if (todo & RFP_DECR_FRAGS) { 1195 ASSERT(ps->ps_frags); 1196 ps->ps_frags--; 1197 } 1198 1199 if (ps->ps_frags != 0) { 1200 mutex_exit(&ps->ps_mx); 1201 return; 1202 } 1203 1204 ASSERT((ps->ps_frags == 0) && (ps->ps_pwfrags == 0)); 1205 mutex_exit(&ps->ps_mx); 1206 1207 if (todo & RFP_RLS_LOCK) 1208 md_io_readerexit(ui); 1209 1210 if (panicstr) { 1211 ps->ps_flags |= MD_RPS_DONE; 1212 return; 1213 } 1214 1215 if (ps->ps_flags & MD_RPS_HSREQ) 1216 (void) raid_hotspares(); 1217 1218 ASSERT(todo & RFP_RLS_LOCK); 1219 ps->ps_flags &= ~MD_RPS_INUSE; 1220 1221 md_dec_iocount(MD_MIN2SET(ps->ps_un->c.un_self_id)); 1222 1223 kmem_cache_free(raid_parent_cache, ps); 1224 } 1225 1226 /* 1227 * NAMES: raid_free_child 1228 * DESCRIPTION: free a parent structure 1229 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1230 * int drop_locks - 0 for no locks held 1231 * NOTE: (TBR) - this routine currently is not in use. 1232 */ 1233 static void 1234 raid_free_child(md_raidcs_t *cs, int drop_locks) 1235 { 1236 mr_unit_t *un = cs->cs_un; 1237 md_raidcbuf_t *cbuf, *cbuf1; 1238 1239 if (cs->cs_pw_inval_list) 1240 raid_free_pwinvalidate(cs); 1241 1242 if (drop_locks) { 1243 ASSERT(cs->cs_flags & MD_RCS_LLOCKD && 1244 (cs->cs_flags & (MD_RCS_READER | MD_RCS_WRITER))); 1245 md_unit_readerexit(MDI_UNIT(MD_SID(un))); 1246 raid_line_exit(cs); 1247 } else { 1248 ASSERT(!(cs->cs_flags & MD_RCS_LLOCKD)); 1249 } 1250 1251 freebuffers(cs); 1252 cbuf = cs->cs_buflist; 1253 while (cbuf) { 1254 cbuf1 = cbuf->cbuf_next; 1255 kmem_cache_free(raid_cbuf_cache, cbuf); 1256 cbuf = cbuf1; 1257 } 1258 if (cs->cs_dbuf.b_flags & B_REMAPPED) 1259 bp_mapout(&cs->cs_dbuf); 1260 kmem_cache_free(raid_child_cache, cs); 1261 } 1262 1263 /* 1264 * NAME: raid_regen_parity 1265 * 1266 * DESCRIPTION: This routine is used to regenerate the parity blocks 1267 * for the entire raid device. It is called from 1268 * both the regen thread and the IO path. 1269 * 1270 * On error the entire device is marked as in error by 1271 * placing the erroring device in error and all other 1272 * devices in last_errored. 1273 * 1274 * PARAMETERS: md_raidcs_t *cs 1275 */ 1276 void 1277 raid_regen_parity(md_raidcs_t *cs) 1278 { 1279 mr_unit_t *un = cs->cs_un; 1280 mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); 1281 caddr_t buffer; 1282 caddr_t parity_buffer; 1283 buf_t *bp; 1284 uint_t *dbuf, *pbuf; 1285 uint_t colcnt = un->un_totalcolumncnt; 1286 int column; 1287 int parity_column = cs->cs_pcolumn; 1288 size_t bcount; 1289 int j; 1290 1291 /* 1292 * This routine uses the data and parity buffers allocated to a 1293 * write. In the case of a read the buffers are allocated and 1294 * freed at the end. 1295 */ 1296 1297 ASSERT(IO_READER_HELD(un)); 1298 ASSERT(cs->cs_flags & MD_RCS_LLOCKD); 1299 ASSERT(UNIT_READER_HELD(un)); 1300 1301 if (raid_state_cnt(un, RCS_OKAY) != colcnt) 1302 return; 1303 1304 if (cs->cs_flags & MD_RCS_READER) { 1305 getpbuffer(cs); 1306 getdbuffer(cs); 1307 } 1308 ASSERT(cs->cs_dbuffer && cs->cs_pbuffer); 1309 bcount = cs->cs_bcount; 1310 buffer = cs->cs_dbuffer; 1311 parity_buffer = cs->cs_pbuffer; 1312 bzero(parity_buffer, bcount); 1313 bp = &cs->cs_dbuf; 1314 for (column = 0; column < colcnt; column++) { 1315 if (column == parity_column) 1316 continue; 1317 reset_buf(bp, B_READ | B_BUSY, bcount); 1318 bp->b_un.b_addr = buffer; 1319 bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev); 1320 bp->b_lblkno = cs->cs_blkno + un->un_column[column].un_devstart; 1321 bp->b_bcount = bcount; 1322 bp->b_bufsize = bcount; 1323 (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); 1324 if (biowait(bp)) 1325 goto bail; 1326 pbuf = (uint_t *)(void *)parity_buffer; 1327 dbuf = (uint_t *)(void *)buffer; 1328 for (j = 0; j < (bcount / (sizeof (uint_t))); j++) { 1329 *pbuf = *pbuf ^ *dbuf; 1330 pbuf++; 1331 dbuf++; 1332 } 1333 } 1334 1335 reset_buf(bp, B_WRITE | B_BUSY, cs->cs_bcount); 1336 bp->b_un.b_addr = parity_buffer; 1337 bp->b_edev = md_dev64_to_dev(un->un_column[parity_column].un_dev); 1338 bp->b_lblkno = cs->cs_blkno + un->un_column[parity_column].un_devstart; 1339 bp->b_bcount = bcount; 1340 bp->b_bufsize = bcount; 1341 (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); 1342 if (biowait(bp)) 1343 goto bail; 1344 1345 if (cs->cs_flags & MD_RCS_READER) { 1346 freebuffers(cs); 1347 cs->cs_pbuffer = NULL; 1348 cs->cs_dbuffer = NULL; 1349 } 1350 bp->b_chain = (struct buf *)cs; 1351 return; 1352 bail: 1353 if (cs->cs_flags & MD_RCS_READER) { 1354 freebuffers(cs); 1355 cs->cs_pbuffer = NULL; 1356 cs->cs_dbuffer = NULL; 1357 } 1358 md_unit_readerexit(ui); 1359 un = md_unit_writerlock(ui); 1360 raid_set_state(un, column, RCS_ERRED, 0); 1361 for (column = 0; column < colcnt; column++) 1362 raid_set_state(un, column, RCS_ERRED, 0); 1363 raid_commit(un, NULL); 1364 md_unit_writerexit(ui); 1365 un = md_unit_readerlock(ui); 1366 bp->b_chain = (struct buf *)cs; 1367 } 1368 1369 /* 1370 * NAMES: raid_error_state 1371 * DESCRIPTION: check unit and column states' impact on I/O error 1372 * NOTE: the state now may not be the state when the 1373 * I/O completed due to race conditions. 1374 * PARAMETERS: mr_unit_t *un - pointer to raid unit structure 1375 * md_raidcs_t *cs - pointer to child structure 1376 * buf_t *bp - pointer to buffer structure 1377 */ 1378 static int 1379 raid_error_state(mr_unit_t *un, buf_t *bp) 1380 { 1381 int column; 1382 int i; 1383 1384 ASSERT(IO_READER_HELD(un)); 1385 ASSERT(UNIT_WRITER_HELD(un)); 1386 1387 column = -1; 1388 for (i = 0; i < un->un_totalcolumncnt; i++) { 1389 if (un->un_column[i].un_dev == md_expldev(bp->b_edev)) { 1390 column = i; 1391 break; 1392 } 1393 if (un->un_column[i].un_alt_dev == md_expldev(bp->b_edev)) { 1394 column = i; 1395 break; 1396 } 1397 } 1398 1399 /* in case a replace snuck in while waiting on unit writer lock */ 1400 1401 if (column == -1) { 1402 return (0); 1403 } 1404 1405 (void) raid_set_state(un, column, RCS_ERRED, 0); 1406 ASSERT(un->un_state & (RUS_ERRED | RUS_LAST_ERRED)); 1407 1408 raid_commit(un, NULL); 1409 if (un->un_state & RUS_ERRED) { 1410 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, 1411 MD_UN2SET(un), MD_SID(un)); 1412 } else if (un->un_state & RUS_LAST_ERRED) { 1413 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE, 1414 MD_UN2SET(un), MD_SID(un)); 1415 } 1416 1417 return (EIO); 1418 } 1419 1420 /* 1421 * NAME: raid_mapin_buf 1422 * DESCRIPTION: wait for the input buffer header to be maped in 1423 * PARAMETERS: md_raidps_t *ps 1424 */ 1425 static void 1426 raid_mapin_buf(md_raidcs_t *cs) 1427 { 1428 md_raidps_t *ps = cs->cs_ps; 1429 1430 /* 1431 * check to see if the buffer is maped. If all is ok return the 1432 * offset of the data and return. Since it is expensive to grab 1433 * a mutex this is only done if the mapin is not complete. 1434 * Once the mutex is aquired it is possible that the mapin was 1435 * not done so recheck and if necessary do the mapin. 1436 */ 1437 if (ps->ps_mapin > 0) { 1438 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1439 return; 1440 } 1441 mutex_enter(&ps->ps_mapin_mx); 1442 if (ps->ps_mapin > 0) { 1443 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1444 mutex_exit(&ps->ps_mapin_mx); 1445 return; 1446 } 1447 bp_mapin(ps->ps_bp); 1448 /* 1449 * get the new b_addr out of the parent since bp_mapin just changed it 1450 */ 1451 ps->ps_addr = ps->ps_bp->b_un.b_addr; 1452 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1453 ps->ps_mapin++; 1454 mutex_exit(&ps->ps_mapin_mx); 1455 } 1456 1457 /* 1458 * NAMES: raid_read_no_retry 1459 * DESCRIPTION: I/O retry routine for a RAID metadevice read 1460 * read failed attempting to regenerate the data, 1461 * no retry possible, error occured in raid_raidregenloop(). 1462 * PARAMETERS: mr_unit_t *un - pointer to raid unit structure 1463 * md_raidcs_t *cs - pointer to child structure 1464 */ 1465 /*ARGSUSED*/ 1466 static void 1467 raid_read_no_retry(mr_unit_t *un, md_raidcs_t *cs) 1468 { 1469 md_raidps_t *ps = cs->cs_ps; 1470 1471 raid_error_parent(ps, EIO); 1472 raid_free_child(cs, 1); 1473 1474 /* decrement readfrags */ 1475 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 1476 } 1477 1478 /* 1479 * NAMES: raid_read_retry 1480 * DESCRIPTION: I/O retry routine for a RAID metadevice read 1481 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1482 */ 1483 static void 1484 raid_read_retry(mr_unit_t *un, md_raidcs_t *cs) 1485 { 1486 /* re-initialize the buf_t structure for raid_read() */ 1487 cs->cs_dbuf.b_chain = (struct buf *)cs; 1488 cs->cs_dbuf.b_back = &cs->cs_dbuf; 1489 cs->cs_dbuf.b_forw = &cs->cs_dbuf; 1490 cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ 1491 cs->cs_dbuf.b_error = 0; /* initialize error */ 1492 cs->cs_dbuf.b_offset = -1; 1493 /* Initialize semaphores */ 1494 sema_init(&cs->cs_dbuf.b_io, 0, NULL, 1495 SEMA_DEFAULT, NULL); 1496 sema_init(&cs->cs_dbuf.b_sem, 0, NULL, 1497 SEMA_DEFAULT, NULL); 1498 1499 cs->cs_pbuf.b_chain = (struct buf *)cs; 1500 cs->cs_pbuf.b_back = &cs->cs_pbuf; 1501 cs->cs_pbuf.b_forw = &cs->cs_pbuf; 1502 cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ 1503 cs->cs_pbuf.b_error = 0; /* initialize error */ 1504 cs->cs_pbuf.b_offset = -1; 1505 sema_init(&cs->cs_pbuf.b_io, 0, NULL, 1506 SEMA_DEFAULT, NULL); 1507 sema_init(&cs->cs_pbuf.b_sem, 0, NULL, 1508 SEMA_DEFAULT, NULL); 1509 1510 cs->cs_flags &= ~MD_RCS_ERROR; /* reset child error flag */ 1511 cs->cs_flags |= MD_RCS_RECOVERY; /* set RECOVERY flag */ 1512 1513 /* 1514 * re-scheduling I/O with raid_read_io() is simpler. basically, 1515 * raid_read_io() is invoked again with same child structure. 1516 * (NOTE: we aren`t supposed to do any error recovery when an I/O 1517 * error occured in raid_raidregenloop(). 1518 */ 1519 raid_mapin_buf(cs); 1520 raid_read_io(un, cs); 1521 } 1522 1523 /* 1524 * NAMES: raid_rderr 1525 * DESCRIPTION: I/O error handling routine for a RAID metadevice read 1526 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1527 * LOCKS: must obtain unit writer lock while calling raid_error_state 1528 * since a unit or column state transition may take place. 1529 * must obtain unit reader lock to retry I/O. 1530 */ 1531 /*ARGSUSED*/ 1532 static void 1533 raid_rderr(md_raidcs_t *cs) 1534 { 1535 md_raidps_t *ps; 1536 mdi_unit_t *ui; 1537 mr_unit_t *un; 1538 int error = 0; 1539 1540 ps = cs->cs_ps; 1541 ui = ps->ps_ui; 1542 un = (mr_unit_t *)md_unit_writerlock(ui); 1543 ASSERT(un != 0); 1544 1545 if (cs->cs_dbuf.b_flags & B_ERROR) 1546 error = raid_error_state(un, &cs->cs_dbuf); 1547 if (cs->cs_pbuf.b_flags & B_ERROR) 1548 error |= raid_error_state(un, &cs->cs_pbuf); 1549 1550 md_unit_writerexit(ui); 1551 1552 ps->ps_flags |= MD_RPS_HSREQ; 1553 1554 un = (mr_unit_t *)md_unit_readerlock(ui); 1555 ASSERT(un != 0); 1556 /* now attempt the appropriate retry routine */ 1557 (*(cs->cs_retry_call))(un, cs); 1558 } 1559 1560 1561 /* 1562 * NAMES: raid_read_error 1563 * DESCRIPTION: I/O error handling routine for a RAID metadevice read 1564 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1565 */ 1566 /*ARGSUSED*/ 1567 static void 1568 raid_read_error(md_raidcs_t *cs) 1569 { 1570 md_raidps_t *ps; 1571 mdi_unit_t *ui; 1572 mr_unit_t *un; 1573 set_t setno; 1574 1575 ps = cs->cs_ps; 1576 ui = ps->ps_ui; 1577 un = cs->cs_un; 1578 1579 setno = MD_UN2SET(un); 1580 1581 if ((cs->cs_dbuf.b_flags & B_ERROR) && 1582 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && 1583 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) 1584 cmn_err(CE_WARN, "md %s: read error on %s", 1585 md_shortname(MD_SID(un)), 1586 md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); 1587 1588 if ((cs->cs_pbuf.b_flags & B_ERROR) && 1589 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && 1590 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) 1591 cmn_err(CE_WARN, "md %s: read error on %s", 1592 md_shortname(MD_SID(un)), 1593 md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); 1594 1595 md_unit_readerexit(ui); 1596 1597 ASSERT(cs->cs_frags == 0); 1598 1599 /* now schedule processing for possible state change */ 1600 daemon_request(&md_mstr_daemon, raid_rderr, 1601 (daemon_queue_t *)cs, REQ_OLD); 1602 1603 } 1604 1605 /* 1606 * NAMES: getdbuffer 1607 * DESCRIPTION: data buffer allocation for a child structure 1608 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1609 * 1610 * NOTE: always get dbuffer before pbuffer 1611 * and get both buffers before pwslot 1612 * otherwise a deadlock could be introduced. 1613 */ 1614 static void 1615 getdbuffer(md_raidcs_t *cs) 1616 { 1617 mr_unit_t *un; 1618 1619 cs->cs_dbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); 1620 if (cs->cs_dbuffer != NULL) 1621 return; 1622 un = cs->cs_ps->ps_un; 1623 mutex_enter(&un->un_mx); 1624 while (un->un_dbuffer == NULL) { 1625 STAT_INC(data_buffer_waits); 1626 un->un_rflags |= MD_RFLAG_NEEDBUF; 1627 cv_wait(&un->un_cv, &un->un_mx); 1628 } 1629 cs->cs_dbuffer = un->un_dbuffer; 1630 cs->cs_flags |= MD_RCS_UNDBUF; 1631 un->un_dbuffer = NULL; 1632 mutex_exit(&un->un_mx); 1633 } 1634 1635 /* 1636 * NAMES: getpbuffer 1637 * DESCRIPTION: parity buffer allocation for a child structure 1638 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1639 * 1640 * NOTE: always get dbuffer before pbuffer 1641 * and get both buffers before pwslot 1642 * otherwise a deadlock could be introduced. 1643 */ 1644 static void 1645 getpbuffer(md_raidcs_t *cs) 1646 { 1647 mr_unit_t *un; 1648 1649 cs->cs_pbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); 1650 if (cs->cs_pbuffer != NULL) 1651 return; 1652 un = cs->cs_ps->ps_un; 1653 mutex_enter(&un->un_mx); 1654 while (un->un_pbuffer == NULL) { 1655 STAT_INC(parity_buffer_waits); 1656 un->un_rflags |= MD_RFLAG_NEEDBUF; 1657 cv_wait(&un->un_cv, &un->un_mx); 1658 } 1659 cs->cs_pbuffer = un->un_pbuffer; 1660 cs->cs_flags |= MD_RCS_UNPBUF; 1661 un->un_pbuffer = NULL; 1662 mutex_exit(&un->un_mx); 1663 } 1664 static void 1665 getresources(md_raidcs_t *cs) 1666 { 1667 md_raidcbuf_t *cbuf; 1668 /* 1669 * NOTE: always get dbuffer before pbuffer 1670 * and get both buffers before pwslot 1671 * otherwise a deadlock could be introduced. 1672 */ 1673 getdbuffer(cs); 1674 getpbuffer(cs); 1675 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 1676 cbuf->cbuf_buffer = 1677 kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_SLEEP); 1678 } 1679 /* 1680 * NAMES: freebuffers 1681 * DESCRIPTION: child structure buffer freeing routine 1682 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1683 */ 1684 static void 1685 freebuffers(md_raidcs_t *cs) 1686 { 1687 mr_unit_t *un; 1688 md_raidcbuf_t *cbuf; 1689 1690 /* free buffers used for full line write */ 1691 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 1692 if (cbuf->cbuf_buffer == NULL) 1693 continue; 1694 kmem_free(cbuf->cbuf_buffer, cbuf->cbuf_bcount + DEV_BSIZE); 1695 cbuf->cbuf_buffer = NULL; 1696 cbuf->cbuf_bcount = 0; 1697 } 1698 1699 if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { 1700 un = cs->cs_un; 1701 mutex_enter(&un->un_mx); 1702 } 1703 if (cs->cs_dbuffer) { 1704 if (cs->cs_flags & MD_RCS_UNDBUF) 1705 un->un_dbuffer = cs->cs_dbuffer; 1706 else 1707 kmem_free(cs->cs_dbuffer, cs->cs_bcount + DEV_BSIZE); 1708 } 1709 if (cs->cs_pbuffer) { 1710 if (cs->cs_flags & MD_RCS_UNPBUF) 1711 un->un_pbuffer = cs->cs_pbuffer; 1712 else 1713 kmem_free(cs->cs_pbuffer, cs->cs_bcount + DEV_BSIZE); 1714 } 1715 if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { 1716 un->un_rflags &= ~MD_RFLAG_NEEDBUF; 1717 cv_broadcast(&un->un_cv); 1718 mutex_exit(&un->un_mx); 1719 } 1720 } 1721 1722 /* 1723 * NAMES: raid_line_reader_lock, raid_line_writer_lock 1724 * DESCRIPTION: RAID metadevice line reader and writer lock routines 1725 * data column # and parity column #. 1726 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1727 */ 1728 1729 void 1730 raid_line_reader_lock(md_raidcs_t *cs, int resync_thread) 1731 { 1732 mr_unit_t *un; 1733 md_raidcs_t *cs1; 1734 1735 ASSERT(cs->cs_line != MD_DISKADDR_ERROR); 1736 un = cs->cs_un; 1737 cs->cs_flags |= MD_RCS_READER; 1738 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1739 if (!panicstr) 1740 mutex_enter(&un->un_linlck_mx); 1741 cs1 = un->un_linlck_chn; 1742 while (cs1 != NULL) { 1743 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1744 if (raid_io_overlaps(cs, cs1) == 1) 1745 if (cs1->cs_flags & MD_RCS_WRITER) 1746 break; 1747 1748 if (cs1 != NULL) { 1749 if (panicstr) 1750 panic("md; raid line write lock held"); 1751 un->un_linlck_flg = 1; 1752 cv_wait(&un->un_linlck_cv, &un->un_linlck_mx); 1753 STAT_INC(raid_read_waits); 1754 } 1755 } 1756 STAT_MAX(raid_max_reader_locks, raid_reader_locks_active); 1757 STAT_INC(raid_reader_locks); 1758 cs1 = un->un_linlck_chn; 1759 if (cs1 != NULL) 1760 cs1->cs_linlck_prev = cs; 1761 cs->cs_linlck_next = cs1; 1762 cs->cs_linlck_prev = NULL; 1763 un->un_linlck_chn = cs; 1764 cs->cs_flags |= MD_RCS_LLOCKD; 1765 if (resync_thread) { 1766 diskaddr_t lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 1767 diskaddr_t line = (lastblk + 1) / un->un_segsize; 1768 ASSERT(raid_state_cnt(un, RCS_RESYNC)); 1769 mutex_enter(&un->un_mx); 1770 un->un_resync_line_index = line; 1771 mutex_exit(&un->un_mx); 1772 } 1773 if (!panicstr) 1774 mutex_exit(&un->un_linlck_mx); 1775 } 1776 1777 int 1778 raid_line_writer_lock(md_raidcs_t *cs, int lock) 1779 { 1780 mr_unit_t *un; 1781 md_raidcs_t *cs1; 1782 1783 ASSERT(cs->cs_line != MD_DISKADDR_ERROR); 1784 cs->cs_flags |= MD_RCS_WRITER; 1785 un = cs->cs_ps->ps_un; 1786 1787 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1788 if (lock && !panicstr) 1789 mutex_enter(&un->un_linlck_mx); 1790 ASSERT(MUTEX_HELD(&un->un_linlck_mx)); 1791 1792 cs1 = un->un_linlck_chn; 1793 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1794 if (raid_io_overlaps(cs, cs1)) 1795 break; 1796 1797 if (cs1 != NULL) { 1798 if (panicstr) 1799 panic("md: line writer lock inaccessible"); 1800 goto no_lock_exit; 1801 } 1802 1803 if (raid_alloc_pwslot(cs)) { 1804 if (panicstr) 1805 panic("md: no prewrite slots"); 1806 STAT_INC(raid_prewrite_waits); 1807 goto no_lock_exit; 1808 } 1809 1810 cs1 = un->un_linlck_chn; 1811 if (cs1 != NULL) 1812 cs1->cs_linlck_prev = cs; 1813 cs->cs_linlck_next = cs1; 1814 cs->cs_linlck_prev = NULL; 1815 un->un_linlck_chn = cs; 1816 cs->cs_flags |= MD_RCS_LLOCKD; 1817 cs->cs_flags &= ~MD_RCS_WAITING; 1818 STAT_INC(raid_writer_locks); 1819 STAT_MAX(raid_max_write_locks, raid_write_locks_active); 1820 if (lock && !panicstr) 1821 mutex_exit(&un->un_linlck_mx); 1822 return (0); 1823 1824 no_lock_exit: 1825 /* if this is already queued then do not requeue it */ 1826 ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); 1827 if (!lock || (cs->cs_flags & MD_RCS_WAITING)) 1828 return (1); 1829 cs->cs_flags |= MD_RCS_WAITING; 1830 cs->cs_un = un; 1831 raid_enqueue(cs); 1832 if (lock && !panicstr) 1833 mutex_exit(&un->un_linlck_mx); 1834 return (1); 1835 } 1836 1837 static void 1838 raid_startio(md_raidcs_t *cs) 1839 { 1840 mdi_unit_t *ui = cs->cs_ps->ps_ui; 1841 mr_unit_t *un = cs->cs_un; 1842 1843 un = md_unit_readerlock(ui); 1844 raid_write_io(un, cs); 1845 } 1846 1847 void 1848 raid_io_startup(mr_unit_t *un) 1849 { 1850 md_raidcs_t *waiting_list, *cs1; 1851 md_raidcs_t *previous = NULL, *next = NULL; 1852 mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); 1853 kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; 1854 1855 ASSERT(MUTEX_HELD(&un->un_linlck_mx)); 1856 mutex_enter(io_list_mutex); 1857 1858 /* 1859 * check to be sure there are no reader locks outstanding. If 1860 * there are not then pass on the writer lock. 1861 */ 1862 waiting_list = ui->ui_io_lock->io_list_front; 1863 while (waiting_list) { 1864 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1865 ASSERT(! (waiting_list->cs_flags & MD_RCS_LLOCKD)); 1866 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1867 if (raid_io_overlaps(waiting_list, cs1) == 1) 1868 break; 1869 /* 1870 * there was an IOs that overlaps this io so go onto 1871 * the next io in the waiting list 1872 */ 1873 if (cs1) { 1874 previous = waiting_list; 1875 waiting_list = waiting_list->cs_linlck_next; 1876 continue; 1877 } 1878 1879 /* 1880 * There are no IOs that overlap this, so remove it from 1881 * the waiting queue, and start it 1882 */ 1883 1884 if (raid_check_pw(waiting_list)) { 1885 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1886 previous = waiting_list; 1887 waiting_list = waiting_list->cs_linlck_next; 1888 continue; 1889 } 1890 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1891 1892 next = waiting_list->cs_linlck_next; 1893 if (previous) 1894 previous->cs_linlck_next = next; 1895 else 1896 ui->ui_io_lock->io_list_front = next; 1897 1898 if (ui->ui_io_lock->io_list_front == NULL) 1899 ui->ui_io_lock->io_list_back = NULL; 1900 1901 if (ui->ui_io_lock->io_list_back == waiting_list) 1902 ui->ui_io_lock->io_list_back = previous; 1903 1904 waiting_list->cs_linlck_next = NULL; 1905 waiting_list->cs_flags &= ~MD_RCS_WAITING; 1906 STAT_DEC(raid_write_queue_length); 1907 if (raid_line_writer_lock(waiting_list, 0)) 1908 panic("region locking corrupted"); 1909 1910 ASSERT(waiting_list->cs_flags & MD_RCS_LLOCKD); 1911 daemon_request(&md_mstr_daemon, raid_startio, 1912 (daemon_queue_t *)waiting_list, REQ_OLD); 1913 waiting_list = next; 1914 1915 } 1916 mutex_exit(io_list_mutex); 1917 } 1918 1919 void 1920 raid_line_exit(md_raidcs_t *cs) 1921 { 1922 mr_unit_t *un; 1923 1924 un = cs->cs_ps->ps_un; 1925 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1926 mutex_enter(&un->un_linlck_mx); 1927 if (cs->cs_flags & MD_RCS_READER) 1928 STAT_DEC(raid_reader_locks_active); 1929 else 1930 STAT_DEC(raid_write_locks_active); 1931 1932 if (cs->cs_linlck_prev) 1933 cs->cs_linlck_prev->cs_linlck_next = cs->cs_linlck_next; 1934 else 1935 un->un_linlck_chn = cs->cs_linlck_next; 1936 if (cs->cs_linlck_next) 1937 cs->cs_linlck_next->cs_linlck_prev = cs->cs_linlck_prev; 1938 1939 cs->cs_flags &= ~MD_RCS_LLOCKD; 1940 1941 if (un->un_linlck_flg) 1942 cv_broadcast(&un->un_linlck_cv); 1943 1944 un->un_linlck_flg = 0; 1945 cs->cs_line = MD_DISKADDR_ERROR; 1946 1947 raid_cancel_pwslot(cs); 1948 /* 1949 * now that the lock is droped go ahead and see if there are any 1950 * other writes that can be started up 1951 */ 1952 raid_io_startup(un); 1953 1954 mutex_exit(&un->un_linlck_mx); 1955 } 1956 1957 /* 1958 * NAMES: raid_line, raid_pcolumn, raid_dcolumn 1959 * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #, 1960 * data column # and parity column #. 1961 * PARAMETERS: int segment - segment number 1962 * mr_unit_t *un - pointer to an unit structure 1963 * RETURNS: raid_line returns line # 1964 * raid_dcolumn returns data column # 1965 * raid_pcolumn returns parity column # 1966 */ 1967 static diskaddr_t 1968 raid_line(diskaddr_t segment, mr_unit_t *un) 1969 { 1970 diskaddr_t adj_seg; 1971 diskaddr_t line; 1972 diskaddr_t max_orig_segment; 1973 1974 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 1975 if (segment >= max_orig_segment) { 1976 adj_seg = segment - max_orig_segment; 1977 line = adj_seg % un->un_segsincolumn; 1978 } else { 1979 line = segment / (un->un_origcolumncnt - 1); 1980 } 1981 return (line); 1982 } 1983 1984 uint_t 1985 raid_dcolumn(diskaddr_t segment, mr_unit_t *un) 1986 { 1987 diskaddr_t adj_seg; 1988 diskaddr_t line; 1989 diskaddr_t max_orig_segment; 1990 uint_t column; 1991 1992 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 1993 if (segment >= max_orig_segment) { 1994 adj_seg = segment - max_orig_segment; 1995 column = un->un_origcolumncnt + 1996 (uint_t)(adj_seg / un->un_segsincolumn); 1997 } else { 1998 line = segment / (un->un_origcolumncnt - 1); 1999 column = (uint_t)((segment % (un->un_origcolumncnt - 1) + line) 2000 % un->un_origcolumncnt); 2001 } 2002 return (column); 2003 } 2004 2005 uint_t 2006 raid_pcolumn(diskaddr_t segment, mr_unit_t *un) 2007 { 2008 diskaddr_t adj_seg; 2009 diskaddr_t line; 2010 diskaddr_t max_orig_segment; 2011 uint_t column; 2012 2013 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 2014 if (segment >= max_orig_segment) { 2015 adj_seg = segment - max_orig_segment; 2016 line = adj_seg % un->un_segsincolumn; 2017 } else { 2018 line = segment / (un->un_origcolumncnt - 1); 2019 } 2020 column = (uint_t)((line + (un->un_origcolumncnt - 1)) 2021 % un->un_origcolumncnt); 2022 return (column); 2023 } 2024 2025 2026 /* 2027 * Is called in raid_iosetup to probe each column to insure 2028 * that all the columns are in 'okay' state and meet the 2029 * 'full line' requirement. If any column is in error, 2030 * we don't want to enable the 'full line' flag. Previously, 2031 * we would do so and disable it only when a error is 2032 * detected after the first 'full line' io which is too late 2033 * and leads to the potential data corruption. 2034 */ 2035 static int 2036 raid_check_cols(mr_unit_t *un) 2037 { 2038 buf_t bp; 2039 char *buf; 2040 mr_column_t *colptr; 2041 minor_t mnum = MD_SID(un); 2042 int i; 2043 int err = 0; 2044 2045 buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP); 2046 2047 for (i = 0; i < un->un_totalcolumncnt; i++) { 2048 md_dev64_t tmpdev; 2049 2050 colptr = &un->un_column[i]; 2051 2052 tmpdev = colptr->un_dev; 2053 /* 2054 * Open by device id 2055 * If this device is hotspared 2056 * use the hotspare key 2057 */ 2058 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ? 2059 colptr->un_hs_key : colptr->un_orig_key); 2060 2061 if (tmpdev == NODEV64) { 2062 err = 1; 2063 break; 2064 } 2065 2066 colptr->un_dev = tmpdev; 2067 2068 bzero((caddr_t)&bp, sizeof (buf_t)); 2069 bp.b_back = &bp; 2070 bp.b_forw = &bp; 2071 bp.b_flags = (B_READ | B_BUSY); 2072 sema_init(&bp.b_io, 0, NULL, 2073 SEMA_DEFAULT, NULL); 2074 sema_init(&bp.b_sem, 0, NULL, 2075 SEMA_DEFAULT, NULL); 2076 bp.b_edev = md_dev64_to_dev(colptr->un_dev); 2077 bp.b_lblkno = colptr->un_pwstart; 2078 bp.b_bcount = DEV_BSIZE; 2079 bp.b_bufsize = DEV_BSIZE; 2080 bp.b_un.b_addr = (caddr_t)buf; 2081 (void) md_call_strategy(&bp, 0, NULL); 2082 if (biowait(&bp)) { 2083 err = 1; 2084 break; 2085 } 2086 } 2087 2088 kmem_free(buf, DEV_BSIZE); 2089 return (err); 2090 } 2091 2092 /* 2093 * NAME: raid_iosetup 2094 * DESCRIPTION: RAID metadevice specific I/O set up routine which does 2095 * all the necessary calculations to determine the location 2096 * of the segement for the I/O. 2097 * PARAMETERS: mr_unit_t *un - unit number of RAID metadevice 2098 * diskaddr_t blkno - block number of the I/O attempt 2099 * size_t blkcnt - block count for this I/O 2100 * md_raidcs_t *cs - child structure for each segmented I/O 2101 * 2102 * NOTE: The following is an example of a raid disk layer out: 2103 * 2104 * Total Column = 5 2105 * Original Column = 4 2106 * Segment Per Column = 10 2107 * 2108 * Col#0 Col#1 Col#2 Col#3 Col#4 Col#5 Col#6 2109 * ------------------------------------------------------------- 2110 * line#0 Seg#0 Seg#1 Seg#2 Parity Seg#30 Seg#40 2111 * line#1 Parity Seg#3 Seg#4 Seg#5 Seg#31 2112 * line#2 Seg#8 Parity Seg#6 Seg#7 Seg#32 2113 * line#3 Seg#10 Seg#11 Parity Seg#9 Seg#33 2114 * line#4 Seg#12 Seg#13 Seg#14 Parity Seg#34 2115 * line#5 Parity Seg#15 Seg#16 Seg#17 Seg#35 2116 * line#6 Seg#20 Parity Seg#18 Seg#19 Seg#36 2117 * line#7 Seg#22 Seg#23 Parity Seg#21 Seg#37 2118 * line#8 Seg#24 Seg#25 Seg#26 Parity Seg#38 2119 * line#9 Parity Seg#27 Seg#28 Seg#29 Seg#39 2120 */ 2121 static size_t 2122 raid_iosetup( 2123 mr_unit_t *un, 2124 diskaddr_t blkno, 2125 size_t blkcnt, 2126 md_raidcs_t *cs 2127 ) 2128 { 2129 diskaddr_t segment; 2130 diskaddr_t segstart; 2131 diskaddr_t segoff; 2132 size_t leftover; 2133 diskaddr_t line; 2134 uint_t iosize; 2135 uint_t colcnt; 2136 2137 /* caculate the segment# and offset for the block */ 2138 segment = blkno / un->un_segsize; 2139 segstart = segment * un->un_segsize; 2140 segoff = blkno - segstart; 2141 iosize = un->un_iosize - 1; 2142 colcnt = un->un_totalcolumncnt - 1; 2143 line = raid_line(segment, un); 2144 cs->cs_dcolumn = raid_dcolumn(segment, un); 2145 cs->cs_pcolumn = raid_pcolumn(segment, un); 2146 cs->cs_dflags = un->un_column[cs->cs_dcolumn].un_devflags; 2147 cs->cs_pflags = un->un_column[cs->cs_pcolumn].un_devflags; 2148 cs->cs_line = line; 2149 2150 if ((cs->cs_ps->ps_flags & MD_RPS_WRITE) && 2151 (UNIT_STATE(un) & RCS_OKAY) && 2152 (segoff == 0) && 2153 (un->un_totalcolumncnt == un->un_origcolumncnt) && 2154 (un->un_segsize < un->un_iosize) && 2155 (un->un_iosize <= un->un_maxio) && 2156 (blkno == line * un->un_segsize * colcnt) && 2157 (blkcnt >= ((un->un_totalcolumncnt -1) * un->un_segsize)) && 2158 (raid_state_cnt(un, RCS_OKAY) == un->un_origcolumncnt) && 2159 (raid_check_cols(un) == 0)) { 2160 2161 md_raidcbuf_t **cbufp; 2162 md_raidcbuf_t *cbuf; 2163 int i, j; 2164 2165 STAT_INC(raid_full_line_writes); 2166 leftover = blkcnt - (un->un_segsize * colcnt); 2167 ASSERT(blkcnt >= (un->un_segsize * colcnt)); 2168 cs->cs_blkno = line * un->un_segsize; 2169 cs->cs_blkcnt = un->un_segsize; 2170 cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 2171 cs->cs_bcount = dbtob(cs->cs_blkcnt); 2172 cs->cs_flags |= MD_RCS_LINE; 2173 2174 cbufp = &cs->cs_buflist; 2175 for (i = 0; i < un->un_totalcolumncnt; i++) { 2176 j = cs->cs_dcolumn + i; 2177 j = j % un->un_totalcolumncnt; 2178 2179 if ((j == cs->cs_dcolumn) || (j == cs->cs_pcolumn)) 2180 continue; 2181 cbuf = kmem_cache_alloc(raid_cbuf_cache, 2182 MD_ALLOCFLAGS); 2183 raid_cbuf_init(cbuf); 2184 cbuf->cbuf_un = cs->cs_un; 2185 cbuf->cbuf_ps = cs->cs_ps; 2186 cbuf->cbuf_column = j; 2187 cbuf->cbuf_bcount = dbtob(un->un_segsize); 2188 *cbufp = cbuf; 2189 cbufp = &cbuf->cbuf_next; 2190 } 2191 return (leftover); 2192 } 2193 2194 leftover = blkcnt - (un->un_segsize - segoff); 2195 if (blkcnt > (un->un_segsize - segoff)) 2196 blkcnt -= leftover; 2197 else 2198 leftover = 0; 2199 2200 if (blkcnt > (size_t)iosize) { 2201 leftover += (blkcnt - iosize); 2202 blkcnt = iosize; 2203 } 2204 2205 /* calculate the line# and column# for the segment */ 2206 cs->cs_flags &= ~MD_RCS_LINE; 2207 cs->cs_blkno = line * un->un_segsize + segoff; 2208 cs->cs_blkcnt = (uint_t)blkcnt; 2209 cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 2210 cs->cs_bcount = dbtob((uint_t)blkcnt); 2211 return (leftover); 2212 } 2213 2214 /* 2215 * NAME: raid_done 2216 * DESCRIPTION: RAID metadevice I/O done interrupt routine 2217 * PARAMETERS: struct buf *bp - pointer to a buffer structure 2218 */ 2219 static void 2220 raid_done(struct buf *bp) 2221 { 2222 md_raidcs_t *cs; 2223 int flags, frags; 2224 2225 sema_v(&bp->b_io); 2226 cs = (md_raidcs_t *)bp->b_chain; 2227 2228 ASSERT(cs != NULL); 2229 2230 mutex_enter(&cs->cs_mx); 2231 if (bp->b_flags & B_ERROR) { 2232 cs->cs_flags |= MD_RCS_ERROR; 2233 cs->cs_flags &= ~(MD_RCS_ISCALL); 2234 } 2235 2236 flags = cs->cs_flags; 2237 frags = --cs->cs_frags; 2238 mutex_exit(&cs->cs_mx); 2239 if (frags != 0) { 2240 return; 2241 } 2242 2243 if (flags & MD_RCS_ERROR) { 2244 if (cs->cs_error_call) { 2245 daemon_request(&md_done_daemon, cs->cs_error_call, 2246 (daemon_queue_t *)cs, REQ_OLD); 2247 } 2248 return; 2249 } 2250 2251 if (flags & MD_RCS_ISCALL) { 2252 cs->cs_flags &= ~(MD_RCS_ISCALL); 2253 (*(cs->cs_call))(cs); 2254 return; 2255 } 2256 daemon_request(&md_done_daemon, cs->cs_call, 2257 (daemon_queue_t *)cs, REQ_OLD); 2258 } 2259 /* 2260 * the flag RIO_EXTRA is used when dealing with a column in the process 2261 * of being resynced. During the resync, writes may have to take place 2262 * on both the original component and a hotspare component. 2263 */ 2264 #define RIO_DATA 0x00100 /* use data buffer & data column */ 2265 #define RIO_PARITY 0x00200 /* use parity buffer & parity column */ 2266 #define RIO_WRITE 0x00400 /* issue a write */ 2267 #define RIO_READ 0x00800 /* issue a read */ 2268 #define RIO_PWIO 0x01000 /* do the I/O to the prewrite entry */ 2269 #define RIO_ALT 0x02000 /* do write to alternate device */ 2270 #define RIO_EXTRA 0x04000 /* use extra buffer */ 2271 2272 #define RIO_COLMASK 0x000ff 2273 2274 #define RIO_PREWRITE RIO_WRITE | RIO_PWIO 2275 2276 /* 2277 * NAME: raidio 2278 * DESCRIPTION: RAID metadevice write routine 2279 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2280 */ 2281 static void 2282 raidio(md_raidcs_t *cs, int flags) 2283 { 2284 buf_t *bp; 2285 int column; 2286 int flag; 2287 void *private; 2288 mr_unit_t *un; 2289 int iosize; 2290 diskaddr_t pwstart; 2291 diskaddr_t devstart; 2292 md_dev64_t dev; 2293 2294 un = cs->cs_un; 2295 2296 ASSERT(IO_READER_HELD(un)); 2297 ASSERT(UNIT_READER_HELD(un)); 2298 2299 if (flags & RIO_DATA) { 2300 if (flags & RIO_EXTRA) 2301 bp = &cs->cs_hbuf; 2302 else 2303 bp = &cs->cs_dbuf; 2304 bp->b_un.b_addr = cs->cs_dbuffer; 2305 column = cs->cs_dcolumn; 2306 } else { 2307 if (flags & RIO_EXTRA) 2308 bp = &cs->cs_hbuf; 2309 else 2310 bp = &cs->cs_pbuf; 2311 bp->b_un.b_addr = cs->cs_pbuffer; 2312 column = cs->cs_pcolumn; 2313 } 2314 if (flags & RIO_COLMASK) 2315 column = (flags & RIO_COLMASK) - 1; 2316 2317 bp->b_bcount = cs->cs_bcount; 2318 bp->b_bufsize = cs->cs_bcount; 2319 iosize = un->un_iosize; 2320 2321 /* check if the hotspared device will be used */ 2322 if (flags & RIO_ALT && (flags & RIO_WRITE)) { 2323 pwstart = un->un_column[column].un_alt_pwstart; 2324 devstart = un->un_column[column].un_alt_devstart; 2325 dev = un->un_column[column].un_alt_dev; 2326 } else { 2327 pwstart = un->un_column[column].un_pwstart; 2328 devstart = un->un_column[column].un_devstart; 2329 dev = un->un_column[column].un_dev; 2330 } 2331 2332 /* if not writing to log skip log header */ 2333 if ((flags & RIO_PWIO) == 0) { 2334 bp->b_lblkno = devstart + cs->cs_blkno; 2335 bp->b_un.b_addr += DEV_BSIZE; 2336 } else { 2337 bp->b_bcount += DEV_BSIZE; 2338 bp->b_bufsize = bp->b_bcount; 2339 if (flags & RIO_DATA) { 2340 bp->b_lblkno = cs->cs_dpwslot * iosize + pwstart; 2341 } else { /* not DATA -> PARITY */ 2342 bp->b_lblkno = cs->cs_ppwslot * iosize + pwstart; 2343 } 2344 } 2345 2346 bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR | nv_available); 2347 bp->b_flags |= B_BUSY; 2348 if (flags & RIO_READ) { 2349 bp->b_flags |= B_READ; 2350 } else { 2351 bp->b_flags |= B_WRITE; 2352 if ((nv_available && nv_parity && (flags & RIO_PARITY)) || 2353 (nv_available && nv_prewrite && (flags & RIO_PWIO))) 2354 bp->b_flags |= nv_available; 2355 } 2356 bp->b_iodone = (int (*)())raid_done; 2357 bp->b_edev = md_dev64_to_dev(dev); 2358 2359 ASSERT((bp->b_edev != 0) && (bp->b_edev != NODEV)); 2360 2361 private = cs->cs_strategy_private; 2362 flag = cs->cs_strategy_flag; 2363 2364 md_call_strategy(bp, flag, private); 2365 } 2366 2367 /* 2368 * NAME: genstandardparity 2369 * DESCRIPTION: This routine 2370 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2371 */ 2372 static void 2373 genstandardparity(md_raidcs_t *cs) 2374 { 2375 uint_t *dbuf, *pbuf; 2376 size_t wordcnt; 2377 uint_t dsum = 0; 2378 uint_t psum = 0; 2379 2380 ASSERT((cs->cs_bcount & 0x3) == 0); 2381 2382 wordcnt = cs->cs_bcount / sizeof (uint_t); 2383 2384 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2385 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2386 2387 /* Word aligned */ 2388 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2389 uint_t *uwbuf = (uint_t *)(void *)(cs->cs_addr); 2390 uint_t uval; 2391 2392 while (wordcnt--) { 2393 uval = *uwbuf++; 2394 psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ uval)); 2395 ++pbuf; 2396 *dbuf = uval; 2397 dsum ^= uval; 2398 ++dbuf; 2399 } 2400 } else { 2401 uchar_t *ubbuf = (uchar_t *)(cs->cs_addr); 2402 union { 2403 uint_t wb; 2404 uchar_t bb[4]; 2405 } cb; 2406 2407 while (wordcnt--) { 2408 cb.bb[0] = *ubbuf++; 2409 cb.bb[1] = *ubbuf++; 2410 cb.bb[2] = *ubbuf++; 2411 cb.bb[3] = *ubbuf++; 2412 psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ cb.wb)); 2413 ++pbuf; 2414 *dbuf = cb.wb; 2415 dsum ^= cb.wb; 2416 ++dbuf; 2417 } 2418 } 2419 2420 RAID_FILLIN_RPW(cs->cs_dbuffer, cs->cs_un, dsum, cs->cs_pcolumn, 2421 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2422 2, cs->cs_dcolumn, RAID_PWMAGIC); 2423 2424 RAID_FILLIN_RPW(cs->cs_pbuffer, cs->cs_un, psum, cs->cs_dcolumn, 2425 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2426 2, cs->cs_pcolumn, RAID_PWMAGIC); 2427 } 2428 2429 static void 2430 genlineparity(md_raidcs_t *cs) 2431 { 2432 2433 mr_unit_t *un = cs->cs_un; 2434 md_raidcbuf_t *cbuf; 2435 uint_t *pbuf, *dbuf; 2436 uint_t *uwbuf; 2437 uchar_t *ubbuf; 2438 size_t wordcnt; 2439 uint_t psum = 0, dsum = 0; 2440 size_t count = un->un_segsize * DEV_BSIZE; 2441 uint_t col; 2442 buf_t *bp; 2443 2444 ASSERT((cs->cs_bcount & 0x3) == 0); 2445 2446 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2447 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2448 uwbuf = (uint_t *)(void *)(cs->cs_addr); 2449 ubbuf = (uchar_t *)(void *)(cs->cs_addr); 2450 2451 wordcnt = count / sizeof (uint_t); 2452 2453 /* Word aligned */ 2454 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2455 uint_t uval; 2456 2457 while (wordcnt--) { 2458 uval = *uwbuf++; 2459 *dbuf = uval; 2460 *pbuf = uval; 2461 dsum ^= uval; 2462 ++pbuf; 2463 ++dbuf; 2464 } 2465 } else { 2466 union { 2467 uint_t wb; 2468 uchar_t bb[4]; 2469 } cb; 2470 2471 while (wordcnt--) { 2472 cb.bb[0] = *ubbuf++; 2473 cb.bb[1] = *ubbuf++; 2474 cb.bb[2] = *ubbuf++; 2475 cb.bb[3] = *ubbuf++; 2476 *dbuf = cb.wb; 2477 *pbuf = cb.wb; 2478 dsum ^= cb.wb; 2479 ++pbuf; 2480 ++dbuf; 2481 } 2482 } 2483 2484 RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, cs->cs_pcolumn, 2485 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2486 un->un_totalcolumncnt, cs->cs_dcolumn, RAID_PWMAGIC); 2487 2488 raidio(cs, RIO_PREWRITE | RIO_DATA); 2489 2490 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 2491 2492 dsum = 0; 2493 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2494 dbuf = (uint_t *)(void *)(cbuf->cbuf_buffer + DEV_BSIZE); 2495 2496 wordcnt = count / sizeof (uint_t); 2497 2498 col = cbuf->cbuf_column; 2499 2500 /* Word aligned */ 2501 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2502 uint_t uval; 2503 2504 /* 2505 * Only calculate psum when working on the last 2506 * data buffer. 2507 */ 2508 if (cbuf->cbuf_next == NULL) { 2509 psum = 0; 2510 while (wordcnt--) { 2511 uval = *uwbuf++; 2512 *dbuf = uval; 2513 psum ^= (*pbuf ^= uval); 2514 dsum ^= uval; 2515 ++dbuf; 2516 ++pbuf; 2517 } 2518 } else { 2519 while (wordcnt--) { 2520 uval = *uwbuf++; 2521 *dbuf = uval; 2522 *pbuf ^= uval; 2523 dsum ^= uval; 2524 ++dbuf; 2525 ++pbuf; 2526 } 2527 } 2528 } else { 2529 union { 2530 uint_t wb; 2531 uchar_t bb[4]; 2532 } cb; 2533 2534 /* 2535 * Only calculate psum when working on the last 2536 * data buffer. 2537 */ 2538 if (cbuf->cbuf_next == NULL) { 2539 psum = 0; 2540 while (wordcnt--) { 2541 cb.bb[0] = *ubbuf++; 2542 cb.bb[1] = *ubbuf++; 2543 cb.bb[2] = *ubbuf++; 2544 cb.bb[3] = *ubbuf++; 2545 *dbuf = cb.wb; 2546 psum ^= (*pbuf ^= cb.wb); 2547 dsum ^= cb.wb; 2548 ++dbuf; 2549 ++pbuf; 2550 } 2551 } else { 2552 while (wordcnt--) { 2553 cb.bb[0] = *ubbuf++; 2554 cb.bb[1] = *ubbuf++; 2555 cb.bb[2] = *ubbuf++; 2556 cb.bb[3] = *ubbuf++; 2557 *dbuf = cb.wb; 2558 *pbuf ^= cb.wb; 2559 dsum ^= cb.wb; 2560 ++dbuf; 2561 ++pbuf; 2562 } 2563 } 2564 } 2565 RAID_FILLIN_RPW(cbuf->cbuf_buffer, un, dsum, cs->cs_pcolumn, 2566 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2567 un->un_totalcolumncnt, col, RAID_PWMAGIC); 2568 2569 /* 2570 * fill in buffer for write to prewrite area 2571 */ 2572 bp = &cbuf->cbuf_bp; 2573 bp->b_un.b_addr = cbuf->cbuf_buffer; 2574 bp->b_bcount = cbuf->cbuf_bcount + DEV_BSIZE; 2575 bp->b_bufsize = bp->b_bcount; 2576 bp->b_lblkno = (cbuf->cbuf_pwslot * un->un_iosize) + 2577 un->un_column[col].un_pwstart; 2578 bp->b_flags = B_WRITE | B_BUSY; 2579 if (nv_available && nv_prewrite) 2580 bp->b_flags |= nv_available; 2581 bp->b_iodone = (int (*)())raid_done; 2582 bp->b_edev = md_dev64_to_dev(un->un_column[col].un_dev); 2583 bp->b_chain = (struct buf *)cs; 2584 md_call_strategy(bp, 2585 cs->cs_strategy_flag, cs->cs_strategy_private); 2586 } 2587 2588 RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, cs->cs_dcolumn, 2589 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2590 un->un_totalcolumncnt, cs->cs_pcolumn, RAID_PWMAGIC); 2591 2592 raidio(cs, RIO_PREWRITE | RIO_PARITY); 2593 } 2594 2595 /* 2596 * NAME: raid_readregenloop 2597 * DESCRIPTION: RAID metadevice write routine 2598 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2599 */ 2600 static void 2601 raid_readregenloop(md_raidcs_t *cs) 2602 { 2603 mr_unit_t *un; 2604 md_raidps_t *ps; 2605 uint_t *dbuf; 2606 uint_t *pbuf; 2607 size_t wordcnt; 2608 2609 un = cs->cs_un; 2610 2611 /* 2612 * XOR the parity with data bytes, must skip the 2613 * pre-write entry header in all data/parity buffers 2614 */ 2615 wordcnt = cs->cs_bcount / sizeof (uint_t); 2616 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2617 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2618 while (wordcnt--) 2619 *dbuf++ ^= *pbuf++; 2620 2621 /* bump up the loop count */ 2622 cs->cs_loop++; 2623 2624 /* skip the errored component */ 2625 if (cs->cs_loop == cs->cs_dcolumn) 2626 cs->cs_loop++; 2627 2628 if (cs->cs_loop != un->un_totalcolumncnt) { 2629 cs->cs_frags = 1; 2630 raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); 2631 return; 2632 } 2633 /* reaching the end sof loop */ 2634 ps = cs->cs_ps; 2635 bcopy(cs->cs_dbuffer + DEV_BSIZE, cs->cs_addr, cs->cs_bcount); 2636 raid_free_child(cs, 1); 2637 2638 /* decrement readfrags */ 2639 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 2640 } 2641 2642 /* 2643 * NAME: raid_read_io 2644 * DESCRIPTION: RAID metadevice read I/O routine 2645 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2646 * md_raidcs_t *cs - pointer to a child structure 2647 */ 2648 static void 2649 raid_read_io(mr_unit_t *un, md_raidcs_t *cs) 2650 { 2651 int flag; 2652 void *private; 2653 buf_t *bp; 2654 buf_t *pb = cs->cs_ps->ps_bp; 2655 mr_column_t *column; 2656 2657 flag = cs->cs_strategy_flag; 2658 private = cs->cs_strategy_private; 2659 column = &un->un_column[cs->cs_dcolumn]; 2660 2661 /* 2662 * The component to be read is good, simply set up bp structure 2663 * and call low level md routine doing the read. 2664 */ 2665 2666 if (COLUMN_ISOKAY(un, cs->cs_dcolumn) || 2667 (COLUMN_ISLASTERR(un, cs->cs_dcolumn) && 2668 (cs->cs_flags & MD_RCS_RECOVERY) == 0)) { 2669 dev_t ddi_dev; /* needed for bioclone, so not md_dev64_t */ 2670 ddi_dev = md_dev64_to_dev(column->un_dev); 2671 2672 bp = &cs->cs_dbuf; 2673 bp = md_bioclone(pb, cs->cs_offset, cs->cs_bcount, ddi_dev, 2674 column->un_devstart + cs->cs_blkno, 2675 (int (*)())raid_done, bp, KM_NOSLEEP); 2676 2677 bp->b_chain = (buf_t *)cs; 2678 2679 cs->cs_frags = 1; 2680 cs->cs_error_call = raid_read_error; 2681 cs->cs_retry_call = raid_read_retry; 2682 cs->cs_flags |= MD_RCS_ISCALL; 2683 cs->cs_stage = RAID_READ_DONE; 2684 cs->cs_call = raid_stage; 2685 2686 ASSERT(bp->b_edev != 0); 2687 2688 md_call_strategy(bp, flag, private); 2689 return; 2690 } 2691 2692 /* 2693 * The component to be read is bad, have to go through 2694 * raid specific method to read data from other members. 2695 */ 2696 cs->cs_loop = 0; 2697 /* 2698 * NOTE: always get dbuffer before pbuffer 2699 * and get both buffers before pwslot 2700 * otherwise a deadlock could be introduced. 2701 */ 2702 raid_mapin_buf(cs); 2703 getdbuffer(cs); 2704 getpbuffer(cs); 2705 if (cs->cs_loop == cs->cs_dcolumn) 2706 cs->cs_loop++; 2707 2708 /* zero out data buffer for use as a data sink */ 2709 bzero(cs->cs_dbuffer + DEV_BSIZE, cs->cs_bcount); 2710 cs->cs_stage = RAID_NONE; 2711 cs->cs_call = raid_readregenloop; 2712 cs->cs_error_call = raid_read_error; 2713 cs->cs_retry_call = raid_read_no_retry; 2714 cs->cs_frags = 1; 2715 2716 /* use parity buffer to read other columns */ 2717 raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); 2718 } 2719 2720 /* 2721 * NAME: raid_read 2722 * DESCRIPTION: RAID metadevice write routine 2723 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2724 * md_raidcs_t *cs - pointer to a child structure 2725 */ 2726 static int 2727 raid_read(mr_unit_t *un, md_raidcs_t *cs) 2728 { 2729 int error = 0; 2730 md_raidps_t *ps; 2731 mdi_unit_t *ui; 2732 minor_t mnum; 2733 2734 ASSERT(IO_READER_HELD(un)); 2735 ps = cs->cs_ps; 2736 ui = ps->ps_ui; 2737 raid_line_reader_lock(cs, 0); 2738 un = (mr_unit_t *)md_unit_readerlock(ui); 2739 ASSERT(UNIT_STATE(un) != RUS_INIT); 2740 mnum = MD_SID(un); 2741 cs->cs_un = un; 2742 2743 /* make sure the read doesn't go beyond the end of the column */ 2744 if (cs->cs_blkno + cs->cs_blkcnt > 2745 un->un_segsize * un->un_segsincolumn) { 2746 error = ENXIO; 2747 } 2748 if (error) 2749 goto rerror; 2750 2751 if (un->un_state & RUS_REGEN) { 2752 raid_regen_parity(cs); 2753 un = MD_UNIT(mnum); 2754 cs->cs_un = un; 2755 } 2756 2757 raid_read_io(un, cs); 2758 return (0); 2759 2760 rerror: 2761 raid_error_parent(ps, error); 2762 raid_free_child(cs, 1); 2763 /* decrement readfrags */ 2764 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 2765 return (0); 2766 } 2767 2768 /* 2769 * NAME: raid_write_err_retry 2770 * DESCRIPTION: RAID metadevice write retry routine 2771 * write was for parity or data only; 2772 * complete write with error, no recovery possible 2773 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2774 * md_raidcs_t *cs - pointer to a child structure 2775 */ 2776 /*ARGSUSED*/ 2777 static void 2778 raid_write_err_retry(mr_unit_t *un, md_raidcs_t *cs) 2779 { 2780 md_raidps_t *ps = cs->cs_ps; 2781 int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; 2782 2783 /* decrement pwfrags if needed, and frags */ 2784 if (!(cs->cs_flags & MD_RCS_PWDONE)) 2785 flags |= RFP_DECR_PWFRAGS; 2786 raid_error_parent(ps, EIO); 2787 raid_free_child(cs, 1); 2788 raid_free_parent(ps, flags); 2789 } 2790 2791 /* 2792 * NAME: raid_write_err_retry 2793 * DESCRIPTION: RAID metadevice write retry routine 2794 * write is too far along to retry and parent 2795 * has already been signaled with iodone. 2796 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2797 * md_raidcs_t *cs - pointer to a child structure 2798 */ 2799 /*ARGSUSED*/ 2800 static void 2801 raid_write_no_retry(mr_unit_t *un, md_raidcs_t *cs) 2802 { 2803 md_raidps_t *ps = cs->cs_ps; 2804 int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; 2805 2806 /* decrement pwfrags if needed, and frags */ 2807 if (!(cs->cs_flags & MD_RCS_PWDONE)) 2808 flags |= RFP_DECR_PWFRAGS; 2809 raid_free_child(cs, 1); 2810 raid_free_parent(ps, flags); 2811 } 2812 2813 /* 2814 * NAME: raid_write_retry 2815 * DESCRIPTION: RAID metadevice write retry routine 2816 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2817 * md_raidcs_t *cs - pointer to a child structure 2818 */ 2819 static void 2820 raid_write_retry(mr_unit_t *un, md_raidcs_t *cs) 2821 { 2822 md_raidps_t *ps; 2823 2824 ps = cs->cs_ps; 2825 2826 /* re-initialize the buf_t structure for raid_write() */ 2827 cs->cs_dbuf.b_chain = (struct buf *)cs; 2828 cs->cs_dbuf.b_back = &cs->cs_dbuf; 2829 cs->cs_dbuf.b_forw = &cs->cs_dbuf; 2830 cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ 2831 cs->cs_dbuf.b_error = 0; /* initialize error */ 2832 cs->cs_dbuf.b_offset = -1; 2833 /* Initialize semaphores */ 2834 sema_init(&cs->cs_dbuf.b_io, 0, NULL, 2835 SEMA_DEFAULT, NULL); 2836 sema_init(&cs->cs_dbuf.b_sem, 0, NULL, 2837 SEMA_DEFAULT, NULL); 2838 2839 cs->cs_pbuf.b_chain = (struct buf *)cs; 2840 cs->cs_pbuf.b_back = &cs->cs_pbuf; 2841 cs->cs_pbuf.b_forw = &cs->cs_pbuf; 2842 cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ 2843 cs->cs_pbuf.b_error = 0; /* initialize error */ 2844 cs->cs_pbuf.b_offset = -1; 2845 sema_init(&cs->cs_pbuf.b_io, 0, NULL, 2846 SEMA_DEFAULT, NULL); 2847 sema_init(&cs->cs_pbuf.b_sem, 0, NULL, 2848 SEMA_DEFAULT, NULL); 2849 2850 cs->cs_hbuf.b_chain = (struct buf *)cs; 2851 cs->cs_hbuf.b_back = &cs->cs_hbuf; 2852 cs->cs_hbuf.b_forw = &cs->cs_hbuf; 2853 cs->cs_hbuf.b_flags = B_BUSY; /* initialize flags */ 2854 cs->cs_hbuf.b_error = 0; /* initialize error */ 2855 cs->cs_hbuf.b_offset = -1; 2856 sema_init(&cs->cs_hbuf.b_io, 0, NULL, 2857 SEMA_DEFAULT, NULL); 2858 sema_init(&cs->cs_hbuf.b_sem, 0, NULL, 2859 SEMA_DEFAULT, NULL); 2860 2861 cs->cs_flags &= ~(MD_RCS_ERROR); 2862 /* 2863 * If we have already done'ed the i/o but have done prewrite 2864 * on this child, then reset PWDONE flag and bump pwfrags before 2865 * restarting i/o. 2866 * If pwfrags is zero, we have already 'iodone'd the i/o so 2867 * leave things alone. We don't want to re-'done' it. 2868 */ 2869 mutex_enter(&ps->ps_mx); 2870 if (cs->cs_flags & MD_RCS_PWDONE) { 2871 cs->cs_flags &= ~MD_RCS_PWDONE; 2872 ps->ps_pwfrags++; 2873 } 2874 mutex_exit(&ps->ps_mx); 2875 raid_write_io(un, cs); 2876 } 2877 2878 /* 2879 * NAME: raid_wrerr 2880 * DESCRIPTION: RAID metadevice write routine 2881 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2882 * LOCKS: must obtain unit writer lock while calling raid_error_state 2883 * since a unit or column state transition may take place. 2884 * must obtain unit reader lock to retry I/O. 2885 */ 2886 static void 2887 raid_wrerr(md_raidcs_t *cs) 2888 { 2889 md_raidps_t *ps; 2890 mdi_unit_t *ui; 2891 mr_unit_t *un; 2892 md_raidcbuf_t *cbuf; 2893 2894 ps = cs->cs_ps; 2895 ui = ps->ps_ui; 2896 2897 un = (mr_unit_t *)md_unit_writerlock(ui); 2898 ASSERT(un != 0); 2899 2900 if (cs->cs_dbuf.b_flags & B_ERROR) 2901 (void) raid_error_state(un, &cs->cs_dbuf); 2902 if (cs->cs_pbuf.b_flags & B_ERROR) 2903 (void) raid_error_state(un, &cs->cs_pbuf); 2904 if (cs->cs_hbuf.b_flags & B_ERROR) 2905 (void) raid_error_state(un, &cs->cs_hbuf); 2906 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 2907 if (cbuf->cbuf_bp.b_flags & B_ERROR) 2908 (void) raid_error_state(un, &cbuf->cbuf_bp); 2909 2910 md_unit_writerexit(ui); 2911 2912 ps->ps_flags |= MD_RPS_HSREQ; 2913 2914 un = (mr_unit_t *)md_unit_readerlock(ui); 2915 2916 /* now attempt the appropriate retry routine */ 2917 (*(cs->cs_retry_call))(un, cs); 2918 } 2919 /* 2920 * NAMES: raid_write_error 2921 * DESCRIPTION: I/O error handling routine for a RAID metadevice write 2922 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 2923 */ 2924 /*ARGSUSED*/ 2925 static void 2926 raid_write_error(md_raidcs_t *cs) 2927 { 2928 md_raidps_t *ps; 2929 mdi_unit_t *ui; 2930 mr_unit_t *un; 2931 md_raidcbuf_t *cbuf; 2932 set_t setno; 2933 2934 ps = cs->cs_ps; 2935 ui = ps->ps_ui; 2936 un = cs->cs_un; 2937 2938 setno = MD_UN2SET(un); 2939 2940 /* 2941 * locate each buf that is in error on this io and then 2942 * output an error message 2943 */ 2944 if ((cs->cs_dbuf.b_flags & B_ERROR) && 2945 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && 2946 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) 2947 cmn_err(CE_WARN, "md %s: write error on %s", 2948 md_shortname(MD_SID(un)), 2949 md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); 2950 2951 if ((cs->cs_pbuf.b_flags & B_ERROR) && 2952 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && 2953 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) 2954 cmn_err(CE_WARN, "md %s: write error on %s", 2955 md_shortname(MD_SID(un)), 2956 md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); 2957 2958 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 2959 if ((cbuf->cbuf_bp.b_flags & B_ERROR) && 2960 (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_ERRED) && 2961 (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_LAST_ERRED)) 2962 cmn_err(CE_WARN, "md %s: write error on %s", 2963 md_shortname(MD_SID(un)), 2964 md_devname(setno, md_expldev(cbuf->cbuf_bp.b_edev), 2965 NULL, 0)); 2966 2967 md_unit_readerexit(ui); 2968 2969 ASSERT(cs->cs_frags == 0); 2970 2971 /* now schedule processing for possible state change */ 2972 daemon_request(&md_mstr_daemon, raid_wrerr, 2973 (daemon_queue_t *)cs, REQ_OLD); 2974 2975 } 2976 2977 /* 2978 * NAME: raid_write_ponly 2979 * DESCRIPTION: RAID metadevice write routine 2980 * in the case where only the parity column can be written 2981 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2982 */ 2983 static void 2984 raid_write_ponly(md_raidcs_t *cs) 2985 { 2986 md_raidps_t *ps; 2987 mr_unit_t *un = cs->cs_un; 2988 2989 ps = cs->cs_ps; 2990 /* decrement pwfrags if needed, but not frags */ 2991 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 2992 raid_free_parent(ps, RFP_DECR_PWFRAGS); 2993 cs->cs_flags |= MD_RCS_PWDONE; 2994 cs->cs_frags = 1; 2995 cs->cs_stage = RAID_WRITE_PONLY_DONE; 2996 cs->cs_call = raid_stage; 2997 cs->cs_error_call = raid_write_error; 2998 cs->cs_retry_call = raid_write_no_retry; 2999 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3000 cs->cs_frags++; 3001 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE); 3002 } 3003 raidio(cs, RIO_PARITY | RIO_WRITE); 3004 } 3005 3006 /* 3007 * NAME: raid_write_ploop 3008 * DESCRIPTION: RAID metadevice write routine, constructs parity from 3009 * data in other columns. 3010 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3011 */ 3012 static void 3013 raid_write_ploop(md_raidcs_t *cs) 3014 { 3015 mr_unit_t *un = cs->cs_un; 3016 uint_t *dbuf; 3017 uint_t *pbuf; 3018 size_t wordcnt; 3019 uint_t psum = 0; 3020 3021 wordcnt = cs->cs_bcount / sizeof (uint_t); 3022 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 3023 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 3024 while (wordcnt--) 3025 *pbuf++ ^= *dbuf++; 3026 cs->cs_loop++; 3027 3028 /* 3029 * build parity from scratch using new data, 3030 * skip reading the data and parity columns. 3031 */ 3032 while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn) 3033 cs->cs_loop++; 3034 3035 if (cs->cs_loop != un->un_totalcolumncnt) { 3036 cs->cs_frags = 1; 3037 raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); 3038 return; 3039 } 3040 3041 /* construct checksum for parity buffer */ 3042 wordcnt = cs->cs_bcount / sizeof (uint_t); 3043 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 3044 while (wordcnt--) { 3045 psum ^= *pbuf; 3046 pbuf++; 3047 } 3048 RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, -1, 3049 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 3050 1, cs->cs_pcolumn, RAID_PWMAGIC); 3051 3052 cs->cs_stage = RAID_NONE; 3053 cs->cs_call = raid_write_ponly; 3054 cs->cs_error_call = raid_write_error; 3055 cs->cs_retry_call = raid_write_err_retry; 3056 cs->cs_frags = 1; 3057 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3058 cs->cs_frags++; 3059 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); 3060 } 3061 raidio(cs, RIO_PARITY | RIO_PREWRITE); 3062 } 3063 3064 /* 3065 * NAME: raid_write_donly 3066 * DESCRIPTION: RAID metadevice write routine 3067 * Completed writing data to prewrite entry 3068 * in the case where only the data column can be written 3069 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3070 */ 3071 static void 3072 raid_write_donly(md_raidcs_t *cs) 3073 { 3074 md_raidps_t *ps; 3075 mr_unit_t *un = cs->cs_un; 3076 3077 ps = cs->cs_ps; 3078 /* WARNING: don't release unit reader lock here... */ 3079 /* decrement pwfrags if needed, but not frags */ 3080 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 3081 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3082 cs->cs_flags |= MD_RCS_PWDONE; 3083 cs->cs_frags = 1; 3084 cs->cs_stage = RAID_WRITE_DONLY_DONE; 3085 cs->cs_call = raid_stage; 3086 cs->cs_error_call = raid_write_error; 3087 cs->cs_retry_call = raid_write_err_retry; 3088 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3089 cs->cs_frags++; 3090 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); 3091 } 3092 raidio(cs, RIO_DATA | RIO_WRITE); 3093 } 3094 3095 /* 3096 * NAME: raid_write_got_old 3097 * DESCRIPTION: RAID metadevice write routine 3098 * completed read of old data and old parity 3099 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3100 */ 3101 static void 3102 raid_write_got_old(md_raidcs_t *cs) 3103 { 3104 mr_unit_t *un = cs->cs_un; 3105 3106 ASSERT(IO_READER_HELD(cs->cs_un)); 3107 ASSERT(UNIT_READER_HELD(cs->cs_un)); 3108 3109 raid_mapin_buf(cs); 3110 genstandardparity(cs); 3111 cs->cs_frags = 2; 3112 cs->cs_call = raid_stage; 3113 cs->cs_stage = RAID_PREWRITE_DONE; 3114 cs->cs_error_call = raid_write_error; 3115 cs->cs_retry_call = raid_write_retry; 3116 3117 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3118 cs->cs_frags++; 3119 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_PREWRITE); 3120 } 3121 3122 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3123 cs->cs_frags++; 3124 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); 3125 } 3126 ASSERT(cs->cs_frags < 4); 3127 raidio(cs, RIO_DATA | RIO_PREWRITE); 3128 raidio(cs, RIO_PARITY | RIO_PREWRITE); 3129 } 3130 3131 /* 3132 * NAME: raid_write_io 3133 * DESCRIPTION: RAID metadevice write I/O routine 3134 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 3135 * md_raidcs_t *cs - pointer to a child structure 3136 */ 3137 3138 /*ARGSUSED*/ 3139 static void 3140 raid_write_io(mr_unit_t *un, md_raidcs_t *cs) 3141 { 3142 md_raidps_t *ps = cs->cs_ps; 3143 uint_t *dbuf; 3144 uint_t *ubuf; 3145 size_t wordcnt; 3146 uint_t dsum = 0; 3147 int pcheck; 3148 int dcheck; 3149 3150 ASSERT((un->un_column[cs->cs_pcolumn].un_devstate & 3151 RCS_INIT) == 0); 3152 ASSERT((un->un_column[cs->cs_dcolumn].un_devstate & 3153 RCS_INIT) == 0); 3154 ASSERT(IO_READER_HELD(un)); 3155 ASSERT(UNIT_READER_HELD(un)); 3156 ASSERT(cs->cs_flags & MD_RCS_HAVE_PW_SLOTS); 3157 if (cs->cs_flags & MD_RCS_LINE) { 3158 3159 mr_unit_t *un = cs->cs_un; 3160 3161 ASSERT(un->un_origcolumncnt == un->un_totalcolumncnt); 3162 raid_mapin_buf(cs); 3163 cs->cs_frags = un->un_origcolumncnt; 3164 cs->cs_call = raid_stage; 3165 cs->cs_error_call = raid_write_error; 3166 cs->cs_retry_call = raid_write_no_retry; 3167 cs->cs_stage = RAID_LINE_PWDONE; 3168 genlineparity(cs); 3169 return; 3170 } 3171 3172 pcheck = erred_check_line(un, cs, &un->un_column[cs->cs_pcolumn]); 3173 dcheck = erred_check_line(un, cs, &un->un_column[cs->cs_dcolumn]); 3174 cs->cs_resync_check = pcheck << RCL_PARITY_OFFSET || dcheck; 3175 3176 if (pcheck == RCL_ERRED && dcheck == RCL_ERRED) { 3177 int err = EIO; 3178 3179 if ((un->un_column[cs->cs_pcolumn].un_devstate == 3180 RCS_LAST_ERRED) || 3181 (un->un_column[cs->cs_dcolumn].un_devstate == 3182 RCS_LAST_ERRED)) 3183 err = ENXIO; 3184 raid_error_parent(ps, err); 3185 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 3186 raid_free_child(cs, 1); 3187 raid_free_parent(ps, RFP_DECR_FRAGS 3188 | RFP_RLS_LOCK | RFP_DECR_PWFRAGS); 3189 return; 3190 } 3191 3192 if (pcheck & RCL_ERRED) { 3193 /* 3194 * handle case of only having data drive 3195 */ 3196 raid_mapin_buf(cs); 3197 wordcnt = cs->cs_bcount / sizeof (uint_t); 3198 3199 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 3200 ubuf = (uint_t *)(void *)(cs->cs_addr); 3201 3202 while (wordcnt--) { 3203 *dbuf = *ubuf; 3204 dsum ^= *ubuf; 3205 dbuf++; 3206 ubuf++; 3207 } 3208 RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, -1, 3209 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 3210 1, cs->cs_dcolumn, RAID_PWMAGIC); 3211 cs->cs_frags = 1; 3212 cs->cs_stage = RAID_NONE; 3213 cs->cs_call = raid_write_donly; 3214 cs->cs_error_call = raid_write_error; 3215 cs->cs_retry_call = raid_write_err_retry; 3216 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3217 cs->cs_frags++; 3218 raidio(cs, RIO_DATA | RIO_ALT | RIO_EXTRA | 3219 RIO_PREWRITE); 3220 } 3221 raidio(cs, RIO_DATA | RIO_PREWRITE); 3222 return; 3223 } 3224 3225 if (dcheck & RCL_ERRED) { 3226 /* 3227 * handle case of only having parity drive 3228 * build parity from scratch using new data, 3229 * skip reading the data and parity columns. 3230 */ 3231 raid_mapin_buf(cs); 3232 cs->cs_loop = 0; 3233 while (cs->cs_loop == cs->cs_dcolumn || 3234 cs->cs_loop == cs->cs_pcolumn) 3235 cs->cs_loop++; 3236 3237 /* copy new data in to begin building parity */ 3238 bcopy(cs->cs_addr, cs->cs_pbuffer + DEV_BSIZE, cs->cs_bcount); 3239 cs->cs_stage = RAID_NONE; 3240 cs->cs_call = raid_write_ploop; 3241 cs->cs_error_call = raid_write_error; 3242 cs->cs_retry_call = raid_write_err_retry; 3243 cs->cs_frags = 1; 3244 raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); 3245 return; 3246 } 3247 /* 3248 * handle normal cases 3249 * read old data and old parity 3250 */ 3251 cs->cs_frags = 2; 3252 cs->cs_stage = RAID_NONE; 3253 cs->cs_call = raid_write_got_old; 3254 cs->cs_error_call = raid_write_error; 3255 cs->cs_retry_call = raid_write_retry; 3256 ASSERT(ps->ps_magic == RAID_PSMAGIC); 3257 raidio(cs, RIO_DATA | RIO_READ); 3258 raidio(cs, RIO_PARITY | RIO_READ); 3259 } 3260 3261 static void 3262 raid_enqueue(md_raidcs_t *cs) 3263 { 3264 mdi_unit_t *ui = cs->cs_ps->ps_ui; 3265 kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; 3266 md_raidcs_t *cs1; 3267 3268 mutex_enter(io_list_mutex); 3269 ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); 3270 if (ui->ui_io_lock->io_list_front == NULL) { 3271 ui->ui_io_lock->io_list_front = cs; 3272 ui->ui_io_lock->io_list_back = cs; 3273 } else { 3274 cs1 = ui->ui_io_lock->io_list_back; 3275 cs1->cs_linlck_next = cs; 3276 ui->ui_io_lock->io_list_back = cs; 3277 } 3278 STAT_INC(raid_write_waits); 3279 STAT_MAX(raid_max_write_q_length, raid_write_queue_length); 3280 cs->cs_linlck_next = NULL; 3281 mutex_exit(io_list_mutex); 3282 } 3283 3284 /* 3285 * NAME: raid_write 3286 * DESCRIPTION: RAID metadevice write routine 3287 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 3288 * md_raidcs_t *cs - pointer to a child structure 3289 */ 3290 3291 /*ARGSUSED*/ 3292 static int 3293 raid_write(mr_unit_t *un, md_raidcs_t *cs) 3294 { 3295 int error = 0; 3296 md_raidps_t *ps; 3297 mdi_unit_t *ui; 3298 minor_t mnum; 3299 clock_t timeout; 3300 3301 ASSERT(IO_READER_HELD(un)); 3302 ps = cs->cs_ps; 3303 ui = ps->ps_ui; 3304 3305 ASSERT(UNIT_STATE(un) != RUS_INIT); 3306 if (UNIT_STATE(un) == RUS_LAST_ERRED) 3307 error = EIO; 3308 3309 /* make sure the write doesn't go beyond the column */ 3310 if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn) 3311 error = ENXIO; 3312 if (error) 3313 goto werror; 3314 3315 getresources(cs); 3316 3317 /* 3318 * this is an advisory loop that keeps the waiting lists short 3319 * to reduce cpu time. Since there is a race introduced by not 3320 * aquiring all the correct mutexes, use a cv_timedwait to be 3321 * sure the write always will wake up and start. 3322 */ 3323 while (raid_check_pw(cs)) { 3324 mutex_enter(&un->un_mx); 3325 (void) drv_getparm(LBOLT, &timeout); 3326 timeout += md_wr_wait; 3327 un->un_rflags |= MD_RFLAG_NEEDPW; 3328 STAT_INC(raid_prewrite_waits); 3329 (void) cv_timedwait(&un->un_cv, &un->un_mx, timeout); 3330 un->un_rflags &= ~MD_RFLAG_NEEDPW; 3331 mutex_exit(&un->un_mx); 3332 } 3333 3334 if (raid_line_writer_lock(cs, 1)) 3335 return (0); 3336 3337 un = (mr_unit_t *)md_unit_readerlock(ui); 3338 cs->cs_un = un; 3339 mnum = MD_SID(un); 3340 3341 if (un->un_state & RUS_REGEN) { 3342 raid_regen_parity(cs); 3343 un = MD_UNIT(mnum); 3344 cs->cs_un = un; 3345 } 3346 3347 raid_write_io(un, cs); 3348 return (0); 3349 werror: 3350 /* aquire unit reader lock sinc raid_free_child always drops it */ 3351 raid_error_parent(ps, error); 3352 raid_free_child(cs, 0); 3353 /* decrement both pwfrags and frags */ 3354 raid_free_parent(ps, RFP_DECR_PWFRAGS | RFP_DECR_FRAGS | RFP_RLS_LOCK); 3355 return (0); 3356 } 3357 3358 3359 /* 3360 * NAMES: raid_stage 3361 * DESCRIPTION: post-processing routine for a RAID metadevice 3362 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 3363 */ 3364 static void 3365 raid_stage(md_raidcs_t *cs) 3366 { 3367 md_raidps_t *ps = cs->cs_ps; 3368 mr_unit_t *un = cs->cs_un; 3369 md_raidcbuf_t *cbuf; 3370 buf_t *bp; 3371 void *private; 3372 int flag; 3373 3374 switch (cs->cs_stage) { 3375 case RAID_READ_DONE: 3376 raid_free_child(cs, 1); 3377 /* decrement readfrags */ 3378 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 3379 return; 3380 3381 case RAID_WRITE_DONE: 3382 case RAID_WRITE_PONLY_DONE: 3383 case RAID_WRITE_DONLY_DONE: 3384 /* 3385 * Completed writing real parity and/or data. 3386 */ 3387 ASSERT(cs->cs_flags & MD_RCS_PWDONE); 3388 raid_free_child(cs, 1); 3389 /* decrement frags but not pwfrags */ 3390 raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK); 3391 return; 3392 3393 case RAID_PREWRITE_DONE: 3394 /* 3395 * completed writing data and parity to prewrite entries 3396 */ 3397 /* 3398 * WARNING: don't release unit reader lock here.. 3399 * decrement pwfrags but not frags 3400 */ 3401 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3402 cs->cs_flags |= MD_RCS_PWDONE; 3403 cs->cs_frags = 2; 3404 cs->cs_stage = RAID_WRITE_DONE; 3405 cs->cs_call = raid_stage; 3406 cs->cs_error_call = raid_write_error; 3407 cs->cs_retry_call = raid_write_no_retry; 3408 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3409 cs->cs_frags++; 3410 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | 3411 RIO_WRITE); 3412 } 3413 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3414 cs->cs_frags++; 3415 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); 3416 } 3417 ASSERT(cs->cs_frags < 4); 3418 raidio(cs, RIO_DATA | RIO_WRITE); 3419 raidio(cs, RIO_PARITY | RIO_WRITE); 3420 if (cs->cs_pw_inval_list) { 3421 raid_free_pwinvalidate(cs); 3422 } 3423 return; 3424 3425 case RAID_LINE_PWDONE: 3426 ASSERT(cs->cs_frags == 0); 3427 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3428 cs->cs_flags |= MD_RCS_PWDONE; 3429 cs->cs_frags = un->un_origcolumncnt; 3430 cs->cs_call = raid_stage; 3431 cs->cs_error_call = raid_write_error; 3432 cs->cs_retry_call = raid_write_no_retry; 3433 cs->cs_stage = RAID_WRITE_DONE; 3434 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 3435 /* 3436 * fill in buffer for write to prewrite area 3437 */ 3438 bp = &cbuf->cbuf_bp; 3439 bp->b_back = bp; 3440 bp->b_forw = bp; 3441 bp->b_un.b_addr = cbuf->cbuf_buffer + DEV_BSIZE; 3442 bp->b_bcount = cbuf->cbuf_bcount; 3443 bp->b_bufsize = cbuf->cbuf_bcount; 3444 bp->b_lblkno = 3445 un->un_column[cbuf->cbuf_column].un_devstart + 3446 cs->cs_blkno; 3447 bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR); 3448 bp->b_flags &= ~nv_available; 3449 bp->b_flags |= B_WRITE | B_BUSY; 3450 bp->b_iodone = (int (*)())raid_done; 3451 bp->b_edev = md_dev64_to_dev( 3452 un->un_column[cbuf->cbuf_column].un_dev); 3453 bp->b_chain = (struct buf *)cs; 3454 private = cs->cs_strategy_private; 3455 flag = cs->cs_strategy_flag; 3456 md_call_strategy(bp, flag, private); 3457 } 3458 raidio(cs, RIO_DATA | RIO_WRITE); 3459 raidio(cs, RIO_PARITY | RIO_WRITE); 3460 if (cs->cs_pw_inval_list) { 3461 raid_free_pwinvalidate(cs); 3462 } 3463 return; 3464 3465 default: 3466 ASSERT(0); 3467 break; 3468 } 3469 } 3470 /* 3471 * NAME: md_raid_strategy 3472 * DESCRIPTION: RAID metadevice I/O oprations entry point. 3473 * PARAMETERS: buf_t *pb - pointer to a user I/O buffer 3474 * int flag - metadevice specific flag 3475 * void *private - carry over flag ?? 3476 * 3477 */ 3478 3479 void 3480 md_raid_strategy(buf_t *pb, int flag, void *private) 3481 { 3482 md_raidps_t *ps; 3483 md_raidcs_t *cs; 3484 int doing_writes; 3485 int err; 3486 mr_unit_t *un; 3487 mdi_unit_t *ui; 3488 size_t count; 3489 diskaddr_t blkno; 3490 caddr_t addr; 3491 off_t offset; 3492 int colcnt; 3493 minor_t mnum; 3494 set_t setno; 3495 3496 ui = MDI_UNIT(getminor(pb->b_edev)); 3497 md_kstat_waitq_enter(ui); 3498 un = (mr_unit_t *)md_io_readerlock(ui); 3499 setno = MD_MIN2SET(getminor(pb->b_edev)); 3500 3501 if ((flag & MD_NOBLOCK) == 0) { 3502 if (md_inc_iocount(setno) != 0) { 3503 pb->b_flags |= B_ERROR; 3504 pb->b_error = ENXIO; 3505 pb->b_resid = pb->b_bcount; 3506 md_io_readerexit(ui); 3507 biodone(pb); 3508 return; 3509 } 3510 } else { 3511 md_inc_iocount_noblock(setno); 3512 } 3513 3514 mnum = MD_SID(un); 3515 colcnt = un->un_totalcolumncnt - 1; 3516 count = pb->b_bcount; 3517 3518 STAT_CHECK(raid_512, count == 512); 3519 STAT_CHECK(raid_1024, count == 1024); 3520 STAT_CHECK(raid_1024_8192, count > 1024 && count < 8192); 3521 STAT_CHECK(raid_8192, count == 8192); 3522 STAT_CHECK(raid_8192_bigger, count > 8192); 3523 3524 (void *) md_unit_readerlock(ui); 3525 if (!(flag & MD_STR_NOTTOP)) { 3526 err = md_checkbuf(ui, (md_unit_t *)un, pb); /* check and map */ 3527 if (err != 0) { 3528 md_kstat_waitq_exit(ui); 3529 md_io_readerexit(ui); 3530 return; 3531 } 3532 } 3533 md_unit_readerexit(ui); 3534 3535 STAT_INC(raid_total_io); 3536 3537 /* allocate a parent structure for the user I/O */ 3538 ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS); 3539 raid_parent_init(ps); 3540 3541 /* 3542 * Save essential information from the original buffhdr 3543 * in the md_save structure. 3544 */ 3545 ps->ps_un = un; 3546 ps->ps_ui = ui; 3547 ps->ps_bp = pb; 3548 ps->ps_addr = pb->b_un.b_addr; 3549 3550 if ((pb->b_flags & B_READ) == 0) { 3551 ps->ps_flags |= MD_RPS_WRITE; 3552 doing_writes = 1; 3553 STAT_INC(raid_writes); 3554 } else { 3555 ps->ps_flags |= MD_RPS_READ; 3556 doing_writes = 0; 3557 STAT_INC(raid_reads); 3558 } 3559 3560 count = lbtodb(pb->b_bcount); /* transfer count (in blocks) */ 3561 blkno = pb->b_lblkno; /* block number on device */ 3562 addr = 0; 3563 offset = 0; 3564 ps->ps_pwfrags = 1; 3565 ps->ps_frags = 1; 3566 md_kstat_waitq_to_runq(ui); 3567 3568 do { 3569 cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS); 3570 raid_child_init(cs); 3571 cs->cs_ps = ps; 3572 cs->cs_un = un; 3573 cs->cs_mdunit = mnum; 3574 cs->cs_strategy_flag = flag; 3575 cs->cs_strategy_private = private; 3576 cs->cs_addr = addr; 3577 cs->cs_offset = offset; 3578 count = raid_iosetup(un, blkno, count, cs); 3579 if (cs->cs_flags & MD_RCS_LINE) { 3580 blkno += (cs->cs_blkcnt * colcnt); 3581 offset += (cs->cs_bcount * colcnt); 3582 } else { 3583 blkno += cs->cs_blkcnt; 3584 offset += cs->cs_bcount; 3585 } 3586 /* for each cs bump up the ps_pwfrags and ps_frags fields */ 3587 if (count) { 3588 mutex_enter(&ps->ps_mx); 3589 ps->ps_pwfrags++; 3590 ps->ps_frags++; 3591 mutex_exit(&ps->ps_mx); 3592 if (doing_writes) 3593 (void) raid_write(un, cs); 3594 else 3595 (void) raid_read(un, cs); 3596 } 3597 } while (count); 3598 if (doing_writes) { 3599 (void) raid_write(un, cs); 3600 } else 3601 (void) raid_read(un, cs); 3602 3603 if (! (flag & MD_STR_NOTTOP) && panicstr) { 3604 while (! (ps->ps_flags & MD_RPS_DONE)) { 3605 md_daemon(1, &md_done_daemon); 3606 drv_usecwait(10); 3607 } 3608 kmem_cache_free(raid_parent_cache, ps); 3609 } 3610 } 3611 3612 /* 3613 * NAMES: raid_snarf 3614 * DESCRIPTION: RAID metadevice SNARF entry point 3615 * PARAMETERS: md_snarfcmd_t cmd, 3616 * set_t setno 3617 * RETURNS: 3618 */ 3619 static int 3620 raid_snarf(md_snarfcmd_t cmd, set_t setno) 3621 { 3622 mr_unit_t *un; 3623 mddb_recid_t recid; 3624 int gotsomething; 3625 int all_raid_gotten; 3626 mddb_type_t typ1; 3627 uint_t ncol; 3628 mddb_de_ic_t *dep; 3629 mddb_rb32_t *rbp; 3630 size_t newreqsize; 3631 mr_unit_t *big_un; 3632 mr_unit32_od_t *small_un; 3633 3634 3635 if (cmd == MD_SNARF_CLEANUP) 3636 return (0); 3637 3638 all_raid_gotten = 1; 3639 gotsomething = 0; 3640 typ1 = (mddb_type_t)md_getshared_key(setno, 3641 raid_md_ops.md_driver.md_drivername); 3642 recid = mddb_makerecid(setno, 0); 3643 3644 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 3645 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) { 3646 continue; 3647 } 3648 3649 dep = mddb_getrecdep(recid); 3650 dep->de_flags = MDDB_F_RAID; 3651 rbp = dep->de_rb; 3652 if ((rbp->rb_revision == MDDB_REV_RB) && 3653 ((rbp->rb_private & MD_PRV_CONVD) == 0)) { 3654 /* 3655 * This means, we have an old and small record 3656 * and this record hasn't already been converted. 3657 * Before we create an incore metadevice from this 3658 * we have to convert it to a big record. 3659 */ 3660 small_un = (mr_unit32_od_t *)mddb_getrecaddr(recid); 3661 ncol = small_un->un_totalcolumncnt; 3662 newreqsize = sizeof (mr_unit_t) + 3663 ((ncol - 1) * sizeof (mr_column_t)); 3664 big_un = (mr_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP); 3665 raid_convert((caddr_t)small_un, (caddr_t)big_un, 3666 SMALL_2_BIG); 3667 kmem_free(small_un, dep->de_reqsize); 3668 dep->de_rb_userdata = big_un; 3669 dep->de_reqsize = newreqsize; 3670 un = big_un; 3671 rbp->rb_private |= MD_PRV_CONVD; 3672 } else { 3673 /* Big device */ 3674 un = (mr_unit_t *)mddb_getrecaddr(recid); 3675 } 3676 3677 /* Set revision and flag accordingly */ 3678 if (rbp->rb_revision == MDDB_REV_RB) { 3679 un->c.un_revision = MD_32BIT_META_DEV; 3680 } else { 3681 un->c.un_revision = MD_64BIT_META_DEV; 3682 un->c.un_flag |= MD_EFILABEL; 3683 } 3684 3685 /* 3686 * Create minor device node for snarfed entry. 3687 */ 3688 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 3689 3690 if (MD_UNIT(MD_SID(un)) != NULL) { 3691 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 3692 continue; 3693 } 3694 all_raid_gotten = 0; 3695 if (raid_build_incore((void *)un, 1) == 0) { 3696 mddb_setrecprivate(recid, MD_PRV_GOTIT); 3697 md_create_unit_incore(MD_SID(un), &raid_md_ops, 3698 1); 3699 gotsomething = 1; 3700 } else if (un->mr_ic) { 3701 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * 3702 un->un_totalcolumncnt); 3703 kmem_free(un->mr_ic, sizeof (*un->mr_ic)); 3704 } 3705 } 3706 3707 if (!all_raid_gotten) { 3708 return (gotsomething); 3709 } 3710 3711 recid = mddb_makerecid(setno, 0); 3712 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) 3713 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 3714 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 3715 3716 return (0); 3717 } 3718 3719 /* 3720 * NAMES: raid_halt 3721 * DESCRIPTION: RAID metadevice HALT entry point 3722 * PARAMETERS: md_haltcmd_t cmd - 3723 * set_t setno - 3724 * RETURNS: 3725 */ 3726 static int 3727 raid_halt(md_haltcmd_t cmd, set_t setno) 3728 { 3729 set_t i; 3730 mdi_unit_t *ui; 3731 minor_t mnum; 3732 3733 if (cmd == MD_HALT_CLOSE) 3734 return (0); 3735 3736 if (cmd == MD_HALT_OPEN) 3737 return (0); 3738 3739 if (cmd == MD_HALT_UNLOAD) 3740 return (0); 3741 3742 if (cmd == MD_HALT_CHECK) { 3743 for (i = 0; i < md_nunits; i++) { 3744 mnum = MD_MKMIN(setno, i); 3745 if ((ui = MDI_UNIT(mnum)) == NULL) 3746 continue; 3747 if (ui->ui_opsindex != raid_md_ops.md_selfindex) 3748 continue; 3749 if (md_unit_isopen(ui)) 3750 return (1); 3751 } 3752 return (0); 3753 } 3754 3755 if (cmd != MD_HALT_DOIT) 3756 return (1); 3757 3758 for (i = 0; i < md_nunits; i++) { 3759 mnum = MD_MKMIN(setno, i); 3760 if ((ui = MDI_UNIT(mnum)) == NULL) 3761 continue; 3762 if (ui->ui_opsindex != raid_md_ops.md_selfindex) 3763 continue; 3764 reset_raid((mr_unit_t *)MD_UNIT(mnum), mnum, 0); 3765 } 3766 return (0); 3767 } 3768 3769 /* 3770 * NAMES: raid_close_all_devs 3771 * DESCRIPTION: Close all the devices of the unit. 3772 * PARAMETERS: mr_unit_t *un - pointer to unit structure 3773 * RETURNS: 3774 */ 3775 void 3776 raid_close_all_devs(mr_unit_t *un, int init_pw, int md_cflags) 3777 { 3778 int i; 3779 mr_column_t *device; 3780 3781 for (i = 0; i < un->un_totalcolumncnt; i++) { 3782 device = &un->un_column[i]; 3783 if (device->un_devflags & MD_RAID_DEV_ISOPEN) { 3784 ASSERT((device->un_dev != (md_dev64_t)0) && 3785 (device->un_dev != NODEV64)); 3786 if ((device->un_devstate & RCS_OKAY) && init_pw) 3787 (void) init_pw_area(un, device->un_dev, 3788 device->un_pwstart, i); 3789 md_layered_close(device->un_dev, md_cflags); 3790 device->un_devflags &= ~MD_RAID_DEV_ISOPEN; 3791 } 3792 } 3793 } 3794 3795 /* 3796 * NAMES: raid_open_all_devs 3797 * DESCRIPTION: Open all the components (columns) of the device unit. 3798 * PARAMETERS: mr_unit_t *un - pointer to unit structure 3799 * RETURNS: 3800 */ 3801 static int 3802 raid_open_all_devs(mr_unit_t *un, int md_oflags) 3803 { 3804 minor_t mnum = MD_SID(un); 3805 int i; 3806 int not_opened = 0; 3807 int commit = 0; 3808 int col = -1; 3809 mr_column_t *device; 3810 set_t setno = MD_MIN2SET(MD_SID(un)); 3811 side_t side = mddb_getsidenum(setno); 3812 mdkey_t key; 3813 mdi_unit_t *ui = MDI_UNIT(mnum); 3814 3815 ui->ui_tstate &= ~MD_INACCESSIBLE; 3816 3817 for (i = 0; i < un->un_totalcolumncnt; i++) { 3818 md_dev64_t tmpdev; 3819 3820 device = &un->un_column[i]; 3821 3822 if (COLUMN_STATE(un, i) & RCS_ERRED) { 3823 not_opened++; 3824 continue; 3825 } 3826 3827 if (device->un_devflags & MD_RAID_DEV_ISOPEN) 3828 continue; 3829 3830 tmpdev = device->un_dev; 3831 /* 3832 * Open by device id 3833 */ 3834 key = HOTSPARED(un, i) ? 3835 device->un_hs_key : device->un_orig_key; 3836 if ((md_getmajor(tmpdev) != md_major) && 3837 md_devid_found(setno, side, key) == 1) { 3838 tmpdev = md_resolve_bydevid(mnum, tmpdev, key); 3839 } 3840 if (md_layered_open(mnum, &tmpdev, md_oflags)) { 3841 device->un_dev = tmpdev; 3842 not_opened++; 3843 continue; 3844 } 3845 device->un_dev = tmpdev; 3846 device->un_devflags |= MD_RAID_DEV_ISOPEN; 3847 } 3848 3849 /* if open errors and errored devices are 1 then device can run */ 3850 if (not_opened > 1) { 3851 cmn_err(CE_WARN, 3852 "md: %s failed to open. open error on %s\n", 3853 md_shortname(MD_SID(un)), 3854 md_devname(MD_UN2SET(un), device->un_orig_dev, 3855 NULL, 0)); 3856 3857 ui->ui_tstate |= MD_INACCESSIBLE; 3858 3859 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 3860 MD_UN2SET(un), MD_SID(un)); 3861 3862 return (not_opened > 1); 3863 } 3864 3865 for (i = 0; i < un->un_totalcolumncnt; i++) { 3866 device = &un->un_column[i]; 3867 if (device->un_devflags & MD_RAID_DEV_ISOPEN) { 3868 if (device->un_devstate & RCS_LAST_ERRED) { 3869 /* 3870 * At this point in time there is a possibility 3871 * that errors were the result of a controller 3872 * failure with more than a single column on it 3873 * so clear out last errored columns and let errors 3874 * re-occur is necessary. 3875 */ 3876 raid_set_state(un, i, RCS_OKAY, 0); 3877 commit++; 3878 } 3879 continue; 3880 } 3881 ASSERT(col == -1); 3882 col = i; 3883 } 3884 3885 if (col != -1) { 3886 raid_set_state(un, col, RCS_ERRED, 0); 3887 commit++; 3888 } 3889 3890 if (commit) 3891 raid_commit(un, NULL); 3892 3893 if (col != -1) { 3894 if (COLUMN_STATE(un, col) & RCS_ERRED) { 3895 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 3896 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 3897 } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { 3898 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 3899 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 3900 } 3901 } 3902 3903 return (0); 3904 } 3905 3906 /* 3907 * NAMES: raid_internal_open 3908 * DESCRIPTION: Do the actual RAID open 3909 * PARAMETERS: minor_t mnum - minor number of the RAID device 3910 * int flag - 3911 * int otyp - 3912 * int md_oflags - RAID open flags 3913 * RETURNS: 0 if successful, nonzero otherwise 3914 */ 3915 int 3916 raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags) 3917 { 3918 mr_unit_t *un; 3919 mdi_unit_t *ui; 3920 int err = 0; 3921 int replay_error = 0; 3922 3923 ui = MDI_UNIT(mnum); 3924 ASSERT(ui != NULL); 3925 3926 un = (mr_unit_t *)md_unit_openclose_enter(ui); 3927 /* 3928 * this MUST be checked before md_unit_isopen is checked. 3929 * raid_init_columns sets md_unit_isopen to block reset, halt. 3930 */ 3931 if ((UNIT_STATE(un) & (RUS_INIT | RUS_DOI)) && 3932 !(md_oflags & MD_OFLG_ISINIT)) { 3933 md_unit_openclose_exit(ui); 3934 return (EAGAIN); 3935 } 3936 3937 if ((md_oflags & MD_OFLG_ISINIT) || md_unit_isopen(ui)) { 3938 err = md_unit_incopen(mnum, flag, otyp); 3939 goto out; 3940 } 3941 3942 md_unit_readerexit(ui); 3943 3944 un = (mr_unit_t *)md_unit_writerlock(ui); 3945 if (raid_open_all_devs(un, md_oflags) == 0) { 3946 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) { 3947 md_unit_writerexit(ui); 3948 un = (mr_unit_t *)md_unit_readerlock(ui); 3949 raid_close_all_devs(un, 0, md_oflags); 3950 goto out; 3951 } 3952 } else { 3953 /* 3954 * if this unit contains more than two errored components 3955 * should return error and close all opened devices 3956 */ 3957 3958 md_unit_writerexit(ui); 3959 un = (mr_unit_t *)md_unit_readerlock(ui); 3960 raid_close_all_devs(un, 0, md_oflags); 3961 md_unit_openclose_exit(ui); 3962 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 3963 MD_UN2SET(un), MD_SID(un)); 3964 return (ENXIO); 3965 } 3966 3967 if (!(MD_STATUS(un) & MD_UN_REPLAYED)) { 3968 replay_error = raid_replay(un); 3969 MD_STATUS(un) |= MD_UN_REPLAYED; 3970 } 3971 3972 md_unit_writerexit(ui); 3973 un = (mr_unit_t *)md_unit_readerlock(ui); 3974 3975 if ((replay_error == RAID_RPLY_READONLY) && 3976 ((flag & (FREAD | FWRITE)) == FREAD)) { 3977 md_unit_openclose_exit(ui); 3978 return (0); 3979 } 3980 3981 /* allocate hotspare if possible */ 3982 (void) raid_hotspares(); 3983 3984 3985 out: 3986 md_unit_openclose_exit(ui); 3987 return (err); 3988 } 3989 /* 3990 * NAMES: raid_open 3991 * DESCRIPTION: RAID metadevice OPEN entry point 3992 * PARAMETERS: dev_t dev - 3993 * int flag - 3994 * int otyp - 3995 * cred_t * cred_p - 3996 * int md_oflags - 3997 * RETURNS: 3998 */ 3999 /*ARGSUSED1*/ 4000 static int 4001 raid_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 4002 { 4003 int error = 0; 4004 4005 if (error = raid_internal_open(getminor(*dev), flag, otyp, md_oflags)) { 4006 return (error); 4007 } 4008 return (0); 4009 } 4010 4011 /* 4012 * NAMES: raid_internal_close 4013 * DESCRIPTION: RAID metadevice CLOSE actual implementation 4014 * PARAMETERS: minor_t - minor number of the RAID device 4015 * int otyp - 4016 * int init_pw - 4017 * int md_cflags - RAID close flags 4018 * RETURNS: 0 if successful, nonzero otherwise 4019 */ 4020 /*ARGSUSED*/ 4021 int 4022 raid_internal_close(minor_t mnum, int otyp, int init_pw, int md_cflags) 4023 { 4024 mdi_unit_t *ui = MDI_UNIT(mnum); 4025 mr_unit_t *un; 4026 int err = 0; 4027 4028 /* single thread */ 4029 un = (mr_unit_t *)md_unit_openclose_enter(ui); 4030 4031 /* count closed */ 4032 if ((err = md_unit_decopen(mnum, otyp)) != 0) 4033 goto out; 4034 /* close devices, if necessary */ 4035 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 4036 raid_close_all_devs(un, init_pw, md_cflags); 4037 } 4038 4039 /* unlock, return success */ 4040 out: 4041 md_unit_openclose_exit(ui); 4042 return (err); 4043 } 4044 4045 /* 4046 * NAMES: raid_close 4047 * DESCRIPTION: RAID metadevice close entry point 4048 * PARAMETERS: dev_t dev - 4049 * int flag - 4050 * int otyp - 4051 * cred_t * cred_p - 4052 * int md_oflags - 4053 * RETURNS: 4054 */ 4055 /*ARGSUSED1*/ 4056 static int 4057 raid_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) 4058 { 4059 int retval; 4060 4061 (void) md_io_writerlock(MDI_UNIT(getminor(dev))); 4062 retval = raid_internal_close(getminor(dev), otyp, 1, md_cflags); 4063 (void) md_io_writerexit(MDI_UNIT(getminor(dev))); 4064 return (retval); 4065 } 4066 4067 /* 4068 * raid_probe_close_all_devs 4069 */ 4070 void 4071 raid_probe_close_all_devs(mr_unit_t *un) 4072 { 4073 int i; 4074 mr_column_t *device; 4075 4076 for (i = 0; i < un->un_totalcolumncnt; i++) { 4077 device = &un->un_column[i]; 4078 4079 if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { 4080 md_layered_close(device->un_dev, 4081 MD_OFLG_PROBEDEV); 4082 device->un_devflags &= ~MD_RAID_DEV_PROBEOPEN; 4083 } 4084 } 4085 } 4086 /* 4087 * Raid_probe_dev: 4088 * 4089 * On entry the unit writerlock is held 4090 */ 4091 static int 4092 raid_probe_dev(mdi_unit_t *ui, minor_t mnum) 4093 { 4094 mr_unit_t *un; 4095 int i; 4096 int not_opened = 0; 4097 int commit = 0; 4098 int col = -1; 4099 mr_column_t *device; 4100 int md_devopen = 0; 4101 4102 if (md_unit_isopen(ui)) 4103 md_devopen++; 4104 4105 un = MD_UNIT(mnum); 4106 /* 4107 * If the state has been set to LAST_ERRED because 4108 * of an error when the raid device was open at some 4109 * point in the past, don't probe. We really don't want 4110 * to reset the state in this case. 4111 */ 4112 if (UNIT_STATE(un) == RUS_LAST_ERRED) 4113 return (0); 4114 4115 ui->ui_tstate &= ~MD_INACCESSIBLE; 4116 4117 for (i = 0; i < un->un_totalcolumncnt; i++) { 4118 md_dev64_t tmpdev; 4119 4120 device = &un->un_column[i]; 4121 if (COLUMN_STATE(un, i) & RCS_ERRED) { 4122 not_opened++; 4123 continue; 4124 } 4125 4126 tmpdev = device->un_dev; 4127 /* 4128 * Currently the flags passed are not needed since 4129 * there cannot be an underlying metadevice. However 4130 * they are kept here for consistency. 4131 * 4132 * Open by device id 4133 */ 4134 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i)? 4135 device->un_hs_key : device->un_orig_key); 4136 if (md_layered_open(mnum, &tmpdev, 4137 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV)) { 4138 device->un_dev = tmpdev; 4139 not_opened++; 4140 continue; 4141 } 4142 device->un_dev = tmpdev; 4143 4144 device->un_devflags |= MD_RAID_DEV_PROBEOPEN; 4145 } 4146 4147 /* 4148 * The code below is careful on setting the LAST_ERRED state. 4149 * 4150 * If open errors and exactly one device has failed we can run. 4151 * If more then one device fails we have to figure out when to set 4152 * LAST_ERRED state. The rationale is to avoid unnecessary resyncs 4153 * since they are painful and time consuming. 4154 * 4155 * When more than one component/column fails there are 2 scenerios. 4156 * 4157 * 1. Metadevice has NOT been opened: In this case, the behavior 4158 * mimics the open symantics. ie. Only the first failed device 4159 * is ERRED and LAST_ERRED is not set. 4160 * 4161 * 2. Metadevice has been opened: Here the read/write sematics are 4162 * followed. The first failed devicce is ERRED and on the next 4163 * failed device LAST_ERRED is set. 4164 */ 4165 4166 if (not_opened > 1 && !md_devopen) { 4167 cmn_err(CE_WARN, 4168 "md: %s failed to open. open error on %s\n", 4169 md_shortname(MD_SID(un)), 4170 md_devname(MD_UN2SET(un), device->un_orig_dev, 4171 NULL, 0)); 4172 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 4173 MD_UN2SET(un), MD_SID(un)); 4174 raid_probe_close_all_devs(un); 4175 ui->ui_tstate |= MD_INACCESSIBLE; 4176 return (not_opened > 1); 4177 } 4178 4179 if (!md_devopen) { 4180 for (i = 0; i < un->un_totalcolumncnt; i++) { 4181 device = &un->un_column[i]; 4182 if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { 4183 if (device->un_devstate & RCS_LAST_ERRED) { 4184 /* 4185 * At this point in time there is a 4186 * possibility that errors were the 4187 * result of a controller failure with 4188 * more than a single column on it so 4189 * clear out last errored columns and 4190 * let errors re-occur is necessary. 4191 */ 4192 raid_set_state(un, i, RCS_OKAY, 0); 4193 commit++; 4194 } 4195 continue; 4196 } 4197 ASSERT(col == -1); 4198 /* 4199 * note if multiple devices are failing then only 4200 * the last one is marked as error 4201 */ 4202 col = i; 4203 } 4204 4205 if (col != -1) { 4206 raid_set_state(un, col, RCS_ERRED, 0); 4207 commit++; 4208 } 4209 4210 } else { 4211 for (i = 0; i < un->un_totalcolumncnt; i++) { 4212 device = &un->un_column[i]; 4213 4214 /* if we have LAST_ERRED go ahead and commit. */ 4215 if (un->un_state & RUS_LAST_ERRED) 4216 break; 4217 /* 4218 * could not open the component 4219 */ 4220 4221 if (!(device->un_devflags & MD_RAID_DEV_PROBEOPEN)) { 4222 col = i; 4223 raid_set_state(un, col, RCS_ERRED, 0); 4224 commit++; 4225 } 4226 } 4227 } 4228 4229 if (commit) 4230 raid_commit(un, NULL); 4231 4232 if (col != -1) { 4233 if (COLUMN_STATE(un, col) & RCS_ERRED) { 4234 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 4235 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 4236 } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { 4237 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 4238 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 4239 } 4240 } 4241 4242 raid_probe_close_all_devs(un); 4243 return (0); 4244 } 4245 4246 static int 4247 raid_imp_set( 4248 set_t setno 4249 ) 4250 { 4251 mddb_recid_t recid; 4252 int i, gotsomething; 4253 mddb_type_t typ1; 4254 mddb_de_ic_t *dep; 4255 mddb_rb32_t *rbp; 4256 mr_unit_t *un64; 4257 mr_unit32_od_t *un32; 4258 minor_t *self_id; /* minor needs to be updated */ 4259 md_parent_t *parent_id; /* parent needs to be updated */ 4260 mddb_recid_t *record_id; /* record id needs to be updated */ 4261 hsp_t *hsp_id; 4262 4263 gotsomething = 0; 4264 4265 typ1 = (mddb_type_t)md_getshared_key(setno, 4266 raid_md_ops.md_driver.md_drivername); 4267 recid = mddb_makerecid(setno, 0); 4268 4269 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 4270 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 4271 continue; 4272 4273 dep = mddb_getrecdep(recid); 4274 rbp = dep->de_rb; 4275 4276 if (rbp->rb_revision == MDDB_REV_RB) { 4277 /* 4278 * Small device 4279 */ 4280 un32 = (mr_unit32_od_t *)mddb_getrecaddr(recid); 4281 self_id = &(un32->c.un_self_id); 4282 parent_id = &(un32->c.un_parent); 4283 record_id = &(un32->c.un_record_id); 4284 hsp_id = &(un32->un_hsp_id); 4285 4286 for (i = 0; i < un32->un_totalcolumncnt; i++) { 4287 mr_column32_od_t *device; 4288 4289 device = &un32->un_column[i]; 4290 if (!md_update_minor(setno, mddb_getsidenum 4291 (setno), device->un_orig_key)) 4292 goto out; 4293 4294 if (device->un_hs_id != 0) 4295 device->un_hs_id = MAKERECID( 4296 setno, device->un_hs_id); 4297 } 4298 } else { 4299 un64 = (mr_unit_t *)mddb_getrecaddr(recid); 4300 self_id = &(un64->c.un_self_id); 4301 parent_id = &(un64->c.un_parent); 4302 record_id = &(un64->c.un_record_id); 4303 hsp_id = &(un64->un_hsp_id); 4304 4305 for (i = 0; i < un64->un_totalcolumncnt; i++) { 4306 mr_column_t *device; 4307 4308 device = &un64->un_column[i]; 4309 if (!md_update_minor(setno, mddb_getsidenum 4310 (setno), device->un_orig_key)) 4311 goto out; 4312 4313 if (device->un_hs_id != 0) 4314 device->un_hs_id = MAKERECID( 4315 setno, device->un_hs_id); 4316 } 4317 } 4318 4319 /* 4320 * Update unit with the imported setno 4321 */ 4322 mddb_setrecprivate(recid, MD_PRV_GOTIT); 4323 4324 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 4325 4326 if (*hsp_id != -1) 4327 *hsp_id = MAKERECID(setno, DBID(*hsp_id)); 4328 4329 if (*parent_id != MD_NO_PARENT) 4330 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 4331 *record_id = MAKERECID(setno, DBID(*record_id)); 4332 gotsomething = 1; 4333 } 4334 4335 out: 4336 return (gotsomething); 4337 } 4338 4339 static md_named_services_t raid_named_services[] = { 4340 {raid_hotspares, "poke hotspares" }, 4341 {raid_rename_check, MDRNM_CHECK }, 4342 {raid_rename_lock, MDRNM_LOCK }, 4343 {(intptr_t (*)()) raid_rename_unlock, MDRNM_UNLOCK }, 4344 {(intptr_t (*)()) raid_probe_dev, "probe open test" }, 4345 {NULL, 0 } 4346 }; 4347 4348 md_ops_t raid_md_ops = { 4349 raid_open, /* open */ 4350 raid_close, /* close */ 4351 md_raid_strategy, /* strategy */ 4352 NULL, /* print */ 4353 NULL, /* dump */ 4354 NULL, /* read */ 4355 NULL, /* write */ 4356 md_raid_ioctl, /* ioctl, */ 4357 raid_snarf, /* raid_snarf */ 4358 raid_halt, /* raid_halt */ 4359 NULL, /* aread */ 4360 NULL, /* awrite */ 4361 raid_imp_set, /* import set */ 4362 raid_named_services 4363 }; 4364 4365 static void 4366 init_init() 4367 { 4368 /* default to a second */ 4369 if (md_wr_wait == 0) 4370 md_wr_wait = md_hz >> 1; 4371 4372 raid_parent_cache = kmem_cache_create("md_raid_parent", 4373 sizeof (md_raidps_t), 0, raid_parent_constructor, 4374 raid_parent_destructor, raid_run_queue, NULL, NULL, 0); 4375 raid_child_cache = kmem_cache_create("md_raid_child", 4376 sizeof (md_raidcs_t) - sizeof (buf_t) + biosize(), 0, 4377 raid_child_constructor, raid_child_destructor, 4378 raid_run_queue, NULL, NULL, 0); 4379 raid_cbuf_cache = kmem_cache_create("md_raid_cbufs", 4380 sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor, 4381 raid_cbuf_destructor, raid_run_queue, NULL, NULL, 0); 4382 } 4383 4384 static void 4385 fini_uninit() 4386 { 4387 kmem_cache_destroy(raid_parent_cache); 4388 kmem_cache_destroy(raid_child_cache); 4389 kmem_cache_destroy(raid_cbuf_cache); 4390 raid_parent_cache = raid_child_cache = raid_cbuf_cache = NULL; 4391 } 4392 4393 /* define the module linkage */ 4394 MD_PLUGIN_MISC_MODULE("raid module %I%", init_init(), fini_uninit()) 4395