1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * NAME: raid.c 31 * 32 * DESCRIPTION: Main RAID driver source file containing open, close and I/O 33 * operations. 34 * 35 * ROUTINES PROVIDED FOR EXTERNAL USE: 36 * raid_open() - open the RAID metadevice for access. 37 * raid_internal_open() - internal open routine of RAID metdevice. 38 * md_raid_strategy() - perform normal I/O operations, 39 * such as read and write. 40 * raid_close() - close the RAID metadevice. 41 * raid_internal_close() - internal close routine of RAID metadevice. 42 * raid_snarf() - initialize and clean up MDD records. 43 * raid_halt() - reset the RAID metadevice 44 * raid_line() - return the line # of this segment 45 * raid_dcolumn() - return the data column # of this segment 46 * raid_pcolumn() - return the parity column # of this segment 47 */ 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/conf.h> 52 #include <sys/file.h> 53 #include <sys/user.h> 54 #include <sys/uio.h> 55 #include <sys/t_lock.h> 56 #include <sys/buf.h> 57 #include <sys/dkio.h> 58 #include <sys/vtoc.h> 59 #include <sys/kmem.h> 60 #include <vm/page.h> 61 #include <sys/cmn_err.h> 62 #include <sys/sysmacros.h> 63 #include <sys/types.h> 64 #include <sys/mkdev.h> 65 #include <sys/stat.h> 66 #include <sys/open.h> 67 #include <sys/modctl.h> 68 #include <sys/ddi.h> 69 #include <sys/sunddi.h> 70 #include <sys/debug.h> 71 #include <sys/lvm/md_raid.h> 72 #include <sys/lvm/mdvar.h> 73 #include <sys/lvm/md_convert.h> 74 75 #include <sys/sysevent/eventdefs.h> 76 #include <sys/sysevent/svm.h> 77 78 md_ops_t raid_md_ops; 79 #ifndef lint 80 static char _depends_on[] = "drv/md"; 81 md_ops_t *md_interface_ops = &raid_md_ops; 82 #endif /* lint */ 83 84 extern unit_t md_nunits; 85 extern unit_t md_nsets; 86 extern md_set_t md_set[]; 87 extern int md_status; 88 extern major_t md_major; 89 extern mdq_anchor_t md_done_daemon; 90 extern mdq_anchor_t md_mstr_daemon; 91 extern int md_sleep_for_test; 92 extern clock_t md_hz; 93 94 extern md_event_queue_t *md_event_queue; 95 96 97 int pchunks = 16; 98 int phigh = 1024; 99 int plow = 128; 100 int cchunks = 64; 101 int chigh = 1024; 102 int clow = 512; 103 int bchunks = 32; 104 int bhigh = 256; 105 int blow = 128; 106 107 int raid_total_io = 0; 108 int raid_reads = 0; 109 int raid_writes = 0; 110 int raid_no_bpmaps = 0; 111 int raid_512 = 0; 112 int raid_1024 = 0; 113 int raid_1024_8192 = 0; 114 int raid_8192 = 0; 115 int raid_8192_bigger = 0; 116 int raid_line_lock_wait = 0; 117 118 int data_buffer_waits = 0; 119 int parity_buffer_waits = 0; 120 121 /* writer line locks */ 122 int raid_writer_locks = 0; /* total writer locks */ 123 int raid_write_waits = 0; /* total writer locks that waited */ 124 int raid_full_line_writes = 0; /* total full line writes */ 125 int raid_write_queue_length = 0; /* wait queue length */ 126 int raid_max_write_q_length = 0; /* maximum queue length */ 127 int raid_write_locks_active = 0; /* writer locks at any time */ 128 int raid_max_write_locks = 0; /* maximum writer locks active */ 129 130 /* read line locks */ 131 int raid_reader_locks = 0; /* total reader locks held */ 132 int raid_reader_locks_active = 0; /* reader locks held */ 133 int raid_max_reader_locks = 0; /* maximum reader locks held in run */ 134 int raid_read_overlaps = 0; /* number of times 2 reads hit same line */ 135 int raid_read_waits = 0; /* times a reader waited on writer */ 136 137 /* prewrite stats */ 138 int raid_prewrite_waits = 0; /* number of waits for a pw slot */ 139 int raid_pw = 0; /* number of pw slots in use */ 140 int raid_prewrite_max = 0; /* maximum number of pw slots in use */ 141 int raid_pw_invalidates = 0; 142 143 static clock_t md_wr_wait = 0; 144 145 int nv_available = 0; /* presence of nv-ram support in device */ 146 int nv_prewrite = 1; /* mark prewrites with nv_available */ 147 int nv_parity = 1; /* mark parity with nv_available */ 148 149 kmem_cache_t *raid_parent_cache = NULL; 150 kmem_cache_t *raid_child_cache = NULL; 151 kmem_cache_t *raid_cbuf_cache = NULL; 152 153 int raid_internal_open(minor_t mnum, int flag, int otyp, 154 int md_oflags); 155 156 static void freebuffers(md_raidcs_t *cs); 157 static int raid_read(mr_unit_t *un, md_raidcs_t *cs); 158 static void raid_read_io(mr_unit_t *un, md_raidcs_t *cs); 159 static int raid_write(mr_unit_t *un, md_raidcs_t *cs); 160 static void raid_write_io(mr_unit_t *un, md_raidcs_t *cs); 161 static void raid_stage(md_raidcs_t *cs); 162 static void raid_enqueue(md_raidcs_t *cs); 163 static diskaddr_t raid_line(diskaddr_t segment, mr_unit_t *un); 164 uint_t raid_dcolumn(diskaddr_t segment, mr_unit_t *un); 165 static void getpbuffer(md_raidcs_t *cs); 166 static void getdbuffer(md_raidcs_t *cs); 167 static void raid_done(buf_t *bp); 168 static void raid_io_startup(mr_unit_t *un); 169 170 static rus_state_t 171 raid_col2unit(rcs_state_t state, rus_state_t unitstate) 172 { 173 switch (state) { 174 case RCS_INIT: 175 return (RUS_INIT); 176 case RCS_OKAY: 177 return (RUS_OKAY); 178 case RCS_RESYNC: 179 if (unitstate & RUS_LAST_ERRED) 180 return (RUS_LAST_ERRED); 181 else 182 return (RUS_ERRED); 183 case RCS_ERRED: 184 return (RUS_ERRED); 185 case RCS_LAST_ERRED: 186 return (RUS_ERRED); 187 default: 188 break; 189 } 190 panic("raid_col2unit"); 191 /*NOTREACHED*/ 192 } 193 194 void 195 raid_set_state(mr_unit_t *un, int col, rcs_state_t newstate, int force) 196 { 197 198 rus_state_t unitstate, origstate; 199 rcs_state_t colstate; 200 rcs_state_t orig_colstate; 201 int errcnt = 0, 202 okaycnt = 0, 203 resynccnt = 0; 204 int i; 205 char *devname; 206 207 ASSERT(un); 208 ASSERT(col < un->un_totalcolumncnt); 209 ASSERT(newstate & 210 (RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | 211 RCS_LAST_ERRED | RCS_REGEN)); 212 ASSERT((newstate & 213 ~(RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | 214 RCS_LAST_ERRED | RCS_REGEN)) 215 == 0); 216 217 ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1); 218 219 unitstate = un->un_state; 220 origstate = unitstate; 221 222 if (force) { 223 un->un_column[col].un_devstate = newstate; 224 un->un_state = raid_col2unit(newstate, unitstate); 225 uniqtime32(&un->un_column[col].un_devtimestamp); 226 uniqtime32(&un->un_timestamp); 227 return; 228 } 229 230 ASSERT(un->un_state & 231 (RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | 232 RUS_REGEN)); 233 ASSERT((un->un_state & ~(RUS_INIT | 234 RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)) == 0); 235 236 if (un->un_column[col].un_devstate == newstate) 237 return; 238 239 if (newstate == RCS_REGEN) { 240 if (raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) 241 return; 242 un->un_state = RUS_REGEN; 243 return; 244 } 245 246 orig_colstate = un->un_column[col].un_devstate; 247 248 /* 249 * if there is another column in the error state then this 250 * column should go to the last errored state 251 */ 252 for (i = 0; i < un->un_totalcolumncnt; i++) { 253 if (i == col) 254 colstate = newstate; 255 else 256 colstate = un->un_column[i].un_devstate; 257 if (colstate & (RCS_ERRED | RCS_LAST_ERRED | RCS_INIT_ERRED)) 258 errcnt++; 259 if (colstate & RCS_OKAY) 260 okaycnt++; 261 if (colstate & RCS_RESYNC) 262 resynccnt++; 263 } 264 ASSERT(resynccnt < 2); 265 266 if (okaycnt == un->un_totalcolumncnt) 267 unitstate = RUS_OKAY; 268 else if (errcnt > 1) { 269 unitstate = RUS_LAST_ERRED; 270 if (newstate & RCS_ERRED) 271 newstate = RCS_LAST_ERRED; 272 } else if (errcnt == 1) 273 if (!(unitstate & RUS_LAST_ERRED)) 274 unitstate = RUS_ERRED; 275 276 if (un->un_state == RUS_DOI) 277 unitstate = RUS_DOI; 278 279 un->un_column[col].un_devstate = newstate; 280 uniqtime32(&un->un_column[col].un_devtimestamp); 281 /* 282 * if there are last errored column being brought back online 283 * by open or snarf, then be sure to clear the RUS_LAST_ERRED 284 * bit to allow writes. If there is a real error then the 285 * column will go back into last erred. 286 */ 287 if ((raid_state_cnt(un, RCS_LAST_ERRED) == 0) && 288 (raid_state_cnt(un, RCS_ERRED) == 1)) 289 unitstate = RUS_ERRED; 290 291 un->un_state = unitstate; 292 uniqtime32(&un->un_timestamp); 293 294 if ((! (origstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) && 295 (unitstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) { 296 devname = md_devname(MD_UN2SET(un), 297 un->un_column[col].un_dev, NULL, 0); 298 299 cmn_err(CE_WARN, "md: %s: %s needs maintenance", 300 md_shortname(MD_SID(un)), devname); 301 302 if (unitstate & RUS_LAST_ERRED) { 303 cmn_err(CE_WARN, "md: %s: %s last erred", 304 md_shortname(MD_SID(un)), devname); 305 306 } else if (un->un_column[col].un_devflags & 307 MD_RAID_DEV_ISOPEN) { 308 /* 309 * Close the broken device and clear the open flag on 310 * it. We have to check that the device is open, 311 * otherwise the first open on it has resulted in the 312 * error that is being processed and the actual un_dev 313 * will be NODEV64. 314 */ 315 md_layered_close(un->un_column[col].un_dev, 316 MD_OFLG_NULL); 317 un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; 318 } 319 } else if (orig_colstate == RCS_LAST_ERRED && newstate == RCS_ERRED && 320 un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) { 321 /* 322 * Similar to logic above except no log messages since we 323 * are just transitioning from Last Erred to Erred. 324 */ 325 md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL); 326 un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; 327 } 328 329 /* 330 * If a resync has completed, see if there is a Last Erred 331 * component that we can change to the Erred state. 332 */ 333 if ((orig_colstate == RCS_RESYNC) && (newstate == RCS_OKAY)) { 334 for (i = 0; i < un->un_totalcolumncnt; i++) { 335 if (i != col && 336 (un->un_column[i].un_devstate & RCS_LAST_ERRED)) { 337 raid_set_state(un, i, RCS_ERRED, 0); 338 break; 339 } 340 } 341 } 342 } 343 344 /* 345 * NAME: erred_check_line 346 * 347 * DESCRIPTION: Return the type of write to perform on an erred column based 348 * upon any resync activity. 349 * 350 * if a column is being resynced and the write is above the 351 * resync point may have to write to the target being resynced. 352 * 353 * Column state may make it impossible to do the write 354 * in which case RCL_EIO or RCL_ENXIO is returned. 355 * 356 * If a column cannot be written directly, RCL_ERRED is 357 * returned and processing should proceed accordingly. 358 * 359 * PARAMETERS: minor_t mnum - minor number identity of metadevice 360 * md_raidcs_t *cs - child save structure 361 * mr_column_t *dcolumn - pointer to data column structure 362 * mr_column_t *pcolumn - pointer to parity column structure 363 * 364 * RETURNS: RCL_OKAY, RCL_ERRED 365 * 366 * LOCKS: Expects Line Writer Lock and Unit Resource Lock to be held 367 * across call. 368 */ 369 370 static int 371 erred_check_line(mr_unit_t *un, md_raidcs_t *cs, mr_column_t *column) 372 { 373 374 ASSERT(un != NULL); 375 ASSERT(cs->cs_flags & MD_RCS_LLOCKD); 376 377 if (column->un_devstate & RCS_OKAY) 378 return (RCL_OKAY); 379 380 if (column->un_devstate & RCS_ERRED) 381 return (RCL_ERRED); /* do not read from errored disk */ 382 383 /* 384 * for the last errored case their are two considerations. 385 * When the last errored column is the only errored column then 386 * do treat it like a maintenance column, not doing I/O from 387 * it. When it there are other failures then just attempt 388 * to use it. 389 */ 390 if (column->un_devstate & RCS_LAST_ERRED) 391 return (RCL_ERRED); 392 393 ASSERT(column->un_devstate & RCS_RESYNC); 394 395 /* 396 * When a resync from a hotspare is being done (copy resync) 397 * then always treat it as an OKAY column, since no regen 398 * is required. 399 */ 400 if (column->un_devflags & MD_RAID_COPY_RESYNC) { 401 return (RCL_OKAY); 402 } 403 404 mutex_enter(&un->un_mx); 405 if (cs->cs_line < un->un_resync_line_index) { 406 mutex_exit(&un->un_mx); 407 return (RCL_OKAY); 408 } 409 mutex_exit(&un->un_mx); 410 return (RCL_ERRED); 411 412 } 413 414 /* 415 * NAMES: raid_state_cnt 416 * 417 * DESCRIPTION: counts number of column in a specific state 418 * 419 * PARAMETERS: md_raid_t *un 420 * rcs_state state 421 */ 422 int 423 raid_state_cnt(mr_unit_t *un, rcs_state_t state) 424 { 425 int i, retval = 0; 426 427 for (i = 0; i < un->un_totalcolumncnt; i++) 428 if (un->un_column[i].un_devstate & state) 429 retval++; 430 return (retval); 431 } 432 433 /* 434 * NAMES: raid_io_overlaps 435 * 436 * DESCRIPTION: checkst for overlap of 2 child save structures 437 * 438 * PARAMETERS: md_raidcs_t cs1 439 * md_raidcs_t cs2 440 * 441 * RETURNS: 0 - no overlap 442 * 1 - overlap 443 */ 444 int 445 raid_io_overlaps(md_raidcs_t *cs1, md_raidcs_t *cs2) 446 { 447 if (cs1->cs_blkno > cs2->cs_lastblk) 448 return (0); 449 if (cs1->cs_lastblk < cs2->cs_blkno) 450 return (0); 451 return (1); 452 } 453 454 /* 455 * NAMES: raid_parent_constructor 456 * DESCRIPTION: parent structure constructor routine 457 * PARAMETERS: 458 */ 459 /*ARGSUSED1*/ 460 static int 461 raid_parent_constructor(void *p, void *d1, int d2) 462 { 463 mutex_init(&((md_raidps_t *)p)->ps_mx, 464 NULL, MUTEX_DEFAULT, NULL); 465 mutex_init(&((md_raidps_t *)p)->ps_mapin_mx, 466 NULL, MUTEX_DEFAULT, NULL); 467 return (0); 468 } 469 470 void 471 raid_parent_init(md_raidps_t *ps) 472 { 473 bzero(ps, offsetof(md_raidps_t, ps_mx)); 474 ((md_raidps_t *)ps)->ps_flags = MD_RPS_INUSE; 475 ((md_raidps_t *)ps)->ps_magic = RAID_PSMAGIC; 476 } 477 478 /*ARGSUSED1*/ 479 static void 480 raid_parent_destructor(void *p, void *d) 481 { 482 mutex_destroy(&((md_raidps_t *)p)->ps_mx); 483 mutex_destroy(&((md_raidps_t *)p)->ps_mapin_mx); 484 } 485 486 /* 487 * NAMES: raid_child_constructor 488 * DESCRIPTION: child structure constructor routine 489 * PARAMETERS: 490 */ 491 /*ARGSUSED1*/ 492 static int 493 raid_child_constructor(void *p, void *d1, int d2) 494 { 495 md_raidcs_t *cs = (md_raidcs_t *)p; 496 mutex_init(&cs->cs_mx, NULL, MUTEX_DEFAULT, NULL); 497 bioinit(&cs->cs_dbuf); 498 bioinit(&cs->cs_pbuf); 499 bioinit(&cs->cs_hbuf); 500 return (0); 501 } 502 503 void 504 raid_child_init(md_raidcs_t *cs) 505 { 506 bzero(cs, offsetof(md_raidcs_t, cs_mx)); 507 508 md_bioreset(&cs->cs_dbuf); 509 md_bioreset(&cs->cs_pbuf); 510 md_bioreset(&cs->cs_hbuf); 511 512 ((md_raidcs_t *)cs)->cs_dbuf.b_chain = 513 ((md_raidcs_t *)cs)->cs_pbuf.b_chain = 514 ((md_raidcs_t *)cs)->cs_hbuf.b_chain = 515 (struct buf *)(cs); 516 517 cs->cs_magic = RAID_CSMAGIC; 518 cs->cs_line = MD_DISKADDR_ERROR; 519 cs->cs_dpwslot = -1; 520 cs->cs_ppwslot = -1; 521 } 522 523 /*ARGSUSED1*/ 524 static void 525 raid_child_destructor(void *p, void *d) 526 { 527 biofini(&((md_raidcs_t *)p)->cs_dbuf); 528 biofini(&((md_raidcs_t *)p)->cs_hbuf); 529 biofini(&((md_raidcs_t *)p)->cs_pbuf); 530 mutex_destroy(&((md_raidcs_t *)p)->cs_mx); 531 } 532 533 /*ARGSUSED1*/ 534 static int 535 raid_cbuf_constructor(void *p, void *d1, int d2) 536 { 537 bioinit(&((md_raidcbuf_t *)p)->cbuf_bp); 538 return (0); 539 } 540 541 static void 542 raid_cbuf_init(md_raidcbuf_t *cb) 543 { 544 bzero(cb, offsetof(md_raidcbuf_t, cbuf_bp)); 545 md_bioreset(&cb->cbuf_bp); 546 cb->cbuf_magic = RAID_BUFMAGIC; 547 cb->cbuf_pwslot = -1; 548 cb->cbuf_flags = CBUF_WRITE; 549 } 550 551 /*ARGSUSED1*/ 552 static void 553 raid_cbuf_destructor(void *p, void *d) 554 { 555 biofini(&((md_raidcbuf_t *)p)->cbuf_bp); 556 } 557 558 /* 559 * NAMES: raid_run_queue 560 * DESCRIPTION: spawn a backend processing daemon for RAID metadevice. 561 * PARAMETERS: 562 */ 563 /*ARGSUSED*/ 564 static void 565 raid_run_queue(void *d) 566 { 567 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 568 md_daemon(1, &md_done_daemon); 569 } 570 571 /* 572 * NAME: raid_build_pwslot 573 * DESCRIPTION: builds mr_pw_reserve for the column 574 * PARAMETERS: un is the pointer to the unit structure 575 * colindex is the column to create the structure for 576 */ 577 int 578 raid_build_pw_reservation(mr_unit_t *un, int colindex) 579 { 580 mr_pw_reserve_t *pw; 581 mr_scoreboard_t *sb; 582 int i; 583 584 pw = (mr_pw_reserve_t *) kmem_zalloc(sizeof (mr_pw_reserve_t) + 585 (sizeof (mr_scoreboard_t) * un->un_pwcnt), KM_SLEEP); 586 pw->pw_magic = RAID_PWMAGIC; 587 pw->pw_column = colindex; 588 pw->pw_free = un->un_pwcnt; 589 sb = &pw->pw_sb[0]; 590 for (i = 0; i < un->un_pwcnt; i++) { 591 sb[i].sb_column = colindex; 592 sb[i].sb_flags = SB_UNUSED; 593 sb[i].sb_start_blk = 0; 594 sb[i].sb_last_blk = 0; 595 sb[i].sb_cs = NULL; 596 } 597 un->un_column_ic[colindex].un_pw_reserve = pw; 598 return (0); 599 } 600 /* 601 * NAME: raid_free_pw_reservation 602 * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine 603 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 604 * int colindex - index of the column whose pre-write slot struct 605 * is to be destroyed. 606 */ 607 void 608 raid_free_pw_reservation(mr_unit_t *un, int colindex) 609 { 610 mr_pw_reserve_t *pw = un->un_column_ic[colindex].un_pw_reserve; 611 612 kmem_free(pw, sizeof (mr_pw_reserve_t) + 613 (sizeof (mr_scoreboard_t) * un->un_pwcnt)); 614 } 615 616 /* 617 * NAME: raid_cancel_pwslot 618 * DESCRIPTION: RAID metadevice write routine 619 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 620 */ 621 static void 622 raid_cancel_pwslot(md_raidcs_t *cs) 623 { 624 mr_unit_t *un = cs->cs_un; 625 mr_pw_reserve_t *pw; 626 mr_scoreboard_t *sb; 627 mr_column_ic_t *col; 628 md_raidcbuf_t *cbuf; 629 int broadcast = 0; 630 631 if (cs->cs_ps->ps_flags & MD_RPS_READ) 632 return; 633 if (cs->cs_dpwslot != -1) { 634 col = &un->un_column_ic[cs->cs_dcolumn]; 635 pw = col->un_pw_reserve; 636 sb = &pw->pw_sb[cs->cs_dpwslot]; 637 sb->sb_flags = SB_AVAIL; 638 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 639 broadcast++; 640 sb->sb_cs = NULL; 641 } 642 643 if (cs->cs_ppwslot != -1) { 644 col = &un->un_column_ic[cs->cs_pcolumn]; 645 pw = col->un_pw_reserve; 646 sb = &pw->pw_sb[cs->cs_ppwslot]; 647 sb->sb_flags = SB_AVAIL; 648 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 649 broadcast++; 650 sb->sb_cs = NULL; 651 } 652 653 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 654 if (cbuf->cbuf_pwslot == -1) 655 continue; 656 col = &un->un_column_ic[cbuf->cbuf_column]; 657 pw = col->un_pw_reserve; 658 sb = &pw->pw_sb[cbuf->cbuf_pwslot]; 659 sb->sb_flags = SB_AVAIL; 660 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 661 broadcast++; 662 sb->sb_cs = NULL; 663 } 664 if (broadcast) { 665 cv_broadcast(&un->un_cv); 666 return; 667 } 668 mutex_enter(&un->un_mx); 669 if (un->un_rflags & MD_RFLAG_NEEDPW) 670 cv_broadcast(&un->un_cv); 671 mutex_exit(&un->un_mx); 672 } 673 674 static void 675 raid_free_pwinvalidate(md_raidcs_t *cs) 676 { 677 md_raidcbuf_t *cbuf; 678 md_raidcbuf_t *cbuf_to_free; 679 mr_unit_t *un = cs->cs_un; 680 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); 681 mr_pw_reserve_t *pw; 682 mr_scoreboard_t *sb; 683 int broadcast = 0; 684 685 cbuf = cs->cs_pw_inval_list; 686 ASSERT(cbuf); 687 mutex_enter(&un->un_linlck_mx); 688 while (cbuf) { 689 pw = un->un_column_ic[cbuf->cbuf_column].un_pw_reserve; 690 sb = &pw->pw_sb[0]; 691 ASSERT(sb[cbuf->cbuf_pwslot].sb_flags & SB_INVAL_PEND); 692 sb[cbuf->cbuf_pwslot].sb_flags = SB_UNUSED; 693 sb[cbuf->cbuf_pwslot].sb_cs = NULL; 694 if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) 695 broadcast++; 696 cbuf_to_free = cbuf; 697 cbuf = cbuf->cbuf_next; 698 kmem_free(cbuf_to_free->cbuf_buffer, dbtob(un->un_iosize)); 699 kmem_cache_free(raid_cbuf_cache, cbuf_to_free); 700 } 701 cs->cs_pw_inval_list = (md_raidcbuf_t *)NULL; 702 /* 703 * now that there is a free prewrite slot, check to see if there 704 * are any io operations waiting first wake up the raid_io_startup 705 * then signal the the processes waiting in raid_write. 706 */ 707 if (ui->ui_io_lock->io_list_front) 708 raid_io_startup(un); 709 mutex_exit(&un->un_linlck_mx); 710 if (broadcast) { 711 cv_broadcast(&un->un_cv); 712 return; 713 } 714 mutex_enter(&un->un_mx); 715 if (un->un_rflags & MD_RFLAG_NEEDPW) 716 cv_broadcast(&un->un_cv); 717 mutex_exit(&un->un_mx); 718 } 719 720 721 static int 722 raid_get_pwslot(md_raidcs_t *cs, int column) 723 { 724 mr_scoreboard_t *sb; 725 mr_pw_reserve_t *pw; 726 mr_unit_t *un = cs->cs_un; 727 diskaddr_t start_blk = cs->cs_blkno; 728 diskaddr_t last_blk = cs->cs_lastblk; 729 int i; 730 int pwcnt = un->un_pwcnt; 731 int avail = -1; 732 int use = -1; 733 int flags; 734 735 736 /* start with the data column */ 737 pw = cs->cs_un->un_column_ic[column].un_pw_reserve; 738 sb = &pw->pw_sb[0]; 739 ASSERT(pw->pw_free > 0); 740 for (i = 0; i < pwcnt; i++) { 741 flags = sb[i].sb_flags; 742 if (flags & SB_INVAL_PEND) 743 continue; 744 745 if ((avail == -1) && (flags & (SB_AVAIL | SB_UNUSED))) 746 avail = i; 747 748 if ((start_blk > sb[i].sb_last_blk) || 749 (last_blk < sb[i].sb_start_blk)) 750 continue; 751 752 /* OVERLAP */ 753 ASSERT(! (sb[i].sb_flags & SB_INUSE)); 754 755 /* 756 * raid_invalidate_pwslot attempts to zero out prewrite entry 757 * in parallel with other disk reads/writes related to current 758 * transaction. however cs_frags accounting for this case is 759 * broken because raid_write_io resets cs_frags i.e. ignoring 760 * that it could have been been set to > 0 value by 761 * raid_invalidate_pwslot. While this can be fixed an 762 * additional problem is that we don't seem to handle 763 * correctly the case of getting a disk error for prewrite 764 * entry invalidation. 765 * It does not look like we really need 766 * to invalidate prewrite slots because raid_replay sorts 767 * prewrite id's in ascending order and during recovery the 768 * latest prewrite entry for the same block will be replay 769 * last. That's why i ifdef'd out the call to 770 * raid_invalidate_pwslot. --aguzovsk@east 771 */ 772 773 if (use == -1) { 774 use = i; 775 } 776 } 777 778 ASSERT(avail != -1); 779 pw->pw_free--; 780 if (use == -1) 781 use = avail; 782 783 ASSERT(! (sb[use].sb_flags & SB_INUSE)); 784 sb[use].sb_flags = SB_INUSE; 785 sb[use].sb_cs = cs; 786 sb[use].sb_start_blk = start_blk; 787 sb[use].sb_last_blk = last_blk; 788 ASSERT((use >= 0) && (use < un->un_pwcnt)); 789 return (use); 790 } 791 792 static int 793 raid_check_pw(md_raidcs_t *cs) 794 { 795 796 mr_unit_t *un = cs->cs_un; 797 int i; 798 799 ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); 800 /* 801 * check to be sure there is a prewrite slot available 802 * if not just return. 803 */ 804 if (cs->cs_flags & MD_RCS_LINE) { 805 for (i = 0; i < un->un_totalcolumncnt; i++) 806 if (un->un_column_ic[i].un_pw_reserve->pw_free <= 0) 807 return (1); 808 return (0); 809 } 810 811 if (un->un_column_ic[cs->cs_dcolumn].un_pw_reserve->pw_free <= 0) 812 return (1); 813 if (un->un_column_ic[cs->cs_pcolumn].un_pw_reserve->pw_free <= 0) 814 return (1); 815 return (0); 816 } 817 static int 818 raid_alloc_pwslot(md_raidcs_t *cs) 819 { 820 mr_unit_t *un = cs->cs_un; 821 md_raidcbuf_t *cbuf; 822 823 ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); 824 if (raid_check_pw(cs)) 825 return (1); 826 827 mutex_enter(&un->un_mx); 828 un->un_pwid++; 829 cs->cs_pwid = un->un_pwid; 830 mutex_exit(&un->un_mx); 831 832 cs->cs_dpwslot = raid_get_pwslot(cs, cs->cs_dcolumn); 833 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 834 cbuf->cbuf_pwslot = raid_get_pwslot(cs, cbuf->cbuf_column); 835 } 836 cs->cs_ppwslot = raid_get_pwslot(cs, cs->cs_pcolumn); 837 838 cs->cs_flags |= MD_RCS_HAVE_PW_SLOTS; 839 840 return (0); 841 } 842 843 /* 844 * NAMES: raid_build_incore 845 * DESCRIPTION: RAID metadevice incore structure building routine 846 * PARAMETERS: void *p - pointer to a unit structure 847 * int snarfing - a flag to indicate snarfing is required 848 */ 849 int 850 raid_build_incore(void *p, int snarfing) 851 { 852 mr_unit_t *un = (mr_unit_t *)p; 853 minor_t mnum = MD_SID(un); 854 mddb_recid_t hs_recid = 0; 855 int i; 856 int preserve_flags; 857 mr_column_t *column; 858 int iosize; 859 md_dev64_t hs, dev; 860 int resync_cnt = 0, 861 error_cnt = 0; 862 863 hs = NODEV64; 864 dev = NODEV64; 865 866 /* clear out bogus pointer incase we return(1) prior to alloc */ 867 un->mr_ic = NULL; 868 869 if (MD_STATUS(un) & MD_UN_BEING_RESET) { 870 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); 871 return (1); 872 } 873 874 if (MD_UNIT(mnum) != NULL) 875 return (0); 876 877 if (snarfing) 878 MD_STATUS(un) = 0; 879 880 un->mr_ic = (mr_unit_ic_t *)kmem_zalloc(sizeof (*un->mr_ic), 881 KM_SLEEP); 882 883 un->un_column_ic = (mr_column_ic_t *) 884 kmem_zalloc(sizeof (mr_column_ic_t) * 885 un->un_totalcolumncnt, KM_SLEEP); 886 887 for (i = 0; i < un->un_totalcolumncnt; i++) { 888 889 column = &un->un_column[i]; 890 preserve_flags = column->un_devflags & 891 (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC); 892 column->un_devflags &= 893 ~(MD_RAID_ALT_ISOPEN | MD_RAID_DEV_ISOPEN | 894 MD_RAID_WRITE_ALT); 895 if (raid_build_pw_reservation(un, i) != 0) { 896 /* could not build pwslot */ 897 return (1); 898 } 899 900 if (snarfing) { 901 set_t setno = MD_MIN2SET(mnum); 902 dev = md_getdevnum(setno, mddb_getsidenum(setno), 903 column->un_orig_key, MD_NOTRUST_DEVT); 904 /* 905 * Comment out instead of remove so we have history 906 * In the pre-SVM releases stored devt is used so 907 * as long as there is one snarf is always happy 908 * even the component is powered off. This is not 909 * the case in current SVM implementation. NODEV64 910 * can be returned and in this case since we resolve 911 * the devt at 'open' time (first use of metadevice) 912 * we will allow snarf continue. 913 * 914 * if (dev == NODEV64) 915 * return (1); 916 */ 917 918 /* 919 * Setup un_orig_dev from device id info if the device 920 * is valid (not NODEV64). 921 */ 922 if (dev != NODEV64) 923 column->un_orig_dev = dev; 924 925 if (column->un_devstate & RCS_RESYNC) 926 resync_cnt++; 927 if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) 928 error_cnt++; 929 930 if (HOTSPARED(un, i)) { 931 (void) md_hot_spare_ifc(HS_MKDEV, 932 0, 0, 0, &column->un_hs_id, NULL, 933 &hs, NULL); 934 /* 935 * Same here 936 * 937 * if (hs == NODEV64) 938 * return (1); 939 */ 940 } 941 942 if (HOTSPARED(un, i)) { 943 if (column->un_devstate & 944 (RCS_OKAY | RCS_LAST_ERRED)) { 945 column->un_dev = hs; 946 column->un_pwstart = 947 column->un_hs_pwstart; 948 column->un_devstart = 949 column->un_hs_devstart; 950 preserve_flags &= 951 ~(MD_RAID_COPY_RESYNC | 952 MD_RAID_REGEN_RESYNC); 953 } else if (column->un_devstate & RCS_RESYNC) { 954 /* 955 * if previous system was 4.0 set 956 * the direction flags 957 */ 958 if ((preserve_flags & 959 (MD_RAID_COPY_RESYNC | 960 MD_RAID_REGEN_RESYNC)) == 0) { 961 if (column->un_alt_dev != NODEV64) 962 preserve_flags |= 963 MD_RAID_COPY_RESYNC; 964 else 965 preserve_flags |= 966 MD_RAID_REGEN_RESYNC; 967 } 968 } 969 } else { /* no hot spares */ 970 column->un_dev = dev; 971 column->un_pwstart = column->un_orig_pwstart; 972 column->un_devstart = column->un_orig_devstart; 973 if (column->un_devstate & RCS_RESYNC) { 974 preserve_flags |= MD_RAID_REGEN_RESYNC; 975 preserve_flags &= ~MD_RAID_COPY_RESYNC; 976 } 977 } 978 if (! (column->un_devstate & RCS_RESYNC)) { 979 preserve_flags &= 980 ~(MD_RAID_REGEN_RESYNC | 981 MD_RAID_COPY_RESYNC); 982 } 983 984 column->un_devflags = preserve_flags; 985 column->un_alt_dev = NODEV64; 986 column->un_alt_pwstart = 0; 987 column->un_alt_devstart = 0; 988 un->un_resync_line_index = 0; 989 un->un_resync_index = 0; 990 un->un_percent_done = 0; 991 } 992 } 993 994 if (resync_cnt && error_cnt) { 995 for (i = 0; i < un->un_totalcolumncnt; i++) { 996 column = &un->un_column[i]; 997 if (HOTSPARED(un, i) && 998 (column->un_devstate & RCS_RESYNC) && 999 (column->un_devflags & MD_RAID_COPY_RESYNC)) 1000 /* hotspare has data */ 1001 continue; 1002 1003 if (HOTSPARED(un, i) && 1004 (column->un_devstate & RCS_RESYNC)) { 1005 /* hotspare does not have data */ 1006 raid_hs_release(HS_FREE, un, &hs_recid, i); 1007 column->un_dev = column->un_orig_dev; 1008 column->un_pwstart = column->un_orig_pwstart; 1009 column->un_devstart = column->un_orig_devstart; 1010 mddb_setrecprivate(hs_recid, MD_PRV_PENDCOM); 1011 } 1012 1013 if (column->un_devstate & RCS_ERRED) 1014 column->un_devstate = RCS_LAST_ERRED; 1015 1016 if (column->un_devstate & RCS_RESYNC) 1017 column->un_devstate = RCS_ERRED; 1018 } 1019 } 1020 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM); 1021 1022 un->un_pwid = 1; /* or some other possible value */ 1023 un->un_magic = RAID_UNMAGIC; 1024 iosize = un->un_iosize; 1025 un->un_pbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); 1026 un->un_dbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); 1027 mutex_init(&un->un_linlck_mx, NULL, MUTEX_DEFAULT, NULL); 1028 cv_init(&un->un_linlck_cv, NULL, CV_DEFAULT, NULL); 1029 un->un_linlck_chn = NULL; 1030 MD_UNIT(mnum) = un; 1031 1032 1033 return (0); 1034 } 1035 1036 /* 1037 * NAMES: reset_raid 1038 * DESCRIPTION: RAID metadevice reset routine 1039 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 1040 * minor_t mnum - RAID metadevice minor number 1041 * int removing - a flag to imply removing device name from 1042 * MDDB database. 1043 */ 1044 void 1045 reset_raid(mr_unit_t *un, minor_t mnum, int removing) 1046 { 1047 int i, n = 0; 1048 sv_dev_t *sv; 1049 mr_column_t *column; 1050 int column_cnt = un->un_totalcolumncnt; 1051 mddb_recid_t *recids, vtoc_id; 1052 int hserr; 1053 1054 ASSERT((MDI_UNIT(mnum)->ui_io_lock->io_list_front == NULL) && 1055 (MDI_UNIT(mnum)->ui_io_lock->io_list_back == NULL)); 1056 1057 md_destroy_unit_incore(mnum, &raid_md_ops); 1058 1059 MD_UNIT(mnum) = NULL; 1060 1061 if (un->un_pbuffer) { 1062 kmem_free(un->un_pbuffer, dbtob(un->un_iosize)); 1063 un->un_pbuffer = NULL; 1064 } 1065 if (un->un_dbuffer) { 1066 kmem_free(un->un_dbuffer, dbtob(un->un_iosize)); 1067 un->un_dbuffer = NULL; 1068 } 1069 1070 /* free all pre-write slots created during build incore */ 1071 for (i = 0; i < un->un_totalcolumncnt; i++) 1072 raid_free_pw_reservation(un, i); 1073 1074 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * 1075 un->un_totalcolumncnt); 1076 1077 kmem_free(un->mr_ic, sizeof (*un->mr_ic)); 1078 1079 if (!removing) 1080 return; 1081 1082 sv = (sv_dev_t *)kmem_zalloc((column_cnt + 1) * sizeof (sv_dev_t), 1083 KM_SLEEP); 1084 1085 recids = (mddb_recid_t *) 1086 kmem_zalloc((column_cnt + 2) * sizeof (mddb_recid_t), KM_SLEEP); 1087 1088 for (i = 0; i < column_cnt; i++) { 1089 md_unit_t *comp_un; 1090 md_dev64_t comp_dev; 1091 1092 column = &un->un_column[i]; 1093 sv[i].setno = MD_MIN2SET(mnum); 1094 sv[i].key = column->un_orig_key; 1095 if (HOTSPARED(un, i)) { 1096 if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) 1097 hserr = HS_BAD; 1098 else 1099 hserr = HS_FREE; 1100 raid_hs_release(hserr, un, &recids[n++], i); 1101 } 1102 /* 1103 * deparent any metadevices. 1104 * NOTE: currently soft partitions are the only metadevices 1105 * allowed in RAID metadevices. 1106 */ 1107 comp_dev = column->un_dev; 1108 if (md_getmajor(comp_dev) == md_major) { 1109 comp_un = MD_UNIT(md_getminor(comp_dev)); 1110 recids[n++] = MD_RECID(comp_un); 1111 md_reset_parent(comp_dev); 1112 } 1113 } 1114 /* decrement the reference count of the old hsp */ 1115 if (un->un_hsp_id != -1) 1116 (void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0, 1117 &recids[n++], NULL, NULL, NULL); 1118 recids[n] = 0; 1119 MD_STATUS(un) |= MD_UN_BEING_RESET; 1120 vtoc_id = un->c.un_vtoc_id; 1121 1122 raid_commit(un, recids); 1123 1124 1125 /* Remove the unit structure */ 1126 mddb_deleterec_wrapper(un->c.un_record_id); 1127 1128 /* Remove the vtoc, if present */ 1129 if (vtoc_id) 1130 mddb_deleterec_wrapper(vtoc_id); 1131 md_rem_names(sv, column_cnt); 1132 kmem_free(sv, (column_cnt + 1) * sizeof (sv_dev_t)); 1133 kmem_free(recids, (column_cnt + 2) * sizeof (mddb_recid_t)); 1134 1135 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 1136 MD_MIN2SET(mnum), mnum); 1137 } 1138 1139 /* 1140 * NAMES: raid_error_parent 1141 * DESCRIPTION: mark a parent structure in error 1142 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1143 * int error - error value to set 1144 * NOTE: (TBR) - this routine currently is not in use. 1145 */ 1146 static void 1147 raid_error_parent(md_raidps_t *ps, int error) 1148 { 1149 mutex_enter(&ps->ps_mx); 1150 ps->ps_flags |= MD_RPS_ERROR; 1151 ps->ps_error = error; 1152 mutex_exit(&ps->ps_mx); 1153 } 1154 1155 /* 1156 * The following defines tell raid_free_parent 1157 * RFP_RLS_LOCK release the unit reader lock when done. 1158 * RFP_DECR_PWFRAGS decrement ps_pwfrags 1159 * RFP_DECR_FRAGS decrement ps_frags 1160 * RFP_DECR_READFRAGS read keeps FRAGS and PWFRAGS in lockstep 1161 */ 1162 #define RFP_RLS_LOCK 0x00001 1163 #define RFP_DECR_PWFRAGS 0x00002 1164 #define RFP_DECR_FRAGS 0x00004 1165 #define RFP_DECR_READFRAGS (RFP_DECR_PWFRAGS | RFP_DECR_FRAGS) 1166 1167 /* 1168 * NAMES: raid_free_parent 1169 * DESCRIPTION: free a parent structure 1170 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1171 * int todo - indicates what needs to be done 1172 */ 1173 static void 1174 raid_free_parent(md_raidps_t *ps, int todo) 1175 { 1176 mdi_unit_t *ui = ps->ps_ui; 1177 1178 ASSERT(ps->ps_magic == RAID_PSMAGIC); 1179 ASSERT(ps->ps_flags & MD_RPS_INUSE); 1180 mutex_enter(&ps->ps_mx); 1181 if (todo & RFP_DECR_PWFRAGS) { 1182 ASSERT(ps->ps_pwfrags); 1183 ps->ps_pwfrags--; 1184 if (ps->ps_pwfrags == 0 && (! (ps->ps_flags & MD_RPS_IODONE))) { 1185 if (ps->ps_flags & MD_RPS_ERROR) { 1186 ps->ps_bp->b_flags |= B_ERROR; 1187 ps->ps_bp->b_error = ps->ps_error; 1188 } 1189 md_kstat_done(ui, ps->ps_bp, 0); 1190 biodone(ps->ps_bp); 1191 ps->ps_flags |= MD_RPS_IODONE; 1192 } 1193 } 1194 1195 if (todo & RFP_DECR_FRAGS) { 1196 ASSERT(ps->ps_frags); 1197 ps->ps_frags--; 1198 } 1199 1200 if (ps->ps_frags != 0) { 1201 mutex_exit(&ps->ps_mx); 1202 return; 1203 } 1204 1205 ASSERT((ps->ps_frags == 0) && (ps->ps_pwfrags == 0)); 1206 mutex_exit(&ps->ps_mx); 1207 1208 if (todo & RFP_RLS_LOCK) 1209 md_io_readerexit(ui); 1210 1211 if (panicstr) { 1212 ps->ps_flags |= MD_RPS_DONE; 1213 return; 1214 } 1215 1216 if (ps->ps_flags & MD_RPS_HSREQ) 1217 (void) raid_hotspares(); 1218 1219 ASSERT(todo & RFP_RLS_LOCK); 1220 ps->ps_flags &= ~MD_RPS_INUSE; 1221 1222 md_dec_iocount(MD_MIN2SET(ps->ps_un->c.un_self_id)); 1223 1224 kmem_cache_free(raid_parent_cache, ps); 1225 } 1226 1227 /* 1228 * NAMES: raid_free_child 1229 * DESCRIPTION: free a parent structure 1230 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1231 * int drop_locks - 0 for no locks held 1232 * NOTE: (TBR) - this routine currently is not in use. 1233 */ 1234 static void 1235 raid_free_child(md_raidcs_t *cs, int drop_locks) 1236 { 1237 mr_unit_t *un = cs->cs_un; 1238 md_raidcbuf_t *cbuf, *cbuf1; 1239 1240 if (cs->cs_pw_inval_list) 1241 raid_free_pwinvalidate(cs); 1242 1243 if (drop_locks) { 1244 ASSERT(cs->cs_flags & MD_RCS_LLOCKD && 1245 (cs->cs_flags & (MD_RCS_READER | MD_RCS_WRITER))); 1246 md_unit_readerexit(MDI_UNIT(MD_SID(un))); 1247 raid_line_exit(cs); 1248 } else { 1249 ASSERT(!(cs->cs_flags & MD_RCS_LLOCKD)); 1250 } 1251 1252 freebuffers(cs); 1253 cbuf = cs->cs_buflist; 1254 while (cbuf) { 1255 cbuf1 = cbuf->cbuf_next; 1256 kmem_cache_free(raid_cbuf_cache, cbuf); 1257 cbuf = cbuf1; 1258 } 1259 if (cs->cs_dbuf.b_flags & B_REMAPPED) 1260 bp_mapout(&cs->cs_dbuf); 1261 kmem_cache_free(raid_child_cache, cs); 1262 } 1263 1264 /* 1265 * NAME: raid_regen_parity 1266 * 1267 * DESCRIPTION: This routine is used to regenerate the parity blocks 1268 * for the entire raid device. It is called from 1269 * both the regen thread and the IO path. 1270 * 1271 * On error the entire device is marked as in error by 1272 * placing the erroring device in error and all other 1273 * devices in last_errored. 1274 * 1275 * PARAMETERS: md_raidcs_t *cs 1276 */ 1277 void 1278 raid_regen_parity(md_raidcs_t *cs) 1279 { 1280 mr_unit_t *un = cs->cs_un; 1281 mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); 1282 caddr_t buffer; 1283 caddr_t parity_buffer; 1284 buf_t *bp; 1285 uint_t *dbuf, *pbuf; 1286 uint_t colcnt = un->un_totalcolumncnt; 1287 int column; 1288 int parity_column = cs->cs_pcolumn; 1289 size_t bcount; 1290 int j; 1291 1292 /* 1293 * This routine uses the data and parity buffers allocated to a 1294 * write. In the case of a read the buffers are allocated and 1295 * freed at the end. 1296 */ 1297 1298 ASSERT(IO_READER_HELD(un)); 1299 ASSERT(cs->cs_flags & MD_RCS_LLOCKD); 1300 ASSERT(UNIT_READER_HELD(un)); 1301 1302 if (raid_state_cnt(un, RCS_OKAY) != colcnt) 1303 return; 1304 1305 if (cs->cs_flags & MD_RCS_READER) { 1306 getpbuffer(cs); 1307 getdbuffer(cs); 1308 } 1309 ASSERT(cs->cs_dbuffer && cs->cs_pbuffer); 1310 bcount = cs->cs_bcount; 1311 buffer = cs->cs_dbuffer; 1312 parity_buffer = cs->cs_pbuffer; 1313 bzero(parity_buffer, bcount); 1314 bp = &cs->cs_dbuf; 1315 for (column = 0; column < colcnt; column++) { 1316 if (column == parity_column) 1317 continue; 1318 reset_buf(bp, B_READ | B_BUSY, bcount); 1319 bp->b_un.b_addr = buffer; 1320 bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev); 1321 bp->b_lblkno = cs->cs_blkno + un->un_column[column].un_devstart; 1322 bp->b_bcount = bcount; 1323 bp->b_bufsize = bcount; 1324 (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); 1325 if (biowait(bp)) 1326 goto bail; 1327 pbuf = (uint_t *)(void *)parity_buffer; 1328 dbuf = (uint_t *)(void *)buffer; 1329 for (j = 0; j < (bcount / (sizeof (uint_t))); j++) { 1330 *pbuf = *pbuf ^ *dbuf; 1331 pbuf++; 1332 dbuf++; 1333 } 1334 } 1335 1336 reset_buf(bp, B_WRITE | B_BUSY, cs->cs_bcount); 1337 bp->b_un.b_addr = parity_buffer; 1338 bp->b_edev = md_dev64_to_dev(un->un_column[parity_column].un_dev); 1339 bp->b_lblkno = cs->cs_blkno + un->un_column[parity_column].un_devstart; 1340 bp->b_bcount = bcount; 1341 bp->b_bufsize = bcount; 1342 (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); 1343 if (biowait(bp)) 1344 goto bail; 1345 1346 if (cs->cs_flags & MD_RCS_READER) { 1347 freebuffers(cs); 1348 cs->cs_pbuffer = NULL; 1349 cs->cs_dbuffer = NULL; 1350 } 1351 bp->b_chain = (struct buf *)cs; 1352 return; 1353 bail: 1354 if (cs->cs_flags & MD_RCS_READER) { 1355 freebuffers(cs); 1356 cs->cs_pbuffer = NULL; 1357 cs->cs_dbuffer = NULL; 1358 } 1359 md_unit_readerexit(ui); 1360 un = md_unit_writerlock(ui); 1361 raid_set_state(un, column, RCS_ERRED, 0); 1362 for (column = 0; column < colcnt; column++) 1363 raid_set_state(un, column, RCS_ERRED, 0); 1364 raid_commit(un, NULL); 1365 md_unit_writerexit(ui); 1366 un = md_unit_readerlock(ui); 1367 bp->b_chain = (struct buf *)cs; 1368 } 1369 1370 /* 1371 * NAMES: raid_error_state 1372 * DESCRIPTION: check unit and column states' impact on I/O error 1373 * NOTE: the state now may not be the state when the 1374 * I/O completed due to race conditions. 1375 * PARAMETERS: mr_unit_t *un - pointer to raid unit structure 1376 * md_raidcs_t *cs - pointer to child structure 1377 * buf_t *bp - pointer to buffer structure 1378 */ 1379 static int 1380 raid_error_state(mr_unit_t *un, buf_t *bp) 1381 { 1382 int column; 1383 int i; 1384 1385 ASSERT(IO_READER_HELD(un)); 1386 ASSERT(UNIT_WRITER_HELD(un)); 1387 1388 column = -1; 1389 for (i = 0; i < un->un_totalcolumncnt; i++) { 1390 if (un->un_column[i].un_dev == md_expldev(bp->b_edev)) { 1391 column = i; 1392 break; 1393 } 1394 if (un->un_column[i].un_alt_dev == md_expldev(bp->b_edev)) { 1395 column = i; 1396 break; 1397 } 1398 } 1399 1400 /* in case a replace snuck in while waiting on unit writer lock */ 1401 1402 if (column == -1) { 1403 return (0); 1404 } 1405 1406 (void) raid_set_state(un, column, RCS_ERRED, 0); 1407 ASSERT(un->un_state & (RUS_ERRED | RUS_LAST_ERRED)); 1408 1409 raid_commit(un, NULL); 1410 if (un->un_state & RUS_ERRED) { 1411 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, 1412 MD_UN2SET(un), MD_SID(un)); 1413 } else if (un->un_state & RUS_LAST_ERRED) { 1414 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE, 1415 MD_UN2SET(un), MD_SID(un)); 1416 } 1417 1418 return (EIO); 1419 } 1420 1421 /* 1422 * NAME: raid_mapin_buf 1423 * DESCRIPTION: wait for the input buffer header to be maped in 1424 * PARAMETERS: md_raidps_t *ps 1425 */ 1426 static void 1427 raid_mapin_buf(md_raidcs_t *cs) 1428 { 1429 md_raidps_t *ps = cs->cs_ps; 1430 1431 /* 1432 * check to see if the buffer is maped. If all is ok return the 1433 * offset of the data and return. Since it is expensive to grab 1434 * a mutex this is only done if the mapin is not complete. 1435 * Once the mutex is aquired it is possible that the mapin was 1436 * not done so recheck and if necessary do the mapin. 1437 */ 1438 if (ps->ps_mapin > 0) { 1439 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1440 return; 1441 } 1442 mutex_enter(&ps->ps_mapin_mx); 1443 if (ps->ps_mapin > 0) { 1444 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1445 mutex_exit(&ps->ps_mapin_mx); 1446 return; 1447 } 1448 bp_mapin(ps->ps_bp); 1449 /* 1450 * get the new b_addr out of the parent since bp_mapin just changed it 1451 */ 1452 ps->ps_addr = ps->ps_bp->b_un.b_addr; 1453 cs->cs_addr = ps->ps_addr + cs->cs_offset; 1454 ps->ps_mapin++; 1455 mutex_exit(&ps->ps_mapin_mx); 1456 } 1457 1458 /* 1459 * NAMES: raid_read_no_retry 1460 * DESCRIPTION: I/O retry routine for a RAID metadevice read 1461 * read failed attempting to regenerate the data, 1462 * no retry possible, error occured in raid_raidregenloop(). 1463 * PARAMETERS: mr_unit_t *un - pointer to raid unit structure 1464 * md_raidcs_t *cs - pointer to child structure 1465 */ 1466 /*ARGSUSED*/ 1467 static void 1468 raid_read_no_retry(mr_unit_t *un, md_raidcs_t *cs) 1469 { 1470 md_raidps_t *ps = cs->cs_ps; 1471 1472 raid_error_parent(ps, EIO); 1473 raid_free_child(cs, 1); 1474 1475 /* decrement readfrags */ 1476 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 1477 } 1478 1479 /* 1480 * NAMES: raid_read_retry 1481 * DESCRIPTION: I/O retry routine for a RAID metadevice read 1482 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1483 */ 1484 static void 1485 raid_read_retry(mr_unit_t *un, md_raidcs_t *cs) 1486 { 1487 /* re-initialize the buf_t structure for raid_read() */ 1488 cs->cs_dbuf.b_chain = (struct buf *)cs; 1489 cs->cs_dbuf.b_back = &cs->cs_dbuf; 1490 cs->cs_dbuf.b_forw = &cs->cs_dbuf; 1491 cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ 1492 cs->cs_dbuf.b_error = 0; /* initialize error */ 1493 cs->cs_dbuf.b_offset = -1; 1494 /* Initialize semaphores */ 1495 sema_init(&cs->cs_dbuf.b_io, 0, NULL, 1496 SEMA_DEFAULT, NULL); 1497 sema_init(&cs->cs_dbuf.b_sem, 0, NULL, 1498 SEMA_DEFAULT, NULL); 1499 1500 cs->cs_pbuf.b_chain = (struct buf *)cs; 1501 cs->cs_pbuf.b_back = &cs->cs_pbuf; 1502 cs->cs_pbuf.b_forw = &cs->cs_pbuf; 1503 cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ 1504 cs->cs_pbuf.b_error = 0; /* initialize error */ 1505 cs->cs_pbuf.b_offset = -1; 1506 sema_init(&cs->cs_pbuf.b_io, 0, NULL, 1507 SEMA_DEFAULT, NULL); 1508 sema_init(&cs->cs_pbuf.b_sem, 0, NULL, 1509 SEMA_DEFAULT, NULL); 1510 1511 cs->cs_flags &= ~MD_RCS_ERROR; /* reset child error flag */ 1512 cs->cs_flags |= MD_RCS_RECOVERY; /* set RECOVERY flag */ 1513 1514 /* 1515 * re-scheduling I/O with raid_read_io() is simpler. basically, 1516 * raid_read_io() is invoked again with same child structure. 1517 * (NOTE: we aren`t supposed to do any error recovery when an I/O 1518 * error occured in raid_raidregenloop(). 1519 */ 1520 raid_mapin_buf(cs); 1521 raid_read_io(un, cs); 1522 } 1523 1524 /* 1525 * NAMES: raid_rderr 1526 * DESCRIPTION: I/O error handling routine for a RAID metadevice read 1527 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1528 * LOCKS: must obtain unit writer lock while calling raid_error_state 1529 * since a unit or column state transition may take place. 1530 * must obtain unit reader lock to retry I/O. 1531 */ 1532 /*ARGSUSED*/ 1533 static void 1534 raid_rderr(md_raidcs_t *cs) 1535 { 1536 md_raidps_t *ps; 1537 mdi_unit_t *ui; 1538 mr_unit_t *un; 1539 int error = 0; 1540 1541 ps = cs->cs_ps; 1542 ui = ps->ps_ui; 1543 un = (mr_unit_t *)md_unit_writerlock(ui); 1544 ASSERT(un != 0); 1545 1546 if (cs->cs_dbuf.b_flags & B_ERROR) 1547 error = raid_error_state(un, &cs->cs_dbuf); 1548 if (cs->cs_pbuf.b_flags & B_ERROR) 1549 error |= raid_error_state(un, &cs->cs_pbuf); 1550 1551 md_unit_writerexit(ui); 1552 1553 ps->ps_flags |= MD_RPS_HSREQ; 1554 1555 un = (mr_unit_t *)md_unit_readerlock(ui); 1556 ASSERT(un != 0); 1557 /* now attempt the appropriate retry routine */ 1558 (*(cs->cs_retry_call))(un, cs); 1559 } 1560 1561 1562 /* 1563 * NAMES: raid_read_error 1564 * DESCRIPTION: I/O error handling routine for a RAID metadevice read 1565 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1566 */ 1567 /*ARGSUSED*/ 1568 static void 1569 raid_read_error(md_raidcs_t *cs) 1570 { 1571 md_raidps_t *ps; 1572 mdi_unit_t *ui; 1573 mr_unit_t *un; 1574 set_t setno; 1575 1576 ps = cs->cs_ps; 1577 ui = ps->ps_ui; 1578 un = cs->cs_un; 1579 1580 setno = MD_UN2SET(un); 1581 1582 if ((cs->cs_dbuf.b_flags & B_ERROR) && 1583 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && 1584 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) 1585 cmn_err(CE_WARN, "md %s: read error on %s", 1586 md_shortname(MD_SID(un)), 1587 md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); 1588 1589 if ((cs->cs_pbuf.b_flags & B_ERROR) && 1590 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && 1591 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) 1592 cmn_err(CE_WARN, "md %s: read error on %s", 1593 md_shortname(MD_SID(un)), 1594 md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); 1595 1596 md_unit_readerexit(ui); 1597 1598 ASSERT(cs->cs_frags == 0); 1599 1600 /* now schedule processing for possible state change */ 1601 daemon_request(&md_mstr_daemon, raid_rderr, 1602 (daemon_queue_t *)cs, REQ_OLD); 1603 1604 } 1605 1606 /* 1607 * NAMES: getdbuffer 1608 * DESCRIPTION: data buffer allocation for a child structure 1609 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1610 * 1611 * NOTE: always get dbuffer before pbuffer 1612 * and get both buffers before pwslot 1613 * otherwise a deadlock could be introduced. 1614 */ 1615 static void 1616 getdbuffer(md_raidcs_t *cs) 1617 { 1618 mr_unit_t *un; 1619 1620 cs->cs_dbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); 1621 if (cs->cs_dbuffer != NULL) 1622 return; 1623 un = cs->cs_ps->ps_un; 1624 mutex_enter(&un->un_mx); 1625 while (un->un_dbuffer == NULL) { 1626 STAT_INC(data_buffer_waits); 1627 un->un_rflags |= MD_RFLAG_NEEDBUF; 1628 cv_wait(&un->un_cv, &un->un_mx); 1629 } 1630 cs->cs_dbuffer = un->un_dbuffer; 1631 cs->cs_flags |= MD_RCS_UNDBUF; 1632 un->un_dbuffer = NULL; 1633 mutex_exit(&un->un_mx); 1634 } 1635 1636 /* 1637 * NAMES: getpbuffer 1638 * DESCRIPTION: parity buffer allocation for a child structure 1639 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1640 * 1641 * NOTE: always get dbuffer before pbuffer 1642 * and get both buffers before pwslot 1643 * otherwise a deadlock could be introduced. 1644 */ 1645 static void 1646 getpbuffer(md_raidcs_t *cs) 1647 { 1648 mr_unit_t *un; 1649 1650 cs->cs_pbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); 1651 if (cs->cs_pbuffer != NULL) 1652 return; 1653 un = cs->cs_ps->ps_un; 1654 mutex_enter(&un->un_mx); 1655 while (un->un_pbuffer == NULL) { 1656 STAT_INC(parity_buffer_waits); 1657 un->un_rflags |= MD_RFLAG_NEEDBUF; 1658 cv_wait(&un->un_cv, &un->un_mx); 1659 } 1660 cs->cs_pbuffer = un->un_pbuffer; 1661 cs->cs_flags |= MD_RCS_UNPBUF; 1662 un->un_pbuffer = NULL; 1663 mutex_exit(&un->un_mx); 1664 } 1665 static void 1666 getresources(md_raidcs_t *cs) 1667 { 1668 md_raidcbuf_t *cbuf; 1669 /* 1670 * NOTE: always get dbuffer before pbuffer 1671 * and get both buffers before pwslot 1672 * otherwise a deadlock could be introduced. 1673 */ 1674 getdbuffer(cs); 1675 getpbuffer(cs); 1676 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 1677 cbuf->cbuf_buffer = 1678 kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_SLEEP); 1679 } 1680 /* 1681 * NAMES: freebuffers 1682 * DESCRIPTION: child structure buffer freeing routine 1683 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1684 */ 1685 static void 1686 freebuffers(md_raidcs_t *cs) 1687 { 1688 mr_unit_t *un; 1689 md_raidcbuf_t *cbuf; 1690 1691 /* free buffers used for full line write */ 1692 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 1693 if (cbuf->cbuf_buffer == NULL) 1694 continue; 1695 kmem_free(cbuf->cbuf_buffer, cbuf->cbuf_bcount + DEV_BSIZE); 1696 cbuf->cbuf_buffer = NULL; 1697 cbuf->cbuf_bcount = 0; 1698 } 1699 1700 if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { 1701 un = cs->cs_un; 1702 mutex_enter(&un->un_mx); 1703 } 1704 if (cs->cs_dbuffer) { 1705 if (cs->cs_flags & MD_RCS_UNDBUF) 1706 un->un_dbuffer = cs->cs_dbuffer; 1707 else 1708 kmem_free(cs->cs_dbuffer, cs->cs_bcount + DEV_BSIZE); 1709 } 1710 if (cs->cs_pbuffer) { 1711 if (cs->cs_flags & MD_RCS_UNPBUF) 1712 un->un_pbuffer = cs->cs_pbuffer; 1713 else 1714 kmem_free(cs->cs_pbuffer, cs->cs_bcount + DEV_BSIZE); 1715 } 1716 if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { 1717 un->un_rflags &= ~MD_RFLAG_NEEDBUF; 1718 cv_broadcast(&un->un_cv); 1719 mutex_exit(&un->un_mx); 1720 } 1721 } 1722 1723 /* 1724 * NAMES: raid_line_reader_lock, raid_line_writer_lock 1725 * DESCRIPTION: RAID metadevice line reader and writer lock routines 1726 * data column # and parity column #. 1727 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 1728 */ 1729 1730 void 1731 raid_line_reader_lock(md_raidcs_t *cs, int resync_thread) 1732 { 1733 mr_unit_t *un; 1734 md_raidcs_t *cs1; 1735 1736 ASSERT(cs->cs_line != MD_DISKADDR_ERROR); 1737 un = cs->cs_un; 1738 cs->cs_flags |= MD_RCS_READER; 1739 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1740 if (!panicstr) 1741 mutex_enter(&un->un_linlck_mx); 1742 cs1 = un->un_linlck_chn; 1743 while (cs1 != NULL) { 1744 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1745 if (raid_io_overlaps(cs, cs1) == 1) 1746 if (cs1->cs_flags & MD_RCS_WRITER) 1747 break; 1748 1749 if (cs1 != NULL) { 1750 if (panicstr) 1751 panic("md; raid line write lock held"); 1752 un->un_linlck_flg = 1; 1753 cv_wait(&un->un_linlck_cv, &un->un_linlck_mx); 1754 STAT_INC(raid_read_waits); 1755 } 1756 } 1757 STAT_MAX(raid_max_reader_locks, raid_reader_locks_active); 1758 STAT_INC(raid_reader_locks); 1759 cs1 = un->un_linlck_chn; 1760 if (cs1 != NULL) 1761 cs1->cs_linlck_prev = cs; 1762 cs->cs_linlck_next = cs1; 1763 cs->cs_linlck_prev = NULL; 1764 un->un_linlck_chn = cs; 1765 cs->cs_flags |= MD_RCS_LLOCKD; 1766 if (resync_thread) { 1767 diskaddr_t lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 1768 diskaddr_t line = (lastblk + 1) / un->un_segsize; 1769 ASSERT(raid_state_cnt(un, RCS_RESYNC)); 1770 mutex_enter(&un->un_mx); 1771 un->un_resync_line_index = line; 1772 mutex_exit(&un->un_mx); 1773 } 1774 if (!panicstr) 1775 mutex_exit(&un->un_linlck_mx); 1776 } 1777 1778 int 1779 raid_line_writer_lock(md_raidcs_t *cs, int lock) 1780 { 1781 mr_unit_t *un; 1782 md_raidcs_t *cs1; 1783 1784 ASSERT(cs->cs_line != MD_DISKADDR_ERROR); 1785 cs->cs_flags |= MD_RCS_WRITER; 1786 un = cs->cs_ps->ps_un; 1787 1788 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1789 if (lock && !panicstr) 1790 mutex_enter(&un->un_linlck_mx); 1791 ASSERT(MUTEX_HELD(&un->un_linlck_mx)); 1792 1793 cs1 = un->un_linlck_chn; 1794 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1795 if (raid_io_overlaps(cs, cs1)) 1796 break; 1797 1798 if (cs1 != NULL) { 1799 if (panicstr) 1800 panic("md: line writer lock inaccessible"); 1801 goto no_lock_exit; 1802 } 1803 1804 if (raid_alloc_pwslot(cs)) { 1805 if (panicstr) 1806 panic("md: no prewrite slots"); 1807 STAT_INC(raid_prewrite_waits); 1808 goto no_lock_exit; 1809 } 1810 1811 cs1 = un->un_linlck_chn; 1812 if (cs1 != NULL) 1813 cs1->cs_linlck_prev = cs; 1814 cs->cs_linlck_next = cs1; 1815 cs->cs_linlck_prev = NULL; 1816 un->un_linlck_chn = cs; 1817 cs->cs_flags |= MD_RCS_LLOCKD; 1818 cs->cs_flags &= ~MD_RCS_WAITING; 1819 STAT_INC(raid_writer_locks); 1820 STAT_MAX(raid_max_write_locks, raid_write_locks_active); 1821 if (lock && !panicstr) 1822 mutex_exit(&un->un_linlck_mx); 1823 return (0); 1824 1825 no_lock_exit: 1826 /* if this is already queued then do not requeue it */ 1827 ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); 1828 if (!lock || (cs->cs_flags & MD_RCS_WAITING)) 1829 return (1); 1830 cs->cs_flags |= MD_RCS_WAITING; 1831 cs->cs_un = un; 1832 raid_enqueue(cs); 1833 if (lock && !panicstr) 1834 mutex_exit(&un->un_linlck_mx); 1835 return (1); 1836 } 1837 1838 static void 1839 raid_startio(md_raidcs_t *cs) 1840 { 1841 mdi_unit_t *ui = cs->cs_ps->ps_ui; 1842 mr_unit_t *un = cs->cs_un; 1843 1844 un = md_unit_readerlock(ui); 1845 raid_write_io(un, cs); 1846 } 1847 1848 void 1849 raid_io_startup(mr_unit_t *un) 1850 { 1851 md_raidcs_t *waiting_list, *cs1; 1852 md_raidcs_t *previous = NULL, *next = NULL; 1853 mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); 1854 kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; 1855 1856 ASSERT(MUTEX_HELD(&un->un_linlck_mx)); 1857 mutex_enter(io_list_mutex); 1858 1859 /* 1860 * check to be sure there are no reader locks outstanding. If 1861 * there are not then pass on the writer lock. 1862 */ 1863 waiting_list = ui->ui_io_lock->io_list_front; 1864 while (waiting_list) { 1865 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1866 ASSERT(! (waiting_list->cs_flags & MD_RCS_LLOCKD)); 1867 for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) 1868 if (raid_io_overlaps(waiting_list, cs1) == 1) 1869 break; 1870 /* 1871 * there was an IOs that overlaps this io so go onto 1872 * the next io in the waiting list 1873 */ 1874 if (cs1) { 1875 previous = waiting_list; 1876 waiting_list = waiting_list->cs_linlck_next; 1877 continue; 1878 } 1879 1880 /* 1881 * There are no IOs that overlap this, so remove it from 1882 * the waiting queue, and start it 1883 */ 1884 1885 if (raid_check_pw(waiting_list)) { 1886 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1887 previous = waiting_list; 1888 waiting_list = waiting_list->cs_linlck_next; 1889 continue; 1890 } 1891 ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); 1892 1893 next = waiting_list->cs_linlck_next; 1894 if (previous) 1895 previous->cs_linlck_next = next; 1896 else 1897 ui->ui_io_lock->io_list_front = next; 1898 1899 if (ui->ui_io_lock->io_list_front == NULL) 1900 ui->ui_io_lock->io_list_back = NULL; 1901 1902 if (ui->ui_io_lock->io_list_back == waiting_list) 1903 ui->ui_io_lock->io_list_back = previous; 1904 1905 waiting_list->cs_linlck_next = NULL; 1906 waiting_list->cs_flags &= ~MD_RCS_WAITING; 1907 STAT_DEC(raid_write_queue_length); 1908 if (raid_line_writer_lock(waiting_list, 0)) 1909 panic("region locking corrupted"); 1910 1911 ASSERT(waiting_list->cs_flags & MD_RCS_LLOCKD); 1912 daemon_request(&md_mstr_daemon, raid_startio, 1913 (daemon_queue_t *)waiting_list, REQ_OLD); 1914 waiting_list = next; 1915 1916 } 1917 mutex_exit(io_list_mutex); 1918 } 1919 1920 void 1921 raid_line_exit(md_raidcs_t *cs) 1922 { 1923 mr_unit_t *un; 1924 1925 un = cs->cs_ps->ps_un; 1926 STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); 1927 mutex_enter(&un->un_linlck_mx); 1928 if (cs->cs_flags & MD_RCS_READER) 1929 STAT_DEC(raid_reader_locks_active); 1930 else 1931 STAT_DEC(raid_write_locks_active); 1932 1933 if (cs->cs_linlck_prev) 1934 cs->cs_linlck_prev->cs_linlck_next = cs->cs_linlck_next; 1935 else 1936 un->un_linlck_chn = cs->cs_linlck_next; 1937 if (cs->cs_linlck_next) 1938 cs->cs_linlck_next->cs_linlck_prev = cs->cs_linlck_prev; 1939 1940 cs->cs_flags &= ~MD_RCS_LLOCKD; 1941 1942 if (un->un_linlck_flg) 1943 cv_broadcast(&un->un_linlck_cv); 1944 1945 un->un_linlck_flg = 0; 1946 cs->cs_line = MD_DISKADDR_ERROR; 1947 1948 raid_cancel_pwslot(cs); 1949 /* 1950 * now that the lock is droped go ahead and see if there are any 1951 * other writes that can be started up 1952 */ 1953 raid_io_startup(un); 1954 1955 mutex_exit(&un->un_linlck_mx); 1956 } 1957 1958 /* 1959 * NAMES: raid_line, raid_pcolumn, raid_dcolumn 1960 * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #, 1961 * data column # and parity column #. 1962 * PARAMETERS: int segment - segment number 1963 * mr_unit_t *un - pointer to an unit structure 1964 * RETURNS: raid_line returns line # 1965 * raid_dcolumn returns data column # 1966 * raid_pcolumn returns parity column # 1967 */ 1968 static diskaddr_t 1969 raid_line(diskaddr_t segment, mr_unit_t *un) 1970 { 1971 diskaddr_t adj_seg; 1972 diskaddr_t line; 1973 diskaddr_t max_orig_segment; 1974 1975 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 1976 if (segment >= max_orig_segment) { 1977 adj_seg = segment - max_orig_segment; 1978 line = adj_seg % un->un_segsincolumn; 1979 } else { 1980 line = segment / (un->un_origcolumncnt - 1); 1981 } 1982 return (line); 1983 } 1984 1985 uint_t 1986 raid_dcolumn(diskaddr_t segment, mr_unit_t *un) 1987 { 1988 diskaddr_t adj_seg; 1989 diskaddr_t line; 1990 diskaddr_t max_orig_segment; 1991 uint_t column; 1992 1993 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 1994 if (segment >= max_orig_segment) { 1995 adj_seg = segment - max_orig_segment; 1996 column = un->un_origcolumncnt + 1997 (uint_t)(adj_seg / un->un_segsincolumn); 1998 } else { 1999 line = segment / (un->un_origcolumncnt - 1); 2000 column = (uint_t)((segment % (un->un_origcolumncnt - 1) + line) 2001 % un->un_origcolumncnt); 2002 } 2003 return (column); 2004 } 2005 2006 uint_t 2007 raid_pcolumn(diskaddr_t segment, mr_unit_t *un) 2008 { 2009 diskaddr_t adj_seg; 2010 diskaddr_t line; 2011 diskaddr_t max_orig_segment; 2012 uint_t column; 2013 2014 max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; 2015 if (segment >= max_orig_segment) { 2016 adj_seg = segment - max_orig_segment; 2017 line = adj_seg % un->un_segsincolumn; 2018 } else { 2019 line = segment / (un->un_origcolumncnt - 1); 2020 } 2021 column = (uint_t)((line + (un->un_origcolumncnt - 1)) 2022 % un->un_origcolumncnt); 2023 return (column); 2024 } 2025 2026 2027 /* 2028 * Is called in raid_iosetup to probe each column to insure 2029 * that all the columns are in 'okay' state and meet the 2030 * 'full line' requirement. If any column is in error, 2031 * we don't want to enable the 'full line' flag. Previously, 2032 * we would do so and disable it only when a error is 2033 * detected after the first 'full line' io which is too late 2034 * and leads to the potential data corruption. 2035 */ 2036 static int 2037 raid_check_cols(mr_unit_t *un) 2038 { 2039 buf_t bp; 2040 char *buf; 2041 mr_column_t *colptr; 2042 minor_t mnum = MD_SID(un); 2043 int i; 2044 int err = 0; 2045 2046 buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP); 2047 2048 for (i = 0; i < un->un_totalcolumncnt; i++) { 2049 md_dev64_t tmpdev; 2050 2051 colptr = &un->un_column[i]; 2052 2053 tmpdev = colptr->un_dev; 2054 /* 2055 * Open by device id 2056 * If this device is hotspared 2057 * use the hotspare key 2058 */ 2059 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ? 2060 colptr->un_hs_key : colptr->un_orig_key); 2061 2062 if (tmpdev == NODEV64) { 2063 err = 1; 2064 break; 2065 } 2066 2067 colptr->un_dev = tmpdev; 2068 2069 bzero((caddr_t)&bp, sizeof (buf_t)); 2070 bp.b_back = &bp; 2071 bp.b_forw = &bp; 2072 bp.b_flags = (B_READ | B_BUSY); 2073 sema_init(&bp.b_io, 0, NULL, 2074 SEMA_DEFAULT, NULL); 2075 sema_init(&bp.b_sem, 0, NULL, 2076 SEMA_DEFAULT, NULL); 2077 bp.b_edev = md_dev64_to_dev(colptr->un_dev); 2078 bp.b_lblkno = colptr->un_pwstart; 2079 bp.b_bcount = DEV_BSIZE; 2080 bp.b_bufsize = DEV_BSIZE; 2081 bp.b_un.b_addr = (caddr_t)buf; 2082 (void) md_call_strategy(&bp, 0, NULL); 2083 if (biowait(&bp)) { 2084 err = 1; 2085 break; 2086 } 2087 } 2088 2089 kmem_free(buf, DEV_BSIZE); 2090 return (err); 2091 } 2092 2093 /* 2094 * NAME: raid_iosetup 2095 * DESCRIPTION: RAID metadevice specific I/O set up routine which does 2096 * all the necessary calculations to determine the location 2097 * of the segement for the I/O. 2098 * PARAMETERS: mr_unit_t *un - unit number of RAID metadevice 2099 * diskaddr_t blkno - block number of the I/O attempt 2100 * size_t blkcnt - block count for this I/O 2101 * md_raidcs_t *cs - child structure for each segmented I/O 2102 * 2103 * NOTE: The following is an example of a raid disk layer out: 2104 * 2105 * Total Column = 5 2106 * Original Column = 4 2107 * Segment Per Column = 10 2108 * 2109 * Col#0 Col#1 Col#2 Col#3 Col#4 Col#5 Col#6 2110 * ------------------------------------------------------------- 2111 * line#0 Seg#0 Seg#1 Seg#2 Parity Seg#30 Seg#40 2112 * line#1 Parity Seg#3 Seg#4 Seg#5 Seg#31 2113 * line#2 Seg#8 Parity Seg#6 Seg#7 Seg#32 2114 * line#3 Seg#10 Seg#11 Parity Seg#9 Seg#33 2115 * line#4 Seg#12 Seg#13 Seg#14 Parity Seg#34 2116 * line#5 Parity Seg#15 Seg#16 Seg#17 Seg#35 2117 * line#6 Seg#20 Parity Seg#18 Seg#19 Seg#36 2118 * line#7 Seg#22 Seg#23 Parity Seg#21 Seg#37 2119 * line#8 Seg#24 Seg#25 Seg#26 Parity Seg#38 2120 * line#9 Parity Seg#27 Seg#28 Seg#29 Seg#39 2121 */ 2122 static size_t 2123 raid_iosetup( 2124 mr_unit_t *un, 2125 diskaddr_t blkno, 2126 size_t blkcnt, 2127 md_raidcs_t *cs 2128 ) 2129 { 2130 diskaddr_t segment; 2131 diskaddr_t segstart; 2132 diskaddr_t segoff; 2133 size_t leftover; 2134 diskaddr_t line; 2135 uint_t iosize; 2136 uint_t colcnt; 2137 2138 /* caculate the segment# and offset for the block */ 2139 segment = blkno / un->un_segsize; 2140 segstart = segment * un->un_segsize; 2141 segoff = blkno - segstart; 2142 iosize = un->un_iosize - 1; 2143 colcnt = un->un_totalcolumncnt - 1; 2144 line = raid_line(segment, un); 2145 cs->cs_dcolumn = raid_dcolumn(segment, un); 2146 cs->cs_pcolumn = raid_pcolumn(segment, un); 2147 cs->cs_dflags = un->un_column[cs->cs_dcolumn].un_devflags; 2148 cs->cs_pflags = un->un_column[cs->cs_pcolumn].un_devflags; 2149 cs->cs_line = line; 2150 2151 if ((cs->cs_ps->ps_flags & MD_RPS_WRITE) && 2152 (UNIT_STATE(un) & RCS_OKAY) && 2153 (segoff == 0) && 2154 (un->un_totalcolumncnt == un->un_origcolumncnt) && 2155 (un->un_segsize < un->un_iosize) && 2156 (un->un_iosize <= un->un_maxio) && 2157 (blkno == line * un->un_segsize * colcnt) && 2158 (blkcnt >= ((un->un_totalcolumncnt -1) * un->un_segsize)) && 2159 (raid_state_cnt(un, RCS_OKAY) == un->un_origcolumncnt) && 2160 (raid_check_cols(un) == 0)) { 2161 2162 md_raidcbuf_t **cbufp; 2163 md_raidcbuf_t *cbuf; 2164 int i, j; 2165 2166 STAT_INC(raid_full_line_writes); 2167 leftover = blkcnt - (un->un_segsize * colcnt); 2168 ASSERT(blkcnt >= (un->un_segsize * colcnt)); 2169 cs->cs_blkno = line * un->un_segsize; 2170 cs->cs_blkcnt = un->un_segsize; 2171 cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 2172 cs->cs_bcount = dbtob(cs->cs_blkcnt); 2173 cs->cs_flags |= MD_RCS_LINE; 2174 2175 cbufp = &cs->cs_buflist; 2176 for (i = 0; i < un->un_totalcolumncnt; i++) { 2177 j = cs->cs_dcolumn + i; 2178 j = j % un->un_totalcolumncnt; 2179 2180 if ((j == cs->cs_dcolumn) || (j == cs->cs_pcolumn)) 2181 continue; 2182 cbuf = kmem_cache_alloc(raid_cbuf_cache, 2183 MD_ALLOCFLAGS); 2184 raid_cbuf_init(cbuf); 2185 cbuf->cbuf_un = cs->cs_un; 2186 cbuf->cbuf_ps = cs->cs_ps; 2187 cbuf->cbuf_column = j; 2188 cbuf->cbuf_bcount = dbtob(un->un_segsize); 2189 *cbufp = cbuf; 2190 cbufp = &cbuf->cbuf_next; 2191 } 2192 return (leftover); 2193 } 2194 2195 leftover = blkcnt - (un->un_segsize - segoff); 2196 if (blkcnt > (un->un_segsize - segoff)) 2197 blkcnt -= leftover; 2198 else 2199 leftover = 0; 2200 2201 if (blkcnt > (size_t)iosize) { 2202 leftover += (blkcnt - iosize); 2203 blkcnt = iosize; 2204 } 2205 2206 /* calculate the line# and column# for the segment */ 2207 cs->cs_flags &= ~MD_RCS_LINE; 2208 cs->cs_blkno = line * un->un_segsize + segoff; 2209 cs->cs_blkcnt = (uint_t)blkcnt; 2210 cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; 2211 cs->cs_bcount = dbtob((uint_t)blkcnt); 2212 return (leftover); 2213 } 2214 2215 /* 2216 * NAME: raid_done 2217 * DESCRIPTION: RAID metadevice I/O done interrupt routine 2218 * PARAMETERS: struct buf *bp - pointer to a buffer structure 2219 */ 2220 static void 2221 raid_done(struct buf *bp) 2222 { 2223 md_raidcs_t *cs; 2224 int flags, frags; 2225 2226 sema_v(&bp->b_io); 2227 cs = (md_raidcs_t *)bp->b_chain; 2228 2229 ASSERT(cs != NULL); 2230 2231 mutex_enter(&cs->cs_mx); 2232 if (bp->b_flags & B_ERROR) { 2233 cs->cs_flags |= MD_RCS_ERROR; 2234 cs->cs_flags &= ~(MD_RCS_ISCALL); 2235 } 2236 2237 flags = cs->cs_flags; 2238 frags = --cs->cs_frags; 2239 mutex_exit(&cs->cs_mx); 2240 if (frags != 0) { 2241 return; 2242 } 2243 2244 if (flags & MD_RCS_ERROR) { 2245 if (cs->cs_error_call) { 2246 daemon_request(&md_done_daemon, cs->cs_error_call, 2247 (daemon_queue_t *)cs, REQ_OLD); 2248 } 2249 return; 2250 } 2251 2252 if (flags & MD_RCS_ISCALL) { 2253 cs->cs_flags &= ~(MD_RCS_ISCALL); 2254 (*(cs->cs_call))(cs); 2255 return; 2256 } 2257 daemon_request(&md_done_daemon, cs->cs_call, 2258 (daemon_queue_t *)cs, REQ_OLD); 2259 } 2260 /* 2261 * the flag RIO_EXTRA is used when dealing with a column in the process 2262 * of being resynced. During the resync, writes may have to take place 2263 * on both the original component and a hotspare component. 2264 */ 2265 #define RIO_DATA 0x00100 /* use data buffer & data column */ 2266 #define RIO_PARITY 0x00200 /* use parity buffer & parity column */ 2267 #define RIO_WRITE 0x00400 /* issue a write */ 2268 #define RIO_READ 0x00800 /* issue a read */ 2269 #define RIO_PWIO 0x01000 /* do the I/O to the prewrite entry */ 2270 #define RIO_ALT 0x02000 /* do write to alternate device */ 2271 #define RIO_EXTRA 0x04000 /* use extra buffer */ 2272 2273 #define RIO_COLMASK 0x000ff 2274 2275 #define RIO_PREWRITE RIO_WRITE | RIO_PWIO 2276 2277 /* 2278 * NAME: raidio 2279 * DESCRIPTION: RAID metadevice write routine 2280 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2281 */ 2282 static void 2283 raidio(md_raidcs_t *cs, int flags) 2284 { 2285 buf_t *bp; 2286 int column; 2287 int flag; 2288 void *private; 2289 mr_unit_t *un; 2290 int iosize; 2291 diskaddr_t pwstart; 2292 diskaddr_t devstart; 2293 md_dev64_t dev; 2294 2295 un = cs->cs_un; 2296 2297 ASSERT(IO_READER_HELD(un)); 2298 ASSERT(UNIT_READER_HELD(un)); 2299 2300 if (flags & RIO_DATA) { 2301 if (flags & RIO_EXTRA) 2302 bp = &cs->cs_hbuf; 2303 else 2304 bp = &cs->cs_dbuf; 2305 bp->b_un.b_addr = cs->cs_dbuffer; 2306 column = cs->cs_dcolumn; 2307 } else { 2308 if (flags & RIO_EXTRA) 2309 bp = &cs->cs_hbuf; 2310 else 2311 bp = &cs->cs_pbuf; 2312 bp->b_un.b_addr = cs->cs_pbuffer; 2313 column = cs->cs_pcolumn; 2314 } 2315 if (flags & RIO_COLMASK) 2316 column = (flags & RIO_COLMASK) - 1; 2317 2318 bp->b_bcount = cs->cs_bcount; 2319 bp->b_bufsize = cs->cs_bcount; 2320 iosize = un->un_iosize; 2321 2322 /* check if the hotspared device will be used */ 2323 if (flags & RIO_ALT && (flags & RIO_WRITE)) { 2324 pwstart = un->un_column[column].un_alt_pwstart; 2325 devstart = un->un_column[column].un_alt_devstart; 2326 dev = un->un_column[column].un_alt_dev; 2327 } else { 2328 pwstart = un->un_column[column].un_pwstart; 2329 devstart = un->un_column[column].un_devstart; 2330 dev = un->un_column[column].un_dev; 2331 } 2332 2333 /* if not writing to log skip log header */ 2334 if ((flags & RIO_PWIO) == 0) { 2335 bp->b_lblkno = devstart + cs->cs_blkno; 2336 bp->b_un.b_addr += DEV_BSIZE; 2337 } else { 2338 bp->b_bcount += DEV_BSIZE; 2339 bp->b_bufsize = bp->b_bcount; 2340 if (flags & RIO_DATA) { 2341 bp->b_lblkno = cs->cs_dpwslot * iosize + pwstart; 2342 } else { /* not DATA -> PARITY */ 2343 bp->b_lblkno = cs->cs_ppwslot * iosize + pwstart; 2344 } 2345 } 2346 2347 bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR | nv_available); 2348 bp->b_flags |= B_BUSY; 2349 if (flags & RIO_READ) { 2350 bp->b_flags |= B_READ; 2351 } else { 2352 bp->b_flags |= B_WRITE; 2353 if ((nv_available && nv_parity && (flags & RIO_PARITY)) || 2354 (nv_available && nv_prewrite && (flags & RIO_PWIO))) 2355 bp->b_flags |= nv_available; 2356 } 2357 bp->b_iodone = (int (*)())raid_done; 2358 bp->b_edev = md_dev64_to_dev(dev); 2359 2360 ASSERT((bp->b_edev != 0) && (bp->b_edev != NODEV)); 2361 2362 private = cs->cs_strategy_private; 2363 flag = cs->cs_strategy_flag; 2364 2365 md_call_strategy(bp, flag, private); 2366 } 2367 2368 /* 2369 * NAME: genstandardparity 2370 * DESCRIPTION: This routine 2371 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2372 */ 2373 static void 2374 genstandardparity(md_raidcs_t *cs) 2375 { 2376 uint_t *dbuf, *pbuf; 2377 size_t wordcnt; 2378 uint_t dsum = 0; 2379 uint_t psum = 0; 2380 2381 ASSERT((cs->cs_bcount & 0x3) == 0); 2382 2383 wordcnt = cs->cs_bcount / sizeof (uint_t); 2384 2385 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2386 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2387 2388 /* Word aligned */ 2389 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2390 uint_t *uwbuf = (uint_t *)(void *)(cs->cs_addr); 2391 uint_t uval; 2392 2393 while (wordcnt--) { 2394 uval = *uwbuf++; 2395 psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ uval)); 2396 ++pbuf; 2397 *dbuf = uval; 2398 dsum ^= uval; 2399 ++dbuf; 2400 } 2401 } else { 2402 uchar_t *ubbuf = (uchar_t *)(cs->cs_addr); 2403 union { 2404 uint_t wb; 2405 uchar_t bb[4]; 2406 } cb; 2407 2408 while (wordcnt--) { 2409 cb.bb[0] = *ubbuf++; 2410 cb.bb[1] = *ubbuf++; 2411 cb.bb[2] = *ubbuf++; 2412 cb.bb[3] = *ubbuf++; 2413 psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ cb.wb)); 2414 ++pbuf; 2415 *dbuf = cb.wb; 2416 dsum ^= cb.wb; 2417 ++dbuf; 2418 } 2419 } 2420 2421 RAID_FILLIN_RPW(cs->cs_dbuffer, cs->cs_un, dsum, cs->cs_pcolumn, 2422 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2423 2, cs->cs_dcolumn, RAID_PWMAGIC); 2424 2425 RAID_FILLIN_RPW(cs->cs_pbuffer, cs->cs_un, psum, cs->cs_dcolumn, 2426 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2427 2, cs->cs_pcolumn, RAID_PWMAGIC); 2428 } 2429 2430 static void 2431 genlineparity(md_raidcs_t *cs) 2432 { 2433 2434 mr_unit_t *un = cs->cs_un; 2435 md_raidcbuf_t *cbuf; 2436 uint_t *pbuf, *dbuf; 2437 uint_t *uwbuf; 2438 uchar_t *ubbuf; 2439 size_t wordcnt; 2440 uint_t psum = 0, dsum = 0; 2441 size_t count = un->un_segsize * DEV_BSIZE; 2442 uint_t col; 2443 buf_t *bp; 2444 2445 ASSERT((cs->cs_bcount & 0x3) == 0); 2446 2447 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2448 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2449 uwbuf = (uint_t *)(void *)(cs->cs_addr); 2450 ubbuf = (uchar_t *)(void *)(cs->cs_addr); 2451 2452 wordcnt = count / sizeof (uint_t); 2453 2454 /* Word aligned */ 2455 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2456 uint_t uval; 2457 2458 while (wordcnt--) { 2459 uval = *uwbuf++; 2460 *dbuf = uval; 2461 *pbuf = uval; 2462 dsum ^= uval; 2463 ++pbuf; 2464 ++dbuf; 2465 } 2466 } else { 2467 union { 2468 uint_t wb; 2469 uchar_t bb[4]; 2470 } cb; 2471 2472 while (wordcnt--) { 2473 cb.bb[0] = *ubbuf++; 2474 cb.bb[1] = *ubbuf++; 2475 cb.bb[2] = *ubbuf++; 2476 cb.bb[3] = *ubbuf++; 2477 *dbuf = cb.wb; 2478 *pbuf = cb.wb; 2479 dsum ^= cb.wb; 2480 ++pbuf; 2481 ++dbuf; 2482 } 2483 } 2484 2485 RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, cs->cs_pcolumn, 2486 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2487 un->un_totalcolumncnt, cs->cs_dcolumn, RAID_PWMAGIC); 2488 2489 raidio(cs, RIO_PREWRITE | RIO_DATA); 2490 2491 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 2492 2493 dsum = 0; 2494 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2495 dbuf = (uint_t *)(void *)(cbuf->cbuf_buffer + DEV_BSIZE); 2496 2497 wordcnt = count / sizeof (uint_t); 2498 2499 col = cbuf->cbuf_column; 2500 2501 /* Word aligned */ 2502 if (((uintptr_t)cs->cs_addr & 0x3) == 0) { 2503 uint_t uval; 2504 2505 /* 2506 * Only calculate psum when working on the last 2507 * data buffer. 2508 */ 2509 if (cbuf->cbuf_next == NULL) { 2510 psum = 0; 2511 while (wordcnt--) { 2512 uval = *uwbuf++; 2513 *dbuf = uval; 2514 psum ^= (*pbuf ^= uval); 2515 dsum ^= uval; 2516 ++dbuf; 2517 ++pbuf; 2518 } 2519 } else { 2520 while (wordcnt--) { 2521 uval = *uwbuf++; 2522 *dbuf = uval; 2523 *pbuf ^= uval; 2524 dsum ^= uval; 2525 ++dbuf; 2526 ++pbuf; 2527 } 2528 } 2529 } else { 2530 union { 2531 uint_t wb; 2532 uchar_t bb[4]; 2533 } cb; 2534 2535 /* 2536 * Only calculate psum when working on the last 2537 * data buffer. 2538 */ 2539 if (cbuf->cbuf_next == NULL) { 2540 psum = 0; 2541 while (wordcnt--) { 2542 cb.bb[0] = *ubbuf++; 2543 cb.bb[1] = *ubbuf++; 2544 cb.bb[2] = *ubbuf++; 2545 cb.bb[3] = *ubbuf++; 2546 *dbuf = cb.wb; 2547 psum ^= (*pbuf ^= cb.wb); 2548 dsum ^= cb.wb; 2549 ++dbuf; 2550 ++pbuf; 2551 } 2552 } else { 2553 while (wordcnt--) { 2554 cb.bb[0] = *ubbuf++; 2555 cb.bb[1] = *ubbuf++; 2556 cb.bb[2] = *ubbuf++; 2557 cb.bb[3] = *ubbuf++; 2558 *dbuf = cb.wb; 2559 *pbuf ^= cb.wb; 2560 dsum ^= cb.wb; 2561 ++dbuf; 2562 ++pbuf; 2563 } 2564 } 2565 } 2566 RAID_FILLIN_RPW(cbuf->cbuf_buffer, un, dsum, cs->cs_pcolumn, 2567 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2568 un->un_totalcolumncnt, col, RAID_PWMAGIC); 2569 2570 /* 2571 * fill in buffer for write to prewrite area 2572 */ 2573 bp = &cbuf->cbuf_bp; 2574 bp->b_un.b_addr = cbuf->cbuf_buffer; 2575 bp->b_bcount = cbuf->cbuf_bcount + DEV_BSIZE; 2576 bp->b_bufsize = bp->b_bcount; 2577 bp->b_lblkno = (cbuf->cbuf_pwslot * un->un_iosize) + 2578 un->un_column[col].un_pwstart; 2579 bp->b_flags = B_WRITE | B_BUSY; 2580 if (nv_available && nv_prewrite) 2581 bp->b_flags |= nv_available; 2582 bp->b_iodone = (int (*)())raid_done; 2583 bp->b_edev = md_dev64_to_dev(un->un_column[col].un_dev); 2584 bp->b_chain = (struct buf *)cs; 2585 md_call_strategy(bp, 2586 cs->cs_strategy_flag, cs->cs_strategy_private); 2587 } 2588 2589 RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, cs->cs_dcolumn, 2590 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2591 un->un_totalcolumncnt, cs->cs_pcolumn, RAID_PWMAGIC); 2592 2593 raidio(cs, RIO_PREWRITE | RIO_PARITY); 2594 } 2595 2596 /* 2597 * NAME: raid_readregenloop 2598 * DESCRIPTION: RAID metadevice write routine 2599 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2600 */ 2601 static void 2602 raid_readregenloop(md_raidcs_t *cs) 2603 { 2604 mr_unit_t *un; 2605 md_raidps_t *ps; 2606 uint_t *dbuf; 2607 uint_t *pbuf; 2608 size_t wordcnt; 2609 2610 un = cs->cs_un; 2611 2612 /* 2613 * XOR the parity with data bytes, must skip the 2614 * pre-write entry header in all data/parity buffers 2615 */ 2616 wordcnt = cs->cs_bcount / sizeof (uint_t); 2617 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 2618 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 2619 while (wordcnt--) 2620 *dbuf++ ^= *pbuf++; 2621 2622 /* bump up the loop count */ 2623 cs->cs_loop++; 2624 2625 /* skip the errored component */ 2626 if (cs->cs_loop == cs->cs_dcolumn) 2627 cs->cs_loop++; 2628 2629 if (cs->cs_loop != un->un_totalcolumncnt) { 2630 cs->cs_frags = 1; 2631 raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); 2632 return; 2633 } 2634 /* reaching the end sof loop */ 2635 ps = cs->cs_ps; 2636 bcopy(cs->cs_dbuffer + DEV_BSIZE, cs->cs_addr, cs->cs_bcount); 2637 raid_free_child(cs, 1); 2638 2639 /* decrement readfrags */ 2640 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 2641 } 2642 2643 /* 2644 * NAME: raid_read_io 2645 * DESCRIPTION: RAID metadevice read I/O routine 2646 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2647 * md_raidcs_t *cs - pointer to a child structure 2648 */ 2649 static void 2650 raid_read_io(mr_unit_t *un, md_raidcs_t *cs) 2651 { 2652 int flag; 2653 void *private; 2654 buf_t *bp; 2655 buf_t *pb = cs->cs_ps->ps_bp; 2656 mr_column_t *column; 2657 2658 flag = cs->cs_strategy_flag; 2659 private = cs->cs_strategy_private; 2660 column = &un->un_column[cs->cs_dcolumn]; 2661 2662 /* 2663 * The component to be read is good, simply set up bp structure 2664 * and call low level md routine doing the read. 2665 */ 2666 2667 if (COLUMN_ISOKAY(un, cs->cs_dcolumn) || 2668 (COLUMN_ISLASTERR(un, cs->cs_dcolumn) && 2669 (cs->cs_flags & MD_RCS_RECOVERY) == 0)) { 2670 dev_t ddi_dev; /* needed for bioclone, so not md_dev64_t */ 2671 ddi_dev = md_dev64_to_dev(column->un_dev); 2672 2673 bp = &cs->cs_dbuf; 2674 bp = md_bioclone(pb, cs->cs_offset, cs->cs_bcount, ddi_dev, 2675 column->un_devstart + cs->cs_blkno, 2676 (int (*)())raid_done, bp, KM_NOSLEEP); 2677 2678 bp->b_chain = (buf_t *)cs; 2679 2680 cs->cs_frags = 1; 2681 cs->cs_error_call = raid_read_error; 2682 cs->cs_retry_call = raid_read_retry; 2683 cs->cs_flags |= MD_RCS_ISCALL; 2684 cs->cs_stage = RAID_READ_DONE; 2685 cs->cs_call = raid_stage; 2686 2687 ASSERT(bp->b_edev != 0); 2688 2689 md_call_strategy(bp, flag, private); 2690 return; 2691 } 2692 2693 /* 2694 * The component to be read is bad, have to go through 2695 * raid specific method to read data from other members. 2696 */ 2697 cs->cs_loop = 0; 2698 /* 2699 * NOTE: always get dbuffer before pbuffer 2700 * and get both buffers before pwslot 2701 * otherwise a deadlock could be introduced. 2702 */ 2703 raid_mapin_buf(cs); 2704 getdbuffer(cs); 2705 getpbuffer(cs); 2706 if (cs->cs_loop == cs->cs_dcolumn) 2707 cs->cs_loop++; 2708 2709 /* zero out data buffer for use as a data sink */ 2710 bzero(cs->cs_dbuffer + DEV_BSIZE, cs->cs_bcount); 2711 cs->cs_stage = RAID_NONE; 2712 cs->cs_call = raid_readregenloop; 2713 cs->cs_error_call = raid_read_error; 2714 cs->cs_retry_call = raid_read_no_retry; 2715 cs->cs_frags = 1; 2716 2717 /* use parity buffer to read other columns */ 2718 raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); 2719 } 2720 2721 /* 2722 * NAME: raid_read 2723 * DESCRIPTION: RAID metadevice write routine 2724 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2725 * md_raidcs_t *cs - pointer to a child structure 2726 */ 2727 static int 2728 raid_read(mr_unit_t *un, md_raidcs_t *cs) 2729 { 2730 int error = 0; 2731 md_raidps_t *ps; 2732 mdi_unit_t *ui; 2733 minor_t mnum; 2734 2735 ASSERT(IO_READER_HELD(un)); 2736 ps = cs->cs_ps; 2737 ui = ps->ps_ui; 2738 raid_line_reader_lock(cs, 0); 2739 un = (mr_unit_t *)md_unit_readerlock(ui); 2740 ASSERT(UNIT_STATE(un) != RUS_INIT); 2741 mnum = MD_SID(un); 2742 cs->cs_un = un; 2743 2744 /* make sure the read doesn't go beyond the end of the column */ 2745 if (cs->cs_blkno + cs->cs_blkcnt > 2746 un->un_segsize * un->un_segsincolumn) { 2747 error = ENXIO; 2748 } 2749 if (error) 2750 goto rerror; 2751 2752 if (un->un_state & RUS_REGEN) { 2753 raid_regen_parity(cs); 2754 un = MD_UNIT(mnum); 2755 cs->cs_un = un; 2756 } 2757 2758 raid_read_io(un, cs); 2759 return (0); 2760 2761 rerror: 2762 raid_error_parent(ps, error); 2763 raid_free_child(cs, 1); 2764 /* decrement readfrags */ 2765 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 2766 return (0); 2767 } 2768 2769 /* 2770 * NAME: raid_write_err_retry 2771 * DESCRIPTION: RAID metadevice write retry routine 2772 * write was for parity or data only; 2773 * complete write with error, no recovery possible 2774 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2775 * md_raidcs_t *cs - pointer to a child structure 2776 */ 2777 /*ARGSUSED*/ 2778 static void 2779 raid_write_err_retry(mr_unit_t *un, md_raidcs_t *cs) 2780 { 2781 md_raidps_t *ps = cs->cs_ps; 2782 int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; 2783 2784 /* decrement pwfrags if needed, and frags */ 2785 if (!(cs->cs_flags & MD_RCS_PWDONE)) 2786 flags |= RFP_DECR_PWFRAGS; 2787 raid_error_parent(ps, EIO); 2788 raid_free_child(cs, 1); 2789 raid_free_parent(ps, flags); 2790 } 2791 2792 /* 2793 * NAME: raid_write_err_retry 2794 * DESCRIPTION: RAID metadevice write retry routine 2795 * write is too far along to retry and parent 2796 * has already been signaled with iodone. 2797 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2798 * md_raidcs_t *cs - pointer to a child structure 2799 */ 2800 /*ARGSUSED*/ 2801 static void 2802 raid_write_no_retry(mr_unit_t *un, md_raidcs_t *cs) 2803 { 2804 md_raidps_t *ps = cs->cs_ps; 2805 int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; 2806 2807 /* decrement pwfrags if needed, and frags */ 2808 if (!(cs->cs_flags & MD_RCS_PWDONE)) 2809 flags |= RFP_DECR_PWFRAGS; 2810 raid_free_child(cs, 1); 2811 raid_free_parent(ps, flags); 2812 } 2813 2814 /* 2815 * NAME: raid_write_retry 2816 * DESCRIPTION: RAID metadevice write retry routine 2817 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 2818 * md_raidcs_t *cs - pointer to a child structure 2819 */ 2820 static void 2821 raid_write_retry(mr_unit_t *un, md_raidcs_t *cs) 2822 { 2823 md_raidps_t *ps; 2824 2825 ps = cs->cs_ps; 2826 2827 /* re-initialize the buf_t structure for raid_write() */ 2828 cs->cs_dbuf.b_chain = (struct buf *)cs; 2829 cs->cs_dbuf.b_back = &cs->cs_dbuf; 2830 cs->cs_dbuf.b_forw = &cs->cs_dbuf; 2831 cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ 2832 cs->cs_dbuf.b_error = 0; /* initialize error */ 2833 cs->cs_dbuf.b_offset = -1; 2834 /* Initialize semaphores */ 2835 sema_init(&cs->cs_dbuf.b_io, 0, NULL, 2836 SEMA_DEFAULT, NULL); 2837 sema_init(&cs->cs_dbuf.b_sem, 0, NULL, 2838 SEMA_DEFAULT, NULL); 2839 2840 cs->cs_pbuf.b_chain = (struct buf *)cs; 2841 cs->cs_pbuf.b_back = &cs->cs_pbuf; 2842 cs->cs_pbuf.b_forw = &cs->cs_pbuf; 2843 cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ 2844 cs->cs_pbuf.b_error = 0; /* initialize error */ 2845 cs->cs_pbuf.b_offset = -1; 2846 sema_init(&cs->cs_pbuf.b_io, 0, NULL, 2847 SEMA_DEFAULT, NULL); 2848 sema_init(&cs->cs_pbuf.b_sem, 0, NULL, 2849 SEMA_DEFAULT, NULL); 2850 2851 cs->cs_hbuf.b_chain = (struct buf *)cs; 2852 cs->cs_hbuf.b_back = &cs->cs_hbuf; 2853 cs->cs_hbuf.b_forw = &cs->cs_hbuf; 2854 cs->cs_hbuf.b_flags = B_BUSY; /* initialize flags */ 2855 cs->cs_hbuf.b_error = 0; /* initialize error */ 2856 cs->cs_hbuf.b_offset = -1; 2857 sema_init(&cs->cs_hbuf.b_io, 0, NULL, 2858 SEMA_DEFAULT, NULL); 2859 sema_init(&cs->cs_hbuf.b_sem, 0, NULL, 2860 SEMA_DEFAULT, NULL); 2861 2862 cs->cs_flags &= ~(MD_RCS_ERROR); 2863 /* 2864 * If we have already done'ed the i/o but have done prewrite 2865 * on this child, then reset PWDONE flag and bump pwfrags before 2866 * restarting i/o. 2867 * If pwfrags is zero, we have already 'iodone'd the i/o so 2868 * leave things alone. We don't want to re-'done' it. 2869 */ 2870 mutex_enter(&ps->ps_mx); 2871 if (cs->cs_flags & MD_RCS_PWDONE) { 2872 cs->cs_flags &= ~MD_RCS_PWDONE; 2873 ps->ps_pwfrags++; 2874 } 2875 mutex_exit(&ps->ps_mx); 2876 raid_write_io(un, cs); 2877 } 2878 2879 /* 2880 * NAME: raid_wrerr 2881 * DESCRIPTION: RAID metadevice write routine 2882 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2883 * LOCKS: must obtain unit writer lock while calling raid_error_state 2884 * since a unit or column state transition may take place. 2885 * must obtain unit reader lock to retry I/O. 2886 */ 2887 static void 2888 raid_wrerr(md_raidcs_t *cs) 2889 { 2890 md_raidps_t *ps; 2891 mdi_unit_t *ui; 2892 mr_unit_t *un; 2893 md_raidcbuf_t *cbuf; 2894 2895 ps = cs->cs_ps; 2896 ui = ps->ps_ui; 2897 2898 un = (mr_unit_t *)md_unit_writerlock(ui); 2899 ASSERT(un != 0); 2900 2901 if (cs->cs_dbuf.b_flags & B_ERROR) 2902 (void) raid_error_state(un, &cs->cs_dbuf); 2903 if (cs->cs_pbuf.b_flags & B_ERROR) 2904 (void) raid_error_state(un, &cs->cs_pbuf); 2905 if (cs->cs_hbuf.b_flags & B_ERROR) 2906 (void) raid_error_state(un, &cs->cs_hbuf); 2907 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 2908 if (cbuf->cbuf_bp.b_flags & B_ERROR) 2909 (void) raid_error_state(un, &cbuf->cbuf_bp); 2910 2911 md_unit_writerexit(ui); 2912 2913 ps->ps_flags |= MD_RPS_HSREQ; 2914 2915 un = (mr_unit_t *)md_unit_readerlock(ui); 2916 2917 /* now attempt the appropriate retry routine */ 2918 (*(cs->cs_retry_call))(un, cs); 2919 } 2920 /* 2921 * NAMES: raid_write_error 2922 * DESCRIPTION: I/O error handling routine for a RAID metadevice write 2923 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 2924 */ 2925 /*ARGSUSED*/ 2926 static void 2927 raid_write_error(md_raidcs_t *cs) 2928 { 2929 md_raidps_t *ps; 2930 mdi_unit_t *ui; 2931 mr_unit_t *un; 2932 md_raidcbuf_t *cbuf; 2933 set_t setno; 2934 2935 ps = cs->cs_ps; 2936 ui = ps->ps_ui; 2937 un = cs->cs_un; 2938 2939 setno = MD_UN2SET(un); 2940 2941 /* 2942 * locate each buf that is in error on this io and then 2943 * output an error message 2944 */ 2945 if ((cs->cs_dbuf.b_flags & B_ERROR) && 2946 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && 2947 (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) 2948 cmn_err(CE_WARN, "md %s: write error on %s", 2949 md_shortname(MD_SID(un)), 2950 md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); 2951 2952 if ((cs->cs_pbuf.b_flags & B_ERROR) && 2953 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && 2954 (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) 2955 cmn_err(CE_WARN, "md %s: write error on %s", 2956 md_shortname(MD_SID(un)), 2957 md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); 2958 2959 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) 2960 if ((cbuf->cbuf_bp.b_flags & B_ERROR) && 2961 (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_ERRED) && 2962 (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_LAST_ERRED)) 2963 cmn_err(CE_WARN, "md %s: write error on %s", 2964 md_shortname(MD_SID(un)), 2965 md_devname(setno, md_expldev(cbuf->cbuf_bp.b_edev), 2966 NULL, 0)); 2967 2968 md_unit_readerexit(ui); 2969 2970 ASSERT(cs->cs_frags == 0); 2971 2972 /* now schedule processing for possible state change */ 2973 daemon_request(&md_mstr_daemon, raid_wrerr, 2974 (daemon_queue_t *)cs, REQ_OLD); 2975 2976 } 2977 2978 /* 2979 * NAME: raid_write_ponly 2980 * DESCRIPTION: RAID metadevice write routine 2981 * in the case where only the parity column can be written 2982 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 2983 */ 2984 static void 2985 raid_write_ponly(md_raidcs_t *cs) 2986 { 2987 md_raidps_t *ps; 2988 mr_unit_t *un = cs->cs_un; 2989 2990 ps = cs->cs_ps; 2991 /* decrement pwfrags if needed, but not frags */ 2992 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 2993 raid_free_parent(ps, RFP_DECR_PWFRAGS); 2994 cs->cs_flags |= MD_RCS_PWDONE; 2995 cs->cs_frags = 1; 2996 cs->cs_stage = RAID_WRITE_PONLY_DONE; 2997 cs->cs_call = raid_stage; 2998 cs->cs_error_call = raid_write_error; 2999 cs->cs_retry_call = raid_write_no_retry; 3000 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3001 cs->cs_frags++; 3002 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE); 3003 } 3004 raidio(cs, RIO_PARITY | RIO_WRITE); 3005 } 3006 3007 /* 3008 * NAME: raid_write_ploop 3009 * DESCRIPTION: RAID metadevice write routine, constructs parity from 3010 * data in other columns. 3011 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3012 */ 3013 static void 3014 raid_write_ploop(md_raidcs_t *cs) 3015 { 3016 mr_unit_t *un = cs->cs_un; 3017 uint_t *dbuf; 3018 uint_t *pbuf; 3019 size_t wordcnt; 3020 uint_t psum = 0; 3021 3022 wordcnt = cs->cs_bcount / sizeof (uint_t); 3023 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 3024 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 3025 while (wordcnt--) 3026 *pbuf++ ^= *dbuf++; 3027 cs->cs_loop++; 3028 3029 /* 3030 * build parity from scratch using new data, 3031 * skip reading the data and parity columns. 3032 */ 3033 while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn) 3034 cs->cs_loop++; 3035 3036 if (cs->cs_loop != un->un_totalcolumncnt) { 3037 cs->cs_frags = 1; 3038 raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); 3039 return; 3040 } 3041 3042 /* construct checksum for parity buffer */ 3043 wordcnt = cs->cs_bcount / sizeof (uint_t); 3044 pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); 3045 while (wordcnt--) { 3046 psum ^= *pbuf; 3047 pbuf++; 3048 } 3049 RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, -1, 3050 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 3051 1, cs->cs_pcolumn, RAID_PWMAGIC); 3052 3053 cs->cs_stage = RAID_NONE; 3054 cs->cs_call = raid_write_ponly; 3055 cs->cs_error_call = raid_write_error; 3056 cs->cs_retry_call = raid_write_err_retry; 3057 cs->cs_frags = 1; 3058 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3059 cs->cs_frags++; 3060 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); 3061 } 3062 raidio(cs, RIO_PARITY | RIO_PREWRITE); 3063 } 3064 3065 /* 3066 * NAME: raid_write_donly 3067 * DESCRIPTION: RAID metadevice write routine 3068 * Completed writing data to prewrite entry 3069 * in the case where only the data column can be written 3070 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3071 */ 3072 static void 3073 raid_write_donly(md_raidcs_t *cs) 3074 { 3075 md_raidps_t *ps; 3076 mr_unit_t *un = cs->cs_un; 3077 3078 ps = cs->cs_ps; 3079 /* WARNING: don't release unit reader lock here... */ 3080 /* decrement pwfrags if needed, but not frags */ 3081 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 3082 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3083 cs->cs_flags |= MD_RCS_PWDONE; 3084 cs->cs_frags = 1; 3085 cs->cs_stage = RAID_WRITE_DONLY_DONE; 3086 cs->cs_call = raid_stage; 3087 cs->cs_error_call = raid_write_error; 3088 cs->cs_retry_call = raid_write_err_retry; 3089 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3090 cs->cs_frags++; 3091 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); 3092 } 3093 raidio(cs, RIO_DATA | RIO_WRITE); 3094 } 3095 3096 /* 3097 * NAME: raid_write_got_old 3098 * DESCRIPTION: RAID metadevice write routine 3099 * completed read of old data and old parity 3100 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure 3101 */ 3102 static void 3103 raid_write_got_old(md_raidcs_t *cs) 3104 { 3105 mr_unit_t *un = cs->cs_un; 3106 3107 ASSERT(IO_READER_HELD(cs->cs_un)); 3108 ASSERT(UNIT_READER_HELD(cs->cs_un)); 3109 3110 raid_mapin_buf(cs); 3111 genstandardparity(cs); 3112 cs->cs_frags = 2; 3113 cs->cs_call = raid_stage; 3114 cs->cs_stage = RAID_PREWRITE_DONE; 3115 cs->cs_error_call = raid_write_error; 3116 cs->cs_retry_call = raid_write_retry; 3117 3118 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3119 cs->cs_frags++; 3120 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_PREWRITE); 3121 } 3122 3123 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3124 cs->cs_frags++; 3125 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); 3126 } 3127 ASSERT(cs->cs_frags < 4); 3128 raidio(cs, RIO_DATA | RIO_PREWRITE); 3129 raidio(cs, RIO_PARITY | RIO_PREWRITE); 3130 } 3131 3132 /* 3133 * NAME: raid_write_io 3134 * DESCRIPTION: RAID metadevice write I/O routine 3135 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 3136 * md_raidcs_t *cs - pointer to a child structure 3137 */ 3138 3139 /*ARGSUSED*/ 3140 static void 3141 raid_write_io(mr_unit_t *un, md_raidcs_t *cs) 3142 { 3143 md_raidps_t *ps = cs->cs_ps; 3144 uint_t *dbuf; 3145 uint_t *ubuf; 3146 size_t wordcnt; 3147 uint_t dsum = 0; 3148 int pcheck; 3149 int dcheck; 3150 3151 ASSERT((un->un_column[cs->cs_pcolumn].un_devstate & 3152 RCS_INIT) == 0); 3153 ASSERT((un->un_column[cs->cs_dcolumn].un_devstate & 3154 RCS_INIT) == 0); 3155 ASSERT(IO_READER_HELD(un)); 3156 ASSERT(UNIT_READER_HELD(un)); 3157 ASSERT(cs->cs_flags & MD_RCS_HAVE_PW_SLOTS); 3158 if (cs->cs_flags & MD_RCS_LINE) { 3159 3160 mr_unit_t *un = cs->cs_un; 3161 3162 ASSERT(un->un_origcolumncnt == un->un_totalcolumncnt); 3163 raid_mapin_buf(cs); 3164 cs->cs_frags = un->un_origcolumncnt; 3165 cs->cs_call = raid_stage; 3166 cs->cs_error_call = raid_write_error; 3167 cs->cs_retry_call = raid_write_no_retry; 3168 cs->cs_stage = RAID_LINE_PWDONE; 3169 genlineparity(cs); 3170 return; 3171 } 3172 3173 pcheck = erred_check_line(un, cs, &un->un_column[cs->cs_pcolumn]); 3174 dcheck = erred_check_line(un, cs, &un->un_column[cs->cs_dcolumn]); 3175 cs->cs_resync_check = pcheck << RCL_PARITY_OFFSET || dcheck; 3176 3177 if (pcheck == RCL_ERRED && dcheck == RCL_ERRED) { 3178 int err = EIO; 3179 3180 if ((un->un_column[cs->cs_pcolumn].un_devstate == 3181 RCS_LAST_ERRED) || 3182 (un->un_column[cs->cs_dcolumn].un_devstate == 3183 RCS_LAST_ERRED)) 3184 err = ENXIO; 3185 raid_error_parent(ps, err); 3186 ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); 3187 raid_free_child(cs, 1); 3188 raid_free_parent(ps, RFP_DECR_FRAGS 3189 | RFP_RLS_LOCK | RFP_DECR_PWFRAGS); 3190 return; 3191 } 3192 3193 if (pcheck & RCL_ERRED) { 3194 /* 3195 * handle case of only having data drive 3196 */ 3197 raid_mapin_buf(cs); 3198 wordcnt = cs->cs_bcount / sizeof (uint_t); 3199 3200 dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); 3201 ubuf = (uint_t *)(void *)(cs->cs_addr); 3202 3203 while (wordcnt--) { 3204 *dbuf = *ubuf; 3205 dsum ^= *ubuf; 3206 dbuf++; 3207 ubuf++; 3208 } 3209 RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, -1, 3210 cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 3211 1, cs->cs_dcolumn, RAID_PWMAGIC); 3212 cs->cs_frags = 1; 3213 cs->cs_stage = RAID_NONE; 3214 cs->cs_call = raid_write_donly; 3215 cs->cs_error_call = raid_write_error; 3216 cs->cs_retry_call = raid_write_err_retry; 3217 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3218 cs->cs_frags++; 3219 raidio(cs, RIO_DATA | RIO_ALT | RIO_EXTRA | 3220 RIO_PREWRITE); 3221 } 3222 raidio(cs, RIO_DATA | RIO_PREWRITE); 3223 return; 3224 } 3225 3226 if (dcheck & RCL_ERRED) { 3227 /* 3228 * handle case of only having parity drive 3229 * build parity from scratch using new data, 3230 * skip reading the data and parity columns. 3231 */ 3232 raid_mapin_buf(cs); 3233 cs->cs_loop = 0; 3234 while (cs->cs_loop == cs->cs_dcolumn || 3235 cs->cs_loop == cs->cs_pcolumn) 3236 cs->cs_loop++; 3237 3238 /* copy new data in to begin building parity */ 3239 bcopy(cs->cs_addr, cs->cs_pbuffer + DEV_BSIZE, cs->cs_bcount); 3240 cs->cs_stage = RAID_NONE; 3241 cs->cs_call = raid_write_ploop; 3242 cs->cs_error_call = raid_write_error; 3243 cs->cs_retry_call = raid_write_err_retry; 3244 cs->cs_frags = 1; 3245 raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); 3246 return; 3247 } 3248 /* 3249 * handle normal cases 3250 * read old data and old parity 3251 */ 3252 cs->cs_frags = 2; 3253 cs->cs_stage = RAID_NONE; 3254 cs->cs_call = raid_write_got_old; 3255 cs->cs_error_call = raid_write_error; 3256 cs->cs_retry_call = raid_write_retry; 3257 ASSERT(ps->ps_magic == RAID_PSMAGIC); 3258 raidio(cs, RIO_DATA | RIO_READ); 3259 raidio(cs, RIO_PARITY | RIO_READ); 3260 } 3261 3262 static void 3263 raid_enqueue(md_raidcs_t *cs) 3264 { 3265 mdi_unit_t *ui = cs->cs_ps->ps_ui; 3266 kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; 3267 md_raidcs_t *cs1; 3268 3269 mutex_enter(io_list_mutex); 3270 ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); 3271 if (ui->ui_io_lock->io_list_front == NULL) { 3272 ui->ui_io_lock->io_list_front = cs; 3273 ui->ui_io_lock->io_list_back = cs; 3274 } else { 3275 cs1 = ui->ui_io_lock->io_list_back; 3276 cs1->cs_linlck_next = cs; 3277 ui->ui_io_lock->io_list_back = cs; 3278 } 3279 STAT_INC(raid_write_waits); 3280 STAT_MAX(raid_max_write_q_length, raid_write_queue_length); 3281 cs->cs_linlck_next = NULL; 3282 mutex_exit(io_list_mutex); 3283 } 3284 3285 /* 3286 * NAME: raid_write 3287 * DESCRIPTION: RAID metadevice write routine 3288 * PARAMETERS: mr_unit_t *un - pointer to a unit structure 3289 * md_raidcs_t *cs - pointer to a child structure 3290 */ 3291 3292 /*ARGSUSED*/ 3293 static int 3294 raid_write(mr_unit_t *un, md_raidcs_t *cs) 3295 { 3296 int error = 0; 3297 md_raidps_t *ps; 3298 mdi_unit_t *ui; 3299 minor_t mnum; 3300 clock_t timeout; 3301 3302 ASSERT(IO_READER_HELD(un)); 3303 ps = cs->cs_ps; 3304 ui = ps->ps_ui; 3305 3306 ASSERT(UNIT_STATE(un) != RUS_INIT); 3307 if (UNIT_STATE(un) == RUS_LAST_ERRED) 3308 error = EIO; 3309 3310 /* make sure the write doesn't go beyond the column */ 3311 if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn) 3312 error = ENXIO; 3313 if (error) 3314 goto werror; 3315 3316 getresources(cs); 3317 3318 /* 3319 * this is an advisory loop that keeps the waiting lists short 3320 * to reduce cpu time. Since there is a race introduced by not 3321 * aquiring all the correct mutexes, use a cv_timedwait to be 3322 * sure the write always will wake up and start. 3323 */ 3324 while (raid_check_pw(cs)) { 3325 mutex_enter(&un->un_mx); 3326 (void) drv_getparm(LBOLT, &timeout); 3327 timeout += md_wr_wait; 3328 un->un_rflags |= MD_RFLAG_NEEDPW; 3329 STAT_INC(raid_prewrite_waits); 3330 (void) cv_timedwait(&un->un_cv, &un->un_mx, timeout); 3331 un->un_rflags &= ~MD_RFLAG_NEEDPW; 3332 mutex_exit(&un->un_mx); 3333 } 3334 3335 if (raid_line_writer_lock(cs, 1)) 3336 return (0); 3337 3338 un = (mr_unit_t *)md_unit_readerlock(ui); 3339 cs->cs_un = un; 3340 mnum = MD_SID(un); 3341 3342 if (un->un_state & RUS_REGEN) { 3343 raid_regen_parity(cs); 3344 un = MD_UNIT(mnum); 3345 cs->cs_un = un; 3346 } 3347 3348 raid_write_io(un, cs); 3349 return (0); 3350 werror: 3351 /* aquire unit reader lock sinc raid_free_child always drops it */ 3352 raid_error_parent(ps, error); 3353 raid_free_child(cs, 0); 3354 /* decrement both pwfrags and frags */ 3355 raid_free_parent(ps, RFP_DECR_PWFRAGS | RFP_DECR_FRAGS | RFP_RLS_LOCK); 3356 return (0); 3357 } 3358 3359 3360 /* 3361 * NAMES: raid_stage 3362 * DESCRIPTION: post-processing routine for a RAID metadevice 3363 * PARAMETERS: md_raidcs_t *cs - pointer to child structure 3364 */ 3365 static void 3366 raid_stage(md_raidcs_t *cs) 3367 { 3368 md_raidps_t *ps = cs->cs_ps; 3369 mr_unit_t *un = cs->cs_un; 3370 md_raidcbuf_t *cbuf; 3371 buf_t *bp; 3372 void *private; 3373 int flag; 3374 3375 switch (cs->cs_stage) { 3376 case RAID_READ_DONE: 3377 raid_free_child(cs, 1); 3378 /* decrement readfrags */ 3379 raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); 3380 return; 3381 3382 case RAID_WRITE_DONE: 3383 case RAID_WRITE_PONLY_DONE: 3384 case RAID_WRITE_DONLY_DONE: 3385 /* 3386 * Completed writing real parity and/or data. 3387 */ 3388 ASSERT(cs->cs_flags & MD_RCS_PWDONE); 3389 raid_free_child(cs, 1); 3390 /* decrement frags but not pwfrags */ 3391 raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK); 3392 return; 3393 3394 case RAID_PREWRITE_DONE: 3395 /* 3396 * completed writing data and parity to prewrite entries 3397 */ 3398 /* 3399 * WARNING: don't release unit reader lock here.. 3400 * decrement pwfrags but not frags 3401 */ 3402 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3403 cs->cs_flags |= MD_RCS_PWDONE; 3404 cs->cs_frags = 2; 3405 cs->cs_stage = RAID_WRITE_DONE; 3406 cs->cs_call = raid_stage; 3407 cs->cs_error_call = raid_write_error; 3408 cs->cs_retry_call = raid_write_no_retry; 3409 if (WRITE_ALT(un, cs->cs_pcolumn)) { 3410 cs->cs_frags++; 3411 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | 3412 RIO_WRITE); 3413 } 3414 if (WRITE_ALT(un, cs->cs_dcolumn)) { 3415 cs->cs_frags++; 3416 raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); 3417 } 3418 ASSERT(cs->cs_frags < 4); 3419 raidio(cs, RIO_DATA | RIO_WRITE); 3420 raidio(cs, RIO_PARITY | RIO_WRITE); 3421 if (cs->cs_pw_inval_list) { 3422 raid_free_pwinvalidate(cs); 3423 } 3424 return; 3425 3426 case RAID_LINE_PWDONE: 3427 ASSERT(cs->cs_frags == 0); 3428 raid_free_parent(ps, RFP_DECR_PWFRAGS); 3429 cs->cs_flags |= MD_RCS_PWDONE; 3430 cs->cs_frags = un->un_origcolumncnt; 3431 cs->cs_call = raid_stage; 3432 cs->cs_error_call = raid_write_error; 3433 cs->cs_retry_call = raid_write_no_retry; 3434 cs->cs_stage = RAID_WRITE_DONE; 3435 for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { 3436 /* 3437 * fill in buffer for write to prewrite area 3438 */ 3439 bp = &cbuf->cbuf_bp; 3440 bp->b_back = bp; 3441 bp->b_forw = bp; 3442 bp->b_un.b_addr = cbuf->cbuf_buffer + DEV_BSIZE; 3443 bp->b_bcount = cbuf->cbuf_bcount; 3444 bp->b_bufsize = cbuf->cbuf_bcount; 3445 bp->b_lblkno = 3446 un->un_column[cbuf->cbuf_column].un_devstart + 3447 cs->cs_blkno; 3448 bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR); 3449 bp->b_flags &= ~nv_available; 3450 bp->b_flags |= B_WRITE | B_BUSY; 3451 bp->b_iodone = (int (*)())raid_done; 3452 bp->b_edev = md_dev64_to_dev( 3453 un->un_column[cbuf->cbuf_column].un_dev); 3454 bp->b_chain = (struct buf *)cs; 3455 private = cs->cs_strategy_private; 3456 flag = cs->cs_strategy_flag; 3457 md_call_strategy(bp, flag, private); 3458 } 3459 raidio(cs, RIO_DATA | RIO_WRITE); 3460 raidio(cs, RIO_PARITY | RIO_WRITE); 3461 if (cs->cs_pw_inval_list) { 3462 raid_free_pwinvalidate(cs); 3463 } 3464 return; 3465 3466 default: 3467 ASSERT(0); 3468 break; 3469 } 3470 } 3471 /* 3472 * NAME: md_raid_strategy 3473 * DESCRIPTION: RAID metadevice I/O oprations entry point. 3474 * PARAMETERS: buf_t *pb - pointer to a user I/O buffer 3475 * int flag - metadevice specific flag 3476 * void *private - carry over flag ?? 3477 * 3478 */ 3479 3480 void 3481 md_raid_strategy(buf_t *pb, int flag, void *private) 3482 { 3483 md_raidps_t *ps; 3484 md_raidcs_t *cs; 3485 int doing_writes; 3486 int err; 3487 mr_unit_t *un; 3488 mdi_unit_t *ui; 3489 size_t count; 3490 diskaddr_t blkno; 3491 caddr_t addr; 3492 off_t offset; 3493 int colcnt; 3494 minor_t mnum; 3495 set_t setno; 3496 3497 ui = MDI_UNIT(getminor(pb->b_edev)); 3498 md_kstat_waitq_enter(ui); 3499 un = (mr_unit_t *)md_io_readerlock(ui); 3500 setno = MD_MIN2SET(getminor(pb->b_edev)); 3501 3502 if ((flag & MD_NOBLOCK) == 0) { 3503 if (md_inc_iocount(setno) != 0) { 3504 pb->b_flags |= B_ERROR; 3505 pb->b_error = ENXIO; 3506 pb->b_resid = pb->b_bcount; 3507 md_io_readerexit(ui); 3508 biodone(pb); 3509 return; 3510 } 3511 } else { 3512 md_inc_iocount_noblock(setno); 3513 } 3514 3515 mnum = MD_SID(un); 3516 colcnt = un->un_totalcolumncnt - 1; 3517 count = pb->b_bcount; 3518 3519 STAT_CHECK(raid_512, count == 512); 3520 STAT_CHECK(raid_1024, count == 1024); 3521 STAT_CHECK(raid_1024_8192, count > 1024 && count < 8192); 3522 STAT_CHECK(raid_8192, count == 8192); 3523 STAT_CHECK(raid_8192_bigger, count > 8192); 3524 3525 (void *) md_unit_readerlock(ui); 3526 if (!(flag & MD_STR_NOTTOP)) { 3527 err = md_checkbuf(ui, (md_unit_t *)un, pb); /* check and map */ 3528 if (err != 0) { 3529 md_kstat_waitq_exit(ui); 3530 md_io_readerexit(ui); 3531 return; 3532 } 3533 } 3534 md_unit_readerexit(ui); 3535 3536 STAT_INC(raid_total_io); 3537 3538 /* allocate a parent structure for the user I/O */ 3539 ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS); 3540 raid_parent_init(ps); 3541 3542 /* 3543 * Save essential information from the original buffhdr 3544 * in the md_save structure. 3545 */ 3546 ps->ps_un = un; 3547 ps->ps_ui = ui; 3548 ps->ps_bp = pb; 3549 ps->ps_addr = pb->b_un.b_addr; 3550 3551 if ((pb->b_flags & B_READ) == 0) { 3552 ps->ps_flags |= MD_RPS_WRITE; 3553 doing_writes = 1; 3554 STAT_INC(raid_writes); 3555 } else { 3556 ps->ps_flags |= MD_RPS_READ; 3557 doing_writes = 0; 3558 STAT_INC(raid_reads); 3559 } 3560 3561 count = lbtodb(pb->b_bcount); /* transfer count (in blocks) */ 3562 blkno = pb->b_lblkno; /* block number on device */ 3563 addr = 0; 3564 offset = 0; 3565 ps->ps_pwfrags = 1; 3566 ps->ps_frags = 1; 3567 md_kstat_waitq_to_runq(ui); 3568 3569 do { 3570 cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS); 3571 raid_child_init(cs); 3572 cs->cs_ps = ps; 3573 cs->cs_un = un; 3574 cs->cs_mdunit = mnum; 3575 cs->cs_strategy_flag = flag; 3576 cs->cs_strategy_private = private; 3577 cs->cs_addr = addr; 3578 cs->cs_offset = offset; 3579 count = raid_iosetup(un, blkno, count, cs); 3580 if (cs->cs_flags & MD_RCS_LINE) { 3581 blkno += (cs->cs_blkcnt * colcnt); 3582 offset += (cs->cs_bcount * colcnt); 3583 } else { 3584 blkno += cs->cs_blkcnt; 3585 offset += cs->cs_bcount; 3586 } 3587 /* for each cs bump up the ps_pwfrags and ps_frags fields */ 3588 if (count) { 3589 mutex_enter(&ps->ps_mx); 3590 ps->ps_pwfrags++; 3591 ps->ps_frags++; 3592 mutex_exit(&ps->ps_mx); 3593 if (doing_writes) 3594 (void) raid_write(un, cs); 3595 else 3596 (void) raid_read(un, cs); 3597 } 3598 } while (count); 3599 if (doing_writes) { 3600 (void) raid_write(un, cs); 3601 } else 3602 (void) raid_read(un, cs); 3603 3604 if (! (flag & MD_STR_NOTTOP) && panicstr) { 3605 while (! (ps->ps_flags & MD_RPS_DONE)) { 3606 md_daemon(1, &md_done_daemon); 3607 drv_usecwait(10); 3608 } 3609 kmem_cache_free(raid_parent_cache, ps); 3610 } 3611 } 3612 3613 /* 3614 * NAMES: raid_snarf 3615 * DESCRIPTION: RAID metadevice SNARF entry point 3616 * PARAMETERS: md_snarfcmd_t cmd, 3617 * set_t setno 3618 * RETURNS: 3619 */ 3620 static int 3621 raid_snarf(md_snarfcmd_t cmd, set_t setno) 3622 { 3623 mr_unit_t *un; 3624 mddb_recid_t recid; 3625 int gotsomething; 3626 int all_raid_gotten; 3627 mddb_type_t typ1; 3628 uint_t ncol; 3629 mddb_de_ic_t *dep; 3630 mddb_rb32_t *rbp; 3631 size_t newreqsize; 3632 mr_unit_t *big_un; 3633 mr_unit32_od_t *small_un; 3634 3635 3636 if (cmd == MD_SNARF_CLEANUP) 3637 return (0); 3638 3639 all_raid_gotten = 1; 3640 gotsomething = 0; 3641 typ1 = (mddb_type_t)md_getshared_key(setno, 3642 raid_md_ops.md_driver.md_drivername); 3643 recid = mddb_makerecid(setno, 0); 3644 3645 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 3646 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) { 3647 continue; 3648 } 3649 3650 dep = mddb_getrecdep(recid); 3651 dep->de_flags = MDDB_F_RAID; 3652 rbp = dep->de_rb; 3653 if ((rbp->rb_revision == MDDB_REV_RB) && 3654 ((rbp->rb_private & MD_PRV_CONVD) == 0)) { 3655 /* 3656 * This means, we have an old and small record 3657 * and this record hasn't already been converted. 3658 * Before we create an incore metadevice from this 3659 * we have to convert it to a big record. 3660 */ 3661 small_un = (mr_unit32_od_t *)mddb_getrecaddr(recid); 3662 ncol = small_un->un_totalcolumncnt; 3663 newreqsize = sizeof (mr_unit_t) + 3664 ((ncol - 1) * sizeof (mr_column_t)); 3665 big_un = (mr_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP); 3666 raid_convert((caddr_t)small_un, (caddr_t)big_un, 3667 SMALL_2_BIG); 3668 kmem_free(small_un, dep->de_reqsize); 3669 dep->de_rb_userdata = big_un; 3670 dep->de_reqsize = newreqsize; 3671 un = big_un; 3672 rbp->rb_private |= MD_PRV_CONVD; 3673 } else { 3674 /* Big device */ 3675 un = (mr_unit_t *)mddb_getrecaddr(recid); 3676 } 3677 3678 /* Set revision and flag accordingly */ 3679 if (rbp->rb_revision == MDDB_REV_RB) { 3680 un->c.un_revision = MD_32BIT_META_DEV; 3681 } else { 3682 un->c.un_revision = MD_64BIT_META_DEV; 3683 un->c.un_flag |= MD_EFILABEL; 3684 } 3685 3686 /* 3687 * Create minor device node for snarfed entry. 3688 */ 3689 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 3690 3691 if (MD_UNIT(MD_SID(un)) != NULL) { 3692 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 3693 continue; 3694 } 3695 all_raid_gotten = 0; 3696 if (raid_build_incore((void *)un, 1) == 0) { 3697 mddb_setrecprivate(recid, MD_PRV_GOTIT); 3698 md_create_unit_incore(MD_SID(un), &raid_md_ops, 3699 1); 3700 gotsomething = 1; 3701 } else if (un->mr_ic) { 3702 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * 3703 un->un_totalcolumncnt); 3704 kmem_free(un->mr_ic, sizeof (*un->mr_ic)); 3705 } 3706 } 3707 3708 if (!all_raid_gotten) { 3709 return (gotsomething); 3710 } 3711 3712 recid = mddb_makerecid(setno, 0); 3713 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) 3714 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 3715 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 3716 3717 return (0); 3718 } 3719 3720 /* 3721 * NAMES: raid_halt 3722 * DESCRIPTION: RAID metadevice HALT entry point 3723 * PARAMETERS: md_haltcmd_t cmd - 3724 * set_t setno - 3725 * RETURNS: 3726 */ 3727 static int 3728 raid_halt(md_haltcmd_t cmd, set_t setno) 3729 { 3730 set_t i; 3731 mdi_unit_t *ui; 3732 minor_t mnum; 3733 3734 if (cmd == MD_HALT_CLOSE) 3735 return (0); 3736 3737 if (cmd == MD_HALT_OPEN) 3738 return (0); 3739 3740 if (cmd == MD_HALT_UNLOAD) 3741 return (0); 3742 3743 if (cmd == MD_HALT_CHECK) { 3744 for (i = 0; i < md_nunits; i++) { 3745 mnum = MD_MKMIN(setno, i); 3746 if ((ui = MDI_UNIT(mnum)) == NULL) 3747 continue; 3748 if (ui->ui_opsindex != raid_md_ops.md_selfindex) 3749 continue; 3750 if (md_unit_isopen(ui)) 3751 return (1); 3752 } 3753 return (0); 3754 } 3755 3756 if (cmd != MD_HALT_DOIT) 3757 return (1); 3758 3759 for (i = 0; i < md_nunits; i++) { 3760 mnum = MD_MKMIN(setno, i); 3761 if ((ui = MDI_UNIT(mnum)) == NULL) 3762 continue; 3763 if (ui->ui_opsindex != raid_md_ops.md_selfindex) 3764 continue; 3765 reset_raid((mr_unit_t *)MD_UNIT(mnum), mnum, 0); 3766 } 3767 return (0); 3768 } 3769 3770 /* 3771 * NAMES: raid_close_all_devs 3772 * DESCRIPTION: Close all the devices of the unit. 3773 * PARAMETERS: mr_unit_t *un - pointer to unit structure 3774 * RETURNS: 3775 */ 3776 void 3777 raid_close_all_devs(mr_unit_t *un, int init_pw, int md_cflags) 3778 { 3779 int i; 3780 mr_column_t *device; 3781 3782 for (i = 0; i < un->un_totalcolumncnt; i++) { 3783 device = &un->un_column[i]; 3784 if (device->un_devflags & MD_RAID_DEV_ISOPEN) { 3785 ASSERT((device->un_dev != (md_dev64_t)0) && 3786 (device->un_dev != NODEV64)); 3787 if ((device->un_devstate & RCS_OKAY) && init_pw) 3788 (void) init_pw_area(un, device->un_dev, 3789 device->un_pwstart, i); 3790 md_layered_close(device->un_dev, md_cflags); 3791 device->un_devflags &= ~MD_RAID_DEV_ISOPEN; 3792 } 3793 } 3794 } 3795 3796 /* 3797 * NAMES: raid_open_all_devs 3798 * DESCRIPTION: Open all the components (columns) of the device unit. 3799 * PARAMETERS: mr_unit_t *un - pointer to unit structure 3800 * RETURNS: 3801 */ 3802 static int 3803 raid_open_all_devs(mr_unit_t *un, int md_oflags) 3804 { 3805 minor_t mnum = MD_SID(un); 3806 int i; 3807 int not_opened = 0; 3808 int commit = 0; 3809 int col = -1; 3810 mr_column_t *device; 3811 set_t setno = MD_MIN2SET(MD_SID(un)); 3812 side_t side = mddb_getsidenum(setno); 3813 mdkey_t key; 3814 mdi_unit_t *ui = MDI_UNIT(mnum); 3815 3816 ui->ui_tstate &= ~MD_INACCESSIBLE; 3817 3818 for (i = 0; i < un->un_totalcolumncnt; i++) { 3819 md_dev64_t tmpdev; 3820 3821 device = &un->un_column[i]; 3822 3823 if (COLUMN_STATE(un, i) & RCS_ERRED) { 3824 not_opened++; 3825 continue; 3826 } 3827 3828 if (device->un_devflags & MD_RAID_DEV_ISOPEN) 3829 continue; 3830 3831 tmpdev = device->un_dev; 3832 /* 3833 * Open by device id 3834 */ 3835 key = HOTSPARED(un, i) ? 3836 device->un_hs_key : device->un_orig_key; 3837 if ((md_getmajor(tmpdev) != md_major) && 3838 md_devid_found(setno, side, key) == 1) { 3839 tmpdev = md_resolve_bydevid(mnum, tmpdev, key); 3840 } 3841 if (md_layered_open(mnum, &tmpdev, md_oflags)) { 3842 device->un_dev = tmpdev; 3843 not_opened++; 3844 continue; 3845 } 3846 device->un_dev = tmpdev; 3847 device->un_devflags |= MD_RAID_DEV_ISOPEN; 3848 } 3849 3850 /* if open errors and errored devices are 1 then device can run */ 3851 if (not_opened > 1) { 3852 cmn_err(CE_WARN, 3853 "md: %s failed to open. open error on %s\n", 3854 md_shortname(MD_SID(un)), 3855 md_devname(MD_UN2SET(un), device->un_orig_dev, 3856 NULL, 0)); 3857 3858 ui->ui_tstate |= MD_INACCESSIBLE; 3859 3860 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 3861 MD_UN2SET(un), MD_SID(un)); 3862 3863 return (not_opened > 1); 3864 } 3865 3866 for (i = 0; i < un->un_totalcolumncnt; i++) { 3867 device = &un->un_column[i]; 3868 if (device->un_devflags & MD_RAID_DEV_ISOPEN) { 3869 if (device->un_devstate & RCS_LAST_ERRED) { 3870 /* 3871 * At this point in time there is a possibility 3872 * that errors were the result of a controller 3873 * failure with more than a single column on it 3874 * so clear out last errored columns and let errors 3875 * re-occur is necessary. 3876 */ 3877 raid_set_state(un, i, RCS_OKAY, 0); 3878 commit++; 3879 } 3880 continue; 3881 } 3882 ASSERT(col == -1); 3883 col = i; 3884 } 3885 3886 if (col != -1) { 3887 raid_set_state(un, col, RCS_ERRED, 0); 3888 commit++; 3889 } 3890 3891 if (commit) 3892 raid_commit(un, NULL); 3893 3894 if (col != -1) { 3895 if (COLUMN_STATE(un, col) & RCS_ERRED) { 3896 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 3897 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 3898 } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { 3899 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 3900 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 3901 } 3902 } 3903 3904 return (0); 3905 } 3906 3907 /* 3908 * NAMES: raid_internal_open 3909 * DESCRIPTION: Do the actual RAID open 3910 * PARAMETERS: minor_t mnum - minor number of the RAID device 3911 * int flag - 3912 * int otyp - 3913 * int md_oflags - RAID open flags 3914 * RETURNS: 0 if successful, nonzero otherwise 3915 */ 3916 int 3917 raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags) 3918 { 3919 mr_unit_t *un; 3920 mdi_unit_t *ui; 3921 int err = 0; 3922 int replay_error = 0; 3923 3924 ui = MDI_UNIT(mnum); 3925 ASSERT(ui != NULL); 3926 3927 un = (mr_unit_t *)md_unit_openclose_enter(ui); 3928 /* 3929 * this MUST be checked before md_unit_isopen is checked. 3930 * raid_init_columns sets md_unit_isopen to block reset, halt. 3931 */ 3932 if ((UNIT_STATE(un) & (RUS_INIT | RUS_DOI)) && 3933 !(md_oflags & MD_OFLG_ISINIT)) { 3934 md_unit_openclose_exit(ui); 3935 return (EAGAIN); 3936 } 3937 3938 if ((md_oflags & MD_OFLG_ISINIT) || md_unit_isopen(ui)) { 3939 err = md_unit_incopen(mnum, flag, otyp); 3940 goto out; 3941 } 3942 3943 md_unit_readerexit(ui); 3944 3945 un = (mr_unit_t *)md_unit_writerlock(ui); 3946 if (raid_open_all_devs(un, md_oflags) == 0) { 3947 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) { 3948 md_unit_writerexit(ui); 3949 un = (mr_unit_t *)md_unit_readerlock(ui); 3950 raid_close_all_devs(un, 0, md_oflags); 3951 goto out; 3952 } 3953 } else { 3954 /* 3955 * if this unit contains more than two errored components 3956 * should return error and close all opened devices 3957 */ 3958 3959 md_unit_writerexit(ui); 3960 un = (mr_unit_t *)md_unit_readerlock(ui); 3961 raid_close_all_devs(un, 0, md_oflags); 3962 md_unit_openclose_exit(ui); 3963 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 3964 MD_UN2SET(un), MD_SID(un)); 3965 return (ENXIO); 3966 } 3967 3968 if (!(MD_STATUS(un) & MD_UN_REPLAYED)) { 3969 replay_error = raid_replay(un); 3970 MD_STATUS(un) |= MD_UN_REPLAYED; 3971 } 3972 3973 md_unit_writerexit(ui); 3974 un = (mr_unit_t *)md_unit_readerlock(ui); 3975 3976 if ((replay_error == RAID_RPLY_READONLY) && 3977 ((flag & (FREAD | FWRITE)) == FREAD)) { 3978 md_unit_openclose_exit(ui); 3979 return (0); 3980 } 3981 3982 /* allocate hotspare if possible */ 3983 (void) raid_hotspares(); 3984 3985 3986 out: 3987 md_unit_openclose_exit(ui); 3988 return (err); 3989 } 3990 /* 3991 * NAMES: raid_open 3992 * DESCRIPTION: RAID metadevice OPEN entry point 3993 * PARAMETERS: dev_t dev - 3994 * int flag - 3995 * int otyp - 3996 * cred_t * cred_p - 3997 * int md_oflags - 3998 * RETURNS: 3999 */ 4000 /*ARGSUSED1*/ 4001 static int 4002 raid_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 4003 { 4004 int error = 0; 4005 4006 if (error = raid_internal_open(getminor(*dev), flag, otyp, md_oflags)) { 4007 return (error); 4008 } 4009 return (0); 4010 } 4011 4012 /* 4013 * NAMES: raid_internal_close 4014 * DESCRIPTION: RAID metadevice CLOSE actual implementation 4015 * PARAMETERS: minor_t - minor number of the RAID device 4016 * int otyp - 4017 * int init_pw - 4018 * int md_cflags - RAID close flags 4019 * RETURNS: 0 if successful, nonzero otherwise 4020 */ 4021 /*ARGSUSED*/ 4022 int 4023 raid_internal_close(minor_t mnum, int otyp, int init_pw, int md_cflags) 4024 { 4025 mdi_unit_t *ui = MDI_UNIT(mnum); 4026 mr_unit_t *un; 4027 int err = 0; 4028 4029 /* single thread */ 4030 un = (mr_unit_t *)md_unit_openclose_enter(ui); 4031 4032 /* count closed */ 4033 if ((err = md_unit_decopen(mnum, otyp)) != 0) 4034 goto out; 4035 /* close devices, if necessary */ 4036 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 4037 raid_close_all_devs(un, init_pw, md_cflags); 4038 } 4039 4040 /* unlock, return success */ 4041 out: 4042 md_unit_openclose_exit(ui); 4043 return (err); 4044 } 4045 4046 /* 4047 * NAMES: raid_close 4048 * DESCRIPTION: RAID metadevice close entry point 4049 * PARAMETERS: dev_t dev - 4050 * int flag - 4051 * int otyp - 4052 * cred_t * cred_p - 4053 * int md_oflags - 4054 * RETURNS: 4055 */ 4056 /*ARGSUSED1*/ 4057 static int 4058 raid_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) 4059 { 4060 int retval; 4061 4062 (void) md_io_writerlock(MDI_UNIT(getminor(dev))); 4063 retval = raid_internal_close(getminor(dev), otyp, 1, md_cflags); 4064 (void) md_io_writerexit(MDI_UNIT(getminor(dev))); 4065 return (retval); 4066 } 4067 4068 /* 4069 * raid_probe_close_all_devs 4070 */ 4071 void 4072 raid_probe_close_all_devs(mr_unit_t *un) 4073 { 4074 int i; 4075 mr_column_t *device; 4076 4077 for (i = 0; i < un->un_totalcolumncnt; i++) { 4078 device = &un->un_column[i]; 4079 4080 if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { 4081 md_layered_close(device->un_dev, 4082 MD_OFLG_PROBEDEV); 4083 device->un_devflags &= ~MD_RAID_DEV_PROBEOPEN; 4084 } 4085 } 4086 } 4087 /* 4088 * Raid_probe_dev: 4089 * 4090 * On entry the unit writerlock is held 4091 */ 4092 static int 4093 raid_probe_dev(mdi_unit_t *ui, minor_t mnum) 4094 { 4095 mr_unit_t *un; 4096 int i; 4097 int not_opened = 0; 4098 int commit = 0; 4099 int col = -1; 4100 mr_column_t *device; 4101 int md_devopen = 0; 4102 4103 if (md_unit_isopen(ui)) 4104 md_devopen++; 4105 4106 un = MD_UNIT(mnum); 4107 /* 4108 * If the state has been set to LAST_ERRED because 4109 * of an error when the raid device was open at some 4110 * point in the past, don't probe. We really don't want 4111 * to reset the state in this case. 4112 */ 4113 if (UNIT_STATE(un) == RUS_LAST_ERRED) 4114 return (0); 4115 4116 ui->ui_tstate &= ~MD_INACCESSIBLE; 4117 4118 for (i = 0; i < un->un_totalcolumncnt; i++) { 4119 md_dev64_t tmpdev; 4120 4121 device = &un->un_column[i]; 4122 if (COLUMN_STATE(un, i) & RCS_ERRED) { 4123 not_opened++; 4124 continue; 4125 } 4126 4127 tmpdev = device->un_dev; 4128 /* 4129 * Currently the flags passed are not needed since 4130 * there cannot be an underlying metadevice. However 4131 * they are kept here for consistency. 4132 * 4133 * Open by device id 4134 */ 4135 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i)? 4136 device->un_hs_key : device->un_orig_key); 4137 if (md_layered_open(mnum, &tmpdev, 4138 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV)) { 4139 device->un_dev = tmpdev; 4140 not_opened++; 4141 continue; 4142 } 4143 device->un_dev = tmpdev; 4144 4145 device->un_devflags |= MD_RAID_DEV_PROBEOPEN; 4146 } 4147 4148 /* 4149 * The code below is careful on setting the LAST_ERRED state. 4150 * 4151 * If open errors and exactly one device has failed we can run. 4152 * If more then one device fails we have to figure out when to set 4153 * LAST_ERRED state. The rationale is to avoid unnecessary resyncs 4154 * since they are painful and time consuming. 4155 * 4156 * When more than one component/column fails there are 2 scenerios. 4157 * 4158 * 1. Metadevice has NOT been opened: In this case, the behavior 4159 * mimics the open symantics. ie. Only the first failed device 4160 * is ERRED and LAST_ERRED is not set. 4161 * 4162 * 2. Metadevice has been opened: Here the read/write sematics are 4163 * followed. The first failed devicce is ERRED and on the next 4164 * failed device LAST_ERRED is set. 4165 */ 4166 4167 if (not_opened > 1 && !md_devopen) { 4168 cmn_err(CE_WARN, 4169 "md: %s failed to open. open error on %s\n", 4170 md_shortname(MD_SID(un)), 4171 md_devname(MD_UN2SET(un), device->un_orig_dev, 4172 NULL, 0)); 4173 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 4174 MD_UN2SET(un), MD_SID(un)); 4175 raid_probe_close_all_devs(un); 4176 ui->ui_tstate |= MD_INACCESSIBLE; 4177 return (not_opened > 1); 4178 } 4179 4180 if (!md_devopen) { 4181 for (i = 0; i < un->un_totalcolumncnt; i++) { 4182 device = &un->un_column[i]; 4183 if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { 4184 if (device->un_devstate & RCS_LAST_ERRED) { 4185 /* 4186 * At this point in time there is a 4187 * possibility that errors were the 4188 * result of a controller failure with 4189 * more than a single column on it so 4190 * clear out last errored columns and 4191 * let errors re-occur is necessary. 4192 */ 4193 raid_set_state(un, i, RCS_OKAY, 0); 4194 commit++; 4195 } 4196 continue; 4197 } 4198 ASSERT(col == -1); 4199 /* 4200 * note if multiple devices are failing then only 4201 * the last one is marked as error 4202 */ 4203 col = i; 4204 } 4205 4206 if (col != -1) { 4207 raid_set_state(un, col, RCS_ERRED, 0); 4208 commit++; 4209 } 4210 4211 } else { 4212 for (i = 0; i < un->un_totalcolumncnt; i++) { 4213 device = &un->un_column[i]; 4214 4215 /* if we have LAST_ERRED go ahead and commit. */ 4216 if (un->un_state & RUS_LAST_ERRED) 4217 break; 4218 /* 4219 * could not open the component 4220 */ 4221 4222 if (!(device->un_devflags & MD_RAID_DEV_PROBEOPEN)) { 4223 col = i; 4224 raid_set_state(un, col, RCS_ERRED, 0); 4225 commit++; 4226 } 4227 } 4228 } 4229 4230 if (commit) 4231 raid_commit(un, NULL); 4232 4233 if (col != -1) { 4234 if (COLUMN_STATE(un, col) & RCS_ERRED) { 4235 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 4236 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 4237 } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { 4238 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 4239 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 4240 } 4241 } 4242 4243 raid_probe_close_all_devs(un); 4244 return (0); 4245 } 4246 4247 static int 4248 raid_imp_set( 4249 set_t setno 4250 ) 4251 { 4252 mddb_recid_t recid; 4253 int i, gotsomething; 4254 mddb_type_t typ1; 4255 mddb_de_ic_t *dep; 4256 mddb_rb32_t *rbp; 4257 mr_unit_t *un64; 4258 mr_unit32_od_t *un32; 4259 minor_t *self_id; /* minor needs to be updated */ 4260 md_parent_t *parent_id; /* parent needs to be updated */ 4261 mddb_recid_t *record_id; /* record id needs to be updated */ 4262 hsp_t *hsp_id; 4263 4264 gotsomething = 0; 4265 4266 typ1 = (mddb_type_t)md_getshared_key(setno, 4267 raid_md_ops.md_driver.md_drivername); 4268 recid = mddb_makerecid(setno, 0); 4269 4270 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 4271 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 4272 continue; 4273 4274 dep = mddb_getrecdep(recid); 4275 rbp = dep->de_rb; 4276 4277 if (rbp->rb_revision == MDDB_REV_RB) { 4278 /* 4279 * Small device 4280 */ 4281 un32 = (mr_unit32_od_t *)mddb_getrecaddr(recid); 4282 self_id = &(un32->c.un_self_id); 4283 parent_id = &(un32->c.un_parent); 4284 record_id = &(un32->c.un_record_id); 4285 hsp_id = &(un32->un_hsp_id); 4286 4287 for (i = 0; i < un32->un_totalcolumncnt; i++) { 4288 mr_column32_od_t *device; 4289 4290 device = &un32->un_column[i]; 4291 if (!md_update_minor(setno, mddb_getsidenum 4292 (setno), device->un_orig_key)) 4293 goto out; 4294 4295 if (device->un_hs_id != 0) 4296 device->un_hs_id = MAKERECID( 4297 setno, device->un_hs_id); 4298 } 4299 } else { 4300 un64 = (mr_unit_t *)mddb_getrecaddr(recid); 4301 self_id = &(un64->c.un_self_id); 4302 parent_id = &(un64->c.un_parent); 4303 record_id = &(un64->c.un_record_id); 4304 hsp_id = &(un64->un_hsp_id); 4305 4306 for (i = 0; i < un64->un_totalcolumncnt; i++) { 4307 mr_column_t *device; 4308 4309 device = &un64->un_column[i]; 4310 if (!md_update_minor(setno, mddb_getsidenum 4311 (setno), device->un_orig_key)) 4312 goto out; 4313 4314 if (device->un_hs_id != 0) 4315 device->un_hs_id = MAKERECID( 4316 setno, device->un_hs_id); 4317 } 4318 } 4319 4320 /* 4321 * Update unit with the imported setno 4322 */ 4323 mddb_setrecprivate(recid, MD_PRV_GOTIT); 4324 4325 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 4326 4327 if (*hsp_id != -1) 4328 *hsp_id = MAKERECID(setno, DBID(*hsp_id)); 4329 4330 if (*parent_id != MD_NO_PARENT) 4331 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 4332 *record_id = MAKERECID(setno, DBID(*record_id)); 4333 gotsomething = 1; 4334 } 4335 4336 out: 4337 return (gotsomething); 4338 } 4339 4340 static md_named_services_t raid_named_services[] = { 4341 {raid_hotspares, "poke hotspares" }, 4342 {raid_rename_check, MDRNM_CHECK }, 4343 {raid_rename_lock, MDRNM_LOCK }, 4344 {(intptr_t (*)()) raid_rename_unlock, MDRNM_UNLOCK }, 4345 {(intptr_t (*)()) raid_probe_dev, "probe open test" }, 4346 {NULL, 0 } 4347 }; 4348 4349 md_ops_t raid_md_ops = { 4350 raid_open, /* open */ 4351 raid_close, /* close */ 4352 md_raid_strategy, /* strategy */ 4353 NULL, /* print */ 4354 NULL, /* dump */ 4355 NULL, /* read */ 4356 NULL, /* write */ 4357 md_raid_ioctl, /* ioctl, */ 4358 raid_snarf, /* raid_snarf */ 4359 raid_halt, /* raid_halt */ 4360 NULL, /* aread */ 4361 NULL, /* awrite */ 4362 raid_imp_set, /* import set */ 4363 raid_named_services 4364 }; 4365 4366 static void 4367 init_init() 4368 { 4369 /* default to a second */ 4370 if (md_wr_wait == 0) 4371 md_wr_wait = md_hz >> 1; 4372 4373 raid_parent_cache = kmem_cache_create("md_raid_parent", 4374 sizeof (md_raidps_t), 0, raid_parent_constructor, 4375 raid_parent_destructor, raid_run_queue, NULL, NULL, 0); 4376 raid_child_cache = kmem_cache_create("md_raid_child", 4377 sizeof (md_raidcs_t) - sizeof (buf_t) + biosize(), 0, 4378 raid_child_constructor, raid_child_destructor, 4379 raid_run_queue, NULL, NULL, 0); 4380 raid_cbuf_cache = kmem_cache_create("md_raid_cbufs", 4381 sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor, 4382 raid_cbuf_destructor, raid_run_queue, NULL, NULL, 0); 4383 } 4384 4385 static void 4386 fini_uninit() 4387 { 4388 kmem_cache_destroy(raid_parent_cache); 4389 kmem_cache_destroy(raid_child_cache); 4390 kmem_cache_destroy(raid_cbuf_cache); 4391 raid_parent_cache = raid_child_cache = raid_cbuf_cache = NULL; 4392 } 4393 4394 /* define the module linkage */ 4395 MD_PLUGIN_MISC_MODULE("raid module %I%", init_init(), fini_uninit()) 4396