1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Soft partitioning metadevice driver (md_sp). 30 * 31 * This file contains the primary operations of the soft partitioning 32 * metadevice driver. This includes all routines for normal operation 33 * (open/close/read/write). Please see mdvar.h for a definition of 34 * metadevice operations vector (md_ops_t). This driver is loosely 35 * based on the stripe driver (md_stripe). 36 * 37 * All metadevice administration is done through the use of ioctl's. 38 * As such, all administrative routines appear in sp_ioctl.c. 39 * 40 * Soft partitions are represented both in-core and in the metadb with a 41 * unit structure. The soft partition-specific information in the unit 42 * structure includes the following information: 43 * - Device information (md_dev64_t & md key) about the device on which 44 * the soft partition is built. 45 * - Soft partition status information. 46 * - The size of the soft partition and number of extents used to 47 * make up that size. 48 * - An array of exents which define virtual/physical offset 49 * mappings and lengths for each extent. 50 * 51 * Typical soft partition operation proceeds as follows: 52 * - The unit structure is fetched from the metadb and placed into 53 * an in-core array (as with other metadevices). This operation 54 * is performed via sp_build_incore( ) and takes place during 55 * "snarfing" (when all metadevices are brought in-core at 56 * once) and when a new soft partition is created. 57 * - A soft partition is opened via sp_open( ). At open time the 58 * the soft partition unit structure is verified with the soft 59 * partition on-disk structures. Additionally, the soft partition 60 * status is checked (only soft partitions in the OK state may be 61 * opened). 62 * - Soft partition I/O is performed via sp_strategy( ) which relies on 63 * a support routine, sp_mapbuf( ), to do most of the work. 64 * sp_mapbuf( ) maps a buffer to a particular extent via a binary 65 * search of the extent array in the soft partition unit structure. 66 * Once a translation has been performed, the I/O is passed down 67 * to the next layer, which may be another metadevice or a physical 68 * disk. Since a soft partition may contain multiple, non-contiguous 69 * extents, a single I/O may have to be fragmented. 70 * - Soft partitions are closed using sp_close. 71 * 72 */ 73 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/conf.h> 77 #include <sys/file.h> 78 #include <sys/user.h> 79 #include <sys/uio.h> 80 #include <sys/t_lock.h> 81 #include <sys/buf.h> 82 #include <sys/dkio.h> 83 #include <sys/vtoc.h> 84 #include <sys/kmem.h> 85 #include <vm/page.h> 86 #include <sys/cmn_err.h> 87 #include <sys/sysmacros.h> 88 #include <sys/types.h> 89 #include <sys/mkdev.h> 90 #include <sys/stat.h> 91 #include <sys/open.h> 92 #include <sys/lvm/mdvar.h> 93 #include <sys/lvm/md_sp.h> 94 #include <sys/lvm/md_convert.h> 95 #include <sys/lvm/md_notify.h> 96 #include <sys/lvm/md_crc.h> 97 #include <sys/modctl.h> 98 #include <sys/ddi.h> 99 #include <sys/sunddi.h> 100 #include <sys/debug.h> 101 102 #include <sys/sysevent/eventdefs.h> 103 #include <sys/sysevent/svm.h> 104 105 md_ops_t sp_md_ops; 106 #ifndef lint 107 char _depends_on[] = "drv/md"; 108 md_ops_t *md_interface_ops = &sp_md_ops; 109 #endif 110 111 extern unit_t md_nunits; 112 extern set_t md_nsets; 113 extern md_set_t md_set[]; 114 115 extern int md_status; 116 extern major_t md_major; 117 extern mdq_anchor_t md_done_daemon; 118 extern mdq_anchor_t md_sp_daemon; 119 extern kmutex_t md_mx; 120 extern kcondvar_t md_cv; 121 extern md_krwlock_t md_unit_array_rw; 122 123 static kmem_cache_t *sp_parent_cache = NULL; 124 static kmem_cache_t *sp_child_cache = NULL; 125 static void sp_send_stat_ok(mp_unit_t *); 126 static void sp_send_stat_err(mp_unit_t *); 127 128 /* 129 * FUNCTION: sp_parent_constructor() 130 * INPUT: none. 131 * OUTPUT: ps - parent save structure initialized. 132 * RETURNS: void * - ptr to initialized parent save structure. 133 * PURPOSE: initialize parent save structure. 134 */ 135 /*ARGSUSED1*/ 136 static int 137 sp_parent_constructor(void *p, void *d1, int d2) 138 { 139 mutex_init(&((md_spps_t *)p)->ps_mx, 140 NULL, MUTEX_DEFAULT, NULL); 141 return (0); 142 } 143 144 static void 145 sp_parent_init(md_spps_t *ps) 146 { 147 bzero(ps, offsetof(md_spps_t, ps_mx)); 148 } 149 150 /*ARGSUSED1*/ 151 static void 152 sp_parent_destructor(void *p, void *d) 153 { 154 mutex_destroy(&((md_spps_t *)p)->ps_mx); 155 } 156 157 /* 158 * FUNCTION: sp_child_constructor() 159 * INPUT: none. 160 * OUTPUT: cs - child save structure initialized. 161 * RETURNS: void * - ptr to initialized child save structure. 162 * PURPOSE: initialize child save structure. 163 */ 164 /*ARGSUSED1*/ 165 static int 166 sp_child_constructor(void *p, void *d1, int d2) 167 { 168 bioinit(&((md_spcs_t *)p)->cs_buf); 169 return (0); 170 } 171 172 static void 173 sp_child_init(md_spcs_t *cs) 174 { 175 cs->cs_mdunit = 0; 176 cs->cs_ps = NULL; 177 md_bioreset(&cs->cs_buf); 178 } 179 180 /*ARGSUSED1*/ 181 static void 182 sp_child_destructor(void *p, void *d) 183 { 184 biofini(&((md_spcs_t *)p)->cs_buf); 185 } 186 187 /* 188 * FUNCTION: sp_run_queue() 189 * INPUT: none. 190 * OUTPUT: none. 191 * RETURNS: void. 192 * PURPOSE: run the md_daemon to clean up memory pool. 193 */ 194 /*ARGSUSED*/ 195 static void 196 sp_run_queue(void *d) 197 { 198 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 199 md_daemon(1, &md_done_daemon); 200 } 201 202 203 /* 204 * FUNCTION: sp_build_incore() 205 * INPUT: p - ptr to unit structure. 206 * snarfing - flag to tell us we are snarfing. 207 * OUTPUT: non. 208 * RETURNS: int - 0 (always). 209 * PURPOSE: place unit structure into in-core unit array (keyed from 210 * minor number). 211 */ 212 int 213 sp_build_incore(void *p, int snarfing) 214 { 215 mp_unit_t *un = (mp_unit_t *)p; 216 minor_t mnum; 217 set_t setno; 218 md_dev64_t tmpdev; 219 220 mnum = MD_SID(un); 221 222 if (MD_UNIT(mnum) != NULL) 223 return (0); 224 225 MD_STATUS(un) = 0; 226 227 if (snarfing) { 228 /* 229 * if we are snarfing, we get the device information 230 * from the metadb record (using the metadb key for 231 * that device). 232 */ 233 setno = MD_MIN2SET(mnum); 234 235 tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), 236 un->un_key, MD_NOTRUST_DEVT); 237 un->un_dev = tmpdev; 238 } 239 240 /* place unit in in-core array */ 241 MD_UNIT(mnum) = un; 242 return (0); 243 } 244 245 /* 246 * FUNCTION: reset_sp() 247 * INPUT: un - unit structure to be reset/removed. 248 * mnum - minor number to be reset/removed. 249 * removing - flag to tell us if we are removing 250 * permanently or just reseting in-core 251 * structures. 252 * OUTPUT: none. 253 * RETURNS: void. 254 * PURPOSE: used to either simply reset in-core structures or to 255 * permanently remove metadevices from the metadb. 256 */ 257 void 258 reset_sp(mp_unit_t *un, minor_t mnum, int removing) 259 { 260 sv_dev_t *sv; 261 mddb_recid_t vtoc_id; 262 263 /* clean up in-core structures */ 264 md_destroy_unit_incore(mnum, &sp_md_ops); 265 266 MD_UNIT(mnum) = NULL; 267 268 if (!removing) 269 return; 270 271 /* we are removing the soft partition from the metadb */ 272 273 /* 274 * Save off device information so we can get to 275 * it after we do the mddb_deleterec(). 276 */ 277 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); 278 sv->setno = MD_MIN2SET(mnum); 279 sv->key = un->un_key; 280 vtoc_id = un->c.un_vtoc_id; 281 282 /* Remove the unit structure */ 283 mddb_deleterec_wrapper(un->c.un_record_id); 284 285 if (vtoc_id) 286 mddb_deleterec_wrapper(vtoc_id); 287 288 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE, 289 MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); 290 291 /* 292 * remove the underlying device name from the metadb. if other 293 * soft partitions are built on this device, this will simply 294 * decrease the reference count for this device. otherwise the 295 * name record for this device will be removed from the metadb. 296 */ 297 md_rem_names(sv, 1); 298 kmem_free(sv, sizeof (sv_dev_t)); 299 } 300 301 /* 302 * FUNCTION: sp_send_stat_msg 303 * INPUT: un - unit reference 304 * status - status to be sent to master node 305 * MD_SP_OK - soft-partition is now OK 306 * MD_SP_ERR " " errored 307 * OUTPUT: none. 308 * RETURNS: void. 309 * PURPOSE: send a soft-partition status change to the master node. If the 310 * message succeeds we simply return. If it fails we panic as the 311 * cluster-wide view of the metadevices is now inconsistent. 312 * CALLING CONTEXT: 313 * Blockable. No locks can be held. 314 */ 315 static void 316 sp_send_stat_msg(mp_unit_t *un, sp_status_t status) 317 { 318 md_mn_msg_sp_setstat_t sp_msg; 319 md_mn_kresult_t *kres; 320 set_t setno = MD_UN2SET(un); 321 int rval; 322 const char *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK"; 323 324 sp_msg.sp_setstat_mnum = MD_SID(un); 325 sp_msg.sp_setstat_status = status; 326 327 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 328 329 rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG, 330 (char *)&sp_msg, sizeof (sp_msg), kres); 331 332 if (!MDMN_KSEND_MSG_OK(rval, kres)) { 333 mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2"); 334 335 /* 336 * Panic as we are now in an inconsistent state. 337 */ 338 339 cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n", 340 md_shortname(MD_SID(un)), str); 341 } 342 343 kmem_free(kres, sizeof (md_mn_kresult_t)); 344 } 345 346 /* 347 * FUNCTION: sp_finish_error 348 * INPUT: ps - parent save structure for error-ed I/O. 349 * lock_held - set if the unit readerlock is held 350 * OUTPUT: none. 351 * RETURNS: void. 352 * PURPOSE: report a driver error 353 */ 354 static void 355 sp_finish_error(md_spps_t *ps, int lock_held) 356 { 357 struct buf *pb = ps->ps_bp; 358 mdi_unit_t *ui = ps->ps_ui; 359 md_dev64_t un_dev; /* underlying device */ 360 md_dev64_t md_dev = md_expldev(pb->b_edev); /* metadev in error */ 361 char *str; 362 363 un_dev = md_expldev(ps->ps_un->un_dev); 364 /* set error type */ 365 if (pb->b_flags & B_READ) { 366 str = "read"; 367 } else { 368 str = "write"; 369 } 370 371 372 SPPS_FREE(sp_parent_cache, ps); 373 pb->b_flags |= B_ERROR; 374 375 md_kstat_done(ui, pb, 0); 376 377 if (lock_held) { 378 md_unit_readerexit(ui); 379 } 380 md_biodone(pb); 381 382 cmn_err(CE_WARN, "md: %s: %s error on %s", 383 md_shortname(md_getminor(md_dev)), str, 384 md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0)); 385 } 386 387 388 /* 389 * FUNCTION: sp_xmit_ok 390 * INPUT: dq - daemon queue referencing failing ps structure 391 * OUTPUT: none. 392 * RETURNS: void. 393 * PURPOSE: send a message to the master node in a multi-owner diskset to 394 * update all attached nodes view of the soft-part to be MD_SP_OK. 395 * CALLING CONTEXT: 396 * Blockable. No unit lock held. 397 */ 398 static void 399 sp_xmit_ok(daemon_queue_t *dq) 400 { 401 md_spps_t *ps = (md_spps_t *)dq; 402 403 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 404 sp_send_stat_msg(ps->ps_un, MD_SP_OK); 405 406 /* 407 * Successfully transmitted error state to all nodes, now release this 408 * parent structure. 409 */ 410 SPPS_FREE(sp_parent_cache, ps); 411 } 412 413 /* 414 * FUNCTION: sp_xmit_error 415 * INPUT: dq - daemon queue referencing failing ps structure 416 * OUTPUT: none. 417 * RETURNS: void. 418 * PURPOSE: send a message to the master node in a multi-owner diskset to 419 * update all attached nodes view of the soft-part to be MD_SP_ERR. 420 * CALLING CONTEXT: 421 * Blockable. No unit lock held. 422 */ 423 static void 424 sp_xmit_error(daemon_queue_t *dq) 425 { 426 md_spps_t *ps = (md_spps_t *)dq; 427 428 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 429 sp_send_stat_msg(ps->ps_un, MD_SP_ERR); 430 431 /* 432 * Successfully transmitted error state to all nodes, now release this 433 * parent structure. 434 */ 435 SPPS_FREE(sp_parent_cache, ps); 436 } 437 static void 438 sp_send_stat_ok(mp_unit_t *un) 439 { 440 minor_t mnum = MD_SID(un); 441 md_spps_t *ps; 442 443 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 444 sp_parent_init(ps); 445 ps->ps_un = un; 446 ps->ps_ui = MDI_UNIT(mnum); 447 448 daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps, 449 REQ_OLD); 450 } 451 452 static void 453 sp_send_stat_err(mp_unit_t *un) 454 { 455 minor_t mnum = MD_SID(un); 456 md_spps_t *ps; 457 458 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 459 sp_parent_init(ps); 460 ps->ps_un = un; 461 ps->ps_ui = MDI_UNIT(mnum); 462 463 daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps, 464 REQ_OLD); 465 } 466 467 468 /* 469 * FUNCTION: sp_error() 470 * INPUT: ps - parent save structure for error-ed I/O. 471 * OUTPUT: none. 472 * RETURNS: void. 473 * PURPOSE: report a driver error. 474 * CALLING CONTEXT: 475 * Interrupt - non-blockable 476 */ 477 static void 478 sp_error(md_spps_t *ps) 479 { 480 set_t setno = MD_UN2SET(ps->ps_un); 481 482 /* 483 * Drop the mutex associated with this request before (potentially) 484 * enqueuing the free onto a separate thread. We have to release the 485 * mutex before destroying the parent structure. 486 */ 487 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 488 if (MUTEX_HELD(&ps->ps_mx)) { 489 mutex_exit(&ps->ps_mx); 490 } 491 } else { 492 /* 493 * this should only ever happen if we are panicking, 494 * since DONTFREE is only set on the parent if panicstr 495 * is non-NULL. 496 */ 497 ASSERT(panicstr); 498 } 499 500 /* 501 * For a multi-owner set we need to send a message to the master so that 502 * all nodes get the errored status when we first encounter it. To avoid 503 * deadlocking when multiple soft-partitions encounter an error on one 504 * physical unit we drop the unit readerlock before enqueueing the 505 * request. That way we can service any messages that require a 506 * writerlock to be held. Additionally, to avoid deadlocking when at 507 * the bottom of a metadevice stack and a higher level mirror has 508 * multiple requests outstanding on this soft-part, we clone the ps 509 * that failed and pass the error back up the stack to release the 510 * reference that this i/o may have in the higher-level metadevice. 511 * The other nodes in the cluster just have to modify the soft-part 512 * status and we do not need to block the i/o completion for this. 513 */ 514 if (MD_MNSET_SETNO(setno)) { 515 md_spps_t *err_ps; 516 err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 517 sp_parent_init(err_ps); 518 519 err_ps->ps_un = ps->ps_un; 520 err_ps->ps_ui = ps->ps_ui; 521 522 md_unit_readerexit(ps->ps_ui); 523 524 daemon_request(&md_sp_daemon, sp_xmit_error, 525 (daemon_queue_t *)err_ps, REQ_OLD); 526 527 sp_finish_error(ps, 0); 528 529 return; 530 } else { 531 ps->ps_un->un_status = MD_SP_ERR; 532 } 533 534 /* Flag the error */ 535 sp_finish_error(ps, 1); 536 537 } 538 539 /* 540 * FUNCTION: sp_mapbuf() 541 * INPUT: un - unit structure for soft partition we are doing 542 * I/O on. 543 * voff - virtual offset in soft partition to map. 544 * bcount - # of blocks in the I/O. 545 * OUTPUT: bp - translated buffer to be passed down to next layer. 546 * RETURNS: 1 - request must be fragmented, more work to do, 547 * 0 - request satisified, no more work to do 548 * -1 - error 549 * PURPOSE: Map the the virtual offset in the soft partition (passed 550 * in via voff) to the "physical" offset on whatever the soft 551 * partition is built on top of. We do this by doing a binary 552 * search of the extent array in the soft partition unit 553 * structure. Once the current extent is found, we do the 554 * translation, determine if the I/O will cross extent 555 * boundaries (if so, we have to fragment the I/O), then 556 * fill in the buf structure to be passed down to the next layer. 557 */ 558 static int 559 sp_mapbuf( 560 mp_unit_t *un, 561 sp_ext_offset_t voff, 562 sp_ext_length_t bcount, 563 buf_t *bp 564 ) 565 { 566 int lo, mid, hi, found, more; 567 size_t new_bcount; 568 sp_ext_offset_t new_blkno; 569 sp_ext_offset_t new_offset; 570 sp_ext_offset_t ext_endblk; 571 md_dev64_t new_edev; 572 extern unsigned md_maxphys; 573 574 found = 0; 575 lo = 0; 576 hi = un->un_numexts - 1; 577 578 /* 579 * do a binary search to find the extent that contains the 580 * starting offset. after this loop, mid contains the index 581 * of the correct extent. 582 */ 583 while (lo <= hi && !found) { 584 mid = (lo + hi) / 2; 585 /* is the starting offset contained within the mid-ext? */ 586 if (voff >= un->un_ext[mid].un_voff && 587 voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len) 588 found = 1; 589 else if (voff < un->un_ext[mid].un_voff) 590 hi = mid - 1; 591 else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */ 592 lo = mid + 1; 593 } 594 595 if (!found) { 596 cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff); 597 return (-1); 598 } 599 600 /* translate to underlying physical offset/device */ 601 new_offset = voff - un->un_ext[mid].un_voff; 602 new_blkno = un->un_ext[mid].un_poff + new_offset; 603 new_edev = un->un_dev; 604 605 /* determine if we need to break the I/O into fragments */ 606 ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len; 607 if (voff + btodb(bcount) > ext_endblk) { 608 new_bcount = dbtob(ext_endblk - voff); 609 more = 1; 610 } else { 611 new_bcount = bcount; 612 more = 0; 613 } 614 615 /* only break up the I/O if we're not built on another metadevice */ 616 if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) { 617 new_bcount = md_maxphys; 618 more = 1; 619 } 620 if (bp != (buf_t *)NULL) { 621 /* do bp updates */ 622 bp->b_bcount = new_bcount; 623 bp->b_lblkno = new_blkno; 624 bp->b_edev = md_dev64_to_dev(new_edev); 625 } 626 return (more); 627 } 628 629 /* 630 * FUNCTION: sp_validate() 631 * INPUT: un - unit structure to be validated. 632 * OUTPUT: none. 633 * RETURNS: 0 - soft partition ok. 634 * -1 - error. 635 * PURPOSE: called on open to sanity check the soft partition. In 636 * order to open a soft partition: 637 * - it must have at least one extent 638 * - the extent info in core and on disk must match 639 * - it may not be in an intermediate state (which would 640 * imply that a two-phase commit was interrupted) 641 * 642 * If the extent checking fails (B_ERROR returned from the read 643 * strategy call) _and_ we're a multi-owner diskset, we send a 644 * message to the master so that all nodes inherit the same view 645 * of the soft partition. 646 * If we are checking a soft-part that is marked as in error, and 647 * we can actually read and validate the watermarks we send a 648 * message to clear the error to the master node. 649 */ 650 static int 651 sp_validate(mp_unit_t *un) 652 { 653 uint_t ext; 654 struct buf *buf; 655 sp_ext_length_t len; 656 mp_watermark_t *wm; 657 set_t setno; 658 int reset_error = 0; 659 660 setno = MD_UN2SET(un); 661 662 /* sanity check unit structure components ?? */ 663 if (un->un_status != MD_SP_OK) { 664 if (un->un_status != MD_SP_ERR) { 665 cmn_err(CE_WARN, "md: %s: open failed, soft partition " 666 "status is %u.", 667 md_shortname(MD_SID(un)), 668 un->un_status); 669 return (-1); 670 } else { 671 cmn_err(CE_WARN, "md: %s: open of soft partition " 672 "in Errored state.", 673 md_shortname(MD_SID(un))); 674 reset_error = 1; 675 } 676 } 677 678 if (un->un_numexts == 0) { 679 cmn_err(CE_WARN, "md: %s: open failed, soft partition does " 680 "not have any extents.", md_shortname(MD_SID(un))); 681 return (-1); 682 } 683 684 len = 0LL; 685 for (ext = 0; ext < un->un_numexts; ext++) { 686 687 /* tally extent lengths to check total size */ 688 len += un->un_ext[ext].un_len; 689 690 /* allocate buffer for watermark */ 691 buf = getrbuf(KM_SLEEP); 692 693 /* read watermark */ 694 buf->b_flags = B_READ; 695 buf->b_edev = md_dev64_to_dev(un->un_dev); 696 buf->b_iodone = NULL; 697 buf->b_proc = NULL; 698 buf->b_bcount = sizeof (mp_watermark_t); 699 buf->b_lblkno = un->un_ext[ext].un_poff - 1; 700 buf->b_bufsize = sizeof (mp_watermark_t); 701 buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t), 702 KM_SLEEP); 703 704 /* 705 * make the call non-blocking so that it is not affected 706 * by a set take. 707 */ 708 md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL); 709 (void) biowait(buf); 710 711 if (buf->b_flags & B_ERROR) { 712 cmn_err(CE_WARN, "md: %s: open failed, could not " 713 "read watermark at block %llu for extent %u, " 714 "error %d.", md_shortname(MD_SID(un)), 715 buf->b_lblkno, ext, buf->b_error); 716 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 717 freerbuf(buf); 718 719 /* 720 * If we're a multi-owner diskset we send a message 721 * indicating that this soft-part has an invalid 722 * extent to the master node. This ensures a consistent 723 * view of the soft-part across the cluster. 724 */ 725 if (MD_MNSET_SETNO(setno)) { 726 sp_send_stat_err(un); 727 } 728 return (-1); 729 } 730 731 wm = (mp_watermark_t *)buf->b_un.b_addr; 732 733 /* make sure the checksum is correct first */ 734 if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum, 735 (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) { 736 cmn_err(CE_WARN, "md: %s: open failed, watermark " 737 "at block %llu for extent %u does not have a " 738 "valid checksum 0x%08x.", md_shortname(MD_SID(un)), 739 buf->b_lblkno, ext, wm->wm_checksum); 740 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 741 freerbuf(buf); 742 return (-1); 743 } 744 745 if (wm->wm_magic != MD_SP_MAGIC) { 746 cmn_err(CE_WARN, "md: %s: open failed, watermark " 747 "at block %llu for extent %u does not have a " 748 "valid watermark magic number, expected 0x%x, " 749 "found 0x%x.", md_shortname(MD_SID(un)), 750 buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic); 751 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 752 freerbuf(buf); 753 return (-1); 754 } 755 756 /* make sure sequence number matches the current extent */ 757 if (wm->wm_seq != ext) { 758 cmn_err(CE_WARN, "md: %s: open failed, watermark " 759 "at block %llu for extent %u has invalid " 760 "sequence number %u.", md_shortname(MD_SID(un)), 761 buf->b_lblkno, ext, wm->wm_seq); 762 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 763 freerbuf(buf); 764 return (-1); 765 } 766 767 /* make sure watermark length matches unit structure */ 768 if (wm->wm_length != un->un_ext[ext].un_len) { 769 cmn_err(CE_WARN, "md: %s: open failed, watermark " 770 "at block %llu for extent %u has inconsistent " 771 "length, expected %llu, found %llu.", 772 md_shortname(MD_SID(un)), buf->b_lblkno, 773 ext, un->un_ext[ext].un_len, 774 (u_longlong_t)wm->wm_length); 775 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 776 freerbuf(buf); 777 return (-1); 778 } 779 780 /* 781 * make sure the type is a valid soft partition and not 782 * a free extent or the end. 783 */ 784 if (wm->wm_type != EXTTYP_ALLOC) { 785 cmn_err(CE_WARN, "md: %s: open failed, watermark " 786 "at block %llu for extent %u is not marked " 787 "as in-use, type = %u.", md_shortname(MD_SID(un)), 788 buf->b_lblkno, ext, wm->wm_type); 789 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 790 freerbuf(buf); 791 return (-1); 792 } 793 /* free up buffer */ 794 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 795 freerbuf(buf); 796 } 797 798 if (len != un->un_length) { 799 cmn_err(CE_WARN, "md: %s: open failed, computed length " 800 "%llu != expected length %llu.", md_shortname(MD_SID(un)), 801 len, un->un_length); 802 return (-1); 803 } 804 805 /* 806 * If we're a multi-owner set _and_ reset_error is set, we should clear 807 * the error condition on all nodes in the set. Use SP_SETSTAT2 with 808 * MD_SP_OK. 809 */ 810 if (MD_MNSET_SETNO(setno) && reset_error) { 811 sp_send_stat_ok(un); 812 } 813 return (0); 814 } 815 816 /* 817 * FUNCTION: sp_done() 818 * INPUT: child_buf - buffer attached to child save structure. 819 * this is the buffer on which I/O has just 820 * completed. 821 * OUTPUT: none. 822 * RETURNS: 0 - success. 823 * 1 - error. 824 * PURPOSE: called on I/O completion. 825 */ 826 static int 827 sp_done(struct buf *child_buf) 828 { 829 struct buf *parent_buf; 830 mdi_unit_t *ui; 831 md_spps_t *ps; 832 md_spcs_t *cs; 833 834 /* find the child save structure to which this buffer belongs */ 835 cs = (md_spcs_t *)((caddr_t)child_buf - 836 (sizeof (md_spcs_t) - sizeof (buf_t))); 837 /* now get the parent save structure */ 838 ps = cs->cs_ps; 839 parent_buf = ps->ps_bp; 840 841 mutex_enter(&ps->ps_mx); 842 /* pass any errors back up to the parent */ 843 if (child_buf->b_flags & B_ERROR) { 844 ps->ps_flags |= MD_SPPS_ERROR; 845 parent_buf->b_error = child_buf->b_error; 846 } 847 /* mapout, if needed */ 848 if (child_buf->b_flags & B_REMAPPED) 849 bp_mapout(child_buf); 850 851 ps->ps_frags--; 852 if (ps->ps_frags != 0) { 853 /* 854 * if this parent has more children, we just free the 855 * child and return. 856 */ 857 kmem_cache_free(sp_child_cache, cs); 858 mutex_exit(&ps->ps_mx); 859 return (1); 860 } 861 /* there are no more children */ 862 kmem_cache_free(sp_child_cache, cs); 863 if (ps->ps_flags & MD_SPPS_ERROR) { 864 sp_error(ps); 865 return (1); 866 } 867 ui = ps->ps_ui; 868 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 869 mutex_exit(&ps->ps_mx); 870 } else { 871 /* 872 * this should only ever happen if we are panicking, 873 * since DONTFREE is only set on the parent if panicstr 874 * is non-NULL. 875 */ 876 ASSERT(panicstr); 877 } 878 SPPS_FREE(sp_parent_cache, ps); 879 md_kstat_done(ui, parent_buf, 0); 880 md_unit_readerexit(ui); 881 md_biodone(parent_buf); 882 return (0); 883 } 884 885 /* 886 * FUNCTION: md_sp_strategy() 887 * INPUT: parent_buf - parent buffer 888 * flag - flags 889 * private - private data 890 * OUTPUT: none. 891 * RETURNS: void. 892 * PURPOSE: Soft partitioning I/O strategy. Performs the main work 893 * needed to do I/O to a soft partition. The basic 894 * algorithm is as follows: 895 * - Allocate a child save structure to keep track 896 * of the I/O we are going to pass down. 897 * - Map the I/O to the correct extent in the soft 898 * partition (see sp_mapbuf()). 899 * - bioclone() the buffer and pass it down the 900 * stack using md_call_strategy. 901 * - If the I/O needs to split across extents, 902 * repeat the above steps until all fragments 903 * are finished. 904 */ 905 static void 906 md_sp_strategy(buf_t *parent_buf, int flag, void *private) 907 { 908 md_spps_t *ps; 909 md_spcs_t *cs; 910 int more; 911 mp_unit_t *un; 912 mdi_unit_t *ui; 913 size_t current_count; 914 off_t current_offset; 915 sp_ext_offset_t current_blkno; 916 buf_t *child_buf; 917 set_t setno = MD_MIN2SET(getminor(parent_buf->b_edev)); 918 int strat_flag = flag; 919 920 /* 921 * When doing IO to a multi owner meta device, check if set is halted. 922 * We do this check without the needed lock held, for performance 923 * reasons. 924 * If an IO just slips through while the set is locked via an 925 * MD_MN_SUSPEND_SET, we don't care about it. 926 * Only check for suspension if we are a top-level i/o request 927 * (MD_STR_NOTTOP is cleared in 'flag'); 928 */ 929 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 930 (MD_SET_HALTED | MD_SET_MNSET)) { 931 if ((flag & MD_STR_NOTTOP) == 0) { 932 mutex_enter(&md_mx); 933 /* Here we loop until the set is no longer halted */ 934 while (md_set[setno].s_status & MD_SET_HALTED) { 935 cv_wait(&md_cv, &md_mx); 936 } 937 mutex_exit(&md_mx); 938 } 939 } 940 941 ui = MDI_UNIT(getminor(parent_buf->b_edev)); 942 943 md_kstat_waitq_enter(ui); 944 945 un = (mp_unit_t *)md_unit_readerlock(ui); 946 947 if ((flag & MD_NOBLOCK) == 0) { 948 if (md_inc_iocount(setno) != 0) { 949 parent_buf->b_flags |= B_ERROR; 950 parent_buf->b_error = ENXIO; 951 parent_buf->b_resid = parent_buf->b_bcount; 952 md_unit_readerexit(ui); 953 biodone(parent_buf); 954 return; 955 } 956 } else { 957 md_inc_iocount_noblock(setno); 958 } 959 960 if (!(flag & MD_STR_NOTTOP)) { 961 if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) { 962 md_kstat_waitq_exit(ui); 963 return; 964 } 965 } 966 967 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 968 sp_parent_init(ps); 969 970 /* 971 * Save essential information from the original buffhdr 972 * in the parent. 973 */ 974 ps->ps_un = un; 975 ps->ps_ui = ui; 976 ps->ps_bp = parent_buf; 977 ps->ps_addr = parent_buf->b_un.b_addr; 978 979 current_count = parent_buf->b_bcount; 980 current_blkno = (sp_ext_offset_t)parent_buf->b_blkno; 981 current_offset = 0; 982 983 /* 984 * if we are at the top and we are panicking, 985 * we don't free in order to save state. 986 */ 987 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) 988 ps->ps_flags |= MD_SPPS_DONTFREE; 989 990 md_kstat_waitq_to_runq(ui); 991 992 ps->ps_frags++; 993 994 /* 995 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this 996 * metadevice. 997 */ 998 if (ui->ui_tstate & MD_ABR_CAP) 999 strat_flag |= MD_STR_ABR; 1000 1001 /* 1002 * this loop does the main work of an I/O. we allocate a 1003 * a child save for each buf, do the logical to physical 1004 * mapping, decide if we need to frag the I/O, clone the 1005 * new I/O to pass down the stack. repeat until we've 1006 * taken care of the entire buf that was passed to us. 1007 */ 1008 do { 1009 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1010 sp_child_init(cs); 1011 child_buf = &cs->cs_buf; 1012 cs->cs_ps = ps; 1013 1014 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1015 if (more == -1) { 1016 parent_buf->b_flags |= B_ERROR; 1017 parent_buf->b_error = EIO; 1018 md_kstat_done(ui, parent_buf, 0); 1019 md_unit_readerexit(ui); 1020 md_biodone(parent_buf); 1021 kmem_cache_free(sp_parent_cache, ps); 1022 return; 1023 } 1024 1025 child_buf = md_bioclone(parent_buf, current_offset, 1026 child_buf->b_bcount, child_buf->b_edev, 1027 child_buf->b_blkno, sp_done, child_buf, 1028 KM_NOSLEEP); 1029 /* calculate new offset, counts, etc... */ 1030 current_offset += child_buf->b_bcount; 1031 current_count -= child_buf->b_bcount; 1032 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1033 1034 if (more) { 1035 mutex_enter(&ps->ps_mx); 1036 ps->ps_frags++; 1037 mutex_exit(&ps->ps_mx); 1038 } 1039 1040 md_call_strategy(child_buf, strat_flag, private); 1041 } while (more); 1042 1043 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) { 1044 while (!(ps->ps_flags & MD_SPPS_DONE)) { 1045 md_daemon(1, &md_done_daemon); 1046 } 1047 kmem_cache_free(sp_parent_cache, ps); 1048 } 1049 } 1050 1051 /* 1052 * FUNCTION: sp_directed_read() 1053 * INPUT: mnum - minor number 1054 * vdr - vol_directed_rd_t from user 1055 * mode - access mode for copying data out. 1056 * OUTPUT: none. 1057 * RETURNS: 0 - success 1058 * Exxxxx - failure error-code 1059 * PURPOSE: Construct the necessary sub-device i/o requests to perform the 1060 * directed read as requested by the user. This is essentially the 1061 * same as md_sp_strategy() with the exception being that the 1062 * underlying 'md_call_strategy' is replaced with an ioctl call. 1063 */ 1064 int 1065 sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode) 1066 { 1067 md_spps_t *ps; 1068 md_spcs_t *cs; 1069 int more; 1070 mp_unit_t *un; 1071 mdi_unit_t *ui; 1072 size_t current_count; 1073 off_t current_offset; 1074 sp_ext_offset_t current_blkno; 1075 buf_t *child_buf, *parent_buf; 1076 void *kbuffer; 1077 vol_directed_rd_t cvdr; 1078 caddr_t userbuf; 1079 offset_t useroff; 1080 int ret = 0; 1081 1082 ui = MDI_UNIT(mnum); 1083 1084 md_kstat_waitq_enter(ui); 1085 1086 bzero(&cvdr, sizeof (cvdr)); 1087 1088 un = (mp_unit_t *)md_unit_readerlock(ui); 1089 1090 /* 1091 * Construct a parent_buf header which reflects the user-supplied 1092 * request. 1093 */ 1094 1095 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 1096 if (kbuffer == NULL) { 1097 vdr->vdr_flags |= DKV_DMR_ERROR; 1098 md_unit_readerexit(ui); 1099 return (ENOMEM); 1100 } 1101 1102 parent_buf = getrbuf(KM_NOSLEEP); 1103 if (parent_buf == NULL) { 1104 vdr->vdr_flags |= DKV_DMR_ERROR; 1105 md_unit_readerexit(ui); 1106 kmem_free(kbuffer, vdr->vdr_nbytes); 1107 return (ENOMEM); 1108 } 1109 parent_buf->b_un.b_addr = kbuffer; 1110 parent_buf->b_flags = B_READ; 1111 parent_buf->b_bcount = vdr->vdr_nbytes; 1112 parent_buf->b_lblkno = lbtodb(vdr->vdr_offset); 1113 parent_buf->b_edev = un->un_dev; 1114 1115 1116 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 1117 sp_parent_init(ps); 1118 1119 /* 1120 * Save essential information from the original buffhdr 1121 * in the parent. 1122 */ 1123 ps->ps_un = un; 1124 ps->ps_ui = ui; 1125 ps->ps_bp = parent_buf; 1126 ps->ps_addr = parent_buf->b_un.b_addr; 1127 1128 current_count = parent_buf->b_bcount; 1129 current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno; 1130 current_offset = 0; 1131 1132 ps->ps_frags++; 1133 vdr->vdr_bytesread = 0; 1134 1135 /* 1136 * this loop does the main work of an I/O. we allocate a 1137 * a child save for each buf, do the logical to physical 1138 * mapping, decide if we need to frag the I/O, clone the 1139 * new I/O to pass down the stack. repeat until we've 1140 * taken care of the entire buf that was passed to us. 1141 */ 1142 do { 1143 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1144 sp_child_init(cs); 1145 child_buf = &cs->cs_buf; 1146 cs->cs_ps = ps; 1147 1148 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1149 if (more == -1) { 1150 ret = EIO; 1151 vdr->vdr_flags |= DKV_DMR_SHORT; 1152 kmem_cache_free(sp_child_cache, cs); 1153 goto err_out; 1154 } 1155 1156 cvdr.vdr_flags = vdr->vdr_flags; 1157 cvdr.vdr_side = vdr->vdr_side; 1158 cvdr.vdr_nbytes = child_buf->b_bcount; 1159 cvdr.vdr_offset = ldbtob(child_buf->b_lblkno); 1160 /* Work out where we are in the allocated buffer */ 1161 useroff = (offset_t)(uintptr_t)kbuffer; 1162 useroff = useroff + (offset_t)current_offset; 1163 cvdr.vdr_data = (void *)(uintptr_t)useroff; 1164 child_buf = md_bioclone(parent_buf, current_offset, 1165 child_buf->b_bcount, child_buf->b_edev, 1166 child_buf->b_blkno, NULL, 1167 child_buf, KM_NOSLEEP); 1168 /* calculate new offset, counts, etc... */ 1169 current_offset += child_buf->b_bcount; 1170 current_count -= child_buf->b_bcount; 1171 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1172 1173 if (more) { 1174 mutex_enter(&ps->ps_mx); 1175 ps->ps_frags++; 1176 mutex_exit(&ps->ps_mx); 1177 } 1178 1179 ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr, 1180 (mode | FKIOCTL), NULL); 1181 1182 /* 1183 * Free the child structure as we've finished with it. 1184 * Normally this would be done by sp_done() but we're just 1185 * using md_bioclone() to segment the transfer and we never 1186 * issue a strategy request so the iodone will not be called. 1187 */ 1188 kmem_cache_free(sp_child_cache, cs); 1189 if (ret == 0) { 1190 /* copyout the returned data to vdr_data + offset */ 1191 userbuf = (caddr_t)kbuffer; 1192 userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer; 1193 if (ddi_copyout(userbuf, vdr->vdr_data, 1194 cvdr.vdr_bytesread, mode)) { 1195 ret = EFAULT; 1196 goto err_out; 1197 } 1198 vdr->vdr_bytesread += cvdr.vdr_bytesread; 1199 } else { 1200 goto err_out; 1201 } 1202 } while (more); 1203 1204 /* 1205 * Update the user-supplied vol_directed_rd_t structure with the 1206 * contents of the last issued child request. 1207 */ 1208 vdr->vdr_flags = cvdr.vdr_flags; 1209 vdr->vdr_side = cvdr.vdr_side; 1210 bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME); 1211 1212 err_out: 1213 if (ret != 0) { 1214 vdr->vdr_flags |= DKV_DMR_ERROR; 1215 } 1216 if (vdr->vdr_bytesread != vdr->vdr_nbytes) { 1217 vdr->vdr_flags |= DKV_DMR_SHORT; 1218 } 1219 kmem_cache_free(sp_parent_cache, ps); 1220 kmem_free(kbuffer, vdr->vdr_nbytes); 1221 freerbuf(parent_buf); 1222 md_unit_readerexit(ui); 1223 return (ret); 1224 } 1225 1226 /* 1227 * FUNCTION: sp_snarf() 1228 * INPUT: cmd - snarf cmd. 1229 * setno - set number. 1230 * OUTPUT: none. 1231 * RETURNS: 1 - soft partitions were snarfed. 1232 * 0 - no soft partitions were snarfed. 1233 * PURPOSE: Snarf soft partition metadb records into their in-core 1234 * structures. This routine is called at "snarf time" when 1235 * md loads and gets all metadevices records into memory. 1236 * The basic algorithm is simply to walk the soft partition 1237 * records in the metadb and call the soft partitioning 1238 * build_incore routine to set up the in-core structures. 1239 */ 1240 static int 1241 sp_snarf(md_snarfcmd_t cmd, set_t setno) 1242 { 1243 mp_unit_t *un; 1244 mddb_recid_t recid; 1245 int gotsomething; 1246 int all_sp_gotten; 1247 mddb_type_t rec_type; 1248 mddb_de_ic_t *dep; 1249 mddb_rb32_t *rbp; 1250 mp_unit_t *big_un; 1251 mp_unit32_od_t *small_un; 1252 size_t newreqsize; 1253 1254 1255 if (cmd == MD_SNARF_CLEANUP) 1256 return (0); 1257 1258 all_sp_gotten = 1; 1259 gotsomething = 0; 1260 1261 /* get the record type */ 1262 rec_type = (mddb_type_t)md_getshared_key(setno, 1263 sp_md_ops.md_driver.md_drivername); 1264 recid = mddb_makerecid(setno, 0); 1265 1266 /* 1267 * walk soft partition records in the metadb and call 1268 * sp_build_incore to build in-core structures. 1269 */ 1270 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1271 /* if we've already gotten this record, go to the next one */ 1272 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1273 continue; 1274 1275 1276 dep = mddb_getrecdep(recid); 1277 dep->de_flags = MDDB_F_SOFTPART; 1278 rbp = dep->de_rb; 1279 1280 if ((rbp->rb_revision == MDDB_REV_RB) && 1281 ((rbp->rb_private & MD_PRV_CONVD) == 0)) { 1282 /* 1283 * This means, we have an old and small record. 1284 * And this record hasn't already been converted :-o 1285 * before we create an incore metadevice from this 1286 * we have to convert it to a big record. 1287 */ 1288 small_un = (mp_unit32_od_t *)mddb_getrecaddr(recid); 1289 newreqsize = sizeof (mp_unit_t) + 1290 ((small_un->un_numexts - 1) * 1291 sizeof (struct mp_ext)); 1292 big_un = (mp_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP); 1293 softpart_convert((caddr_t)small_un, (caddr_t)big_un, 1294 SMALL_2_BIG); 1295 kmem_free(small_un, dep->de_reqsize); 1296 dep->de_rb_userdata = big_un; 1297 dep->de_reqsize = newreqsize; 1298 rbp->rb_private |= MD_PRV_CONVD; 1299 un = big_un; 1300 } else { 1301 /* Large device */ 1302 un = (mp_unit_t *)mddb_getrecaddr(recid); 1303 } 1304 1305 /* Set revision and flag accordingly */ 1306 if (rbp->rb_revision == MDDB_REV_RB) { 1307 un->c.un_revision = MD_32BIT_META_DEV; 1308 } else { 1309 un->c.un_revision = MD_64BIT_META_DEV; 1310 un->c.un_flag |= MD_EFILABEL; 1311 } 1312 1313 /* 1314 * Create minor node for snarfed entry. 1315 */ 1316 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 1317 1318 if (MD_UNIT(MD_SID(un)) != NULL) { 1319 /* unit is already in-core */ 1320 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1321 continue; 1322 } 1323 all_sp_gotten = 0; 1324 if (sp_build_incore((void *)un, 1) == 0) { 1325 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1326 md_create_unit_incore(MD_SID(un), &sp_md_ops, 0); 1327 gotsomething = 1; 1328 } 1329 } 1330 1331 if (!all_sp_gotten) 1332 return (gotsomething); 1333 /* double-check records */ 1334 recid = mddb_makerecid(setno, 0); 1335 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) 1336 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 1337 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1338 1339 return (0); 1340 } 1341 1342 /* 1343 * FUNCTION: sp_halt() 1344 * INPUT: cmd - halt cmd. 1345 * setno - set number. 1346 * RETURNS: 0 - success. 1347 * 1 - err. 1348 * PURPOSE: Perform driver halt operations. As with stripe, we 1349 * support MD_HALT_CHECK and MD_HALT_DOIT. The first 1350 * does a check to see if halting can be done safely 1351 * (no open soft partitions), the second cleans up and 1352 * shuts down the driver. 1353 */ 1354 static int 1355 sp_halt(md_haltcmd_t cmd, set_t setno) 1356 { 1357 int i; 1358 mdi_unit_t *ui; 1359 minor_t mnum; 1360 1361 if (cmd == MD_HALT_CLOSE) 1362 return (0); 1363 1364 if (cmd == MD_HALT_OPEN) 1365 return (0); 1366 1367 if (cmd == MD_HALT_UNLOAD) 1368 return (0); 1369 1370 if (cmd == MD_HALT_CHECK) { 1371 for (i = 0; i < md_nunits; i++) { 1372 mnum = MD_MKMIN(setno, i); 1373 if ((ui = MDI_UNIT(mnum)) == NULL) 1374 continue; 1375 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1376 continue; 1377 if (md_unit_isopen(ui)) 1378 return (1); 1379 } 1380 return (0); 1381 } 1382 1383 if (cmd != MD_HALT_DOIT) 1384 return (1); 1385 1386 for (i = 0; i < md_nunits; i++) { 1387 mnum = MD_MKMIN(setno, i); 1388 if ((ui = MDI_UNIT(mnum)) == NULL) 1389 continue; 1390 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1391 continue; 1392 reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0); 1393 } 1394 1395 return (0); 1396 } 1397 1398 /* 1399 * FUNCTION: sp_open_dev() 1400 * INPUT: un - unit structure. 1401 * oflags - open flags. 1402 * OUTPUT: none. 1403 * RETURNS: 0 - success. 1404 * non-zero - err. 1405 * PURPOSE: open underlying device via md_layered_open. 1406 */ 1407 static int 1408 sp_open_dev(mp_unit_t *un, int oflags) 1409 { 1410 minor_t mnum = MD_SID(un); 1411 int err; 1412 md_dev64_t tmpdev; 1413 set_t setno = MD_MIN2SET(MD_SID(un)); 1414 side_t side = mddb_getsidenum(setno); 1415 1416 tmpdev = un->un_dev; 1417 /* 1418 * Do the open by device id if underlying is regular 1419 */ 1420 if ((md_getmajor(tmpdev) != md_major) && 1421 md_devid_found(setno, side, un->un_key) == 1) { 1422 tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key); 1423 } 1424 err = md_layered_open(mnum, &tmpdev, oflags); 1425 un->un_dev = tmpdev; 1426 1427 if (err) 1428 return (ENXIO); 1429 1430 return (0); 1431 } 1432 1433 /* 1434 * FUNCTION: sp_open() 1435 * INPUT: dev - device to open. 1436 * flag - pass-through flag. 1437 * otyp - pass-through open type. 1438 * cred_p - credentials. 1439 * md_oflags - open flags. 1440 * OUTPUT: none. 1441 * RETURNS: 0 - success. 1442 * non-zero - err. 1443 * PURPOSE: open a soft partition. 1444 */ 1445 /* ARGSUSED */ 1446 static int 1447 sp_open( 1448 dev_t *dev, 1449 int flag, 1450 int otyp, 1451 cred_t *cred_p, 1452 int md_oflags 1453 ) 1454 { 1455 minor_t mnum = getminor(*dev); 1456 mdi_unit_t *ui = MDI_UNIT(mnum); 1457 mp_unit_t *un; 1458 int err = 0; 1459 set_t setno; 1460 1461 /* 1462 * When doing an open of a multi owner metadevice, check to see if this 1463 * node is a starting node and if a reconfig cycle is underway. 1464 * If so, the system isn't sufficiently set up enough to handle the 1465 * open (which involves I/O during sp_validate), so fail with ENXIO. 1466 */ 1467 setno = MD_MIN2SET(mnum); 1468 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 1469 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 1470 return (ENXIO); 1471 } 1472 1473 /* grab necessary locks */ 1474 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1475 setno = MD_UN2SET(un); 1476 1477 /* open underlying device, if necessary */ 1478 if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { 1479 if ((err = sp_open_dev(un, md_oflags)) != 0) 1480 goto out; 1481 1482 if (MD_MNSET_SETNO(setno)) { 1483 /* For probe, don't incur the overhead of validate */ 1484 if (!(md_oflags & MD_OFLG_PROBEDEV)) { 1485 /* 1486 * Don't call sp_validate while 1487 * unit_openclose lock is held. So, actually 1488 * open the device, drop openclose lock, 1489 * call sp_validate, reacquire openclose lock, 1490 * and close the device. If sp_validate 1491 * succeeds, then device will be re-opened. 1492 */ 1493 if ((err = md_unit_incopen(mnum, flag, 1494 otyp)) != 0) 1495 goto out; 1496 1497 mutex_enter(&ui->ui_mx); 1498 ui->ui_lock |= MD_UL_OPENINPROGRESS; 1499 mutex_exit(&ui->ui_mx); 1500 md_unit_openclose_exit(ui); 1501 if (otyp != OTYP_LYR) 1502 rw_exit(&md_unit_array_rw.lock); 1503 1504 err = sp_validate(un); 1505 1506 if (otyp != OTYP_LYR) 1507 rw_enter(&md_unit_array_rw.lock, 1508 RW_READER); 1509 (void) md_unit_openclose_enter(ui); 1510 (void) md_unit_decopen(mnum, otyp); 1511 mutex_enter(&ui->ui_mx); 1512 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 1513 cv_broadcast(&ui->ui_cv); 1514 mutex_exit(&ui->ui_mx); 1515 /* 1516 * Should be in the same state as before 1517 * the sp_validate. 1518 */ 1519 if (err != 0) { 1520 /* close the device opened above */ 1521 md_layered_close(un->un_dev, md_oflags); 1522 err = EIO; 1523 goto out; 1524 } 1525 } 1526 /* 1527 * As we're a multi-owner metadevice we need to ensure 1528 * that all nodes have the same idea of the status. 1529 * sp_validate() will mark the device as errored (if 1530 * it cannot read the watermark) or ok (if it was 1531 * previously errored but the watermark is now valid). 1532 * This code-path is only entered on the non-probe open 1533 * so we will maintain the errored state during a probe 1534 * call. This means the sys-admin must metarecover -m 1535 * to reset the soft-partition error. 1536 */ 1537 } else { 1538 /* For probe, don't incur the overhead of validate */ 1539 if (!(md_oflags & MD_OFLG_PROBEDEV) && 1540 (err = sp_validate(un)) != 0) { 1541 /* close the device opened above */ 1542 md_layered_close(un->un_dev, md_oflags); 1543 err = EIO; 1544 goto out; 1545 } else { 1546 /* 1547 * we succeeded in validating the on disk 1548 * format versus the in core, so reset the 1549 * status if it's in error 1550 */ 1551 if (un->un_status == MD_SP_ERR) { 1552 un->un_status = MD_SP_OK; 1553 } 1554 } 1555 } 1556 } 1557 1558 /* count open */ 1559 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 1560 goto out; 1561 1562 out: 1563 md_unit_openclose_exit(ui); 1564 return (err); 1565 } 1566 1567 /* 1568 * FUNCTION: sp_close() 1569 * INPUT: dev - device to close. 1570 * flag - pass-through flag. 1571 * otyp - pass-through type. 1572 * cred_p - credentials. 1573 * md_cflags - close flags. 1574 * OUTPUT: none. 1575 * RETURNS: 0 - success. 1576 * non-zero - err. 1577 * PURPOSE: close a soft paritition. 1578 */ 1579 /* ARGSUSED */ 1580 static int 1581 sp_close( 1582 dev_t dev, 1583 int flag, 1584 int otyp, 1585 cred_t *cred_p, 1586 int md_cflags 1587 ) 1588 { 1589 minor_t mnum = getminor(dev); 1590 mdi_unit_t *ui = MDI_UNIT(mnum); 1591 mp_unit_t *un; 1592 int err = 0; 1593 1594 /* grab necessary locks */ 1595 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1596 1597 /* count closed */ 1598 if ((err = md_unit_decopen(mnum, otyp)) != 0) 1599 goto out; 1600 1601 /* close devices, if necessary */ 1602 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 1603 md_layered_close(un->un_dev, md_cflags); 1604 } 1605 1606 /* 1607 * If a MN set and transient capabilities (eg ABR/DMR) are set, 1608 * clear these capabilities if this is the last close in 1609 * the cluster 1610 */ 1611 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1612 (ui->ui_tstate & MD_ABR_CAP)) { 1613 md_unit_openclose_exit(ui); 1614 mdmn_clear_all_capabilities(mnum); 1615 return (0); 1616 } 1617 /* unlock, return success */ 1618 out: 1619 md_unit_openclose_exit(ui); 1620 return (err); 1621 } 1622 1623 1624 /* used in sp_dump routine */ 1625 static struct buf dumpbuf; 1626 1627 /* 1628 * FUNCTION: sp_dump() 1629 * INPUT: dev - device to dump to. 1630 * addr - address to dump. 1631 * blkno - blkno on device. 1632 * nblk - number of blocks to dump. 1633 * OUTPUT: none. 1634 * RETURNS: result from bdev_dump. 1635 * PURPOSE: This routine dumps memory to the disk. It assumes that 1636 * the memory has already been mapped into mainbus space. 1637 * It is called at disk interrupt priority when the system 1638 * is in trouble. 1639 * NOTE: this function is defined using 32-bit arguments, 1640 * but soft partitioning is internally 64-bit. Arguments 1641 * are casted where appropriate. 1642 */ 1643 static int 1644 sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1645 { 1646 mp_unit_t *un; 1647 buf_t *bp; 1648 sp_ext_length_t nb; 1649 daddr_t mapblk; 1650 int result; 1651 int more; 1652 int saveresult = 0; 1653 1654 /* 1655 * Don't need to grab the unit lock. 1656 * Cause nothing else is supposed to be happenning. 1657 * Also dump is not supposed to sleep. 1658 */ 1659 un = (mp_unit_t *)MD_UNIT(getminor(dev)); 1660 1661 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 1662 return (EINVAL); 1663 1664 if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks) 1665 return (EINVAL); 1666 1667 bp = &dumpbuf; 1668 nb = (sp_ext_length_t)dbtob(nblk); 1669 do { 1670 bzero((caddr_t)bp, sizeof (*bp)); 1671 more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp); 1672 nblk = (int)(btodb(bp->b_bcount)); 1673 mapblk = bp->b_blkno; 1674 result = bdev_dump(bp->b_edev, addr, mapblk, nblk); 1675 if (result) 1676 saveresult = result; 1677 1678 nb -= bp->b_bcount; 1679 addr += bp->b_bcount; 1680 blkno += nblk; 1681 } while (more); 1682 1683 return (saveresult); 1684 } 1685 1686 static int 1687 sp_imp_set( 1688 set_t setno 1689 ) 1690 { 1691 mddb_recid_t recid; 1692 int gotsomething; 1693 mddb_type_t rec_type; 1694 mddb_de_ic_t *dep; 1695 mddb_rb32_t *rbp; 1696 mp_unit_t *un64; 1697 mp_unit32_od_t *un32; 1698 minor_t *self_id; /* minor needs to be updated */ 1699 md_parent_t *parent_id; /* parent needs to be updated */ 1700 mddb_recid_t *record_id; /* record id needs to be updated */ 1701 1702 gotsomething = 0; 1703 1704 rec_type = (mddb_type_t)md_getshared_key(setno, 1705 sp_md_ops.md_driver.md_drivername); 1706 recid = mddb_makerecid(setno, 0); 1707 1708 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1709 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1710 continue; 1711 1712 dep = mddb_getrecdep(recid); 1713 rbp = dep->de_rb; 1714 1715 if (rbp->rb_revision == MDDB_REV_RB) { 1716 /* 1717 * Small device 1718 */ 1719 un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid); 1720 self_id = &(un32->c.un_self_id); 1721 parent_id = &(un32->c.un_parent); 1722 record_id = &(un32->c.un_record_id); 1723 1724 if (!md_update_minor(setno, mddb_getsidenum 1725 (setno), un32->un_key)) 1726 goto out; 1727 } else { 1728 un64 = (mp_unit_t *)mddb_getrecaddr(recid); 1729 self_id = &(un64->c.un_self_id); 1730 parent_id = &(un64->c.un_parent); 1731 record_id = &(un64->c.un_record_id); 1732 1733 if (!md_update_minor(setno, mddb_getsidenum 1734 (setno), un64->un_key)) 1735 goto out; 1736 } 1737 1738 /* 1739 * Update unit with the imported setno 1740 * 1741 */ 1742 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1743 1744 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 1745 if (*parent_id != MD_NO_PARENT) 1746 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 1747 *record_id = MAKERECID(setno, DBID(*record_id)); 1748 1749 gotsomething = 1; 1750 } 1751 1752 out: 1753 return (gotsomething); 1754 } 1755 1756 static md_named_services_t sp_named_services[] = { 1757 {NULL, 0} 1758 }; 1759 1760 md_ops_t sp_md_ops = { 1761 sp_open, /* open */ 1762 sp_close, /* close */ 1763 md_sp_strategy, /* strategy */ 1764 NULL, /* print */ 1765 sp_dump, /* dump */ 1766 NULL, /* read */ 1767 NULL, /* write */ 1768 md_sp_ioctl, /* ioctl, */ 1769 sp_snarf, /* snarf */ 1770 sp_halt, /* halt */ 1771 NULL, /* aread */ 1772 NULL, /* awrite */ 1773 sp_imp_set, /* import set */ 1774 sp_named_services 1775 }; 1776 1777 static void 1778 init_init() 1779 { 1780 sp_parent_cache = kmem_cache_create("md_softpart_parent", 1781 sizeof (md_spps_t), 0, sp_parent_constructor, 1782 sp_parent_destructor, sp_run_queue, NULL, NULL, 0); 1783 sp_child_cache = kmem_cache_create("md_softpart_child", 1784 sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0, 1785 sp_child_constructor, sp_child_destructor, sp_run_queue, 1786 NULL, NULL, 0); 1787 } 1788 1789 static void 1790 fini_uninit() 1791 { 1792 kmem_cache_destroy(sp_parent_cache); 1793 kmem_cache_destroy(sp_child_cache); 1794 sp_parent_cache = sp_child_cache = NULL; 1795 } 1796 1797 /* define the module linkage */ 1798 MD_PLUGIN_MISC_MODULE("soft partition module %I%", init_init(), fini_uninit()) 1799