1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright (c) 2011 Bayard G. Bell. All rights reserved. 26 */ 27 28 /* 29 * Soft partitioning metadevice driver (md_sp). 30 * 31 * This file contains the primary operations of the soft partitioning 32 * metadevice driver. This includes all routines for normal operation 33 * (open/close/read/write). Please see mdvar.h for a definition of 34 * metadevice operations vector (md_ops_t). This driver is loosely 35 * based on the stripe driver (md_stripe). 36 * 37 * All metadevice administration is done through the use of ioctl's. 38 * As such, all administrative routines appear in sp_ioctl.c. 39 * 40 * Soft partitions are represented both in-core and in the metadb with a 41 * unit structure. The soft partition-specific information in the unit 42 * structure includes the following information: 43 * - Device information (md_dev64_t & md key) about the device on which 44 * the soft partition is built. 45 * - Soft partition status information. 46 * - The size of the soft partition and number of extents used to 47 * make up that size. 48 * - An array of exents which define virtual/physical offset 49 * mappings and lengths for each extent. 50 * 51 * Typical soft partition operation proceeds as follows: 52 * - The unit structure is fetched from the metadb and placed into 53 * an in-core array (as with other metadevices). This operation 54 * is performed via sp_build_incore( ) and takes place during 55 * "snarfing" (when all metadevices are brought in-core at 56 * once) and when a new soft partition is created. 57 * - A soft partition is opened via sp_open( ). At open time the 58 * the soft partition unit structure is verified with the soft 59 * partition on-disk structures. Additionally, the soft partition 60 * status is checked (only soft partitions in the OK state may be 61 * opened). 62 * - Soft partition I/O is performed via sp_strategy( ) which relies on 63 * a support routine, sp_mapbuf( ), to do most of the work. 64 * sp_mapbuf( ) maps a buffer to a particular extent via a binary 65 * search of the extent array in the soft partition unit structure. 66 * Once a translation has been performed, the I/O is passed down 67 * to the next layer, which may be another metadevice or a physical 68 * disk. Since a soft partition may contain multiple, non-contiguous 69 * extents, a single I/O may have to be fragmented. 70 * - Soft partitions are closed using sp_close. 71 * 72 */ 73 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/conf.h> 77 #include <sys/file.h> 78 #include <sys/user.h> 79 #include <sys/uio.h> 80 #include <sys/t_lock.h> 81 #include <sys/buf.h> 82 #include <sys/dkio.h> 83 #include <sys/vtoc.h> 84 #include <sys/kmem.h> 85 #include <vm/page.h> 86 #include <sys/cmn_err.h> 87 #include <sys/sysmacros.h> 88 #include <sys/types.h> 89 #include <sys/mkdev.h> 90 #include <sys/stat.h> 91 #include <sys/open.h> 92 #include <sys/lvm/mdvar.h> 93 #include <sys/lvm/md_sp.h> 94 #include <sys/lvm/md_convert.h> 95 #include <sys/lvm/md_notify.h> 96 #include <sys/lvm/md_crc.h> 97 #include <sys/modctl.h> 98 #include <sys/ddi.h> 99 #include <sys/sunddi.h> 100 #include <sys/debug.h> 101 102 #include <sys/sysevent/eventdefs.h> 103 #include <sys/sysevent/svm.h> 104 105 md_ops_t sp_md_ops; 106 #ifndef lint 107 md_ops_t *md_interface_ops = &sp_md_ops; 108 #endif 109 110 extern unit_t md_nunits; 111 extern set_t md_nsets; 112 extern md_set_t md_set[]; 113 114 extern int md_status; 115 extern major_t md_major; 116 extern mdq_anchor_t md_done_daemon; 117 extern mdq_anchor_t md_sp_daemon; 118 extern kmutex_t md_mx; 119 extern kcondvar_t md_cv; 120 extern md_krwlock_t md_unit_array_rw; 121 extern clock_t md_hz; 122 123 static kmem_cache_t *sp_parent_cache = NULL; 124 static kmem_cache_t *sp_child_cache = NULL; 125 static void sp_send_stat_ok(mp_unit_t *); 126 static void sp_send_stat_err(mp_unit_t *); 127 128 /* 129 * FUNCTION: sp_parent_constructor() 130 * INPUT: none. 131 * OUTPUT: ps - parent save structure initialized. 132 * RETURNS: void * - ptr to initialized parent save structure. 133 * PURPOSE: initialize parent save structure. 134 */ 135 /*ARGSUSED1*/ 136 static int 137 sp_parent_constructor(void *p, void *d1, int d2) 138 { 139 mutex_init(&((md_spps_t *)p)->ps_mx, 140 NULL, MUTEX_DEFAULT, NULL); 141 return (0); 142 } 143 144 static void 145 sp_parent_init(md_spps_t *ps) 146 { 147 bzero(ps, offsetof(md_spps_t, ps_mx)); 148 } 149 150 /*ARGSUSED1*/ 151 static void 152 sp_parent_destructor(void *p, void *d) 153 { 154 mutex_destroy(&((md_spps_t *)p)->ps_mx); 155 } 156 157 /* 158 * FUNCTION: sp_child_constructor() 159 * INPUT: none. 160 * OUTPUT: cs - child save structure initialized. 161 * RETURNS: void * - ptr to initialized child save structure. 162 * PURPOSE: initialize child save structure. 163 */ 164 /*ARGSUSED1*/ 165 static int 166 sp_child_constructor(void *p, void *d1, int d2) 167 { 168 bioinit(&((md_spcs_t *)p)->cs_buf); 169 return (0); 170 } 171 172 static void 173 sp_child_init(md_spcs_t *cs) 174 { 175 cs->cs_mdunit = 0; 176 cs->cs_ps = NULL; 177 md_bioreset(&cs->cs_buf); 178 } 179 180 /*ARGSUSED1*/ 181 static void 182 sp_child_destructor(void *p, void *d) 183 { 184 biofini(&((md_spcs_t *)p)->cs_buf); 185 } 186 187 /* 188 * FUNCTION: sp_run_queue() 189 * INPUT: none. 190 * OUTPUT: none. 191 * RETURNS: void. 192 * PURPOSE: run the md_daemon to clean up memory pool. 193 */ 194 /*ARGSUSED*/ 195 static void 196 sp_run_queue(void *d) 197 { 198 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 199 md_daemon(1, &md_done_daemon); 200 } 201 202 203 /* 204 * FUNCTION: sp_build_incore() 205 * INPUT: p - ptr to unit structure. 206 * snarfing - flag to tell us we are snarfing. 207 * OUTPUT: non. 208 * RETURNS: int - 0 (always). 209 * PURPOSE: place unit structure into in-core unit array (keyed from 210 * minor number). 211 */ 212 int 213 sp_build_incore(void *p, int snarfing) 214 { 215 mp_unit_t *un = (mp_unit_t *)p; 216 minor_t mnum; 217 set_t setno; 218 md_dev64_t tmpdev; 219 220 mnum = MD_SID(un); 221 222 if (MD_UNIT(mnum) != NULL) 223 return (0); 224 225 MD_STATUS(un) = 0; 226 227 if (snarfing) { 228 /* 229 * if we are snarfing, we get the device information 230 * from the metadb record (using the metadb key for 231 * that device). 232 */ 233 setno = MD_MIN2SET(mnum); 234 235 tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), 236 un->un_key, MD_NOTRUST_DEVT); 237 un->un_dev = tmpdev; 238 } 239 240 /* place various information in the in-core data structures */ 241 md_nblocks_set(mnum, un->c.un_total_blocks); 242 MD_UNIT(mnum) = un; 243 244 return (0); 245 } 246 247 /* 248 * FUNCTION: reset_sp() 249 * INPUT: un - unit structure to be reset/removed. 250 * mnum - minor number to be reset/removed. 251 * removing - flag to tell us if we are removing 252 * permanently or just reseting in-core 253 * structures. 254 * OUTPUT: none. 255 * RETURNS: void. 256 * PURPOSE: used to either simply reset in-core structures or to 257 * permanently remove metadevices from the metadb. 258 */ 259 void 260 reset_sp(mp_unit_t *un, minor_t mnum, int removing) 261 { 262 sv_dev_t *sv; 263 mddb_recid_t vtoc_id; 264 265 /* clean up in-core structures */ 266 md_destroy_unit_incore(mnum, &sp_md_ops); 267 268 md_nblocks_set(mnum, -1ULL); 269 MD_UNIT(mnum) = NULL; 270 271 /* 272 * Attempt release of minor node 273 */ 274 md_remove_minor_node(mnum); 275 276 if (!removing) 277 return; 278 279 /* we are removing the soft partition from the metadb */ 280 281 /* 282 * Save off device information so we can get to 283 * it after we do the mddb_deleterec(). 284 */ 285 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); 286 sv->setno = MD_MIN2SET(mnum); 287 sv->key = un->un_key; 288 vtoc_id = un->c.un_vtoc_id; 289 290 /* 291 * Remove self from the namespace 292 */ 293 if (un->c.un_revision & MD_FN_META_DEV) { 294 (void) md_rem_selfname(un->c.un_self_id); 295 } 296 297 /* Remove the unit structure */ 298 mddb_deleterec_wrapper(un->c.un_record_id); 299 300 if (vtoc_id) 301 mddb_deleterec_wrapper(vtoc_id); 302 303 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE, 304 MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); 305 306 /* 307 * remove the underlying device name from the metadb. if other 308 * soft partitions are built on this device, this will simply 309 * decrease the reference count for this device. otherwise the 310 * name record for this device will be removed from the metadb. 311 */ 312 md_rem_names(sv, 1); 313 kmem_free(sv, sizeof (sv_dev_t)); 314 } 315 316 /* 317 * FUNCTION: sp_send_stat_msg 318 * INPUT: un - unit reference 319 * status - status to be sent to master node 320 * MD_SP_OK - soft-partition is now OK 321 * MD_SP_ERR " " errored 322 * OUTPUT: none. 323 * RETURNS: void. 324 * PURPOSE: send a soft-partition status change to the master node. If the 325 * message succeeds we simply return. If it fails we panic as the 326 * cluster-wide view of the metadevices is now inconsistent. 327 * CALLING CONTEXT: 328 * Blockable. No locks can be held. 329 */ 330 static void 331 sp_send_stat_msg(mp_unit_t *un, sp_status_t status) 332 { 333 md_mn_msg_sp_setstat_t sp_msg; 334 md_mn_kresult_t *kres; 335 set_t setno = MD_UN2SET(un); 336 int rval; 337 const char *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK"; 338 int nretries = 0; 339 340 sp_msg.sp_setstat_mnum = MD_SID(un); 341 sp_msg.sp_setstat_status = status; 342 343 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 344 345 spss_msg: 346 rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG, 347 0, (char *)&sp_msg, sizeof (sp_msg), kres); 348 349 if (!MDMN_KSEND_MSG_OK(rval, kres)) { 350 mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2"); 351 /* If we're shutting down already, pause things here. */ 352 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) { 353 while (!md_mn_is_commd_present()) { 354 delay(md_hz); 355 } 356 /* 357 * commd is available again. Retry the message once. 358 * If it fails we panic as the system is in an 359 * unexpected state. 360 */ 361 if (nretries++ == 0) 362 goto spss_msg; 363 } 364 /* 365 * Panic as we are now in an inconsistent state. 366 */ 367 cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n", 368 md_shortname(MD_SID(un)), str); 369 } 370 371 kmem_free(kres, sizeof (md_mn_kresult_t)); 372 } 373 374 /* 375 * FUNCTION: sp_finish_error 376 * INPUT: ps - parent save structure for error-ed I/O. 377 * lock_held - set if the unit readerlock is held 378 * OUTPUT: none. 379 * RETURNS: void. 380 * PURPOSE: report a driver error 381 */ 382 static void 383 sp_finish_error(md_spps_t *ps, int lock_held) 384 { 385 struct buf *pb = ps->ps_bp; 386 mdi_unit_t *ui = ps->ps_ui; 387 md_dev64_t un_dev; /* underlying device */ 388 md_dev64_t md_dev = md_expldev(pb->b_edev); /* metadev in error */ 389 char *str; 390 391 un_dev = md_expldev(ps->ps_un->un_dev); 392 /* set error type */ 393 if (pb->b_flags & B_READ) { 394 str = "read"; 395 } else { 396 str = "write"; 397 } 398 399 400 SPPS_FREE(sp_parent_cache, ps); 401 pb->b_flags |= B_ERROR; 402 403 md_kstat_done(ui, pb, 0); 404 405 if (lock_held) { 406 md_unit_readerexit(ui); 407 } 408 md_biodone(pb); 409 410 cmn_err(CE_WARN, "md: %s: %s error on %s", 411 md_shortname(md_getminor(md_dev)), str, 412 md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0)); 413 } 414 415 416 /* 417 * FUNCTION: sp_xmit_ok 418 * INPUT: dq - daemon queue referencing failing ps structure 419 * OUTPUT: none. 420 * RETURNS: void. 421 * PURPOSE: send a message to the master node in a multi-owner diskset to 422 * update all attached nodes view of the soft-part to be MD_SP_OK. 423 * CALLING CONTEXT: 424 * Blockable. No unit lock held. 425 */ 426 static void 427 sp_xmit_ok(daemon_queue_t *dq) 428 { 429 md_spps_t *ps = (md_spps_t *)dq; 430 431 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 432 sp_send_stat_msg(ps->ps_un, MD_SP_OK); 433 434 /* 435 * Successfully transmitted error state to all nodes, now release this 436 * parent structure. 437 */ 438 SPPS_FREE(sp_parent_cache, ps); 439 } 440 441 /* 442 * FUNCTION: sp_xmit_error 443 * INPUT: dq - daemon queue referencing failing ps structure 444 * OUTPUT: none. 445 * RETURNS: void. 446 * PURPOSE: send a message to the master node in a multi-owner diskset to 447 * update all attached nodes view of the soft-part to be MD_SP_ERR. 448 * CALLING CONTEXT: 449 * Blockable. No unit lock held. 450 */ 451 static void 452 sp_xmit_error(daemon_queue_t *dq) 453 { 454 md_spps_t *ps = (md_spps_t *)dq; 455 456 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 457 sp_send_stat_msg(ps->ps_un, MD_SP_ERR); 458 459 /* 460 * Successfully transmitted error state to all nodes, now release this 461 * parent structure. 462 */ 463 SPPS_FREE(sp_parent_cache, ps); 464 } 465 static void 466 sp_send_stat_ok(mp_unit_t *un) 467 { 468 minor_t mnum = MD_SID(un); 469 md_spps_t *ps; 470 471 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 472 sp_parent_init(ps); 473 ps->ps_un = un; 474 ps->ps_ui = MDI_UNIT(mnum); 475 476 daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps, 477 REQ_OLD); 478 } 479 480 static void 481 sp_send_stat_err(mp_unit_t *un) 482 { 483 minor_t mnum = MD_SID(un); 484 md_spps_t *ps; 485 486 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 487 sp_parent_init(ps); 488 ps->ps_un = un; 489 ps->ps_ui = MDI_UNIT(mnum); 490 491 daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps, 492 REQ_OLD); 493 } 494 495 496 /* 497 * FUNCTION: sp_error() 498 * INPUT: ps - parent save structure for error-ed I/O. 499 * OUTPUT: none. 500 * RETURNS: void. 501 * PURPOSE: report a driver error. 502 * CALLING CONTEXT: 503 * Interrupt - non-blockable 504 */ 505 static void 506 sp_error(md_spps_t *ps) 507 { 508 set_t setno = MD_UN2SET(ps->ps_un); 509 510 /* 511 * Drop the mutex associated with this request before (potentially) 512 * enqueuing the free onto a separate thread. We have to release the 513 * mutex before destroying the parent structure. 514 */ 515 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 516 if (MUTEX_HELD(&ps->ps_mx)) { 517 mutex_exit(&ps->ps_mx); 518 } 519 } else { 520 /* 521 * this should only ever happen if we are panicking, 522 * since DONTFREE is only set on the parent if panicstr 523 * is non-NULL. 524 */ 525 ASSERT(panicstr); 526 } 527 528 /* 529 * For a multi-owner set we need to send a message to the master so that 530 * all nodes get the errored status when we first encounter it. To avoid 531 * deadlocking when multiple soft-partitions encounter an error on one 532 * physical unit we drop the unit readerlock before enqueueing the 533 * request. That way we can service any messages that require a 534 * writerlock to be held. Additionally, to avoid deadlocking when at 535 * the bottom of a metadevice stack and a higher level mirror has 536 * multiple requests outstanding on this soft-part, we clone the ps 537 * that failed and pass the error back up the stack to release the 538 * reference that this i/o may have in the higher-level metadevice. 539 * The other nodes in the cluster just have to modify the soft-part 540 * status and we do not need to block the i/o completion for this. 541 */ 542 if (MD_MNSET_SETNO(setno)) { 543 md_spps_t *err_ps; 544 err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 545 sp_parent_init(err_ps); 546 547 err_ps->ps_un = ps->ps_un; 548 err_ps->ps_ui = ps->ps_ui; 549 550 md_unit_readerexit(ps->ps_ui); 551 552 daemon_request(&md_sp_daemon, sp_xmit_error, 553 (daemon_queue_t *)err_ps, REQ_OLD); 554 555 sp_finish_error(ps, 0); 556 557 return; 558 } else { 559 ps->ps_un->un_status = MD_SP_ERR; 560 } 561 562 /* Flag the error */ 563 sp_finish_error(ps, 1); 564 565 } 566 567 /* 568 * FUNCTION: sp_mapbuf() 569 * INPUT: un - unit structure for soft partition we are doing 570 * I/O on. 571 * voff - virtual offset in soft partition to map. 572 * bcount - # of blocks in the I/O. 573 * OUTPUT: bp - translated buffer to be passed down to next layer. 574 * RETURNS: 1 - request must be fragmented, more work to do, 575 * 0 - request satisified, no more work to do 576 * -1 - error 577 * PURPOSE: Map the the virtual offset in the soft partition (passed 578 * in via voff) to the "physical" offset on whatever the soft 579 * partition is built on top of. We do this by doing a binary 580 * search of the extent array in the soft partition unit 581 * structure. Once the current extent is found, we do the 582 * translation, determine if the I/O will cross extent 583 * boundaries (if so, we have to fragment the I/O), then 584 * fill in the buf structure to be passed down to the next layer. 585 */ 586 static int 587 sp_mapbuf( 588 mp_unit_t *un, 589 sp_ext_offset_t voff, 590 sp_ext_length_t bcount, 591 buf_t *bp 592 ) 593 { 594 int lo, mid, hi, found, more; 595 size_t new_bcount; 596 sp_ext_offset_t new_blkno; 597 sp_ext_offset_t new_offset; 598 sp_ext_offset_t ext_endblk; 599 md_dev64_t new_edev; 600 extern unsigned md_maxphys; 601 602 found = 0; 603 lo = 0; 604 hi = un->un_numexts - 1; 605 606 /* 607 * do a binary search to find the extent that contains the 608 * starting offset. after this loop, mid contains the index 609 * of the correct extent. 610 */ 611 while (lo <= hi && !found) { 612 mid = (lo + hi) / 2; 613 /* is the starting offset contained within the mid-ext? */ 614 if (voff >= un->un_ext[mid].un_voff && 615 voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len) 616 found = 1; 617 else if (voff < un->un_ext[mid].un_voff) 618 hi = mid - 1; 619 else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */ 620 lo = mid + 1; 621 } 622 623 if (!found) { 624 cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff); 625 return (-1); 626 } 627 628 /* translate to underlying physical offset/device */ 629 new_offset = voff - un->un_ext[mid].un_voff; 630 new_blkno = un->un_ext[mid].un_poff + new_offset; 631 new_edev = un->un_dev; 632 633 /* determine if we need to break the I/O into fragments */ 634 ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len; 635 if (voff + btodb(bcount) > ext_endblk) { 636 new_bcount = dbtob(ext_endblk - voff); 637 more = 1; 638 } else { 639 new_bcount = bcount; 640 more = 0; 641 } 642 643 /* only break up the I/O if we're not built on another metadevice */ 644 if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) { 645 new_bcount = md_maxphys; 646 more = 1; 647 } 648 if (bp != (buf_t *)NULL) { 649 /* do bp updates */ 650 bp->b_bcount = new_bcount; 651 bp->b_lblkno = new_blkno; 652 bp->b_edev = md_dev64_to_dev(new_edev); 653 } 654 return (more); 655 } 656 657 /* 658 * FUNCTION: sp_validate() 659 * INPUT: un - unit structure to be validated. 660 * OUTPUT: none. 661 * RETURNS: 0 - soft partition ok. 662 * -1 - error. 663 * PURPOSE: called on open to sanity check the soft partition. In 664 * order to open a soft partition: 665 * - it must have at least one extent 666 * - the extent info in core and on disk must match 667 * - it may not be in an intermediate state (which would 668 * imply that a two-phase commit was interrupted) 669 * 670 * If the extent checking fails (B_ERROR returned from the read 671 * strategy call) _and_ we're a multi-owner diskset, we send a 672 * message to the master so that all nodes inherit the same view 673 * of the soft partition. 674 * If we are checking a soft-part that is marked as in error, and 675 * we can actually read and validate the watermarks we send a 676 * message to clear the error to the master node. 677 */ 678 static int 679 sp_validate(mp_unit_t *un) 680 { 681 uint_t ext; 682 struct buf *buf; 683 sp_ext_length_t len; 684 mp_watermark_t *wm; 685 set_t setno; 686 int reset_error = 0; 687 688 setno = MD_UN2SET(un); 689 690 /* sanity check unit structure components ?? */ 691 if (un->un_status != MD_SP_OK) { 692 if (un->un_status != MD_SP_ERR) { 693 cmn_err(CE_WARN, "md: %s: open failed, soft partition " 694 "status is %u.", 695 md_shortname(MD_SID(un)), 696 un->un_status); 697 return (-1); 698 } else { 699 cmn_err(CE_WARN, "md: %s: open of soft partition " 700 "in Errored state.", 701 md_shortname(MD_SID(un))); 702 reset_error = 1; 703 } 704 } 705 706 if (un->un_numexts == 0) { 707 cmn_err(CE_WARN, "md: %s: open failed, soft partition does " 708 "not have any extents.", md_shortname(MD_SID(un))); 709 return (-1); 710 } 711 712 len = 0LL; 713 for (ext = 0; ext < un->un_numexts; ext++) { 714 715 /* tally extent lengths to check total size */ 716 len += un->un_ext[ext].un_len; 717 718 /* allocate buffer for watermark */ 719 buf = getrbuf(KM_SLEEP); 720 721 /* read watermark */ 722 buf->b_flags = B_READ; 723 buf->b_edev = md_dev64_to_dev(un->un_dev); 724 buf->b_iodone = NULL; 725 buf->b_proc = NULL; 726 buf->b_bcount = sizeof (mp_watermark_t); 727 buf->b_lblkno = un->un_ext[ext].un_poff - 1; 728 buf->b_bufsize = sizeof (mp_watermark_t); 729 buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t), 730 KM_SLEEP); 731 732 /* 733 * make the call non-blocking so that it is not affected 734 * by a set take. 735 */ 736 md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL); 737 (void) biowait(buf); 738 739 if (buf->b_flags & B_ERROR) { 740 cmn_err(CE_WARN, "md: %s: open failed, could not " 741 "read watermark at block %llu for extent %u, " 742 "error %d.", md_shortname(MD_SID(un)), 743 buf->b_lblkno, ext, buf->b_error); 744 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 745 freerbuf(buf); 746 747 /* 748 * If we're a multi-owner diskset we send a message 749 * indicating that this soft-part has an invalid 750 * extent to the master node. This ensures a consistent 751 * view of the soft-part across the cluster. 752 */ 753 if (MD_MNSET_SETNO(setno)) { 754 sp_send_stat_err(un); 755 } 756 return (-1); 757 } 758 759 wm = (mp_watermark_t *)buf->b_un.b_addr; 760 761 /* make sure the checksum is correct first */ 762 if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum, 763 (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) { 764 cmn_err(CE_WARN, "md: %s: open failed, watermark " 765 "at block %llu for extent %u does not have a " 766 "valid checksum 0x%08x.", md_shortname(MD_SID(un)), 767 buf->b_lblkno, ext, wm->wm_checksum); 768 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 769 freerbuf(buf); 770 return (-1); 771 } 772 773 if (wm->wm_magic != MD_SP_MAGIC) { 774 cmn_err(CE_WARN, "md: %s: open failed, watermark " 775 "at block %llu for extent %u does not have a " 776 "valid watermark magic number, expected 0x%x, " 777 "found 0x%x.", md_shortname(MD_SID(un)), 778 buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic); 779 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 780 freerbuf(buf); 781 return (-1); 782 } 783 784 /* make sure sequence number matches the current extent */ 785 if (wm->wm_seq != ext) { 786 cmn_err(CE_WARN, "md: %s: open failed, watermark " 787 "at block %llu for extent %u has invalid " 788 "sequence number %u.", md_shortname(MD_SID(un)), 789 buf->b_lblkno, ext, wm->wm_seq); 790 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 791 freerbuf(buf); 792 return (-1); 793 } 794 795 /* make sure watermark length matches unit structure */ 796 if (wm->wm_length != un->un_ext[ext].un_len) { 797 cmn_err(CE_WARN, "md: %s: open failed, watermark " 798 "at block %llu for extent %u has inconsistent " 799 "length, expected %llu, found %llu.", 800 md_shortname(MD_SID(un)), buf->b_lblkno, 801 ext, un->un_ext[ext].un_len, 802 (u_longlong_t)wm->wm_length); 803 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 804 freerbuf(buf); 805 return (-1); 806 } 807 808 /* 809 * make sure the type is a valid soft partition and not 810 * a free extent or the end. 811 */ 812 if (wm->wm_type != EXTTYP_ALLOC) { 813 cmn_err(CE_WARN, "md: %s: open failed, watermark " 814 "at block %llu for extent %u is not marked " 815 "as in-use, type = %u.", md_shortname(MD_SID(un)), 816 buf->b_lblkno, ext, wm->wm_type); 817 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 818 freerbuf(buf); 819 return (-1); 820 } 821 /* free up buffer */ 822 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 823 freerbuf(buf); 824 } 825 826 if (len != un->un_length) { 827 cmn_err(CE_WARN, "md: %s: open failed, computed length " 828 "%llu != expected length %llu.", md_shortname(MD_SID(un)), 829 len, un->un_length); 830 return (-1); 831 } 832 833 /* 834 * If we're a multi-owner set _and_ reset_error is set, we should clear 835 * the error condition on all nodes in the set. Use SP_SETSTAT2 with 836 * MD_SP_OK. 837 */ 838 if (MD_MNSET_SETNO(setno) && reset_error) { 839 sp_send_stat_ok(un); 840 } 841 return (0); 842 } 843 844 /* 845 * FUNCTION: sp_done() 846 * INPUT: child_buf - buffer attached to child save structure. 847 * this is the buffer on which I/O has just 848 * completed. 849 * OUTPUT: none. 850 * RETURNS: 0 - success. 851 * 1 - error. 852 * PURPOSE: called on I/O completion. 853 */ 854 static int 855 sp_done(struct buf *child_buf) 856 { 857 struct buf *parent_buf; 858 mdi_unit_t *ui; 859 md_spps_t *ps; 860 md_spcs_t *cs; 861 862 /* find the child save structure to which this buffer belongs */ 863 cs = (md_spcs_t *)((caddr_t)child_buf - 864 (sizeof (md_spcs_t) - sizeof (buf_t))); 865 /* now get the parent save structure */ 866 ps = cs->cs_ps; 867 parent_buf = ps->ps_bp; 868 869 mutex_enter(&ps->ps_mx); 870 /* pass any errors back up to the parent */ 871 if (child_buf->b_flags & B_ERROR) { 872 ps->ps_flags |= MD_SPPS_ERROR; 873 parent_buf->b_error = child_buf->b_error; 874 } 875 /* mapout, if needed */ 876 if (child_buf->b_flags & B_REMAPPED) 877 bp_mapout(child_buf); 878 879 ps->ps_frags--; 880 if (ps->ps_frags != 0) { 881 /* 882 * if this parent has more children, we just free the 883 * child and return. 884 */ 885 kmem_cache_free(sp_child_cache, cs); 886 mutex_exit(&ps->ps_mx); 887 return (1); 888 } 889 /* there are no more children */ 890 kmem_cache_free(sp_child_cache, cs); 891 if (ps->ps_flags & MD_SPPS_ERROR) { 892 sp_error(ps); 893 return (1); 894 } 895 ui = ps->ps_ui; 896 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 897 mutex_exit(&ps->ps_mx); 898 } else { 899 /* 900 * this should only ever happen if we are panicking, 901 * since DONTFREE is only set on the parent if panicstr 902 * is non-NULL. 903 */ 904 ASSERT(panicstr); 905 } 906 SPPS_FREE(sp_parent_cache, ps); 907 md_kstat_done(ui, parent_buf, 0); 908 md_unit_readerexit(ui); 909 md_biodone(parent_buf); 910 return (0); 911 } 912 913 /* 914 * FUNCTION: md_sp_strategy() 915 * INPUT: parent_buf - parent buffer 916 * flag - flags 917 * private - private data 918 * OUTPUT: none. 919 * RETURNS: void. 920 * PURPOSE: Soft partitioning I/O strategy. Performs the main work 921 * needed to do I/O to a soft partition. The basic 922 * algorithm is as follows: 923 * - Allocate a child save structure to keep track 924 * of the I/O we are going to pass down. 925 * - Map the I/O to the correct extent in the soft 926 * partition (see sp_mapbuf()). 927 * - bioclone() the buffer and pass it down the 928 * stack using md_call_strategy. 929 * - If the I/O needs to split across extents, 930 * repeat the above steps until all fragments 931 * are finished. 932 */ 933 static void 934 md_sp_strategy(buf_t *parent_buf, int flag, void *private) 935 { 936 md_spps_t *ps; 937 md_spcs_t *cs; 938 int more; 939 mp_unit_t *un; 940 mdi_unit_t *ui; 941 size_t current_count; 942 off_t current_offset; 943 sp_ext_offset_t current_blkno; 944 buf_t *child_buf; 945 set_t setno = MD_MIN2SET(getminor(parent_buf->b_edev)); 946 int strat_flag = flag; 947 948 /* 949 * When doing IO to a multi owner meta device, check if set is halted. 950 * We do this check without the needed lock held, for performance 951 * reasons. 952 * If an IO just slips through while the set is locked via an 953 * MD_MN_SUSPEND_SET, we don't care about it. 954 * Only check for suspension if we are a top-level i/o request 955 * (MD_STR_NOTTOP is cleared in 'flag'); 956 */ 957 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 958 (MD_SET_HALTED | MD_SET_MNSET)) { 959 if ((flag & MD_STR_NOTTOP) == 0) { 960 mutex_enter(&md_mx); 961 /* Here we loop until the set is no longer halted */ 962 while (md_set[setno].s_status & MD_SET_HALTED) { 963 cv_wait(&md_cv, &md_mx); 964 } 965 mutex_exit(&md_mx); 966 } 967 } 968 969 ui = MDI_UNIT(getminor(parent_buf->b_edev)); 970 971 md_kstat_waitq_enter(ui); 972 973 un = (mp_unit_t *)md_unit_readerlock(ui); 974 975 if ((flag & MD_NOBLOCK) == 0) { 976 if (md_inc_iocount(setno) != 0) { 977 parent_buf->b_flags |= B_ERROR; 978 parent_buf->b_error = ENXIO; 979 parent_buf->b_resid = parent_buf->b_bcount; 980 md_kstat_waitq_exit(ui); 981 md_unit_readerexit(ui); 982 biodone(parent_buf); 983 return; 984 } 985 } else { 986 md_inc_iocount_noblock(setno); 987 } 988 989 if (!(flag & MD_STR_NOTTOP)) { 990 if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) { 991 md_kstat_waitq_exit(ui); 992 return; 993 } 994 } 995 996 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 997 sp_parent_init(ps); 998 999 /* 1000 * Save essential information from the original buffhdr 1001 * in the parent. 1002 */ 1003 ps->ps_un = un; 1004 ps->ps_ui = ui; 1005 ps->ps_bp = parent_buf; 1006 ps->ps_addr = parent_buf->b_un.b_addr; 1007 1008 current_count = parent_buf->b_bcount; 1009 current_blkno = (sp_ext_offset_t)parent_buf->b_blkno; 1010 current_offset = 0; 1011 1012 /* 1013 * if we are at the top and we are panicking, 1014 * we don't free in order to save state. 1015 */ 1016 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) 1017 ps->ps_flags |= MD_SPPS_DONTFREE; 1018 1019 md_kstat_waitq_to_runq(ui); 1020 1021 ps->ps_frags++; 1022 1023 /* 1024 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this 1025 * metadevice. 1026 */ 1027 if (ui->ui_tstate & MD_ABR_CAP) 1028 strat_flag |= MD_STR_ABR; 1029 1030 /* 1031 * this loop does the main work of an I/O. we allocate a 1032 * a child save for each buf, do the logical to physical 1033 * mapping, decide if we need to frag the I/O, clone the 1034 * new I/O to pass down the stack. repeat until we've 1035 * taken care of the entire buf that was passed to us. 1036 */ 1037 do { 1038 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1039 sp_child_init(cs); 1040 child_buf = &cs->cs_buf; 1041 cs->cs_ps = ps; 1042 1043 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1044 if (more == -1) { 1045 parent_buf->b_flags |= B_ERROR; 1046 parent_buf->b_error = EIO; 1047 md_kstat_done(ui, parent_buf, 0); 1048 md_unit_readerexit(ui); 1049 md_biodone(parent_buf); 1050 kmem_cache_free(sp_parent_cache, ps); 1051 return; 1052 } 1053 1054 child_buf = md_bioclone(parent_buf, current_offset, 1055 child_buf->b_bcount, child_buf->b_edev, 1056 child_buf->b_blkno, sp_done, child_buf, 1057 KM_NOSLEEP); 1058 /* calculate new offset, counts, etc... */ 1059 current_offset += child_buf->b_bcount; 1060 current_count -= child_buf->b_bcount; 1061 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1062 1063 if (more) { 1064 mutex_enter(&ps->ps_mx); 1065 ps->ps_frags++; 1066 mutex_exit(&ps->ps_mx); 1067 } 1068 1069 md_call_strategy(child_buf, strat_flag, private); 1070 } while (more); 1071 1072 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) { 1073 while (!(ps->ps_flags & MD_SPPS_DONE)) { 1074 md_daemon(1, &md_done_daemon); 1075 } 1076 kmem_cache_free(sp_parent_cache, ps); 1077 } 1078 } 1079 1080 /* 1081 * FUNCTION: sp_directed_read() 1082 * INPUT: mnum - minor number 1083 * vdr - vol_directed_rd_t from user 1084 * mode - access mode for copying data out. 1085 * OUTPUT: none. 1086 * RETURNS: 0 - success 1087 * Exxxxx - failure error-code 1088 * PURPOSE: Construct the necessary sub-device i/o requests to perform the 1089 * directed read as requested by the user. This is essentially the 1090 * same as md_sp_strategy() with the exception being that the 1091 * underlying 'md_call_strategy' is replaced with an ioctl call. 1092 */ 1093 int 1094 sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode) 1095 { 1096 md_spps_t *ps; 1097 md_spcs_t *cs; 1098 int more; 1099 mp_unit_t *un; 1100 mdi_unit_t *ui; 1101 size_t current_count; 1102 off_t current_offset; 1103 sp_ext_offset_t current_blkno; 1104 buf_t *child_buf, *parent_buf; 1105 void *kbuffer; 1106 vol_directed_rd_t cvdr; 1107 caddr_t userbuf; 1108 offset_t useroff; 1109 int ret = 0; 1110 1111 ui = MDI_UNIT(mnum); 1112 1113 md_kstat_waitq_enter(ui); 1114 1115 bzero(&cvdr, sizeof (cvdr)); 1116 1117 un = (mp_unit_t *)md_unit_readerlock(ui); 1118 1119 /* 1120 * Construct a parent_buf header which reflects the user-supplied 1121 * request. 1122 */ 1123 1124 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 1125 if (kbuffer == NULL) { 1126 vdr->vdr_flags |= DKV_DMR_ERROR; 1127 md_kstat_waitq_exit(ui); 1128 md_unit_readerexit(ui); 1129 return (ENOMEM); 1130 } 1131 1132 parent_buf = getrbuf(KM_NOSLEEP); 1133 if (parent_buf == NULL) { 1134 vdr->vdr_flags |= DKV_DMR_ERROR; 1135 md_kstat_waitq_exit(ui); 1136 md_unit_readerexit(ui); 1137 kmem_free(kbuffer, vdr->vdr_nbytes); 1138 return (ENOMEM); 1139 } 1140 parent_buf->b_un.b_addr = kbuffer; 1141 parent_buf->b_flags = B_READ; 1142 parent_buf->b_bcount = vdr->vdr_nbytes; 1143 parent_buf->b_lblkno = lbtodb(vdr->vdr_offset); 1144 parent_buf->b_edev = un->un_dev; 1145 1146 1147 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 1148 sp_parent_init(ps); 1149 1150 /* 1151 * Save essential information from the original buffhdr 1152 * in the parent. 1153 */ 1154 ps->ps_un = un; 1155 ps->ps_ui = ui; 1156 ps->ps_bp = parent_buf; 1157 ps->ps_addr = parent_buf->b_un.b_addr; 1158 1159 current_count = parent_buf->b_bcount; 1160 current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno; 1161 current_offset = 0; 1162 1163 md_kstat_waitq_to_runq(ui); 1164 1165 ps->ps_frags++; 1166 vdr->vdr_bytesread = 0; 1167 1168 /* 1169 * this loop does the main work of an I/O. we allocate a 1170 * a child save for each buf, do the logical to physical 1171 * mapping, decide if we need to frag the I/O, clone the 1172 * new I/O to pass down the stack. repeat until we've 1173 * taken care of the entire buf that was passed to us. 1174 */ 1175 do { 1176 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1177 sp_child_init(cs); 1178 child_buf = &cs->cs_buf; 1179 cs->cs_ps = ps; 1180 1181 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1182 if (more == -1) { 1183 ret = EIO; 1184 vdr->vdr_flags |= DKV_DMR_SHORT; 1185 kmem_cache_free(sp_child_cache, cs); 1186 goto err_out; 1187 } 1188 1189 cvdr.vdr_flags = vdr->vdr_flags; 1190 cvdr.vdr_side = vdr->vdr_side; 1191 cvdr.vdr_nbytes = child_buf->b_bcount; 1192 cvdr.vdr_offset = ldbtob(child_buf->b_lblkno); 1193 /* Work out where we are in the allocated buffer */ 1194 useroff = (offset_t)(uintptr_t)kbuffer; 1195 useroff = useroff + (offset_t)current_offset; 1196 cvdr.vdr_data = (void *)(uintptr_t)useroff; 1197 child_buf = md_bioclone(parent_buf, current_offset, 1198 child_buf->b_bcount, child_buf->b_edev, 1199 child_buf->b_blkno, NULL, 1200 child_buf, KM_NOSLEEP); 1201 /* calculate new offset, counts, etc... */ 1202 current_offset += child_buf->b_bcount; 1203 current_count -= child_buf->b_bcount; 1204 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1205 1206 if (more) { 1207 mutex_enter(&ps->ps_mx); 1208 ps->ps_frags++; 1209 mutex_exit(&ps->ps_mx); 1210 } 1211 1212 ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr, 1213 (mode | FKIOCTL), NULL); 1214 1215 /* 1216 * Free the child structure as we've finished with it. 1217 * Normally this would be done by sp_done() but we're just 1218 * using md_bioclone() to segment the transfer and we never 1219 * issue a strategy request so the iodone will not be called. 1220 */ 1221 kmem_cache_free(sp_child_cache, cs); 1222 if (ret == 0) { 1223 /* copyout the returned data to vdr_data + offset */ 1224 userbuf = (caddr_t)kbuffer; 1225 userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer; 1226 if (ddi_copyout(userbuf, vdr->vdr_data, 1227 cvdr.vdr_bytesread, mode)) { 1228 ret = EFAULT; 1229 goto err_out; 1230 } 1231 vdr->vdr_bytesread += cvdr.vdr_bytesread; 1232 } else { 1233 goto err_out; 1234 } 1235 } while (more); 1236 1237 /* 1238 * Update the user-supplied vol_directed_rd_t structure with the 1239 * contents of the last issued child request. 1240 */ 1241 vdr->vdr_flags = cvdr.vdr_flags; 1242 vdr->vdr_side = cvdr.vdr_side; 1243 bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME); 1244 1245 err_out: 1246 if (ret != 0) { 1247 vdr->vdr_flags |= DKV_DMR_ERROR; 1248 } 1249 if (vdr->vdr_bytesread != vdr->vdr_nbytes) { 1250 vdr->vdr_flags |= DKV_DMR_SHORT; 1251 } 1252 kmem_cache_free(sp_parent_cache, ps); 1253 kmem_free(kbuffer, vdr->vdr_nbytes); 1254 freerbuf(parent_buf); 1255 md_unit_readerexit(ui); 1256 return (ret); 1257 } 1258 1259 /* 1260 * FUNCTION: sp_snarf() 1261 * INPUT: cmd - snarf cmd. 1262 * setno - set number. 1263 * OUTPUT: none. 1264 * RETURNS: 1 - soft partitions were snarfed. 1265 * 0 - no soft partitions were snarfed. 1266 * PURPOSE: Snarf soft partition metadb records into their in-core 1267 * structures. This routine is called at "snarf time" when 1268 * md loads and gets all metadevices records into memory. 1269 * The basic algorithm is simply to walk the soft partition 1270 * records in the metadb and call the soft partitioning 1271 * build_incore routine to set up the in-core structures. 1272 */ 1273 static int 1274 sp_snarf(md_snarfcmd_t cmd, set_t setno) 1275 { 1276 mp_unit_t *un; 1277 mddb_recid_t recid; 1278 int gotsomething; 1279 int all_sp_gotten; 1280 mddb_type_t rec_type; 1281 mddb_de_ic_t *dep; 1282 mddb_rb32_t *rbp; 1283 mp_unit_t *big_un; 1284 mp_unit32_od_t *small_un; 1285 size_t newreqsize; 1286 1287 1288 if (cmd == MD_SNARF_CLEANUP) 1289 return (0); 1290 1291 all_sp_gotten = 1; 1292 gotsomething = 0; 1293 1294 /* get the record type */ 1295 rec_type = (mddb_type_t)md_getshared_key(setno, 1296 sp_md_ops.md_driver.md_drivername); 1297 recid = mddb_makerecid(setno, 0); 1298 1299 /* 1300 * walk soft partition records in the metadb and call 1301 * sp_build_incore to build in-core structures. 1302 */ 1303 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1304 /* if we've already gotten this record, go to the next one */ 1305 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1306 continue; 1307 1308 1309 dep = mddb_getrecdep(recid); 1310 dep->de_flags = MDDB_F_SOFTPART; 1311 rbp = dep->de_rb; 1312 1313 switch (rbp->rb_revision) { 1314 case MDDB_REV_RB: 1315 case MDDB_REV_RBFN: 1316 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 1317 /* 1318 * This means, we have an old and small record. 1319 * And this record hasn't already been converted 1320 * :-o before we create an incore metadevice 1321 * from this we have to convert it to a big 1322 * record. 1323 */ 1324 small_un = 1325 (mp_unit32_od_t *)mddb_getrecaddr(recid); 1326 newreqsize = sizeof (mp_unit_t) + 1327 ((small_un->un_numexts - 1) * 1328 sizeof (struct mp_ext)); 1329 big_un = (mp_unit_t *)kmem_zalloc(newreqsize, 1330 KM_SLEEP); 1331 softpart_convert((caddr_t)small_un, 1332 (caddr_t)big_un, SMALL_2_BIG); 1333 kmem_free(small_un, dep->de_reqsize); 1334 dep->de_rb_userdata = big_un; 1335 dep->de_reqsize = newreqsize; 1336 rbp->rb_private |= MD_PRV_CONVD; 1337 un = big_un; 1338 } else { 1339 /* Record has already been converted */ 1340 un = (mp_unit_t *)mddb_getrecaddr(recid); 1341 } 1342 un->c.un_revision &= ~MD_64BIT_META_DEV; 1343 break; 1344 case MDDB_REV_RB64: 1345 case MDDB_REV_RB64FN: 1346 /* Large device */ 1347 un = (mp_unit_t *)mddb_getrecaddr(recid); 1348 un->c.un_revision |= MD_64BIT_META_DEV; 1349 un->c.un_flag |= MD_EFILABEL; 1350 break; 1351 } 1352 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 1353 1354 /* 1355 * Create minor node for snarfed entry. 1356 */ 1357 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 1358 1359 if (MD_UNIT(MD_SID(un)) != NULL) { 1360 /* unit is already in-core */ 1361 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1362 continue; 1363 } 1364 all_sp_gotten = 0; 1365 if (sp_build_incore((void *)un, 1) == 0) { 1366 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1367 md_create_unit_incore(MD_SID(un), &sp_md_ops, 0); 1368 gotsomething = 1; 1369 } 1370 } 1371 1372 if (!all_sp_gotten) 1373 return (gotsomething); 1374 /* double-check records */ 1375 recid = mddb_makerecid(setno, 0); 1376 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) 1377 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 1378 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1379 1380 return (0); 1381 } 1382 1383 /* 1384 * FUNCTION: sp_halt() 1385 * INPUT: cmd - halt cmd. 1386 * setno - set number. 1387 * RETURNS: 0 - success. 1388 * 1 - err. 1389 * PURPOSE: Perform driver halt operations. As with stripe, we 1390 * support MD_HALT_CHECK and MD_HALT_DOIT. The first 1391 * does a check to see if halting can be done safely 1392 * (no open soft partitions), the second cleans up and 1393 * shuts down the driver. 1394 */ 1395 static int 1396 sp_halt(md_haltcmd_t cmd, set_t setno) 1397 { 1398 int i; 1399 mdi_unit_t *ui; 1400 minor_t mnum; 1401 1402 if (cmd == MD_HALT_CLOSE) 1403 return (0); 1404 1405 if (cmd == MD_HALT_OPEN) 1406 return (0); 1407 1408 if (cmd == MD_HALT_UNLOAD) 1409 return (0); 1410 1411 if (cmd == MD_HALT_CHECK) { 1412 for (i = 0; i < md_nunits; i++) { 1413 mnum = MD_MKMIN(setno, i); 1414 if ((ui = MDI_UNIT(mnum)) == NULL) 1415 continue; 1416 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1417 continue; 1418 if (md_unit_isopen(ui)) 1419 return (1); 1420 } 1421 return (0); 1422 } 1423 1424 if (cmd != MD_HALT_DOIT) 1425 return (1); 1426 1427 for (i = 0; i < md_nunits; i++) { 1428 mnum = MD_MKMIN(setno, i); 1429 if ((ui = MDI_UNIT(mnum)) == NULL) 1430 continue; 1431 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1432 continue; 1433 reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0); 1434 } 1435 1436 return (0); 1437 } 1438 1439 /* 1440 * FUNCTION: sp_open_dev() 1441 * INPUT: un - unit structure. 1442 * oflags - open flags. 1443 * OUTPUT: none. 1444 * RETURNS: 0 - success. 1445 * non-zero - err. 1446 * PURPOSE: open underlying device via md_layered_open. 1447 */ 1448 static int 1449 sp_open_dev(mp_unit_t *un, int oflags) 1450 { 1451 minor_t mnum = MD_SID(un); 1452 int err; 1453 md_dev64_t tmpdev; 1454 set_t setno = MD_MIN2SET(MD_SID(un)); 1455 side_t side = mddb_getsidenum(setno); 1456 1457 tmpdev = un->un_dev; 1458 /* 1459 * Do the open by device id if underlying is regular 1460 */ 1461 if ((md_getmajor(tmpdev) != md_major) && 1462 md_devid_found(setno, side, un->un_key) == 1) { 1463 tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key); 1464 } 1465 err = md_layered_open(mnum, &tmpdev, oflags); 1466 un->un_dev = tmpdev; 1467 1468 if (err) 1469 return (ENXIO); 1470 1471 return (0); 1472 } 1473 1474 /* 1475 * FUNCTION: sp_open() 1476 * INPUT: dev - device to open. 1477 * flag - pass-through flag. 1478 * otyp - pass-through open type. 1479 * cred_p - credentials. 1480 * md_oflags - open flags. 1481 * OUTPUT: none. 1482 * RETURNS: 0 - success. 1483 * non-zero - err. 1484 * PURPOSE: open a soft partition. 1485 */ 1486 /* ARGSUSED */ 1487 static int 1488 sp_open( 1489 dev_t *dev, 1490 int flag, 1491 int otyp, 1492 cred_t *cred_p, 1493 int md_oflags 1494 ) 1495 { 1496 minor_t mnum = getminor(*dev); 1497 mdi_unit_t *ui = MDI_UNIT(mnum); 1498 mp_unit_t *un; 1499 int err = 0; 1500 set_t setno; 1501 1502 /* 1503 * When doing an open of a multi owner metadevice, check to see if this 1504 * node is a starting node and if a reconfig cycle is underway. 1505 * If so, the system isn't sufficiently set up enough to handle the 1506 * open (which involves I/O during sp_validate), so fail with ENXIO. 1507 */ 1508 setno = MD_MIN2SET(mnum); 1509 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 1510 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 1511 return (ENXIO); 1512 } 1513 1514 /* grab necessary locks */ 1515 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1516 setno = MD_UN2SET(un); 1517 1518 /* open underlying device, if necessary */ 1519 if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { 1520 if ((err = sp_open_dev(un, md_oflags)) != 0) 1521 goto out; 1522 1523 if (MD_MNSET_SETNO(setno)) { 1524 /* For probe, don't incur the overhead of validate */ 1525 if (!(md_oflags & MD_OFLG_PROBEDEV)) { 1526 /* 1527 * Don't call sp_validate while 1528 * unit_openclose lock is held. So, actually 1529 * open the device, drop openclose lock, 1530 * call sp_validate, reacquire openclose lock, 1531 * and close the device. If sp_validate 1532 * succeeds, then device will be re-opened. 1533 */ 1534 if ((err = md_unit_incopen(mnum, flag, 1535 otyp)) != 0) 1536 goto out; 1537 1538 mutex_enter(&ui->ui_mx); 1539 ui->ui_lock |= MD_UL_OPENINPROGRESS; 1540 mutex_exit(&ui->ui_mx); 1541 md_unit_openclose_exit(ui); 1542 if (otyp != OTYP_LYR) 1543 rw_exit(&md_unit_array_rw.lock); 1544 1545 err = sp_validate(un); 1546 1547 if (otyp != OTYP_LYR) 1548 rw_enter(&md_unit_array_rw.lock, 1549 RW_READER); 1550 (void) md_unit_openclose_enter(ui); 1551 (void) md_unit_decopen(mnum, otyp); 1552 mutex_enter(&ui->ui_mx); 1553 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 1554 cv_broadcast(&ui->ui_cv); 1555 mutex_exit(&ui->ui_mx); 1556 /* 1557 * Should be in the same state as before 1558 * the sp_validate. 1559 */ 1560 if (err != 0) { 1561 /* close the device opened above */ 1562 md_layered_close(un->un_dev, md_oflags); 1563 err = EIO; 1564 goto out; 1565 } 1566 } 1567 /* 1568 * As we're a multi-owner metadevice we need to ensure 1569 * that all nodes have the same idea of the status. 1570 * sp_validate() will mark the device as errored (if 1571 * it cannot read the watermark) or ok (if it was 1572 * previously errored but the watermark is now valid). 1573 * This code-path is only entered on the non-probe open 1574 * so we will maintain the errored state during a probe 1575 * call. This means the sys-admin must metarecover -m 1576 * to reset the soft-partition error. 1577 */ 1578 } else { 1579 /* For probe, don't incur the overhead of validate */ 1580 if (!(md_oflags & MD_OFLG_PROBEDEV) && 1581 (err = sp_validate(un)) != 0) { 1582 /* close the device opened above */ 1583 md_layered_close(un->un_dev, md_oflags); 1584 err = EIO; 1585 goto out; 1586 } else { 1587 /* 1588 * we succeeded in validating the on disk 1589 * format versus the in core, so reset the 1590 * status if it's in error 1591 */ 1592 if (un->un_status == MD_SP_ERR) { 1593 un->un_status = MD_SP_OK; 1594 } 1595 } 1596 } 1597 } 1598 1599 /* count open */ 1600 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 1601 goto out; 1602 1603 out: 1604 md_unit_openclose_exit(ui); 1605 return (err); 1606 } 1607 1608 /* 1609 * FUNCTION: sp_close() 1610 * INPUT: dev - device to close. 1611 * flag - pass-through flag. 1612 * otyp - pass-through type. 1613 * cred_p - credentials. 1614 * md_cflags - close flags. 1615 * OUTPUT: none. 1616 * RETURNS: 0 - success. 1617 * non-zero - err. 1618 * PURPOSE: close a soft paritition. 1619 */ 1620 /* ARGSUSED */ 1621 static int 1622 sp_close( 1623 dev_t dev, 1624 int flag, 1625 int otyp, 1626 cred_t *cred_p, 1627 int md_cflags 1628 ) 1629 { 1630 minor_t mnum = getminor(dev); 1631 mdi_unit_t *ui = MDI_UNIT(mnum); 1632 mp_unit_t *un; 1633 int err = 0; 1634 1635 /* grab necessary locks */ 1636 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1637 1638 /* count closed */ 1639 if ((err = md_unit_decopen(mnum, otyp)) != 0) 1640 goto out; 1641 1642 /* close devices, if necessary */ 1643 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 1644 md_layered_close(un->un_dev, md_cflags); 1645 } 1646 1647 /* 1648 * If a MN set and transient capabilities (eg ABR/DMR) are set, 1649 * clear these capabilities if this is the last close in 1650 * the cluster 1651 */ 1652 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1653 (ui->ui_tstate & MD_ABR_CAP)) { 1654 md_unit_openclose_exit(ui); 1655 mdmn_clear_all_capabilities(mnum); 1656 return (0); 1657 } 1658 /* unlock, return success */ 1659 out: 1660 md_unit_openclose_exit(ui); 1661 return (err); 1662 } 1663 1664 1665 /* used in sp_dump routine */ 1666 static struct buf dumpbuf; 1667 1668 /* 1669 * FUNCTION: sp_dump() 1670 * INPUT: dev - device to dump to. 1671 * addr - address to dump. 1672 * blkno - blkno on device. 1673 * nblk - number of blocks to dump. 1674 * OUTPUT: none. 1675 * RETURNS: result from bdev_dump. 1676 * PURPOSE: This routine dumps memory to the disk. It assumes that 1677 * the memory has already been mapped into mainbus space. 1678 * It is called at disk interrupt priority when the system 1679 * is in trouble. 1680 * NOTE: this function is defined using 32-bit arguments, 1681 * but soft partitioning is internally 64-bit. Arguments 1682 * are casted where appropriate. 1683 */ 1684 static int 1685 sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1686 { 1687 mp_unit_t *un; 1688 buf_t *bp; 1689 sp_ext_length_t nb; 1690 daddr_t mapblk; 1691 int result; 1692 int more; 1693 int saveresult = 0; 1694 1695 /* 1696 * Don't need to grab the unit lock. 1697 * Cause nothing else is supposed to be happenning. 1698 * Also dump is not supposed to sleep. 1699 */ 1700 un = (mp_unit_t *)MD_UNIT(getminor(dev)); 1701 1702 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 1703 return (EINVAL); 1704 1705 if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks) 1706 return (EINVAL); 1707 1708 bp = &dumpbuf; 1709 nb = (sp_ext_length_t)dbtob(nblk); 1710 do { 1711 bzero((caddr_t)bp, sizeof (*bp)); 1712 more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp); 1713 nblk = (int)(btodb(bp->b_bcount)); 1714 mapblk = bp->b_blkno; 1715 result = bdev_dump(bp->b_edev, addr, mapblk, nblk); 1716 if (result) 1717 saveresult = result; 1718 1719 nb -= bp->b_bcount; 1720 addr += bp->b_bcount; 1721 blkno += nblk; 1722 } while (more); 1723 1724 return (saveresult); 1725 } 1726 1727 static int 1728 sp_imp_set( 1729 set_t setno 1730 ) 1731 { 1732 mddb_recid_t recid; 1733 int gotsomething; 1734 mddb_type_t rec_type; 1735 mddb_de_ic_t *dep; 1736 mddb_rb32_t *rbp; 1737 mp_unit_t *un64; 1738 mp_unit32_od_t *un32; 1739 md_dev64_t self_devt; 1740 minor_t *self_id; /* minor needs to be updated */ 1741 md_parent_t *parent_id; /* parent needs to be updated */ 1742 mddb_recid_t *record_id; /* record id needs to be updated */ 1743 1744 gotsomething = 0; 1745 1746 rec_type = (mddb_type_t)md_getshared_key(setno, 1747 sp_md_ops.md_driver.md_drivername); 1748 recid = mddb_makerecid(setno, 0); 1749 1750 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1751 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1752 continue; 1753 1754 dep = mddb_getrecdep(recid); 1755 rbp = dep->de_rb; 1756 1757 switch (rbp->rb_revision) { 1758 case MDDB_REV_RB: 1759 case MDDB_REV_RBFN: 1760 /* 1761 * Small device 1762 */ 1763 un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid); 1764 self_id = &(un32->c.un_self_id); 1765 parent_id = &(un32->c.un_parent); 1766 record_id = &(un32->c.un_record_id); 1767 1768 if (!md_update_minor(setno, mddb_getsidenum 1769 (setno), un32->un_key)) 1770 goto out; 1771 break; 1772 1773 case MDDB_REV_RB64: 1774 case MDDB_REV_RB64FN: 1775 un64 = (mp_unit_t *)mddb_getrecaddr(recid); 1776 self_id = &(un64->c.un_self_id); 1777 parent_id = &(un64->c.un_parent); 1778 record_id = &(un64->c.un_record_id); 1779 1780 if (!md_update_minor(setno, mddb_getsidenum 1781 (setno), un64->un_key)) 1782 goto out; 1783 break; 1784 } 1785 1786 /* 1787 * If this is a top level and a friendly name metadevice, 1788 * update its minor in the namespace. 1789 */ 1790 if ((*parent_id == MD_NO_PARENT) && 1791 ((rbp->rb_revision == MDDB_REV_RBFN) || 1792 (rbp->rb_revision == MDDB_REV_RB64FN))) { 1793 1794 self_devt = md_makedevice(md_major, *self_id); 1795 if (!md_update_top_device_minor(setno, 1796 mddb_getsidenum(setno), self_devt)) 1797 goto out; 1798 } 1799 1800 /* 1801 * Update unit with the imported setno 1802 * 1803 */ 1804 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1805 1806 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 1807 if (*parent_id != MD_NO_PARENT) 1808 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 1809 *record_id = MAKERECID(setno, DBID(*record_id)); 1810 1811 gotsomething = 1; 1812 } 1813 1814 out: 1815 return (gotsomething); 1816 } 1817 1818 static md_named_services_t sp_named_services[] = { 1819 {NULL, 0} 1820 }; 1821 1822 md_ops_t sp_md_ops = { 1823 sp_open, /* open */ 1824 sp_close, /* close */ 1825 md_sp_strategy, /* strategy */ 1826 NULL, /* print */ 1827 sp_dump, /* dump */ 1828 NULL, /* read */ 1829 NULL, /* write */ 1830 md_sp_ioctl, /* ioctl, */ 1831 sp_snarf, /* snarf */ 1832 sp_halt, /* halt */ 1833 NULL, /* aread */ 1834 NULL, /* awrite */ 1835 sp_imp_set, /* import set */ 1836 sp_named_services 1837 }; 1838 1839 static void 1840 init_init() 1841 { 1842 sp_parent_cache = kmem_cache_create("md_softpart_parent", 1843 sizeof (md_spps_t), 0, sp_parent_constructor, 1844 sp_parent_destructor, sp_run_queue, NULL, NULL, 0); 1845 sp_child_cache = kmem_cache_create("md_softpart_child", 1846 sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0, 1847 sp_child_constructor, sp_child_destructor, sp_run_queue, 1848 NULL, NULL, 0); 1849 } 1850 1851 static void 1852 fini_uninit() 1853 { 1854 kmem_cache_destroy(sp_parent_cache); 1855 kmem_cache_destroy(sp_child_cache); 1856 sp_parent_cache = sp_child_cache = NULL; 1857 } 1858 1859 /* define the module linkage */ 1860 MD_PLUGIN_MISC_MODULE("soft partition module", init_init(), fini_uninit()) 1861