1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Soft partitioning metadevice driver (md_sp). 29 * 30 * This file contains the primary operations of the soft partitioning 31 * metadevice driver. This includes all routines for normal operation 32 * (open/close/read/write). Please see mdvar.h for a definition of 33 * metadevice operations vector (md_ops_t). This driver is loosely 34 * based on the stripe driver (md_stripe). 35 * 36 * All metadevice administration is done through the use of ioctl's. 37 * As such, all administrative routines appear in sp_ioctl.c. 38 * 39 * Soft partitions are represented both in-core and in the metadb with a 40 * unit structure. The soft partition-specific information in the unit 41 * structure includes the following information: 42 * - Device information (md_dev64_t & md key) about the device on which 43 * the soft partition is built. 44 * - Soft partition status information. 45 * - The size of the soft partition and number of extents used to 46 * make up that size. 47 * - An array of exents which define virtual/physical offset 48 * mappings and lengths for each extent. 49 * 50 * Typical soft partition operation proceeds as follows: 51 * - The unit structure is fetched from the metadb and placed into 52 * an in-core array (as with other metadevices). This operation 53 * is performed via sp_build_incore( ) and takes place during 54 * "snarfing" (when all metadevices are brought in-core at 55 * once) and when a new soft partition is created. 56 * - A soft partition is opened via sp_open( ). At open time the 57 * the soft partition unit structure is verified with the soft 58 * partition on-disk structures. Additionally, the soft partition 59 * status is checked (only soft partitions in the OK state may be 60 * opened). 61 * - Soft partition I/O is performed via sp_strategy( ) which relies on 62 * a support routine, sp_mapbuf( ), to do most of the work. 63 * sp_mapbuf( ) maps a buffer to a particular extent via a binary 64 * search of the extent array in the soft partition unit structure. 65 * Once a translation has been performed, the I/O is passed down 66 * to the next layer, which may be another metadevice or a physical 67 * disk. Since a soft partition may contain multiple, non-contiguous 68 * extents, a single I/O may have to be fragmented. 69 * - Soft partitions are closed using sp_close. 70 * 71 */ 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/conf.h> 76 #include <sys/file.h> 77 #include <sys/user.h> 78 #include <sys/uio.h> 79 #include <sys/t_lock.h> 80 #include <sys/buf.h> 81 #include <sys/dkio.h> 82 #include <sys/vtoc.h> 83 #include <sys/kmem.h> 84 #include <vm/page.h> 85 #include <sys/cmn_err.h> 86 #include <sys/sysmacros.h> 87 #include <sys/types.h> 88 #include <sys/mkdev.h> 89 #include <sys/stat.h> 90 #include <sys/open.h> 91 #include <sys/lvm/mdvar.h> 92 #include <sys/lvm/md_sp.h> 93 #include <sys/lvm/md_convert.h> 94 #include <sys/lvm/md_notify.h> 95 #include <sys/lvm/md_crc.h> 96 #include <sys/modctl.h> 97 #include <sys/ddi.h> 98 #include <sys/sunddi.h> 99 #include <sys/debug.h> 100 101 #include <sys/sysevent/eventdefs.h> 102 #include <sys/sysevent/svm.h> 103 104 md_ops_t sp_md_ops; 105 #ifndef lint 106 char _depends_on[] = "drv/md"; 107 md_ops_t *md_interface_ops = &sp_md_ops; 108 #endif 109 110 extern unit_t md_nunits; 111 extern set_t md_nsets; 112 extern md_set_t md_set[]; 113 114 extern int md_status; 115 extern major_t md_major; 116 extern mdq_anchor_t md_done_daemon; 117 extern mdq_anchor_t md_sp_daemon; 118 extern kmutex_t md_mx; 119 extern kcondvar_t md_cv; 120 extern md_krwlock_t md_unit_array_rw; 121 extern clock_t md_hz; 122 123 static kmem_cache_t *sp_parent_cache = NULL; 124 static kmem_cache_t *sp_child_cache = NULL; 125 static void sp_send_stat_ok(mp_unit_t *); 126 static void sp_send_stat_err(mp_unit_t *); 127 128 /* 129 * FUNCTION: sp_parent_constructor() 130 * INPUT: none. 131 * OUTPUT: ps - parent save structure initialized. 132 * RETURNS: void * - ptr to initialized parent save structure. 133 * PURPOSE: initialize parent save structure. 134 */ 135 /*ARGSUSED1*/ 136 static int 137 sp_parent_constructor(void *p, void *d1, int d2) 138 { 139 mutex_init(&((md_spps_t *)p)->ps_mx, 140 NULL, MUTEX_DEFAULT, NULL); 141 return (0); 142 } 143 144 static void 145 sp_parent_init(md_spps_t *ps) 146 { 147 bzero(ps, offsetof(md_spps_t, ps_mx)); 148 } 149 150 /*ARGSUSED1*/ 151 static void 152 sp_parent_destructor(void *p, void *d) 153 { 154 mutex_destroy(&((md_spps_t *)p)->ps_mx); 155 } 156 157 /* 158 * FUNCTION: sp_child_constructor() 159 * INPUT: none. 160 * OUTPUT: cs - child save structure initialized. 161 * RETURNS: void * - ptr to initialized child save structure. 162 * PURPOSE: initialize child save structure. 163 */ 164 /*ARGSUSED1*/ 165 static int 166 sp_child_constructor(void *p, void *d1, int d2) 167 { 168 bioinit(&((md_spcs_t *)p)->cs_buf); 169 return (0); 170 } 171 172 static void 173 sp_child_init(md_spcs_t *cs) 174 { 175 cs->cs_mdunit = 0; 176 cs->cs_ps = NULL; 177 md_bioreset(&cs->cs_buf); 178 } 179 180 /*ARGSUSED1*/ 181 static void 182 sp_child_destructor(void *p, void *d) 183 { 184 biofini(&((md_spcs_t *)p)->cs_buf); 185 } 186 187 /* 188 * FUNCTION: sp_run_queue() 189 * INPUT: none. 190 * OUTPUT: none. 191 * RETURNS: void. 192 * PURPOSE: run the md_daemon to clean up memory pool. 193 */ 194 /*ARGSUSED*/ 195 static void 196 sp_run_queue(void *d) 197 { 198 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 199 md_daemon(1, &md_done_daemon); 200 } 201 202 203 /* 204 * FUNCTION: sp_build_incore() 205 * INPUT: p - ptr to unit structure. 206 * snarfing - flag to tell us we are snarfing. 207 * OUTPUT: non. 208 * RETURNS: int - 0 (always). 209 * PURPOSE: place unit structure into in-core unit array (keyed from 210 * minor number). 211 */ 212 int 213 sp_build_incore(void *p, int snarfing) 214 { 215 mp_unit_t *un = (mp_unit_t *)p; 216 minor_t mnum; 217 set_t setno; 218 md_dev64_t tmpdev; 219 220 mnum = MD_SID(un); 221 222 if (MD_UNIT(mnum) != NULL) 223 return (0); 224 225 MD_STATUS(un) = 0; 226 227 if (snarfing) { 228 /* 229 * if we are snarfing, we get the device information 230 * from the metadb record (using the metadb key for 231 * that device). 232 */ 233 setno = MD_MIN2SET(mnum); 234 235 tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), 236 un->un_key, MD_NOTRUST_DEVT); 237 un->un_dev = tmpdev; 238 } 239 240 /* place various information in the in-core data structures */ 241 md_nblocks_set(mnum, un->c.un_total_blocks); 242 MD_UNIT(mnum) = un; 243 244 return (0); 245 } 246 247 /* 248 * FUNCTION: reset_sp() 249 * INPUT: un - unit structure to be reset/removed. 250 * mnum - minor number to be reset/removed. 251 * removing - flag to tell us if we are removing 252 * permanently or just reseting in-core 253 * structures. 254 * OUTPUT: none. 255 * RETURNS: void. 256 * PURPOSE: used to either simply reset in-core structures or to 257 * permanently remove metadevices from the metadb. 258 */ 259 void 260 reset_sp(mp_unit_t *un, minor_t mnum, int removing) 261 { 262 sv_dev_t *sv; 263 mddb_recid_t vtoc_id; 264 265 /* clean up in-core structures */ 266 md_destroy_unit_incore(mnum, &sp_md_ops); 267 268 md_nblocks_set(mnum, -1ULL); 269 MD_UNIT(mnum) = NULL; 270 271 /* 272 * Attempt release of minor node 273 */ 274 md_remove_minor_node(mnum); 275 276 if (!removing) 277 return; 278 279 /* we are removing the soft partition from the metadb */ 280 281 /* 282 * Save off device information so we can get to 283 * it after we do the mddb_deleterec(). 284 */ 285 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); 286 sv->setno = MD_MIN2SET(mnum); 287 sv->key = un->un_key; 288 vtoc_id = un->c.un_vtoc_id; 289 290 /* 291 * Remove self from the namespace 292 */ 293 if (un->c.un_revision & MD_FN_META_DEV) { 294 (void) md_rem_selfname(un->c.un_self_id); 295 } 296 297 /* Remove the unit structure */ 298 mddb_deleterec_wrapper(un->c.un_record_id); 299 300 if (vtoc_id) 301 mddb_deleterec_wrapper(vtoc_id); 302 303 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE, 304 MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); 305 306 /* 307 * remove the underlying device name from the metadb. if other 308 * soft partitions are built on this device, this will simply 309 * decrease the reference count for this device. otherwise the 310 * name record for this device will be removed from the metadb. 311 */ 312 md_rem_names(sv, 1); 313 kmem_free(sv, sizeof (sv_dev_t)); 314 } 315 316 /* 317 * FUNCTION: sp_send_stat_msg 318 * INPUT: un - unit reference 319 * status - status to be sent to master node 320 * MD_SP_OK - soft-partition is now OK 321 * MD_SP_ERR " " errored 322 * OUTPUT: none. 323 * RETURNS: void. 324 * PURPOSE: send a soft-partition status change to the master node. If the 325 * message succeeds we simply return. If it fails we panic as the 326 * cluster-wide view of the metadevices is now inconsistent. 327 * CALLING CONTEXT: 328 * Blockable. No locks can be held. 329 */ 330 static void 331 sp_send_stat_msg(mp_unit_t *un, sp_status_t status) 332 { 333 md_mn_msg_sp_setstat_t sp_msg; 334 md_mn_kresult_t *kres; 335 set_t setno = MD_UN2SET(un); 336 int rval; 337 const char *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK"; 338 339 sp_msg.sp_setstat_mnum = MD_SID(un); 340 sp_msg.sp_setstat_status = status; 341 342 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 343 344 rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG, 345 0, (char *)&sp_msg, sizeof (sp_msg), kres); 346 347 if (!MDMN_KSEND_MSG_OK(rval, kres)) { 348 mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2"); 349 /* If we're shutting down already, pause things here. */ 350 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) { 351 while (!md_mn_is_commd_present()) { 352 delay(md_hz); 353 } 354 } 355 /* 356 * Panic as we are now in an inconsistent state. 357 */ 358 cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n", 359 md_shortname(MD_SID(un)), str); 360 } 361 362 kmem_free(kres, sizeof (md_mn_kresult_t)); 363 } 364 365 /* 366 * FUNCTION: sp_finish_error 367 * INPUT: ps - parent save structure for error-ed I/O. 368 * lock_held - set if the unit readerlock is held 369 * OUTPUT: none. 370 * RETURNS: void. 371 * PURPOSE: report a driver error 372 */ 373 static void 374 sp_finish_error(md_spps_t *ps, int lock_held) 375 { 376 struct buf *pb = ps->ps_bp; 377 mdi_unit_t *ui = ps->ps_ui; 378 md_dev64_t un_dev; /* underlying device */ 379 md_dev64_t md_dev = md_expldev(pb->b_edev); /* metadev in error */ 380 char *str; 381 382 un_dev = md_expldev(ps->ps_un->un_dev); 383 /* set error type */ 384 if (pb->b_flags & B_READ) { 385 str = "read"; 386 } else { 387 str = "write"; 388 } 389 390 391 SPPS_FREE(sp_parent_cache, ps); 392 pb->b_flags |= B_ERROR; 393 394 md_kstat_done(ui, pb, 0); 395 396 if (lock_held) { 397 md_unit_readerexit(ui); 398 } 399 md_biodone(pb); 400 401 cmn_err(CE_WARN, "md: %s: %s error on %s", 402 md_shortname(md_getminor(md_dev)), str, 403 md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0)); 404 } 405 406 407 /* 408 * FUNCTION: sp_xmit_ok 409 * INPUT: dq - daemon queue referencing failing ps structure 410 * OUTPUT: none. 411 * RETURNS: void. 412 * PURPOSE: send a message to the master node in a multi-owner diskset to 413 * update all attached nodes view of the soft-part to be MD_SP_OK. 414 * CALLING CONTEXT: 415 * Blockable. No unit lock held. 416 */ 417 static void 418 sp_xmit_ok(daemon_queue_t *dq) 419 { 420 md_spps_t *ps = (md_spps_t *)dq; 421 422 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 423 sp_send_stat_msg(ps->ps_un, MD_SP_OK); 424 425 /* 426 * Successfully transmitted error state to all nodes, now release this 427 * parent structure. 428 */ 429 SPPS_FREE(sp_parent_cache, ps); 430 } 431 432 /* 433 * FUNCTION: sp_xmit_error 434 * INPUT: dq - daemon queue referencing failing ps structure 435 * OUTPUT: none. 436 * RETURNS: void. 437 * PURPOSE: send a message to the master node in a multi-owner diskset to 438 * update all attached nodes view of the soft-part to be MD_SP_ERR. 439 * CALLING CONTEXT: 440 * Blockable. No unit lock held. 441 */ 442 static void 443 sp_xmit_error(daemon_queue_t *dq) 444 { 445 md_spps_t *ps = (md_spps_t *)dq; 446 447 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 448 sp_send_stat_msg(ps->ps_un, MD_SP_ERR); 449 450 /* 451 * Successfully transmitted error state to all nodes, now release this 452 * parent structure. 453 */ 454 SPPS_FREE(sp_parent_cache, ps); 455 } 456 static void 457 sp_send_stat_ok(mp_unit_t *un) 458 { 459 minor_t mnum = MD_SID(un); 460 md_spps_t *ps; 461 462 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 463 sp_parent_init(ps); 464 ps->ps_un = un; 465 ps->ps_ui = MDI_UNIT(mnum); 466 467 daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps, 468 REQ_OLD); 469 } 470 471 static void 472 sp_send_stat_err(mp_unit_t *un) 473 { 474 minor_t mnum = MD_SID(un); 475 md_spps_t *ps; 476 477 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 478 sp_parent_init(ps); 479 ps->ps_un = un; 480 ps->ps_ui = MDI_UNIT(mnum); 481 482 daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps, 483 REQ_OLD); 484 } 485 486 487 /* 488 * FUNCTION: sp_error() 489 * INPUT: ps - parent save structure for error-ed I/O. 490 * OUTPUT: none. 491 * RETURNS: void. 492 * PURPOSE: report a driver error. 493 * CALLING CONTEXT: 494 * Interrupt - non-blockable 495 */ 496 static void 497 sp_error(md_spps_t *ps) 498 { 499 set_t setno = MD_UN2SET(ps->ps_un); 500 501 /* 502 * Drop the mutex associated with this request before (potentially) 503 * enqueuing the free onto a separate thread. We have to release the 504 * mutex before destroying the parent structure. 505 */ 506 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 507 if (MUTEX_HELD(&ps->ps_mx)) { 508 mutex_exit(&ps->ps_mx); 509 } 510 } else { 511 /* 512 * this should only ever happen if we are panicking, 513 * since DONTFREE is only set on the parent if panicstr 514 * is non-NULL. 515 */ 516 ASSERT(panicstr); 517 } 518 519 /* 520 * For a multi-owner set we need to send a message to the master so that 521 * all nodes get the errored status when we first encounter it. To avoid 522 * deadlocking when multiple soft-partitions encounter an error on one 523 * physical unit we drop the unit readerlock before enqueueing the 524 * request. That way we can service any messages that require a 525 * writerlock to be held. Additionally, to avoid deadlocking when at 526 * the bottom of a metadevice stack and a higher level mirror has 527 * multiple requests outstanding on this soft-part, we clone the ps 528 * that failed and pass the error back up the stack to release the 529 * reference that this i/o may have in the higher-level metadevice. 530 * The other nodes in the cluster just have to modify the soft-part 531 * status and we do not need to block the i/o completion for this. 532 */ 533 if (MD_MNSET_SETNO(setno)) { 534 md_spps_t *err_ps; 535 err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 536 sp_parent_init(err_ps); 537 538 err_ps->ps_un = ps->ps_un; 539 err_ps->ps_ui = ps->ps_ui; 540 541 md_unit_readerexit(ps->ps_ui); 542 543 daemon_request(&md_sp_daemon, sp_xmit_error, 544 (daemon_queue_t *)err_ps, REQ_OLD); 545 546 sp_finish_error(ps, 0); 547 548 return; 549 } else { 550 ps->ps_un->un_status = MD_SP_ERR; 551 } 552 553 /* Flag the error */ 554 sp_finish_error(ps, 1); 555 556 } 557 558 /* 559 * FUNCTION: sp_mapbuf() 560 * INPUT: un - unit structure for soft partition we are doing 561 * I/O on. 562 * voff - virtual offset in soft partition to map. 563 * bcount - # of blocks in the I/O. 564 * OUTPUT: bp - translated buffer to be passed down to next layer. 565 * RETURNS: 1 - request must be fragmented, more work to do, 566 * 0 - request satisified, no more work to do 567 * -1 - error 568 * PURPOSE: Map the the virtual offset in the soft partition (passed 569 * in via voff) to the "physical" offset on whatever the soft 570 * partition is built on top of. We do this by doing a binary 571 * search of the extent array in the soft partition unit 572 * structure. Once the current extent is found, we do the 573 * translation, determine if the I/O will cross extent 574 * boundaries (if so, we have to fragment the I/O), then 575 * fill in the buf structure to be passed down to the next layer. 576 */ 577 static int 578 sp_mapbuf( 579 mp_unit_t *un, 580 sp_ext_offset_t voff, 581 sp_ext_length_t bcount, 582 buf_t *bp 583 ) 584 { 585 int lo, mid, hi, found, more; 586 size_t new_bcount; 587 sp_ext_offset_t new_blkno; 588 sp_ext_offset_t new_offset; 589 sp_ext_offset_t ext_endblk; 590 md_dev64_t new_edev; 591 extern unsigned md_maxphys; 592 593 found = 0; 594 lo = 0; 595 hi = un->un_numexts - 1; 596 597 /* 598 * do a binary search to find the extent that contains the 599 * starting offset. after this loop, mid contains the index 600 * of the correct extent. 601 */ 602 while (lo <= hi && !found) { 603 mid = (lo + hi) / 2; 604 /* is the starting offset contained within the mid-ext? */ 605 if (voff >= un->un_ext[mid].un_voff && 606 voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len) 607 found = 1; 608 else if (voff < un->un_ext[mid].un_voff) 609 hi = mid - 1; 610 else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */ 611 lo = mid + 1; 612 } 613 614 if (!found) { 615 cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff); 616 return (-1); 617 } 618 619 /* translate to underlying physical offset/device */ 620 new_offset = voff - un->un_ext[mid].un_voff; 621 new_blkno = un->un_ext[mid].un_poff + new_offset; 622 new_edev = un->un_dev; 623 624 /* determine if we need to break the I/O into fragments */ 625 ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len; 626 if (voff + btodb(bcount) > ext_endblk) { 627 new_bcount = dbtob(ext_endblk - voff); 628 more = 1; 629 } else { 630 new_bcount = bcount; 631 more = 0; 632 } 633 634 /* only break up the I/O if we're not built on another metadevice */ 635 if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) { 636 new_bcount = md_maxphys; 637 more = 1; 638 } 639 if (bp != (buf_t *)NULL) { 640 /* do bp updates */ 641 bp->b_bcount = new_bcount; 642 bp->b_lblkno = new_blkno; 643 bp->b_edev = md_dev64_to_dev(new_edev); 644 } 645 return (more); 646 } 647 648 /* 649 * FUNCTION: sp_validate() 650 * INPUT: un - unit structure to be validated. 651 * OUTPUT: none. 652 * RETURNS: 0 - soft partition ok. 653 * -1 - error. 654 * PURPOSE: called on open to sanity check the soft partition. In 655 * order to open a soft partition: 656 * - it must have at least one extent 657 * - the extent info in core and on disk must match 658 * - it may not be in an intermediate state (which would 659 * imply that a two-phase commit was interrupted) 660 * 661 * If the extent checking fails (B_ERROR returned from the read 662 * strategy call) _and_ we're a multi-owner diskset, we send a 663 * message to the master so that all nodes inherit the same view 664 * of the soft partition. 665 * If we are checking a soft-part that is marked as in error, and 666 * we can actually read and validate the watermarks we send a 667 * message to clear the error to the master node. 668 */ 669 static int 670 sp_validate(mp_unit_t *un) 671 { 672 uint_t ext; 673 struct buf *buf; 674 sp_ext_length_t len; 675 mp_watermark_t *wm; 676 set_t setno; 677 int reset_error = 0; 678 679 setno = MD_UN2SET(un); 680 681 /* sanity check unit structure components ?? */ 682 if (un->un_status != MD_SP_OK) { 683 if (un->un_status != MD_SP_ERR) { 684 cmn_err(CE_WARN, "md: %s: open failed, soft partition " 685 "status is %u.", 686 md_shortname(MD_SID(un)), 687 un->un_status); 688 return (-1); 689 } else { 690 cmn_err(CE_WARN, "md: %s: open of soft partition " 691 "in Errored state.", 692 md_shortname(MD_SID(un))); 693 reset_error = 1; 694 } 695 } 696 697 if (un->un_numexts == 0) { 698 cmn_err(CE_WARN, "md: %s: open failed, soft partition does " 699 "not have any extents.", md_shortname(MD_SID(un))); 700 return (-1); 701 } 702 703 len = 0LL; 704 for (ext = 0; ext < un->un_numexts; ext++) { 705 706 /* tally extent lengths to check total size */ 707 len += un->un_ext[ext].un_len; 708 709 /* allocate buffer for watermark */ 710 buf = getrbuf(KM_SLEEP); 711 712 /* read watermark */ 713 buf->b_flags = B_READ; 714 buf->b_edev = md_dev64_to_dev(un->un_dev); 715 buf->b_iodone = NULL; 716 buf->b_proc = NULL; 717 buf->b_bcount = sizeof (mp_watermark_t); 718 buf->b_lblkno = un->un_ext[ext].un_poff - 1; 719 buf->b_bufsize = sizeof (mp_watermark_t); 720 buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t), 721 KM_SLEEP); 722 723 /* 724 * make the call non-blocking so that it is not affected 725 * by a set take. 726 */ 727 md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL); 728 (void) biowait(buf); 729 730 if (buf->b_flags & B_ERROR) { 731 cmn_err(CE_WARN, "md: %s: open failed, could not " 732 "read watermark at block %llu for extent %u, " 733 "error %d.", md_shortname(MD_SID(un)), 734 buf->b_lblkno, ext, buf->b_error); 735 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 736 freerbuf(buf); 737 738 /* 739 * If we're a multi-owner diskset we send a message 740 * indicating that this soft-part has an invalid 741 * extent to the master node. This ensures a consistent 742 * view of the soft-part across the cluster. 743 */ 744 if (MD_MNSET_SETNO(setno)) { 745 sp_send_stat_err(un); 746 } 747 return (-1); 748 } 749 750 wm = (mp_watermark_t *)buf->b_un.b_addr; 751 752 /* make sure the checksum is correct first */ 753 if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum, 754 (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) { 755 cmn_err(CE_WARN, "md: %s: open failed, watermark " 756 "at block %llu for extent %u does not have a " 757 "valid checksum 0x%08x.", md_shortname(MD_SID(un)), 758 buf->b_lblkno, ext, wm->wm_checksum); 759 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 760 freerbuf(buf); 761 return (-1); 762 } 763 764 if (wm->wm_magic != MD_SP_MAGIC) { 765 cmn_err(CE_WARN, "md: %s: open failed, watermark " 766 "at block %llu for extent %u does not have a " 767 "valid watermark magic number, expected 0x%x, " 768 "found 0x%x.", md_shortname(MD_SID(un)), 769 buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic); 770 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 771 freerbuf(buf); 772 return (-1); 773 } 774 775 /* make sure sequence number matches the current extent */ 776 if (wm->wm_seq != ext) { 777 cmn_err(CE_WARN, "md: %s: open failed, watermark " 778 "at block %llu for extent %u has invalid " 779 "sequence number %u.", md_shortname(MD_SID(un)), 780 buf->b_lblkno, ext, wm->wm_seq); 781 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 782 freerbuf(buf); 783 return (-1); 784 } 785 786 /* make sure watermark length matches unit structure */ 787 if (wm->wm_length != un->un_ext[ext].un_len) { 788 cmn_err(CE_WARN, "md: %s: open failed, watermark " 789 "at block %llu for extent %u has inconsistent " 790 "length, expected %llu, found %llu.", 791 md_shortname(MD_SID(un)), buf->b_lblkno, 792 ext, un->un_ext[ext].un_len, 793 (u_longlong_t)wm->wm_length); 794 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 795 freerbuf(buf); 796 return (-1); 797 } 798 799 /* 800 * make sure the type is a valid soft partition and not 801 * a free extent or the end. 802 */ 803 if (wm->wm_type != EXTTYP_ALLOC) { 804 cmn_err(CE_WARN, "md: %s: open failed, watermark " 805 "at block %llu for extent %u is not marked " 806 "as in-use, type = %u.", md_shortname(MD_SID(un)), 807 buf->b_lblkno, ext, wm->wm_type); 808 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 809 freerbuf(buf); 810 return (-1); 811 } 812 /* free up buffer */ 813 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 814 freerbuf(buf); 815 } 816 817 if (len != un->un_length) { 818 cmn_err(CE_WARN, "md: %s: open failed, computed length " 819 "%llu != expected length %llu.", md_shortname(MD_SID(un)), 820 len, un->un_length); 821 return (-1); 822 } 823 824 /* 825 * If we're a multi-owner set _and_ reset_error is set, we should clear 826 * the error condition on all nodes in the set. Use SP_SETSTAT2 with 827 * MD_SP_OK. 828 */ 829 if (MD_MNSET_SETNO(setno) && reset_error) { 830 sp_send_stat_ok(un); 831 } 832 return (0); 833 } 834 835 /* 836 * FUNCTION: sp_done() 837 * INPUT: child_buf - buffer attached to child save structure. 838 * this is the buffer on which I/O has just 839 * completed. 840 * OUTPUT: none. 841 * RETURNS: 0 - success. 842 * 1 - error. 843 * PURPOSE: called on I/O completion. 844 */ 845 static int 846 sp_done(struct buf *child_buf) 847 { 848 struct buf *parent_buf; 849 mdi_unit_t *ui; 850 md_spps_t *ps; 851 md_spcs_t *cs; 852 853 /* find the child save structure to which this buffer belongs */ 854 cs = (md_spcs_t *)((caddr_t)child_buf - 855 (sizeof (md_spcs_t) - sizeof (buf_t))); 856 /* now get the parent save structure */ 857 ps = cs->cs_ps; 858 parent_buf = ps->ps_bp; 859 860 mutex_enter(&ps->ps_mx); 861 /* pass any errors back up to the parent */ 862 if (child_buf->b_flags & B_ERROR) { 863 ps->ps_flags |= MD_SPPS_ERROR; 864 parent_buf->b_error = child_buf->b_error; 865 } 866 /* mapout, if needed */ 867 if (child_buf->b_flags & B_REMAPPED) 868 bp_mapout(child_buf); 869 870 ps->ps_frags--; 871 if (ps->ps_frags != 0) { 872 /* 873 * if this parent has more children, we just free the 874 * child and return. 875 */ 876 kmem_cache_free(sp_child_cache, cs); 877 mutex_exit(&ps->ps_mx); 878 return (1); 879 } 880 /* there are no more children */ 881 kmem_cache_free(sp_child_cache, cs); 882 if (ps->ps_flags & MD_SPPS_ERROR) { 883 sp_error(ps); 884 return (1); 885 } 886 ui = ps->ps_ui; 887 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 888 mutex_exit(&ps->ps_mx); 889 } else { 890 /* 891 * this should only ever happen if we are panicking, 892 * since DONTFREE is only set on the parent if panicstr 893 * is non-NULL. 894 */ 895 ASSERT(panicstr); 896 } 897 SPPS_FREE(sp_parent_cache, ps); 898 md_kstat_done(ui, parent_buf, 0); 899 md_unit_readerexit(ui); 900 md_biodone(parent_buf); 901 return (0); 902 } 903 904 /* 905 * FUNCTION: md_sp_strategy() 906 * INPUT: parent_buf - parent buffer 907 * flag - flags 908 * private - private data 909 * OUTPUT: none. 910 * RETURNS: void. 911 * PURPOSE: Soft partitioning I/O strategy. Performs the main work 912 * needed to do I/O to a soft partition. The basic 913 * algorithm is as follows: 914 * - Allocate a child save structure to keep track 915 * of the I/O we are going to pass down. 916 * - Map the I/O to the correct extent in the soft 917 * partition (see sp_mapbuf()). 918 * - bioclone() the buffer and pass it down the 919 * stack using md_call_strategy. 920 * - If the I/O needs to split across extents, 921 * repeat the above steps until all fragments 922 * are finished. 923 */ 924 static void 925 md_sp_strategy(buf_t *parent_buf, int flag, void *private) 926 { 927 md_spps_t *ps; 928 md_spcs_t *cs; 929 int more; 930 mp_unit_t *un; 931 mdi_unit_t *ui; 932 size_t current_count; 933 off_t current_offset; 934 sp_ext_offset_t current_blkno; 935 buf_t *child_buf; 936 set_t setno = MD_MIN2SET(getminor(parent_buf->b_edev)); 937 int strat_flag = flag; 938 939 /* 940 * When doing IO to a multi owner meta device, check if set is halted. 941 * We do this check without the needed lock held, for performance 942 * reasons. 943 * If an IO just slips through while the set is locked via an 944 * MD_MN_SUSPEND_SET, we don't care about it. 945 * Only check for suspension if we are a top-level i/o request 946 * (MD_STR_NOTTOP is cleared in 'flag'); 947 */ 948 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 949 (MD_SET_HALTED | MD_SET_MNSET)) { 950 if ((flag & MD_STR_NOTTOP) == 0) { 951 mutex_enter(&md_mx); 952 /* Here we loop until the set is no longer halted */ 953 while (md_set[setno].s_status & MD_SET_HALTED) { 954 cv_wait(&md_cv, &md_mx); 955 } 956 mutex_exit(&md_mx); 957 } 958 } 959 960 ui = MDI_UNIT(getminor(parent_buf->b_edev)); 961 962 md_kstat_waitq_enter(ui); 963 964 un = (mp_unit_t *)md_unit_readerlock(ui); 965 966 if ((flag & MD_NOBLOCK) == 0) { 967 if (md_inc_iocount(setno) != 0) { 968 parent_buf->b_flags |= B_ERROR; 969 parent_buf->b_error = ENXIO; 970 parent_buf->b_resid = parent_buf->b_bcount; 971 md_kstat_waitq_exit(ui); 972 md_unit_readerexit(ui); 973 biodone(parent_buf); 974 return; 975 } 976 } else { 977 md_inc_iocount_noblock(setno); 978 } 979 980 if (!(flag & MD_STR_NOTTOP)) { 981 if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) { 982 md_kstat_waitq_exit(ui); 983 return; 984 } 985 } 986 987 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 988 sp_parent_init(ps); 989 990 /* 991 * Save essential information from the original buffhdr 992 * in the parent. 993 */ 994 ps->ps_un = un; 995 ps->ps_ui = ui; 996 ps->ps_bp = parent_buf; 997 ps->ps_addr = parent_buf->b_un.b_addr; 998 999 current_count = parent_buf->b_bcount; 1000 current_blkno = (sp_ext_offset_t)parent_buf->b_blkno; 1001 current_offset = 0; 1002 1003 /* 1004 * if we are at the top and we are panicking, 1005 * we don't free in order to save state. 1006 */ 1007 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) 1008 ps->ps_flags |= MD_SPPS_DONTFREE; 1009 1010 md_kstat_waitq_to_runq(ui); 1011 1012 ps->ps_frags++; 1013 1014 /* 1015 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this 1016 * metadevice. 1017 */ 1018 if (ui->ui_tstate & MD_ABR_CAP) 1019 strat_flag |= MD_STR_ABR; 1020 1021 /* 1022 * this loop does the main work of an I/O. we allocate a 1023 * a child save for each buf, do the logical to physical 1024 * mapping, decide if we need to frag the I/O, clone the 1025 * new I/O to pass down the stack. repeat until we've 1026 * taken care of the entire buf that was passed to us. 1027 */ 1028 do { 1029 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1030 sp_child_init(cs); 1031 child_buf = &cs->cs_buf; 1032 cs->cs_ps = ps; 1033 1034 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1035 if (more == -1) { 1036 parent_buf->b_flags |= B_ERROR; 1037 parent_buf->b_error = EIO; 1038 md_kstat_done(ui, parent_buf, 0); 1039 md_unit_readerexit(ui); 1040 md_biodone(parent_buf); 1041 kmem_cache_free(sp_parent_cache, ps); 1042 return; 1043 } 1044 1045 child_buf = md_bioclone(parent_buf, current_offset, 1046 child_buf->b_bcount, child_buf->b_edev, 1047 child_buf->b_blkno, sp_done, child_buf, 1048 KM_NOSLEEP); 1049 /* calculate new offset, counts, etc... */ 1050 current_offset += child_buf->b_bcount; 1051 current_count -= child_buf->b_bcount; 1052 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1053 1054 if (more) { 1055 mutex_enter(&ps->ps_mx); 1056 ps->ps_frags++; 1057 mutex_exit(&ps->ps_mx); 1058 } 1059 1060 md_call_strategy(child_buf, strat_flag, private); 1061 } while (more); 1062 1063 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) { 1064 while (!(ps->ps_flags & MD_SPPS_DONE)) { 1065 md_daemon(1, &md_done_daemon); 1066 } 1067 kmem_cache_free(sp_parent_cache, ps); 1068 } 1069 } 1070 1071 /* 1072 * FUNCTION: sp_directed_read() 1073 * INPUT: mnum - minor number 1074 * vdr - vol_directed_rd_t from user 1075 * mode - access mode for copying data out. 1076 * OUTPUT: none. 1077 * RETURNS: 0 - success 1078 * Exxxxx - failure error-code 1079 * PURPOSE: Construct the necessary sub-device i/o requests to perform the 1080 * directed read as requested by the user. This is essentially the 1081 * same as md_sp_strategy() with the exception being that the 1082 * underlying 'md_call_strategy' is replaced with an ioctl call. 1083 */ 1084 int 1085 sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode) 1086 { 1087 md_spps_t *ps; 1088 md_spcs_t *cs; 1089 int more; 1090 mp_unit_t *un; 1091 mdi_unit_t *ui; 1092 size_t current_count; 1093 off_t current_offset; 1094 sp_ext_offset_t current_blkno; 1095 buf_t *child_buf, *parent_buf; 1096 void *kbuffer; 1097 vol_directed_rd_t cvdr; 1098 caddr_t userbuf; 1099 offset_t useroff; 1100 int ret = 0; 1101 1102 ui = MDI_UNIT(mnum); 1103 1104 md_kstat_waitq_enter(ui); 1105 1106 bzero(&cvdr, sizeof (cvdr)); 1107 1108 un = (mp_unit_t *)md_unit_readerlock(ui); 1109 1110 /* 1111 * Construct a parent_buf header which reflects the user-supplied 1112 * request. 1113 */ 1114 1115 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 1116 if (kbuffer == NULL) { 1117 vdr->vdr_flags |= DKV_DMR_ERROR; 1118 md_kstat_waitq_exit(ui); 1119 md_unit_readerexit(ui); 1120 return (ENOMEM); 1121 } 1122 1123 parent_buf = getrbuf(KM_NOSLEEP); 1124 if (parent_buf == NULL) { 1125 vdr->vdr_flags |= DKV_DMR_ERROR; 1126 md_kstat_waitq_exit(ui); 1127 md_unit_readerexit(ui); 1128 kmem_free(kbuffer, vdr->vdr_nbytes); 1129 return (ENOMEM); 1130 } 1131 parent_buf->b_un.b_addr = kbuffer; 1132 parent_buf->b_flags = B_READ; 1133 parent_buf->b_bcount = vdr->vdr_nbytes; 1134 parent_buf->b_lblkno = lbtodb(vdr->vdr_offset); 1135 parent_buf->b_edev = un->un_dev; 1136 1137 1138 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 1139 sp_parent_init(ps); 1140 1141 /* 1142 * Save essential information from the original buffhdr 1143 * in the parent. 1144 */ 1145 ps->ps_un = un; 1146 ps->ps_ui = ui; 1147 ps->ps_bp = parent_buf; 1148 ps->ps_addr = parent_buf->b_un.b_addr; 1149 1150 current_count = parent_buf->b_bcount; 1151 current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno; 1152 current_offset = 0; 1153 1154 md_kstat_waitq_to_runq(ui); 1155 1156 ps->ps_frags++; 1157 vdr->vdr_bytesread = 0; 1158 1159 /* 1160 * this loop does the main work of an I/O. we allocate a 1161 * a child save for each buf, do the logical to physical 1162 * mapping, decide if we need to frag the I/O, clone the 1163 * new I/O to pass down the stack. repeat until we've 1164 * taken care of the entire buf that was passed to us. 1165 */ 1166 do { 1167 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1168 sp_child_init(cs); 1169 child_buf = &cs->cs_buf; 1170 cs->cs_ps = ps; 1171 1172 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1173 if (more == -1) { 1174 ret = EIO; 1175 vdr->vdr_flags |= DKV_DMR_SHORT; 1176 kmem_cache_free(sp_child_cache, cs); 1177 goto err_out; 1178 } 1179 1180 cvdr.vdr_flags = vdr->vdr_flags; 1181 cvdr.vdr_side = vdr->vdr_side; 1182 cvdr.vdr_nbytes = child_buf->b_bcount; 1183 cvdr.vdr_offset = ldbtob(child_buf->b_lblkno); 1184 /* Work out where we are in the allocated buffer */ 1185 useroff = (offset_t)(uintptr_t)kbuffer; 1186 useroff = useroff + (offset_t)current_offset; 1187 cvdr.vdr_data = (void *)(uintptr_t)useroff; 1188 child_buf = md_bioclone(parent_buf, current_offset, 1189 child_buf->b_bcount, child_buf->b_edev, 1190 child_buf->b_blkno, NULL, 1191 child_buf, KM_NOSLEEP); 1192 /* calculate new offset, counts, etc... */ 1193 current_offset += child_buf->b_bcount; 1194 current_count -= child_buf->b_bcount; 1195 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1196 1197 if (more) { 1198 mutex_enter(&ps->ps_mx); 1199 ps->ps_frags++; 1200 mutex_exit(&ps->ps_mx); 1201 } 1202 1203 ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr, 1204 (mode | FKIOCTL), NULL); 1205 1206 /* 1207 * Free the child structure as we've finished with it. 1208 * Normally this would be done by sp_done() but we're just 1209 * using md_bioclone() to segment the transfer and we never 1210 * issue a strategy request so the iodone will not be called. 1211 */ 1212 kmem_cache_free(sp_child_cache, cs); 1213 if (ret == 0) { 1214 /* copyout the returned data to vdr_data + offset */ 1215 userbuf = (caddr_t)kbuffer; 1216 userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer; 1217 if (ddi_copyout(userbuf, vdr->vdr_data, 1218 cvdr.vdr_bytesread, mode)) { 1219 ret = EFAULT; 1220 goto err_out; 1221 } 1222 vdr->vdr_bytesread += cvdr.vdr_bytesread; 1223 } else { 1224 goto err_out; 1225 } 1226 } while (more); 1227 1228 /* 1229 * Update the user-supplied vol_directed_rd_t structure with the 1230 * contents of the last issued child request. 1231 */ 1232 vdr->vdr_flags = cvdr.vdr_flags; 1233 vdr->vdr_side = cvdr.vdr_side; 1234 bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME); 1235 1236 err_out: 1237 if (ret != 0) { 1238 vdr->vdr_flags |= DKV_DMR_ERROR; 1239 } 1240 if (vdr->vdr_bytesread != vdr->vdr_nbytes) { 1241 vdr->vdr_flags |= DKV_DMR_SHORT; 1242 } 1243 kmem_cache_free(sp_parent_cache, ps); 1244 kmem_free(kbuffer, vdr->vdr_nbytes); 1245 freerbuf(parent_buf); 1246 md_unit_readerexit(ui); 1247 return (ret); 1248 } 1249 1250 /* 1251 * FUNCTION: sp_snarf() 1252 * INPUT: cmd - snarf cmd. 1253 * setno - set number. 1254 * OUTPUT: none. 1255 * RETURNS: 1 - soft partitions were snarfed. 1256 * 0 - no soft partitions were snarfed. 1257 * PURPOSE: Snarf soft partition metadb records into their in-core 1258 * structures. This routine is called at "snarf time" when 1259 * md loads and gets all metadevices records into memory. 1260 * The basic algorithm is simply to walk the soft partition 1261 * records in the metadb and call the soft partitioning 1262 * build_incore routine to set up the in-core structures. 1263 */ 1264 static int 1265 sp_snarf(md_snarfcmd_t cmd, set_t setno) 1266 { 1267 mp_unit_t *un; 1268 mddb_recid_t recid; 1269 int gotsomething; 1270 int all_sp_gotten; 1271 mddb_type_t rec_type; 1272 mddb_de_ic_t *dep; 1273 mddb_rb32_t *rbp; 1274 mp_unit_t *big_un; 1275 mp_unit32_od_t *small_un; 1276 size_t newreqsize; 1277 1278 1279 if (cmd == MD_SNARF_CLEANUP) 1280 return (0); 1281 1282 all_sp_gotten = 1; 1283 gotsomething = 0; 1284 1285 /* get the record type */ 1286 rec_type = (mddb_type_t)md_getshared_key(setno, 1287 sp_md_ops.md_driver.md_drivername); 1288 recid = mddb_makerecid(setno, 0); 1289 1290 /* 1291 * walk soft partition records in the metadb and call 1292 * sp_build_incore to build in-core structures. 1293 */ 1294 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1295 /* if we've already gotten this record, go to the next one */ 1296 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1297 continue; 1298 1299 1300 dep = mddb_getrecdep(recid); 1301 dep->de_flags = MDDB_F_SOFTPART; 1302 rbp = dep->de_rb; 1303 1304 switch (rbp->rb_revision) { 1305 case MDDB_REV_RB: 1306 case MDDB_REV_RBFN: 1307 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 1308 /* 1309 * This means, we have an old and small record. 1310 * And this record hasn't already been converted 1311 * :-o before we create an incore metadevice 1312 * from this we have to convert it to a big 1313 * record. 1314 */ 1315 small_un = 1316 (mp_unit32_od_t *)mddb_getrecaddr(recid); 1317 newreqsize = sizeof (mp_unit_t) + 1318 ((small_un->un_numexts - 1) * 1319 sizeof (struct mp_ext)); 1320 big_un = (mp_unit_t *)kmem_zalloc(newreqsize, 1321 KM_SLEEP); 1322 softpart_convert((caddr_t)small_un, 1323 (caddr_t)big_un, SMALL_2_BIG); 1324 kmem_free(small_un, dep->de_reqsize); 1325 dep->de_rb_userdata = big_un; 1326 dep->de_reqsize = newreqsize; 1327 rbp->rb_private |= MD_PRV_CONVD; 1328 un = big_un; 1329 } else { 1330 /* Record has already been converted */ 1331 un = (mp_unit_t *)mddb_getrecaddr(recid); 1332 } 1333 un->c.un_revision &= ~MD_64BIT_META_DEV; 1334 break; 1335 case MDDB_REV_RB64: 1336 case MDDB_REV_RB64FN: 1337 /* Large device */ 1338 un = (mp_unit_t *)mddb_getrecaddr(recid); 1339 un->c.un_revision |= MD_64BIT_META_DEV; 1340 un->c.un_flag |= MD_EFILABEL; 1341 break; 1342 } 1343 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 1344 1345 /* 1346 * Create minor node for snarfed entry. 1347 */ 1348 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 1349 1350 if (MD_UNIT(MD_SID(un)) != NULL) { 1351 /* unit is already in-core */ 1352 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1353 continue; 1354 } 1355 all_sp_gotten = 0; 1356 if (sp_build_incore((void *)un, 1) == 0) { 1357 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1358 md_create_unit_incore(MD_SID(un), &sp_md_ops, 0); 1359 gotsomething = 1; 1360 } 1361 } 1362 1363 if (!all_sp_gotten) 1364 return (gotsomething); 1365 /* double-check records */ 1366 recid = mddb_makerecid(setno, 0); 1367 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) 1368 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 1369 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1370 1371 return (0); 1372 } 1373 1374 /* 1375 * FUNCTION: sp_halt() 1376 * INPUT: cmd - halt cmd. 1377 * setno - set number. 1378 * RETURNS: 0 - success. 1379 * 1 - err. 1380 * PURPOSE: Perform driver halt operations. As with stripe, we 1381 * support MD_HALT_CHECK and MD_HALT_DOIT. The first 1382 * does a check to see if halting can be done safely 1383 * (no open soft partitions), the second cleans up and 1384 * shuts down the driver. 1385 */ 1386 static int 1387 sp_halt(md_haltcmd_t cmd, set_t setno) 1388 { 1389 int i; 1390 mdi_unit_t *ui; 1391 minor_t mnum; 1392 1393 if (cmd == MD_HALT_CLOSE) 1394 return (0); 1395 1396 if (cmd == MD_HALT_OPEN) 1397 return (0); 1398 1399 if (cmd == MD_HALT_UNLOAD) 1400 return (0); 1401 1402 if (cmd == MD_HALT_CHECK) { 1403 for (i = 0; i < md_nunits; i++) { 1404 mnum = MD_MKMIN(setno, i); 1405 if ((ui = MDI_UNIT(mnum)) == NULL) 1406 continue; 1407 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1408 continue; 1409 if (md_unit_isopen(ui)) 1410 return (1); 1411 } 1412 return (0); 1413 } 1414 1415 if (cmd != MD_HALT_DOIT) 1416 return (1); 1417 1418 for (i = 0; i < md_nunits; i++) { 1419 mnum = MD_MKMIN(setno, i); 1420 if ((ui = MDI_UNIT(mnum)) == NULL) 1421 continue; 1422 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1423 continue; 1424 reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0); 1425 } 1426 1427 return (0); 1428 } 1429 1430 /* 1431 * FUNCTION: sp_open_dev() 1432 * INPUT: un - unit structure. 1433 * oflags - open flags. 1434 * OUTPUT: none. 1435 * RETURNS: 0 - success. 1436 * non-zero - err. 1437 * PURPOSE: open underlying device via md_layered_open. 1438 */ 1439 static int 1440 sp_open_dev(mp_unit_t *un, int oflags) 1441 { 1442 minor_t mnum = MD_SID(un); 1443 int err; 1444 md_dev64_t tmpdev; 1445 set_t setno = MD_MIN2SET(MD_SID(un)); 1446 side_t side = mddb_getsidenum(setno); 1447 1448 tmpdev = un->un_dev; 1449 /* 1450 * Do the open by device id if underlying is regular 1451 */ 1452 if ((md_getmajor(tmpdev) != md_major) && 1453 md_devid_found(setno, side, un->un_key) == 1) { 1454 tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key); 1455 } 1456 err = md_layered_open(mnum, &tmpdev, oflags); 1457 un->un_dev = tmpdev; 1458 1459 if (err) 1460 return (ENXIO); 1461 1462 return (0); 1463 } 1464 1465 /* 1466 * FUNCTION: sp_open() 1467 * INPUT: dev - device to open. 1468 * flag - pass-through flag. 1469 * otyp - pass-through open type. 1470 * cred_p - credentials. 1471 * md_oflags - open flags. 1472 * OUTPUT: none. 1473 * RETURNS: 0 - success. 1474 * non-zero - err. 1475 * PURPOSE: open a soft partition. 1476 */ 1477 /* ARGSUSED */ 1478 static int 1479 sp_open( 1480 dev_t *dev, 1481 int flag, 1482 int otyp, 1483 cred_t *cred_p, 1484 int md_oflags 1485 ) 1486 { 1487 minor_t mnum = getminor(*dev); 1488 mdi_unit_t *ui = MDI_UNIT(mnum); 1489 mp_unit_t *un; 1490 int err = 0; 1491 set_t setno; 1492 1493 /* 1494 * When doing an open of a multi owner metadevice, check to see if this 1495 * node is a starting node and if a reconfig cycle is underway. 1496 * If so, the system isn't sufficiently set up enough to handle the 1497 * open (which involves I/O during sp_validate), so fail with ENXIO. 1498 */ 1499 setno = MD_MIN2SET(mnum); 1500 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 1501 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 1502 return (ENXIO); 1503 } 1504 1505 /* grab necessary locks */ 1506 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1507 setno = MD_UN2SET(un); 1508 1509 /* open underlying device, if necessary */ 1510 if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { 1511 if ((err = sp_open_dev(un, md_oflags)) != 0) 1512 goto out; 1513 1514 if (MD_MNSET_SETNO(setno)) { 1515 /* For probe, don't incur the overhead of validate */ 1516 if (!(md_oflags & MD_OFLG_PROBEDEV)) { 1517 /* 1518 * Don't call sp_validate while 1519 * unit_openclose lock is held. So, actually 1520 * open the device, drop openclose lock, 1521 * call sp_validate, reacquire openclose lock, 1522 * and close the device. If sp_validate 1523 * succeeds, then device will be re-opened. 1524 */ 1525 if ((err = md_unit_incopen(mnum, flag, 1526 otyp)) != 0) 1527 goto out; 1528 1529 mutex_enter(&ui->ui_mx); 1530 ui->ui_lock |= MD_UL_OPENINPROGRESS; 1531 mutex_exit(&ui->ui_mx); 1532 md_unit_openclose_exit(ui); 1533 if (otyp != OTYP_LYR) 1534 rw_exit(&md_unit_array_rw.lock); 1535 1536 err = sp_validate(un); 1537 1538 if (otyp != OTYP_LYR) 1539 rw_enter(&md_unit_array_rw.lock, 1540 RW_READER); 1541 (void) md_unit_openclose_enter(ui); 1542 (void) md_unit_decopen(mnum, otyp); 1543 mutex_enter(&ui->ui_mx); 1544 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 1545 cv_broadcast(&ui->ui_cv); 1546 mutex_exit(&ui->ui_mx); 1547 /* 1548 * Should be in the same state as before 1549 * the sp_validate. 1550 */ 1551 if (err != 0) { 1552 /* close the device opened above */ 1553 md_layered_close(un->un_dev, md_oflags); 1554 err = EIO; 1555 goto out; 1556 } 1557 } 1558 /* 1559 * As we're a multi-owner metadevice we need to ensure 1560 * that all nodes have the same idea of the status. 1561 * sp_validate() will mark the device as errored (if 1562 * it cannot read the watermark) or ok (if it was 1563 * previously errored but the watermark is now valid). 1564 * This code-path is only entered on the non-probe open 1565 * so we will maintain the errored state during a probe 1566 * call. This means the sys-admin must metarecover -m 1567 * to reset the soft-partition error. 1568 */ 1569 } else { 1570 /* For probe, don't incur the overhead of validate */ 1571 if (!(md_oflags & MD_OFLG_PROBEDEV) && 1572 (err = sp_validate(un)) != 0) { 1573 /* close the device opened above */ 1574 md_layered_close(un->un_dev, md_oflags); 1575 err = EIO; 1576 goto out; 1577 } else { 1578 /* 1579 * we succeeded in validating the on disk 1580 * format versus the in core, so reset the 1581 * status if it's in error 1582 */ 1583 if (un->un_status == MD_SP_ERR) { 1584 un->un_status = MD_SP_OK; 1585 } 1586 } 1587 } 1588 } 1589 1590 /* count open */ 1591 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 1592 goto out; 1593 1594 out: 1595 md_unit_openclose_exit(ui); 1596 return (err); 1597 } 1598 1599 /* 1600 * FUNCTION: sp_close() 1601 * INPUT: dev - device to close. 1602 * flag - pass-through flag. 1603 * otyp - pass-through type. 1604 * cred_p - credentials. 1605 * md_cflags - close flags. 1606 * OUTPUT: none. 1607 * RETURNS: 0 - success. 1608 * non-zero - err. 1609 * PURPOSE: close a soft paritition. 1610 */ 1611 /* ARGSUSED */ 1612 static int 1613 sp_close( 1614 dev_t dev, 1615 int flag, 1616 int otyp, 1617 cred_t *cred_p, 1618 int md_cflags 1619 ) 1620 { 1621 minor_t mnum = getminor(dev); 1622 mdi_unit_t *ui = MDI_UNIT(mnum); 1623 mp_unit_t *un; 1624 int err = 0; 1625 1626 /* grab necessary locks */ 1627 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1628 1629 /* count closed */ 1630 if ((err = md_unit_decopen(mnum, otyp)) != 0) 1631 goto out; 1632 1633 /* close devices, if necessary */ 1634 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 1635 md_layered_close(un->un_dev, md_cflags); 1636 } 1637 1638 /* 1639 * If a MN set and transient capabilities (eg ABR/DMR) are set, 1640 * clear these capabilities if this is the last close in 1641 * the cluster 1642 */ 1643 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1644 (ui->ui_tstate & MD_ABR_CAP)) { 1645 md_unit_openclose_exit(ui); 1646 mdmn_clear_all_capabilities(mnum); 1647 return (0); 1648 } 1649 /* unlock, return success */ 1650 out: 1651 md_unit_openclose_exit(ui); 1652 return (err); 1653 } 1654 1655 1656 /* used in sp_dump routine */ 1657 static struct buf dumpbuf; 1658 1659 /* 1660 * FUNCTION: sp_dump() 1661 * INPUT: dev - device to dump to. 1662 * addr - address to dump. 1663 * blkno - blkno on device. 1664 * nblk - number of blocks to dump. 1665 * OUTPUT: none. 1666 * RETURNS: result from bdev_dump. 1667 * PURPOSE: This routine dumps memory to the disk. It assumes that 1668 * the memory has already been mapped into mainbus space. 1669 * It is called at disk interrupt priority when the system 1670 * is in trouble. 1671 * NOTE: this function is defined using 32-bit arguments, 1672 * but soft partitioning is internally 64-bit. Arguments 1673 * are casted where appropriate. 1674 */ 1675 static int 1676 sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1677 { 1678 mp_unit_t *un; 1679 buf_t *bp; 1680 sp_ext_length_t nb; 1681 daddr_t mapblk; 1682 int result; 1683 int more; 1684 int saveresult = 0; 1685 1686 /* 1687 * Don't need to grab the unit lock. 1688 * Cause nothing else is supposed to be happenning. 1689 * Also dump is not supposed to sleep. 1690 */ 1691 un = (mp_unit_t *)MD_UNIT(getminor(dev)); 1692 1693 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 1694 return (EINVAL); 1695 1696 if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks) 1697 return (EINVAL); 1698 1699 bp = &dumpbuf; 1700 nb = (sp_ext_length_t)dbtob(nblk); 1701 do { 1702 bzero((caddr_t)bp, sizeof (*bp)); 1703 more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp); 1704 nblk = (int)(btodb(bp->b_bcount)); 1705 mapblk = bp->b_blkno; 1706 result = bdev_dump(bp->b_edev, addr, mapblk, nblk); 1707 if (result) 1708 saveresult = result; 1709 1710 nb -= bp->b_bcount; 1711 addr += bp->b_bcount; 1712 blkno += nblk; 1713 } while (more); 1714 1715 return (saveresult); 1716 } 1717 1718 static int 1719 sp_imp_set( 1720 set_t setno 1721 ) 1722 { 1723 mddb_recid_t recid; 1724 int gotsomething; 1725 mddb_type_t rec_type; 1726 mddb_de_ic_t *dep; 1727 mddb_rb32_t *rbp; 1728 mp_unit_t *un64; 1729 mp_unit32_od_t *un32; 1730 md_dev64_t self_devt; 1731 minor_t *self_id; /* minor needs to be updated */ 1732 md_parent_t *parent_id; /* parent needs to be updated */ 1733 mddb_recid_t *record_id; /* record id needs to be updated */ 1734 1735 gotsomething = 0; 1736 1737 rec_type = (mddb_type_t)md_getshared_key(setno, 1738 sp_md_ops.md_driver.md_drivername); 1739 recid = mddb_makerecid(setno, 0); 1740 1741 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1742 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1743 continue; 1744 1745 dep = mddb_getrecdep(recid); 1746 rbp = dep->de_rb; 1747 1748 switch (rbp->rb_revision) { 1749 case MDDB_REV_RB: 1750 case MDDB_REV_RBFN: 1751 /* 1752 * Small device 1753 */ 1754 un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid); 1755 self_id = &(un32->c.un_self_id); 1756 parent_id = &(un32->c.un_parent); 1757 record_id = &(un32->c.un_record_id); 1758 1759 if (!md_update_minor(setno, mddb_getsidenum 1760 (setno), un32->un_key)) 1761 goto out; 1762 break; 1763 1764 case MDDB_REV_RB64: 1765 case MDDB_REV_RB64FN: 1766 un64 = (mp_unit_t *)mddb_getrecaddr(recid); 1767 self_id = &(un64->c.un_self_id); 1768 parent_id = &(un64->c.un_parent); 1769 record_id = &(un64->c.un_record_id); 1770 1771 if (!md_update_minor(setno, mddb_getsidenum 1772 (setno), un64->un_key)) 1773 goto out; 1774 break; 1775 } 1776 1777 /* 1778 * If this is a top level and a friendly name metadevice, 1779 * update its minor in the namespace. 1780 */ 1781 if ((*parent_id == MD_NO_PARENT) && 1782 ((rbp->rb_revision == MDDB_REV_RBFN) || 1783 (rbp->rb_revision == MDDB_REV_RB64FN))) { 1784 1785 self_devt = md_makedevice(md_major, *self_id); 1786 if (!md_update_top_device_minor(setno, 1787 mddb_getsidenum(setno), self_devt)) 1788 goto out; 1789 } 1790 1791 /* 1792 * Update unit with the imported setno 1793 * 1794 */ 1795 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1796 1797 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 1798 if (*parent_id != MD_NO_PARENT) 1799 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 1800 *record_id = MAKERECID(setno, DBID(*record_id)); 1801 1802 gotsomething = 1; 1803 } 1804 1805 out: 1806 return (gotsomething); 1807 } 1808 1809 static md_named_services_t sp_named_services[] = { 1810 {NULL, 0} 1811 }; 1812 1813 md_ops_t sp_md_ops = { 1814 sp_open, /* open */ 1815 sp_close, /* close */ 1816 md_sp_strategy, /* strategy */ 1817 NULL, /* print */ 1818 sp_dump, /* dump */ 1819 NULL, /* read */ 1820 NULL, /* write */ 1821 md_sp_ioctl, /* ioctl, */ 1822 sp_snarf, /* snarf */ 1823 sp_halt, /* halt */ 1824 NULL, /* aread */ 1825 NULL, /* awrite */ 1826 sp_imp_set, /* import set */ 1827 sp_named_services 1828 }; 1829 1830 static void 1831 init_init() 1832 { 1833 sp_parent_cache = kmem_cache_create("md_softpart_parent", 1834 sizeof (md_spps_t), 0, sp_parent_constructor, 1835 sp_parent_destructor, sp_run_queue, NULL, NULL, 0); 1836 sp_child_cache = kmem_cache_create("md_softpart_child", 1837 sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0, 1838 sp_child_constructor, sp_child_destructor, sp_run_queue, 1839 NULL, NULL, 0); 1840 } 1841 1842 static void 1843 fini_uninit() 1844 { 1845 kmem_cache_destroy(sp_parent_cache); 1846 kmem_cache_destroy(sp_child_cache); 1847 sp_parent_cache = sp_child_cache = NULL; 1848 } 1849 1850 /* define the module linkage */ 1851 MD_PLUGIN_MISC_MODULE("soft partition module", init_init(), fini_uninit()) 1852