1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Soft partitioning metadevice driver (md_sp). 31 * 32 * This file contains the primary operations of the soft partitioning 33 * metadevice driver. This includes all routines for normal operation 34 * (open/close/read/write). Please see mdvar.h for a definition of 35 * metadevice operations vector (md_ops_t). This driver is loosely 36 * based on the stripe driver (md_stripe). 37 * 38 * All metadevice administration is done through the use of ioctl's. 39 * As such, all administrative routines appear in sp_ioctl.c. 40 * 41 * Soft partitions are represented both in-core and in the metadb with a 42 * unit structure. The soft partition-specific information in the unit 43 * structure includes the following information: 44 * - Device information (md_dev64_t & md key) about the device on which 45 * the soft partition is built. 46 * - Soft partition status information. 47 * - The size of the soft partition and number of extents used to 48 * make up that size. 49 * - An array of exents which define virtual/physical offset 50 * mappings and lengths for each extent. 51 * 52 * Typical soft partition operation proceeds as follows: 53 * - The unit structure is fetched from the metadb and placed into 54 * an in-core array (as with other metadevices). This operation 55 * is performed via sp_build_incore( ) and takes place during 56 * "snarfing" (when all metadevices are brought in-core at 57 * once) and when a new soft partition is created. 58 * - A soft partition is opened via sp_open( ). At open time the 59 * the soft partition unit structure is verified with the soft 60 * partition on-disk structures. Additionally, the soft partition 61 * status is checked (only soft partitions in the OK state may be 62 * opened). 63 * - Soft partition I/O is performed via sp_strategy( ) which relies on 64 * a support routine, sp_mapbuf( ), to do most of the work. 65 * sp_mapbuf( ) maps a buffer to a particular extent via a binary 66 * search of the extent array in the soft partition unit structure. 67 * Once a translation has been performed, the I/O is passed down 68 * to the next layer, which may be another metadevice or a physical 69 * disk. Since a soft partition may contain multiple, non-contiguous 70 * extents, a single I/O may have to be fragmented. 71 * - Soft partitions are closed using sp_close. 72 * 73 */ 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/conf.h> 78 #include <sys/file.h> 79 #include <sys/user.h> 80 #include <sys/uio.h> 81 #include <sys/t_lock.h> 82 #include <sys/buf.h> 83 #include <sys/dkio.h> 84 #include <sys/vtoc.h> 85 #include <sys/kmem.h> 86 #include <vm/page.h> 87 #include <sys/cmn_err.h> 88 #include <sys/sysmacros.h> 89 #include <sys/types.h> 90 #include <sys/mkdev.h> 91 #include <sys/stat.h> 92 #include <sys/open.h> 93 #include <sys/lvm/mdvar.h> 94 #include <sys/lvm/md_sp.h> 95 #include <sys/lvm/md_convert.h> 96 #include <sys/lvm/md_notify.h> 97 #include <sys/lvm/md_crc.h> 98 #include <sys/modctl.h> 99 #include <sys/ddi.h> 100 #include <sys/sunddi.h> 101 #include <sys/debug.h> 102 103 #include <sys/sysevent/eventdefs.h> 104 #include <sys/sysevent/svm.h> 105 106 md_ops_t sp_md_ops; 107 #ifndef lint 108 static char _depends_on[] = "drv/md"; 109 md_ops_t *md_interface_ops = &sp_md_ops; 110 #endif 111 112 extern unit_t md_nunits; 113 extern set_t md_nsets; 114 extern md_set_t md_set[]; 115 116 extern int md_status; 117 extern major_t md_major; 118 extern mdq_anchor_t md_done_daemon; 119 extern mdq_anchor_t md_sp_daemon; 120 extern kmutex_t md_mx; 121 extern kcondvar_t md_cv; 122 extern md_krwlock_t md_unit_array_rw; 123 124 static kmem_cache_t *sp_parent_cache = NULL; 125 static kmem_cache_t *sp_child_cache = NULL; 126 static void sp_send_stat_ok(mp_unit_t *); 127 static void sp_send_stat_err(mp_unit_t *); 128 129 /* 130 * FUNCTION: sp_parent_constructor() 131 * INPUT: none. 132 * OUTPUT: ps - parent save structure initialized. 133 * RETURNS: void * - ptr to initialized parent save structure. 134 * PURPOSE: initialize parent save structure. 135 */ 136 /*ARGSUSED1*/ 137 static int 138 sp_parent_constructor(void *p, void *d1, int d2) 139 { 140 mutex_init(&((md_spps_t *)p)->ps_mx, 141 NULL, MUTEX_DEFAULT, NULL); 142 return (0); 143 } 144 145 static void 146 sp_parent_init(md_spps_t *ps) 147 { 148 bzero(ps, offsetof(md_spps_t, ps_mx)); 149 } 150 151 /*ARGSUSED1*/ 152 static void 153 sp_parent_destructor(void *p, void *d) 154 { 155 mutex_destroy(&((md_spps_t *)p)->ps_mx); 156 } 157 158 /* 159 * FUNCTION: sp_child_constructor() 160 * INPUT: none. 161 * OUTPUT: cs - child save structure initialized. 162 * RETURNS: void * - ptr to initialized child save structure. 163 * PURPOSE: initialize child save structure. 164 */ 165 /*ARGSUSED1*/ 166 static int 167 sp_child_constructor(void *p, void *d1, int d2) 168 { 169 bioinit(&((md_spcs_t *)p)->cs_buf); 170 return (0); 171 } 172 173 static void 174 sp_child_init(md_spcs_t *cs) 175 { 176 cs->cs_mdunit = 0; 177 cs->cs_ps = NULL; 178 md_bioreset(&cs->cs_buf); 179 } 180 181 /*ARGSUSED1*/ 182 static void 183 sp_child_destructor(void *p, void *d) 184 { 185 biofini(&((md_spcs_t *)p)->cs_buf); 186 } 187 188 /* 189 * FUNCTION: sp_run_queue() 190 * INPUT: none. 191 * OUTPUT: none. 192 * RETURNS: void. 193 * PURPOSE: run the md_daemon to clean up memory pool. 194 */ 195 /*ARGSUSED*/ 196 static void 197 sp_run_queue(void *d) 198 { 199 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 200 md_daemon(1, &md_done_daemon); 201 } 202 203 204 /* 205 * FUNCTION: sp_build_incore() 206 * INPUT: p - ptr to unit structure. 207 * snarfing - flag to tell us we are snarfing. 208 * OUTPUT: non. 209 * RETURNS: int - 0 (always). 210 * PURPOSE: place unit structure into in-core unit array (keyed from 211 * minor number). 212 */ 213 int 214 sp_build_incore(void *p, int snarfing) 215 { 216 mp_unit_t *un = (mp_unit_t *)p; 217 minor_t mnum; 218 set_t setno; 219 md_dev64_t tmpdev; 220 221 mnum = MD_SID(un); 222 223 if (MD_UNIT(mnum) != NULL) 224 return (0); 225 226 MD_STATUS(un) = 0; 227 228 if (snarfing) { 229 /* 230 * if we are snarfing, we get the device information 231 * from the metadb record (using the metadb key for 232 * that device). 233 */ 234 setno = MD_MIN2SET(mnum); 235 236 tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), 237 un->un_key, MD_NOTRUST_DEVT); 238 un->un_dev = tmpdev; 239 } 240 241 /* place unit in in-core array */ 242 MD_UNIT(mnum) = un; 243 return (0); 244 } 245 246 /* 247 * FUNCTION: reset_sp() 248 * INPUT: un - unit structure to be reset/removed. 249 * mnum - minor number to be reset/removed. 250 * removing - flag to tell us if we are removing 251 * permanently or just reseting in-core 252 * structures. 253 * OUTPUT: none. 254 * RETURNS: void. 255 * PURPOSE: used to either simply reset in-core structures or to 256 * permanently remove metadevices from the metadb. 257 */ 258 void 259 reset_sp(mp_unit_t *un, minor_t mnum, int removing) 260 { 261 sv_dev_t *sv; 262 mddb_recid_t vtoc_id; 263 264 /* clean up in-core structures */ 265 md_destroy_unit_incore(mnum, &sp_md_ops); 266 267 MD_UNIT(mnum) = NULL; 268 269 if (!removing) 270 return; 271 272 /* we are removing the soft partition from the metadb */ 273 274 /* 275 * Save off device information so we can get to 276 * it after we do the mddb_deleterec(). 277 */ 278 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); 279 sv->setno = MD_MIN2SET(mnum); 280 sv->key = un->un_key; 281 vtoc_id = un->c.un_vtoc_id; 282 283 /* Remove the unit structure */ 284 mddb_deleterec_wrapper(un->c.un_record_id); 285 286 if (vtoc_id) 287 mddb_deleterec_wrapper(vtoc_id); 288 289 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE, 290 MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); 291 292 /* 293 * remove the underlying device name from the metadb. if other 294 * soft partitions are built on this device, this will simply 295 * decrease the reference count for this device. otherwise the 296 * name record for this device will be removed from the metadb. 297 */ 298 md_rem_names(sv, 1); 299 kmem_free(sv, sizeof (sv_dev_t)); 300 } 301 302 /* 303 * FUNCTION: sp_send_stat_msg 304 * INPUT: un - unit reference 305 * status - status to be sent to master node 306 * MD_SP_OK - soft-partition is now OK 307 * MD_SP_ERR " " errored 308 * OUTPUT: none. 309 * RETURNS: void. 310 * PURPOSE: send a soft-partition status change to the master node. If the 311 * message succeeds we simply return. If it fails we panic as the 312 * cluster-wide view of the metadevices is now inconsistent. 313 * CALLING CONTEXT: 314 * Blockable. No locks can be held. 315 */ 316 static void 317 sp_send_stat_msg(mp_unit_t *un, sp_status_t status) 318 { 319 md_mn_msg_sp_setstat_t sp_msg; 320 md_mn_kresult_t *kres; 321 set_t setno = MD_UN2SET(un); 322 int rval; 323 const char *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK"; 324 325 sp_msg.sp_setstat_mnum = MD_SID(un); 326 sp_msg.sp_setstat_status = status; 327 328 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 329 330 rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG, 331 (char *)&sp_msg, sizeof (sp_msg), kres); 332 333 if (!MDMN_KSEND_MSG_OK(rval, kres)) { 334 mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2"); 335 336 /* 337 * Panic as we are now in an inconsistent state. 338 */ 339 340 cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n", 341 md_shortname(MD_SID(un)), str); 342 } 343 344 kmem_free(kres, sizeof (md_mn_kresult_t)); 345 } 346 347 /* 348 * FUNCTION: sp_finish_error 349 * INPUT: ps - parent save structure for error-ed I/O. 350 * lock_held - set if the unit readerlock is held 351 * OUTPUT: none. 352 * RETURNS: void. 353 * PURPOSE: report a driver error 354 */ 355 static void 356 sp_finish_error(md_spps_t *ps, int lock_held) 357 { 358 struct buf *pb = ps->ps_bp; 359 mdi_unit_t *ui = ps->ps_ui; 360 md_dev64_t un_dev; /* underlying device */ 361 md_dev64_t md_dev = md_expldev(pb->b_edev); /* metadev in error */ 362 char *str; 363 364 un_dev = md_expldev(ps->ps_un->un_dev); 365 /* set error type */ 366 if (pb->b_flags & B_READ) { 367 str = "read"; 368 } else { 369 str = "write"; 370 } 371 372 373 SPPS_FREE(sp_parent_cache, ps); 374 pb->b_flags |= B_ERROR; 375 376 md_kstat_done(ui, pb, 0); 377 378 if (lock_held) { 379 md_unit_readerexit(ui); 380 } 381 md_biodone(pb); 382 383 cmn_err(CE_WARN, "md: %s: %s error on %s", 384 md_shortname(md_getminor(md_dev)), str, 385 md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0)); 386 } 387 388 389 /* 390 * FUNCTION: sp_xmit_ok 391 * INPUT: dq - daemon queue referencing failing ps structure 392 * OUTPUT: none. 393 * RETURNS: void. 394 * PURPOSE: send a message to the master node in a multi-owner diskset to 395 * update all attached nodes view of the soft-part to be MD_SP_OK. 396 * CALLING CONTEXT: 397 * Blockable. No unit lock held. 398 */ 399 static void 400 sp_xmit_ok(daemon_queue_t *dq) 401 { 402 md_spps_t *ps = (md_spps_t *)dq; 403 404 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 405 sp_send_stat_msg(ps->ps_un, MD_SP_OK); 406 407 /* 408 * Successfully transmitted error state to all nodes, now release this 409 * parent structure. 410 */ 411 SPPS_FREE(sp_parent_cache, ps); 412 } 413 414 /* 415 * FUNCTION: sp_xmit_error 416 * INPUT: dq - daemon queue referencing failing ps structure 417 * OUTPUT: none. 418 * RETURNS: void. 419 * PURPOSE: send a message to the master node in a multi-owner diskset to 420 * update all attached nodes view of the soft-part to be MD_SP_ERR. 421 * CALLING CONTEXT: 422 * Blockable. No unit lock held. 423 */ 424 static void 425 sp_xmit_error(daemon_queue_t *dq) 426 { 427 md_spps_t *ps = (md_spps_t *)dq; 428 429 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 430 sp_send_stat_msg(ps->ps_un, MD_SP_ERR); 431 432 /* 433 * Successfully transmitted error state to all nodes, now release this 434 * parent structure. 435 */ 436 SPPS_FREE(sp_parent_cache, ps); 437 } 438 static void 439 sp_send_stat_ok(mp_unit_t *un) 440 { 441 minor_t mnum = MD_SID(un); 442 md_spps_t *ps; 443 444 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 445 sp_parent_init(ps); 446 ps->ps_un = un; 447 ps->ps_ui = MDI_UNIT(mnum); 448 449 daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps, 450 REQ_OLD); 451 } 452 453 static void 454 sp_send_stat_err(mp_unit_t *un) 455 { 456 minor_t mnum = MD_SID(un); 457 md_spps_t *ps; 458 459 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 460 sp_parent_init(ps); 461 ps->ps_un = un; 462 ps->ps_ui = MDI_UNIT(mnum); 463 464 daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps, 465 REQ_OLD); 466 } 467 468 469 /* 470 * FUNCTION: sp_error() 471 * INPUT: ps - parent save structure for error-ed I/O. 472 * OUTPUT: none. 473 * RETURNS: void. 474 * PURPOSE: report a driver error. 475 * CALLING CONTEXT: 476 * Interrupt - non-blockable 477 */ 478 static void 479 sp_error(md_spps_t *ps) 480 { 481 set_t setno = MD_UN2SET(ps->ps_un); 482 483 /* 484 * Drop the mutex associated with this request before (potentially) 485 * enqueuing the free onto a separate thread. We have to release the 486 * mutex before destroying the parent structure. 487 */ 488 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 489 if (MUTEX_HELD(&ps->ps_mx)) { 490 mutex_exit(&ps->ps_mx); 491 } 492 } else { 493 /* 494 * this should only ever happen if we are panicking, 495 * since DONTFREE is only set on the parent if panicstr 496 * is non-NULL. 497 */ 498 ASSERT(panicstr); 499 } 500 501 /* 502 * For a multi-owner set we need to send a message to the master so that 503 * all nodes get the errored status when we first encounter it. To avoid 504 * deadlocking when multiple soft-partitions encounter an error on one 505 * physical unit we drop the unit readerlock before enqueueing the 506 * request. That way we can service any messages that require a 507 * writerlock to be held. Additionally, to avoid deadlocking when at 508 * the bottom of a metadevice stack and a higher level mirror has 509 * multiple requests outstanding on this soft-part, we clone the ps 510 * that failed and pass the error back up the stack to release the 511 * reference that this i/o may have in the higher-level metadevice. 512 * The other nodes in the cluster just have to modify the soft-part 513 * status and we do not need to block the i/o completion for this. 514 */ 515 if (MD_MNSET_SETNO(setno)) { 516 md_spps_t *err_ps; 517 err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 518 sp_parent_init(err_ps); 519 520 err_ps->ps_un = ps->ps_un; 521 err_ps->ps_ui = ps->ps_ui; 522 523 md_unit_readerexit(ps->ps_ui); 524 525 daemon_request(&md_sp_daemon, sp_xmit_error, 526 (daemon_queue_t *)err_ps, REQ_OLD); 527 528 sp_finish_error(ps, 0); 529 530 return; 531 } else { 532 ps->ps_un->un_status = MD_SP_ERR; 533 } 534 535 /* Flag the error */ 536 sp_finish_error(ps, 1); 537 538 } 539 540 /* 541 * FUNCTION: sp_mapbuf() 542 * INPUT: un - unit structure for soft partition we are doing 543 * I/O on. 544 * voff - virtual offset in soft partition to map. 545 * bcount - # of blocks in the I/O. 546 * OUTPUT: bp - translated buffer to be passed down to next layer. 547 * RETURNS: 1 - request must be fragmented, more work to do, 548 * 0 - request satisified, no more work to do 549 * -1 - error 550 * PURPOSE: Map the the virtual offset in the soft partition (passed 551 * in via voff) to the "physical" offset on whatever the soft 552 * partition is built on top of. We do this by doing a binary 553 * search of the extent array in the soft partition unit 554 * structure. Once the current extent is found, we do the 555 * translation, determine if the I/O will cross extent 556 * boundaries (if so, we have to fragment the I/O), then 557 * fill in the buf structure to be passed down to the next layer. 558 */ 559 static int 560 sp_mapbuf( 561 mp_unit_t *un, 562 sp_ext_offset_t voff, 563 sp_ext_length_t bcount, 564 buf_t *bp 565 ) 566 { 567 int lo, mid, hi, found, more; 568 size_t new_bcount; 569 sp_ext_offset_t new_blkno; 570 sp_ext_offset_t new_offset; 571 sp_ext_offset_t ext_endblk; 572 md_dev64_t new_edev; 573 extern unsigned md_maxphys; 574 575 found = 0; 576 lo = 0; 577 hi = un->un_numexts - 1; 578 579 /* 580 * do a binary search to find the extent that contains the 581 * starting offset. after this loop, mid contains the index 582 * of the correct extent. 583 */ 584 while (lo <= hi && !found) { 585 mid = (lo + hi) / 2; 586 /* is the starting offset contained within the mid-ext? */ 587 if (voff >= un->un_ext[mid].un_voff && 588 voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len) 589 found = 1; 590 else if (voff < un->un_ext[mid].un_voff) 591 hi = mid - 1; 592 else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */ 593 lo = mid + 1; 594 } 595 596 if (!found) { 597 cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff); 598 return (-1); 599 } 600 601 /* translate to underlying physical offset/device */ 602 new_offset = voff - un->un_ext[mid].un_voff; 603 new_blkno = un->un_ext[mid].un_poff + new_offset; 604 new_edev = un->un_dev; 605 606 /* determine if we need to break the I/O into fragments */ 607 ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len; 608 if (voff + btodb(bcount) > ext_endblk) { 609 new_bcount = dbtob(ext_endblk - voff); 610 more = 1; 611 } else { 612 new_bcount = bcount; 613 more = 0; 614 } 615 616 /* only break up the I/O if we're not built on another metadevice */ 617 if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) { 618 new_bcount = md_maxphys; 619 more = 1; 620 } 621 if (bp != (buf_t *)NULL) { 622 /* do bp updates */ 623 bp->b_bcount = new_bcount; 624 bp->b_lblkno = new_blkno; 625 bp->b_edev = md_dev64_to_dev(new_edev); 626 } 627 return (more); 628 } 629 630 /* 631 * FUNCTION: sp_validate() 632 * INPUT: un - unit structure to be validated. 633 * OUTPUT: none. 634 * RETURNS: 0 - soft partition ok. 635 * -1 - error. 636 * PURPOSE: called on open to sanity check the soft partition. In 637 * order to open a soft partition: 638 * - it must have at least one extent 639 * - the extent info in core and on disk must match 640 * - it may not be in an intermediate state (which would 641 * imply that a two-phase commit was interrupted) 642 * 643 * If the extent checking fails (B_ERROR returned from the read 644 * strategy call) _and_ we're a multi-owner diskset, we send a 645 * message to the master so that all nodes inherit the same view 646 * of the soft partition. 647 * If we are checking a soft-part that is marked as in error, and 648 * we can actually read and validate the watermarks we send a 649 * message to clear the error to the master node. 650 */ 651 static int 652 sp_validate(mp_unit_t *un) 653 { 654 uint_t ext; 655 struct buf *buf; 656 sp_ext_length_t len; 657 mp_watermark_t *wm; 658 set_t setno; 659 int reset_error = 0; 660 661 setno = MD_UN2SET(un); 662 663 /* sanity check unit structure components ?? */ 664 if (un->un_status != MD_SP_OK) { 665 if (un->un_status != MD_SP_ERR) { 666 cmn_err(CE_WARN, "md: %s: open failed, soft partition " 667 "status is %u.", 668 md_shortname(MD_SID(un)), 669 un->un_status); 670 return (-1); 671 } else { 672 cmn_err(CE_WARN, "md: %s: open of soft partition " 673 "in Errored state.", 674 md_shortname(MD_SID(un))); 675 reset_error = 1; 676 } 677 } 678 679 if (un->un_numexts == 0) { 680 cmn_err(CE_WARN, "md: %s: open failed, soft partition does " 681 "not have any extents.", md_shortname(MD_SID(un))); 682 return (-1); 683 } 684 685 len = 0LL; 686 for (ext = 0; ext < un->un_numexts; ext++) { 687 688 /* tally extent lengths to check total size */ 689 len += un->un_ext[ext].un_len; 690 691 /* allocate buffer for watermark */ 692 buf = getrbuf(KM_SLEEP); 693 694 /* read watermark */ 695 buf->b_flags = B_READ; 696 buf->b_edev = md_dev64_to_dev(un->un_dev); 697 buf->b_iodone = NULL; 698 buf->b_proc = NULL; 699 buf->b_bcount = sizeof (mp_watermark_t); 700 buf->b_lblkno = un->un_ext[ext].un_poff - 1; 701 buf->b_bufsize = sizeof (mp_watermark_t); 702 buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t), 703 KM_SLEEP); 704 705 /* 706 * make the call non-blocking so that it is not affected 707 * by a set take. 708 */ 709 md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL); 710 (void) biowait(buf); 711 712 if (buf->b_flags & B_ERROR) { 713 cmn_err(CE_WARN, "md: %s: open failed, could not " 714 "read watermark at block %llu for extent %u, " 715 "error %d.", md_shortname(MD_SID(un)), 716 buf->b_lblkno, ext, buf->b_error); 717 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 718 freerbuf(buf); 719 720 /* 721 * If we're a multi-owner diskset we send a message 722 * indicating that this soft-part has an invalid 723 * extent to the master node. This ensures a consistent 724 * view of the soft-part across the cluster. 725 */ 726 if (MD_MNSET_SETNO(setno)) { 727 sp_send_stat_err(un); 728 } 729 return (-1); 730 } 731 732 wm = (mp_watermark_t *)buf->b_un.b_addr; 733 734 /* make sure the checksum is correct first */ 735 if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum, 736 (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) { 737 cmn_err(CE_WARN, "md: %s: open failed, watermark " 738 "at block %llu for extent %u does not have a " 739 "valid checksum 0x%08x.", md_shortname(MD_SID(un)), 740 buf->b_lblkno, ext, wm->wm_checksum); 741 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 742 freerbuf(buf); 743 return (-1); 744 } 745 746 if (wm->wm_magic != MD_SP_MAGIC) { 747 cmn_err(CE_WARN, "md: %s: open failed, watermark " 748 "at block %llu for extent %u does not have a " 749 "valid watermark magic number, expected 0x%x, " 750 "found 0x%x.", md_shortname(MD_SID(un)), 751 buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic); 752 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 753 freerbuf(buf); 754 return (-1); 755 } 756 757 /* make sure sequence number matches the current extent */ 758 if (wm->wm_seq != ext) { 759 cmn_err(CE_WARN, "md: %s: open failed, watermark " 760 "at block %llu for extent %u has invalid " 761 "sequence number %u.", md_shortname(MD_SID(un)), 762 buf->b_lblkno, ext, wm->wm_seq); 763 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 764 freerbuf(buf); 765 return (-1); 766 } 767 768 /* make sure watermark length matches unit structure */ 769 if (wm->wm_length != un->un_ext[ext].un_len) { 770 cmn_err(CE_WARN, "md: %s: open failed, watermark " 771 "at block %llu for extent %u has inconsistent " 772 "length, expected %llu, found %llu.", 773 md_shortname(MD_SID(un)), buf->b_lblkno, 774 ext, un->un_ext[ext].un_len, 775 (u_longlong_t)wm->wm_length); 776 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 777 freerbuf(buf); 778 return (-1); 779 } 780 781 /* 782 * make sure the type is a valid soft partition and not 783 * a free extent or the end. 784 */ 785 if (wm->wm_type != EXTTYP_ALLOC) { 786 cmn_err(CE_WARN, "md: %s: open failed, watermark " 787 "at block %llu for extent %u is not marked " 788 "as in-use, type = %u.", md_shortname(MD_SID(un)), 789 buf->b_lblkno, ext, wm->wm_type); 790 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 791 freerbuf(buf); 792 return (-1); 793 } 794 /* free up buffer */ 795 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 796 freerbuf(buf); 797 } 798 799 if (len != un->un_length) { 800 cmn_err(CE_WARN, "md: %s: open failed, computed length " 801 "%llu != expected length %llu.", md_shortname(MD_SID(un)), 802 len, un->un_length); 803 return (-1); 804 } 805 806 /* 807 * If we're a multi-owner set _and_ reset_error is set, we should clear 808 * the error condition on all nodes in the set. Use SP_SETSTAT2 with 809 * MD_SP_OK. 810 */ 811 if (MD_MNSET_SETNO(setno) && reset_error) { 812 sp_send_stat_ok(un); 813 } 814 return (0); 815 } 816 817 /* 818 * FUNCTION: sp_done() 819 * INPUT: child_buf - buffer attached to child save structure. 820 * this is the buffer on which I/O has just 821 * completed. 822 * OUTPUT: none. 823 * RETURNS: 0 - success. 824 * 1 - error. 825 * PURPOSE: called on I/O completion. 826 */ 827 static int 828 sp_done(struct buf *child_buf) 829 { 830 struct buf *parent_buf; 831 mdi_unit_t *ui; 832 md_spps_t *ps; 833 md_spcs_t *cs; 834 835 /* find the child save structure to which this buffer belongs */ 836 cs = (md_spcs_t *)((caddr_t)child_buf - 837 (sizeof (md_spcs_t) - sizeof (buf_t))); 838 /* now get the parent save structure */ 839 ps = cs->cs_ps; 840 parent_buf = ps->ps_bp; 841 842 mutex_enter(&ps->ps_mx); 843 /* pass any errors back up to the parent */ 844 if (child_buf->b_flags & B_ERROR) { 845 ps->ps_flags |= MD_SPPS_ERROR; 846 parent_buf->b_error = child_buf->b_error; 847 } 848 /* mapout, if needed */ 849 if (child_buf->b_flags & B_REMAPPED) 850 bp_mapout(child_buf); 851 852 ps->ps_frags--; 853 if (ps->ps_frags != 0) { 854 /* 855 * if this parent has more children, we just free the 856 * child and return. 857 */ 858 kmem_cache_free(sp_child_cache, cs); 859 mutex_exit(&ps->ps_mx); 860 return (1); 861 } 862 /* there are no more children */ 863 kmem_cache_free(sp_child_cache, cs); 864 if (ps->ps_flags & MD_SPPS_ERROR) { 865 sp_error(ps); 866 return (1); 867 } 868 ui = ps->ps_ui; 869 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 870 mutex_exit(&ps->ps_mx); 871 } else { 872 /* 873 * this should only ever happen if we are panicking, 874 * since DONTFREE is only set on the parent if panicstr 875 * is non-NULL. 876 */ 877 ASSERT(panicstr); 878 } 879 SPPS_FREE(sp_parent_cache, ps); 880 md_kstat_done(ui, parent_buf, 0); 881 md_unit_readerexit(ui); 882 md_biodone(parent_buf); 883 return (0); 884 } 885 886 /* 887 * FUNCTION: md_sp_strategy() 888 * INPUT: parent_buf - parent buffer 889 * flag - flags 890 * private - private data 891 * OUTPUT: none. 892 * RETURNS: void. 893 * PURPOSE: Soft partitioning I/O strategy. Performs the main work 894 * needed to do I/O to a soft partition. The basic 895 * algorithm is as follows: 896 * - Allocate a child save structure to keep track 897 * of the I/O we are going to pass down. 898 * - Map the I/O to the correct extent in the soft 899 * partition (see sp_mapbuf()). 900 * - bioclone() the buffer and pass it down the 901 * stack using md_call_strategy. 902 * - If the I/O needs to split across extents, 903 * repeat the above steps until all fragments 904 * are finished. 905 */ 906 static void 907 md_sp_strategy(buf_t *parent_buf, int flag, void *private) 908 { 909 md_spps_t *ps; 910 md_spcs_t *cs; 911 int more; 912 mp_unit_t *un; 913 mdi_unit_t *ui; 914 size_t current_count; 915 off_t current_offset; 916 sp_ext_offset_t current_blkno; 917 buf_t *child_buf; 918 set_t setno = MD_MIN2SET(getminor(parent_buf->b_edev)); 919 int strat_flag = flag; 920 921 /* 922 * When doing IO to a multi owner meta device, check if set is halted. 923 * We do this check without the needed lock held, for performance 924 * reasons. 925 * If an IO just slips through while the set is locked via an 926 * MD_MN_SUSPEND_SET, we don't care about it. 927 * Only check for suspension if we are a top-level i/o request 928 * (MD_STR_NOTTOP is cleared in 'flag'); 929 */ 930 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 931 (MD_SET_HALTED | MD_SET_MNSET)) { 932 if ((flag & MD_STR_NOTTOP) == 0) { 933 mutex_enter(&md_mx); 934 /* Here we loop until the set is no longer halted */ 935 while (md_set[setno].s_status & MD_SET_HALTED) { 936 cv_wait(&md_cv, &md_mx); 937 } 938 mutex_exit(&md_mx); 939 } 940 } 941 942 ui = MDI_UNIT(getminor(parent_buf->b_edev)); 943 944 md_kstat_waitq_enter(ui); 945 946 un = (mp_unit_t *)md_unit_readerlock(ui); 947 948 if ((flag & MD_NOBLOCK) == 0) { 949 if (md_inc_iocount(setno) != 0) { 950 parent_buf->b_flags |= B_ERROR; 951 parent_buf->b_error = ENXIO; 952 parent_buf->b_resid = parent_buf->b_bcount; 953 md_unit_readerexit(ui); 954 biodone(parent_buf); 955 return; 956 } 957 } else { 958 md_inc_iocount_noblock(setno); 959 } 960 961 if (!(flag & MD_STR_NOTTOP)) { 962 if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) { 963 md_kstat_waitq_exit(ui); 964 return; 965 } 966 } 967 968 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 969 sp_parent_init(ps); 970 971 /* 972 * Save essential information from the original buffhdr 973 * in the parent. 974 */ 975 ps->ps_un = un; 976 ps->ps_ui = ui; 977 ps->ps_bp = parent_buf; 978 ps->ps_addr = parent_buf->b_un.b_addr; 979 980 current_count = parent_buf->b_bcount; 981 current_blkno = (sp_ext_offset_t)parent_buf->b_blkno; 982 current_offset = 0; 983 984 /* 985 * if we are at the top and we are panicking, 986 * we don't free in order to save state. 987 */ 988 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) 989 ps->ps_flags |= MD_SPPS_DONTFREE; 990 991 md_kstat_waitq_to_runq(ui); 992 993 ps->ps_frags++; 994 995 /* 996 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this 997 * metadevice. 998 */ 999 if (ui->ui_tstate & MD_ABR_CAP) 1000 strat_flag |= MD_STR_ABR; 1001 1002 /* 1003 * this loop does the main work of an I/O. we allocate a 1004 * a child save for each buf, do the logical to physical 1005 * mapping, decide if we need to frag the I/O, clone the 1006 * new I/O to pass down the stack. repeat until we've 1007 * taken care of the entire buf that was passed to us. 1008 */ 1009 do { 1010 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1011 sp_child_init(cs); 1012 child_buf = &cs->cs_buf; 1013 cs->cs_ps = ps; 1014 1015 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1016 if (more == -1) { 1017 parent_buf->b_flags |= B_ERROR; 1018 parent_buf->b_error = EIO; 1019 md_kstat_done(ui, parent_buf, 0); 1020 md_unit_readerexit(ui); 1021 md_biodone(parent_buf); 1022 kmem_cache_free(sp_parent_cache, ps); 1023 return; 1024 } 1025 1026 child_buf = md_bioclone(parent_buf, current_offset, 1027 child_buf->b_bcount, child_buf->b_edev, 1028 child_buf->b_blkno, sp_done, child_buf, 1029 KM_NOSLEEP); 1030 /* calculate new offset, counts, etc... */ 1031 current_offset += child_buf->b_bcount; 1032 current_count -= child_buf->b_bcount; 1033 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1034 1035 if (more) { 1036 mutex_enter(&ps->ps_mx); 1037 ps->ps_frags++; 1038 mutex_exit(&ps->ps_mx); 1039 } 1040 1041 md_call_strategy(child_buf, strat_flag, private); 1042 } while (more); 1043 1044 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) { 1045 while (!(ps->ps_flags & MD_SPPS_DONE)) { 1046 md_daemon(1, &md_done_daemon); 1047 } 1048 kmem_cache_free(sp_parent_cache, ps); 1049 } 1050 } 1051 1052 /* 1053 * FUNCTION: sp_directed_read() 1054 * INPUT: mnum - minor number 1055 * vdr - vol_directed_rd_t from user 1056 * mode - access mode for copying data out. 1057 * OUTPUT: none. 1058 * RETURNS: 0 - success 1059 * Exxxxx - failure error-code 1060 * PURPOSE: Construct the necessary sub-device i/o requests to perform the 1061 * directed read as requested by the user. This is essentially the 1062 * same as md_sp_strategy() with the exception being that the 1063 * underlying 'md_call_strategy' is replaced with an ioctl call. 1064 */ 1065 int 1066 sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode) 1067 { 1068 md_spps_t *ps; 1069 md_spcs_t *cs; 1070 int more; 1071 mp_unit_t *un; 1072 mdi_unit_t *ui; 1073 size_t current_count; 1074 off_t current_offset; 1075 sp_ext_offset_t current_blkno; 1076 buf_t *child_buf, *parent_buf; 1077 void *kbuffer; 1078 vol_directed_rd_t cvdr; 1079 caddr_t userbuf; 1080 offset_t useroff; 1081 int ret = 0; 1082 1083 ui = MDI_UNIT(mnum); 1084 1085 md_kstat_waitq_enter(ui); 1086 1087 bzero(&cvdr, sizeof (cvdr)); 1088 1089 un = (mp_unit_t *)md_unit_readerlock(ui); 1090 1091 /* 1092 * Construct a parent_buf header which reflects the user-supplied 1093 * request. 1094 */ 1095 1096 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 1097 if (kbuffer == NULL) { 1098 vdr->vdr_flags |= DKV_DMR_ERROR; 1099 md_unit_readerexit(ui); 1100 return (ENOMEM); 1101 } 1102 1103 parent_buf = getrbuf(KM_NOSLEEP); 1104 if (parent_buf == NULL) { 1105 vdr->vdr_flags |= DKV_DMR_ERROR; 1106 md_unit_readerexit(ui); 1107 kmem_free(kbuffer, vdr->vdr_nbytes); 1108 return (ENOMEM); 1109 } 1110 parent_buf->b_un.b_addr = kbuffer; 1111 parent_buf->b_flags = B_READ; 1112 parent_buf->b_bcount = vdr->vdr_nbytes; 1113 parent_buf->b_lblkno = lbtodb(vdr->vdr_offset); 1114 parent_buf->b_edev = un->un_dev; 1115 1116 1117 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 1118 sp_parent_init(ps); 1119 1120 /* 1121 * Save essential information from the original buffhdr 1122 * in the parent. 1123 */ 1124 ps->ps_un = un; 1125 ps->ps_ui = ui; 1126 ps->ps_bp = parent_buf; 1127 ps->ps_addr = parent_buf->b_un.b_addr; 1128 1129 current_count = parent_buf->b_bcount; 1130 current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno; 1131 current_offset = 0; 1132 1133 ps->ps_frags++; 1134 vdr->vdr_bytesread = 0; 1135 1136 /* 1137 * this loop does the main work of an I/O. we allocate a 1138 * a child save for each buf, do the logical to physical 1139 * mapping, decide if we need to frag the I/O, clone the 1140 * new I/O to pass down the stack. repeat until we've 1141 * taken care of the entire buf that was passed to us. 1142 */ 1143 do { 1144 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1145 sp_child_init(cs); 1146 child_buf = &cs->cs_buf; 1147 cs->cs_ps = ps; 1148 1149 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1150 if (more == -1) { 1151 ret = EIO; 1152 vdr->vdr_flags |= DKV_DMR_SHORT; 1153 kmem_cache_free(sp_child_cache, cs); 1154 goto err_out; 1155 } 1156 1157 cvdr.vdr_flags = vdr->vdr_flags; 1158 cvdr.vdr_side = vdr->vdr_side; 1159 cvdr.vdr_nbytes = child_buf->b_bcount; 1160 cvdr.vdr_offset = ldbtob(child_buf->b_lblkno); 1161 /* Work out where we are in the allocated buffer */ 1162 useroff = (offset_t)(uintptr_t)kbuffer; 1163 useroff = useroff + (offset_t)current_offset; 1164 cvdr.vdr_data = (void *)(uintptr_t)useroff; 1165 child_buf = md_bioclone(parent_buf, current_offset, 1166 child_buf->b_bcount, child_buf->b_edev, 1167 child_buf->b_blkno, NULL, 1168 child_buf, KM_NOSLEEP); 1169 /* calculate new offset, counts, etc... */ 1170 current_offset += child_buf->b_bcount; 1171 current_count -= child_buf->b_bcount; 1172 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1173 1174 if (more) { 1175 mutex_enter(&ps->ps_mx); 1176 ps->ps_frags++; 1177 mutex_exit(&ps->ps_mx); 1178 } 1179 1180 ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr, 1181 (mode | FKIOCTL), NULL); 1182 1183 /* 1184 * Free the child structure as we've finished with it. 1185 * Normally this would be done by sp_done() but we're just 1186 * using md_bioclone() to segment the transfer and we never 1187 * issue a strategy request so the iodone will not be called. 1188 */ 1189 kmem_cache_free(sp_child_cache, cs); 1190 if (ret == 0) { 1191 /* copyout the returned data to vdr_data + offset */ 1192 userbuf = (caddr_t)kbuffer; 1193 userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer; 1194 if (ddi_copyout(userbuf, vdr->vdr_data, 1195 cvdr.vdr_bytesread, mode)) { 1196 ret = EFAULT; 1197 goto err_out; 1198 } 1199 vdr->vdr_bytesread += cvdr.vdr_bytesread; 1200 } else { 1201 goto err_out; 1202 } 1203 } while (more); 1204 1205 /* 1206 * Update the user-supplied vol_directed_rd_t structure with the 1207 * contents of the last issued child request. 1208 */ 1209 vdr->vdr_flags = cvdr.vdr_flags; 1210 vdr->vdr_side = cvdr.vdr_side; 1211 bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME); 1212 1213 err_out: 1214 if (ret != 0) { 1215 vdr->vdr_flags |= DKV_DMR_ERROR; 1216 } 1217 if (vdr->vdr_bytesread != vdr->vdr_nbytes) { 1218 vdr->vdr_flags |= DKV_DMR_SHORT; 1219 } 1220 kmem_cache_free(sp_parent_cache, ps); 1221 kmem_free(kbuffer, vdr->vdr_nbytes); 1222 freerbuf(parent_buf); 1223 md_unit_readerexit(ui); 1224 return (ret); 1225 } 1226 1227 /* 1228 * FUNCTION: sp_snarf() 1229 * INPUT: cmd - snarf cmd. 1230 * setno - set number. 1231 * OUTPUT: none. 1232 * RETURNS: 1 - soft partitions were snarfed. 1233 * 0 - no soft partitions were snarfed. 1234 * PURPOSE: Snarf soft partition metadb records into their in-core 1235 * structures. This routine is called at "snarf time" when 1236 * md loads and gets all metadevices records into memory. 1237 * The basic algorithm is simply to walk the soft partition 1238 * records in the metadb and call the soft partitioning 1239 * build_incore routine to set up the in-core structures. 1240 */ 1241 static int 1242 sp_snarf(md_snarfcmd_t cmd, set_t setno) 1243 { 1244 mp_unit_t *un; 1245 mddb_recid_t recid; 1246 int gotsomething; 1247 int all_sp_gotten; 1248 mddb_type_t rec_type; 1249 mddb_de_ic_t *dep; 1250 mddb_rb32_t *rbp; 1251 mp_unit_t *big_un; 1252 mp_unit32_od_t *small_un; 1253 size_t newreqsize; 1254 1255 1256 if (cmd == MD_SNARF_CLEANUP) 1257 return (0); 1258 1259 all_sp_gotten = 1; 1260 gotsomething = 0; 1261 1262 /* get the record type */ 1263 rec_type = (mddb_type_t)md_getshared_key(setno, 1264 sp_md_ops.md_driver.md_drivername); 1265 recid = mddb_makerecid(setno, 0); 1266 1267 /* 1268 * walk soft partition records in the metadb and call 1269 * sp_build_incore to build in-core structures. 1270 */ 1271 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1272 /* if we've already gotten this record, go to the next one */ 1273 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1274 continue; 1275 1276 1277 dep = mddb_getrecdep(recid); 1278 dep->de_flags = MDDB_F_SOFTPART; 1279 rbp = dep->de_rb; 1280 1281 if ((rbp->rb_revision == MDDB_REV_RB) && 1282 ((rbp->rb_private & MD_PRV_CONVD) == 0)) { 1283 /* 1284 * This means, we have an old and small record. 1285 * And this record hasn't already been converted :-o 1286 * before we create an incore metadevice from this 1287 * we have to convert it to a big record. 1288 */ 1289 small_un = (mp_unit32_od_t *)mddb_getrecaddr(recid); 1290 newreqsize = sizeof (mp_unit_t) + 1291 ((small_un->un_numexts - 1) * 1292 sizeof (struct mp_ext)); 1293 big_un = (mp_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP); 1294 softpart_convert((caddr_t)small_un, (caddr_t)big_un, 1295 SMALL_2_BIG); 1296 kmem_free(small_un, dep->de_reqsize); 1297 dep->de_rb_userdata = big_un; 1298 dep->de_reqsize = newreqsize; 1299 rbp->rb_private |= MD_PRV_CONVD; 1300 un = big_un; 1301 } else { 1302 /* Large device */ 1303 un = (mp_unit_t *)mddb_getrecaddr(recid); 1304 } 1305 1306 /* Set revision and flag accordingly */ 1307 if (rbp->rb_revision == MDDB_REV_RB) { 1308 un->c.un_revision = MD_32BIT_META_DEV; 1309 } else { 1310 un->c.un_revision = MD_64BIT_META_DEV; 1311 un->c.un_flag |= MD_EFILABEL; 1312 } 1313 1314 /* 1315 * Create minor node for snarfed entry. 1316 */ 1317 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 1318 1319 if (MD_UNIT(MD_SID(un)) != NULL) { 1320 /* unit is already in-core */ 1321 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1322 continue; 1323 } 1324 all_sp_gotten = 0; 1325 if (sp_build_incore((void *)un, 1) == 0) { 1326 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1327 md_create_unit_incore(MD_SID(un), &sp_md_ops, 0); 1328 gotsomething = 1; 1329 } 1330 } 1331 1332 if (!all_sp_gotten) 1333 return (gotsomething); 1334 /* double-check records */ 1335 recid = mddb_makerecid(setno, 0); 1336 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) 1337 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 1338 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1339 1340 return (0); 1341 } 1342 1343 /* 1344 * FUNCTION: sp_halt() 1345 * INPUT: cmd - halt cmd. 1346 * setno - set number. 1347 * RETURNS: 0 - success. 1348 * 1 - err. 1349 * PURPOSE: Perform driver halt operations. As with stripe, we 1350 * support MD_HALT_CHECK and MD_HALT_DOIT. The first 1351 * does a check to see if halting can be done safely 1352 * (no open soft partitions), the second cleans up and 1353 * shuts down the driver. 1354 */ 1355 static int 1356 sp_halt(md_haltcmd_t cmd, set_t setno) 1357 { 1358 int i; 1359 mdi_unit_t *ui; 1360 minor_t mnum; 1361 1362 if (cmd == MD_HALT_CLOSE) 1363 return (0); 1364 1365 if (cmd == MD_HALT_OPEN) 1366 return (0); 1367 1368 if (cmd == MD_HALT_UNLOAD) 1369 return (0); 1370 1371 if (cmd == MD_HALT_CHECK) { 1372 for (i = 0; i < md_nunits; i++) { 1373 mnum = MD_MKMIN(setno, i); 1374 if ((ui = MDI_UNIT(mnum)) == NULL) 1375 continue; 1376 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1377 continue; 1378 if (md_unit_isopen(ui)) 1379 return (1); 1380 } 1381 return (0); 1382 } 1383 1384 if (cmd != MD_HALT_DOIT) 1385 return (1); 1386 1387 for (i = 0; i < md_nunits; i++) { 1388 mnum = MD_MKMIN(setno, i); 1389 if ((ui = MDI_UNIT(mnum)) == NULL) 1390 continue; 1391 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1392 continue; 1393 reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0); 1394 } 1395 1396 return (0); 1397 } 1398 1399 /* 1400 * FUNCTION: sp_open_dev() 1401 * INPUT: un - unit structure. 1402 * oflags - open flags. 1403 * OUTPUT: none. 1404 * RETURNS: 0 - success. 1405 * non-zero - err. 1406 * PURPOSE: open underlying device via md_layered_open. 1407 */ 1408 static int 1409 sp_open_dev(mp_unit_t *un, int oflags) 1410 { 1411 minor_t mnum = MD_SID(un); 1412 int err; 1413 md_dev64_t tmpdev; 1414 set_t setno = MD_MIN2SET(MD_SID(un)); 1415 side_t side = mddb_getsidenum(setno); 1416 1417 tmpdev = un->un_dev; 1418 /* 1419 * Do the open by device id if underlying is regular 1420 */ 1421 if ((md_getmajor(tmpdev) != md_major) && 1422 md_devid_found(setno, side, un->un_key) == 1) { 1423 tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key); 1424 } 1425 err = md_layered_open(mnum, &tmpdev, oflags); 1426 un->un_dev = tmpdev; 1427 1428 if (err) 1429 return (ENXIO); 1430 1431 return (0); 1432 } 1433 1434 /* 1435 * FUNCTION: sp_open() 1436 * INPUT: dev - device to open. 1437 * flag - pass-through flag. 1438 * otyp - pass-through open type. 1439 * cred_p - credentials. 1440 * md_oflags - open flags. 1441 * OUTPUT: none. 1442 * RETURNS: 0 - success. 1443 * non-zero - err. 1444 * PURPOSE: open a soft partition. 1445 */ 1446 /* ARGSUSED */ 1447 static int 1448 sp_open( 1449 dev_t *dev, 1450 int flag, 1451 int otyp, 1452 cred_t *cred_p, 1453 int md_oflags 1454 ) 1455 { 1456 minor_t mnum = getminor(*dev); 1457 mdi_unit_t *ui = MDI_UNIT(mnum); 1458 mp_unit_t *un; 1459 int err = 0; 1460 set_t setno; 1461 1462 /* 1463 * When doing an open of a multi owner metadevice, check to see if this 1464 * node is a starting node and if a reconfig cycle is underway. 1465 * If so, the system isn't sufficiently set up enough to handle the 1466 * open (which involves I/O during sp_validate), so fail with ENXIO. 1467 */ 1468 setno = MD_MIN2SET(mnum); 1469 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 1470 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 1471 return (ENXIO); 1472 } 1473 1474 /* grab necessary locks */ 1475 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1476 setno = MD_UN2SET(un); 1477 1478 /* open underlying device, if necessary */ 1479 if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { 1480 if ((err = sp_open_dev(un, md_oflags)) != 0) 1481 goto out; 1482 1483 if (MD_MNSET_SETNO(setno)) { 1484 /* For probe, don't incur the overhead of validate */ 1485 if (!(md_oflags & MD_OFLG_PROBEDEV)) { 1486 /* 1487 * Don't call sp_validate while 1488 * unit_openclose lock is held. So, actually 1489 * open the device, drop openclose lock, 1490 * call sp_validate, reacquire openclose lock, 1491 * and close the device. If sp_validate 1492 * succeeds, then device will be re-opened. 1493 */ 1494 if ((err = md_unit_incopen(mnum, flag, 1495 otyp)) != 0) 1496 goto out; 1497 1498 mutex_enter(&ui->ui_mx); 1499 ui->ui_lock |= MD_UL_OPENINPROGRESS; 1500 mutex_exit(&ui->ui_mx); 1501 md_unit_openclose_exit(ui); 1502 if (otyp != OTYP_LYR) 1503 rw_exit(&md_unit_array_rw.lock); 1504 1505 err = sp_validate(un); 1506 1507 if (otyp != OTYP_LYR) 1508 rw_enter(&md_unit_array_rw.lock, 1509 RW_READER); 1510 (void) md_unit_openclose_enter(ui); 1511 (void) md_unit_decopen(mnum, otyp); 1512 mutex_enter(&ui->ui_mx); 1513 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 1514 cv_broadcast(&ui->ui_cv); 1515 mutex_exit(&ui->ui_mx); 1516 /* 1517 * Should be in the same state as before 1518 * the sp_validate. 1519 */ 1520 if (err != 0) { 1521 /* close the device opened above */ 1522 md_layered_close(un->un_dev, md_oflags); 1523 err = EIO; 1524 goto out; 1525 } 1526 } 1527 /* 1528 * As we're a multi-owner metadevice we need to ensure 1529 * that all nodes have the same idea of the status. 1530 * sp_validate() will mark the device as errored (if 1531 * it cannot read the watermark) or ok (if it was 1532 * previously errored but the watermark is now valid). 1533 * This code-path is only entered on the non-probe open 1534 * so we will maintain the errored state during a probe 1535 * call. This means the sys-admin must metarecover -m 1536 * to reset the soft-partition error. 1537 */ 1538 } else { 1539 /* For probe, don't incur the overhead of validate */ 1540 if (!(md_oflags & MD_OFLG_PROBEDEV) && 1541 (err = sp_validate(un)) != 0) { 1542 /* close the device opened above */ 1543 md_layered_close(un->un_dev, md_oflags); 1544 err = EIO; 1545 goto out; 1546 } else { 1547 /* 1548 * we succeeded in validating the on disk 1549 * format versus the in core, so reset the 1550 * status if it's in error 1551 */ 1552 if (un->un_status == MD_SP_ERR) { 1553 un->un_status = MD_SP_OK; 1554 } 1555 } 1556 } 1557 } 1558 1559 /* count open */ 1560 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 1561 goto out; 1562 1563 out: 1564 md_unit_openclose_exit(ui); 1565 return (err); 1566 } 1567 1568 /* 1569 * FUNCTION: sp_close() 1570 * INPUT: dev - device to close. 1571 * flag - pass-through flag. 1572 * otyp - pass-through type. 1573 * cred_p - credentials. 1574 * md_cflags - close flags. 1575 * OUTPUT: none. 1576 * RETURNS: 0 - success. 1577 * non-zero - err. 1578 * PURPOSE: close a soft paritition. 1579 */ 1580 /* ARGSUSED */ 1581 static int 1582 sp_close( 1583 dev_t dev, 1584 int flag, 1585 int otyp, 1586 cred_t *cred_p, 1587 int md_cflags 1588 ) 1589 { 1590 minor_t mnum = getminor(dev); 1591 mdi_unit_t *ui = MDI_UNIT(mnum); 1592 mp_unit_t *un; 1593 int err = 0; 1594 1595 /* grab necessary locks */ 1596 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1597 1598 /* count closed */ 1599 if ((err = md_unit_decopen(mnum, otyp)) != 0) 1600 goto out; 1601 1602 /* close devices, if necessary */ 1603 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 1604 md_layered_close(un->un_dev, md_cflags); 1605 } 1606 1607 /* 1608 * If a MN set and transient capabilities (eg ABR/DMR) are set, 1609 * clear these capabilities if this is the last close in 1610 * the cluster 1611 */ 1612 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1613 (ui->ui_tstate & MD_ABR_CAP)) { 1614 md_unit_openclose_exit(ui); 1615 mdmn_clear_all_capabilities(mnum); 1616 return (0); 1617 } 1618 /* unlock, return success */ 1619 out: 1620 md_unit_openclose_exit(ui); 1621 return (err); 1622 } 1623 1624 1625 /* used in sp_dump routine */ 1626 static struct buf dumpbuf; 1627 1628 /* 1629 * FUNCTION: sp_dump() 1630 * INPUT: dev - device to dump to. 1631 * addr - address to dump. 1632 * blkno - blkno on device. 1633 * nblk - number of blocks to dump. 1634 * OUTPUT: none. 1635 * RETURNS: result from bdev_dump. 1636 * PURPOSE: This routine dumps memory to the disk. It assumes that 1637 * the memory has already been mapped into mainbus space. 1638 * It is called at disk interrupt priority when the system 1639 * is in trouble. 1640 * NOTE: this function is defined using 32-bit arguments, 1641 * but soft partitioning is internally 64-bit. Arguments 1642 * are casted where appropriate. 1643 */ 1644 static int 1645 sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1646 { 1647 mp_unit_t *un; 1648 buf_t *bp; 1649 sp_ext_length_t nb; 1650 daddr_t mapblk; 1651 int result; 1652 int more; 1653 int saveresult = 0; 1654 1655 /* 1656 * Don't need to grab the unit lock. 1657 * Cause nothing else is supposed to be happenning. 1658 * Also dump is not supposed to sleep. 1659 */ 1660 un = (mp_unit_t *)MD_UNIT(getminor(dev)); 1661 1662 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 1663 return (EINVAL); 1664 1665 if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks) 1666 return (EINVAL); 1667 1668 bp = &dumpbuf; 1669 nb = (sp_ext_length_t)dbtob(nblk); 1670 do { 1671 bzero((caddr_t)bp, sizeof (*bp)); 1672 more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp); 1673 nblk = (int)(btodb(bp->b_bcount)); 1674 mapblk = bp->b_blkno; 1675 result = bdev_dump(bp->b_edev, addr, mapblk, nblk); 1676 if (result) 1677 saveresult = result; 1678 1679 nb -= bp->b_bcount; 1680 addr += bp->b_bcount; 1681 blkno += nblk; 1682 } while (more); 1683 1684 return (saveresult); 1685 } 1686 1687 static int 1688 sp_imp_set( 1689 set_t setno 1690 ) 1691 { 1692 mddb_recid_t recid; 1693 int gotsomething; 1694 mddb_type_t rec_type; 1695 mddb_de_ic_t *dep; 1696 mddb_rb32_t *rbp; 1697 mp_unit_t *un64; 1698 mp_unit32_od_t *un32; 1699 minor_t *self_id; /* minor needs to be updated */ 1700 md_parent_t *parent_id; /* parent needs to be updated */ 1701 mddb_recid_t *record_id; /* record id needs to be updated */ 1702 1703 gotsomething = 0; 1704 1705 rec_type = (mddb_type_t)md_getshared_key(setno, 1706 sp_md_ops.md_driver.md_drivername); 1707 recid = mddb_makerecid(setno, 0); 1708 1709 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1710 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1711 continue; 1712 1713 dep = mddb_getrecdep(recid); 1714 rbp = dep->de_rb; 1715 1716 if (rbp->rb_revision == MDDB_REV_RB) { 1717 /* 1718 * Small device 1719 */ 1720 un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid); 1721 self_id = &(un32->c.un_self_id); 1722 parent_id = &(un32->c.un_parent); 1723 record_id = &(un32->c.un_record_id); 1724 1725 if (!md_update_minor(setno, mddb_getsidenum 1726 (setno), un32->un_key)) 1727 goto out; 1728 } else { 1729 un64 = (mp_unit_t *)mddb_getrecaddr(recid); 1730 self_id = &(un64->c.un_self_id); 1731 parent_id = &(un64->c.un_parent); 1732 record_id = &(un64->c.un_record_id); 1733 1734 if (!md_update_minor(setno, mddb_getsidenum 1735 (setno), un64->un_key)) 1736 goto out; 1737 } 1738 1739 /* 1740 * Update unit with the imported setno 1741 * 1742 */ 1743 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1744 1745 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 1746 if (*parent_id != MD_NO_PARENT) 1747 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 1748 *record_id = MAKERECID(setno, DBID(*record_id)); 1749 1750 gotsomething = 1; 1751 } 1752 1753 out: 1754 return (gotsomething); 1755 } 1756 1757 static md_named_services_t sp_named_services[] = { 1758 {NULL, 0} 1759 }; 1760 1761 md_ops_t sp_md_ops = { 1762 sp_open, /* open */ 1763 sp_close, /* close */ 1764 md_sp_strategy, /* strategy */ 1765 NULL, /* print */ 1766 sp_dump, /* dump */ 1767 NULL, /* read */ 1768 NULL, /* write */ 1769 md_sp_ioctl, /* ioctl, */ 1770 sp_snarf, /* snarf */ 1771 sp_halt, /* halt */ 1772 NULL, /* aread */ 1773 NULL, /* awrite */ 1774 sp_imp_set, /* import set */ 1775 sp_named_services 1776 }; 1777 1778 static void 1779 init_init() 1780 { 1781 sp_parent_cache = kmem_cache_create("md_softpart_parent", 1782 sizeof (md_spps_t), 0, sp_parent_constructor, 1783 sp_parent_destructor, sp_run_queue, NULL, NULL, 0); 1784 sp_child_cache = kmem_cache_create("md_softpart_child", 1785 sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0, 1786 sp_child_constructor, sp_child_destructor, sp_run_queue, 1787 NULL, NULL, 0); 1788 } 1789 1790 static void 1791 fini_uninit() 1792 { 1793 kmem_cache_destroy(sp_parent_cache); 1794 kmem_cache_destroy(sp_child_cache); 1795 sp_parent_cache = sp_child_cache = NULL; 1796 } 1797 1798 /* define the module linkage */ 1799 MD_PLUGIN_MISC_MODULE("soft partition module %I%", init_init(), fini_uninit()) 1800