1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Soft partitioning metadevice driver (md_sp). 30 * 31 * This file contains the primary operations of the soft partitioning 32 * metadevice driver. This includes all routines for normal operation 33 * (open/close/read/write). Please see mdvar.h for a definition of 34 * metadevice operations vector (md_ops_t). This driver is loosely 35 * based on the stripe driver (md_stripe). 36 * 37 * All metadevice administration is done through the use of ioctl's. 38 * As such, all administrative routines appear in sp_ioctl.c. 39 * 40 * Soft partitions are represented both in-core and in the metadb with a 41 * unit structure. The soft partition-specific information in the unit 42 * structure includes the following information: 43 * - Device information (md_dev64_t & md key) about the device on which 44 * the soft partition is built. 45 * - Soft partition status information. 46 * - The size of the soft partition and number of extents used to 47 * make up that size. 48 * - An array of exents which define virtual/physical offset 49 * mappings and lengths for each extent. 50 * 51 * Typical soft partition operation proceeds as follows: 52 * - The unit structure is fetched from the metadb and placed into 53 * an in-core array (as with other metadevices). This operation 54 * is performed via sp_build_incore( ) and takes place during 55 * "snarfing" (when all metadevices are brought in-core at 56 * once) and when a new soft partition is created. 57 * - A soft partition is opened via sp_open( ). At open time the 58 * the soft partition unit structure is verified with the soft 59 * partition on-disk structures. Additionally, the soft partition 60 * status is checked (only soft partitions in the OK state may be 61 * opened). 62 * - Soft partition I/O is performed via sp_strategy( ) which relies on 63 * a support routine, sp_mapbuf( ), to do most of the work. 64 * sp_mapbuf( ) maps a buffer to a particular extent via a binary 65 * search of the extent array in the soft partition unit structure. 66 * Once a translation has been performed, the I/O is passed down 67 * to the next layer, which may be another metadevice or a physical 68 * disk. Since a soft partition may contain multiple, non-contiguous 69 * extents, a single I/O may have to be fragmented. 70 * - Soft partitions are closed using sp_close. 71 * 72 */ 73 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/conf.h> 77 #include <sys/file.h> 78 #include <sys/user.h> 79 #include <sys/uio.h> 80 #include <sys/t_lock.h> 81 #include <sys/buf.h> 82 #include <sys/dkio.h> 83 #include <sys/vtoc.h> 84 #include <sys/kmem.h> 85 #include <vm/page.h> 86 #include <sys/cmn_err.h> 87 #include <sys/sysmacros.h> 88 #include <sys/types.h> 89 #include <sys/mkdev.h> 90 #include <sys/stat.h> 91 #include <sys/open.h> 92 #include <sys/lvm/mdvar.h> 93 #include <sys/lvm/md_sp.h> 94 #include <sys/lvm/md_convert.h> 95 #include <sys/lvm/md_notify.h> 96 #include <sys/lvm/md_crc.h> 97 #include <sys/modctl.h> 98 #include <sys/ddi.h> 99 #include <sys/sunddi.h> 100 #include <sys/debug.h> 101 102 #include <sys/sysevent/eventdefs.h> 103 #include <sys/sysevent/svm.h> 104 105 md_ops_t sp_md_ops; 106 #ifndef lint 107 char _depends_on[] = "drv/md"; 108 md_ops_t *md_interface_ops = &sp_md_ops; 109 #endif 110 111 extern unit_t md_nunits; 112 extern set_t md_nsets; 113 extern md_set_t md_set[]; 114 115 extern int md_status; 116 extern major_t md_major; 117 extern mdq_anchor_t md_done_daemon; 118 extern mdq_anchor_t md_sp_daemon; 119 extern kmutex_t md_mx; 120 extern kcondvar_t md_cv; 121 extern md_krwlock_t md_unit_array_rw; 122 123 static kmem_cache_t *sp_parent_cache = NULL; 124 static kmem_cache_t *sp_child_cache = NULL; 125 static void sp_send_stat_ok(mp_unit_t *); 126 static void sp_send_stat_err(mp_unit_t *); 127 128 /* 129 * FUNCTION: sp_parent_constructor() 130 * INPUT: none. 131 * OUTPUT: ps - parent save structure initialized. 132 * RETURNS: void * - ptr to initialized parent save structure. 133 * PURPOSE: initialize parent save structure. 134 */ 135 /*ARGSUSED1*/ 136 static int 137 sp_parent_constructor(void *p, void *d1, int d2) 138 { 139 mutex_init(&((md_spps_t *)p)->ps_mx, 140 NULL, MUTEX_DEFAULT, NULL); 141 return (0); 142 } 143 144 static void 145 sp_parent_init(md_spps_t *ps) 146 { 147 bzero(ps, offsetof(md_spps_t, ps_mx)); 148 } 149 150 /*ARGSUSED1*/ 151 static void 152 sp_parent_destructor(void *p, void *d) 153 { 154 mutex_destroy(&((md_spps_t *)p)->ps_mx); 155 } 156 157 /* 158 * FUNCTION: sp_child_constructor() 159 * INPUT: none. 160 * OUTPUT: cs - child save structure initialized. 161 * RETURNS: void * - ptr to initialized child save structure. 162 * PURPOSE: initialize child save structure. 163 */ 164 /*ARGSUSED1*/ 165 static int 166 sp_child_constructor(void *p, void *d1, int d2) 167 { 168 bioinit(&((md_spcs_t *)p)->cs_buf); 169 return (0); 170 } 171 172 static void 173 sp_child_init(md_spcs_t *cs) 174 { 175 cs->cs_mdunit = 0; 176 cs->cs_ps = NULL; 177 md_bioreset(&cs->cs_buf); 178 } 179 180 /*ARGSUSED1*/ 181 static void 182 sp_child_destructor(void *p, void *d) 183 { 184 biofini(&((md_spcs_t *)p)->cs_buf); 185 } 186 187 /* 188 * FUNCTION: sp_run_queue() 189 * INPUT: none. 190 * OUTPUT: none. 191 * RETURNS: void. 192 * PURPOSE: run the md_daemon to clean up memory pool. 193 */ 194 /*ARGSUSED*/ 195 static void 196 sp_run_queue(void *d) 197 { 198 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 199 md_daemon(1, &md_done_daemon); 200 } 201 202 203 /* 204 * FUNCTION: sp_build_incore() 205 * INPUT: p - ptr to unit structure. 206 * snarfing - flag to tell us we are snarfing. 207 * OUTPUT: non. 208 * RETURNS: int - 0 (always). 209 * PURPOSE: place unit structure into in-core unit array (keyed from 210 * minor number). 211 */ 212 int 213 sp_build_incore(void *p, int snarfing) 214 { 215 mp_unit_t *un = (mp_unit_t *)p; 216 minor_t mnum; 217 set_t setno; 218 md_dev64_t tmpdev; 219 220 mnum = MD_SID(un); 221 222 if (MD_UNIT(mnum) != NULL) 223 return (0); 224 225 MD_STATUS(un) = 0; 226 227 if (snarfing) { 228 /* 229 * if we are snarfing, we get the device information 230 * from the metadb record (using the metadb key for 231 * that device). 232 */ 233 setno = MD_MIN2SET(mnum); 234 235 tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), 236 un->un_key, MD_NOTRUST_DEVT); 237 un->un_dev = tmpdev; 238 } 239 240 /* place unit in in-core array */ 241 MD_UNIT(mnum) = un; 242 return (0); 243 } 244 245 /* 246 * FUNCTION: reset_sp() 247 * INPUT: un - unit structure to be reset/removed. 248 * mnum - minor number to be reset/removed. 249 * removing - flag to tell us if we are removing 250 * permanently or just reseting in-core 251 * structures. 252 * OUTPUT: none. 253 * RETURNS: void. 254 * PURPOSE: used to either simply reset in-core structures or to 255 * permanently remove metadevices from the metadb. 256 */ 257 void 258 reset_sp(mp_unit_t *un, minor_t mnum, int removing) 259 { 260 sv_dev_t *sv; 261 mddb_recid_t vtoc_id; 262 263 /* clean up in-core structures */ 264 md_destroy_unit_incore(mnum, &sp_md_ops); 265 266 MD_UNIT(mnum) = NULL; 267 268 /* 269 * Attempt release of minor node 270 */ 271 md_remove_minor_node(mnum); 272 273 if (!removing) 274 return; 275 276 /* we are removing the soft partition from the metadb */ 277 278 /* 279 * Save off device information so we can get to 280 * it after we do the mddb_deleterec(). 281 */ 282 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); 283 sv->setno = MD_MIN2SET(mnum); 284 sv->key = un->un_key; 285 vtoc_id = un->c.un_vtoc_id; 286 287 /* 288 * Remove self from the namespace 289 */ 290 if (un->c.un_revision & MD_FN_META_DEV) { 291 (void) md_rem_selfname(un->c.un_self_id); 292 } 293 294 /* Remove the unit structure */ 295 mddb_deleterec_wrapper(un->c.un_record_id); 296 297 if (vtoc_id) 298 mddb_deleterec_wrapper(vtoc_id); 299 300 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE, 301 MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); 302 303 /* 304 * remove the underlying device name from the metadb. if other 305 * soft partitions are built on this device, this will simply 306 * decrease the reference count for this device. otherwise the 307 * name record for this device will be removed from the metadb. 308 */ 309 md_rem_names(sv, 1); 310 kmem_free(sv, sizeof (sv_dev_t)); 311 } 312 313 /* 314 * FUNCTION: sp_send_stat_msg 315 * INPUT: un - unit reference 316 * status - status to be sent to master node 317 * MD_SP_OK - soft-partition is now OK 318 * MD_SP_ERR " " errored 319 * OUTPUT: none. 320 * RETURNS: void. 321 * PURPOSE: send a soft-partition status change to the master node. If the 322 * message succeeds we simply return. If it fails we panic as the 323 * cluster-wide view of the metadevices is now inconsistent. 324 * CALLING CONTEXT: 325 * Blockable. No locks can be held. 326 */ 327 static void 328 sp_send_stat_msg(mp_unit_t *un, sp_status_t status) 329 { 330 md_mn_msg_sp_setstat_t sp_msg; 331 md_mn_kresult_t *kres; 332 set_t setno = MD_UN2SET(un); 333 int rval; 334 const char *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK"; 335 336 sp_msg.sp_setstat_mnum = MD_SID(un); 337 sp_msg.sp_setstat_status = status; 338 339 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 340 341 rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG, 342 (char *)&sp_msg, sizeof (sp_msg), kres); 343 344 if (!MDMN_KSEND_MSG_OK(rval, kres)) { 345 mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2"); 346 347 /* 348 * Panic as we are now in an inconsistent state. 349 */ 350 351 cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n", 352 md_shortname(MD_SID(un)), str); 353 } 354 355 kmem_free(kres, sizeof (md_mn_kresult_t)); 356 } 357 358 /* 359 * FUNCTION: sp_finish_error 360 * INPUT: ps - parent save structure for error-ed I/O. 361 * lock_held - set if the unit readerlock is held 362 * OUTPUT: none. 363 * RETURNS: void. 364 * PURPOSE: report a driver error 365 */ 366 static void 367 sp_finish_error(md_spps_t *ps, int lock_held) 368 { 369 struct buf *pb = ps->ps_bp; 370 mdi_unit_t *ui = ps->ps_ui; 371 md_dev64_t un_dev; /* underlying device */ 372 md_dev64_t md_dev = md_expldev(pb->b_edev); /* metadev in error */ 373 char *str; 374 375 un_dev = md_expldev(ps->ps_un->un_dev); 376 /* set error type */ 377 if (pb->b_flags & B_READ) { 378 str = "read"; 379 } else { 380 str = "write"; 381 } 382 383 384 SPPS_FREE(sp_parent_cache, ps); 385 pb->b_flags |= B_ERROR; 386 387 md_kstat_done(ui, pb, 0); 388 389 if (lock_held) { 390 md_unit_readerexit(ui); 391 } 392 md_biodone(pb); 393 394 cmn_err(CE_WARN, "md: %s: %s error on %s", 395 md_shortname(md_getminor(md_dev)), str, 396 md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0)); 397 } 398 399 400 /* 401 * FUNCTION: sp_xmit_ok 402 * INPUT: dq - daemon queue referencing failing ps structure 403 * OUTPUT: none. 404 * RETURNS: void. 405 * PURPOSE: send a message to the master node in a multi-owner diskset to 406 * update all attached nodes view of the soft-part to be MD_SP_OK. 407 * CALLING CONTEXT: 408 * Blockable. No unit lock held. 409 */ 410 static void 411 sp_xmit_ok(daemon_queue_t *dq) 412 { 413 md_spps_t *ps = (md_spps_t *)dq; 414 415 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 416 sp_send_stat_msg(ps->ps_un, MD_SP_OK); 417 418 /* 419 * Successfully transmitted error state to all nodes, now release this 420 * parent structure. 421 */ 422 SPPS_FREE(sp_parent_cache, ps); 423 } 424 425 /* 426 * FUNCTION: sp_xmit_error 427 * INPUT: dq - daemon queue referencing failing ps structure 428 * OUTPUT: none. 429 * RETURNS: void. 430 * PURPOSE: send a message to the master node in a multi-owner diskset to 431 * update all attached nodes view of the soft-part to be MD_SP_ERR. 432 * CALLING CONTEXT: 433 * Blockable. No unit lock held. 434 */ 435 static void 436 sp_xmit_error(daemon_queue_t *dq) 437 { 438 md_spps_t *ps = (md_spps_t *)dq; 439 440 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 441 sp_send_stat_msg(ps->ps_un, MD_SP_ERR); 442 443 /* 444 * Successfully transmitted error state to all nodes, now release this 445 * parent structure. 446 */ 447 SPPS_FREE(sp_parent_cache, ps); 448 } 449 static void 450 sp_send_stat_ok(mp_unit_t *un) 451 { 452 minor_t mnum = MD_SID(un); 453 md_spps_t *ps; 454 455 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 456 sp_parent_init(ps); 457 ps->ps_un = un; 458 ps->ps_ui = MDI_UNIT(mnum); 459 460 daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps, 461 REQ_OLD); 462 } 463 464 static void 465 sp_send_stat_err(mp_unit_t *un) 466 { 467 minor_t mnum = MD_SID(un); 468 md_spps_t *ps; 469 470 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 471 sp_parent_init(ps); 472 ps->ps_un = un; 473 ps->ps_ui = MDI_UNIT(mnum); 474 475 daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps, 476 REQ_OLD); 477 } 478 479 480 /* 481 * FUNCTION: sp_error() 482 * INPUT: ps - parent save structure for error-ed I/O. 483 * OUTPUT: none. 484 * RETURNS: void. 485 * PURPOSE: report a driver error. 486 * CALLING CONTEXT: 487 * Interrupt - non-blockable 488 */ 489 static void 490 sp_error(md_spps_t *ps) 491 { 492 set_t setno = MD_UN2SET(ps->ps_un); 493 494 /* 495 * Drop the mutex associated with this request before (potentially) 496 * enqueuing the free onto a separate thread. We have to release the 497 * mutex before destroying the parent structure. 498 */ 499 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 500 if (MUTEX_HELD(&ps->ps_mx)) { 501 mutex_exit(&ps->ps_mx); 502 } 503 } else { 504 /* 505 * this should only ever happen if we are panicking, 506 * since DONTFREE is only set on the parent if panicstr 507 * is non-NULL. 508 */ 509 ASSERT(panicstr); 510 } 511 512 /* 513 * For a multi-owner set we need to send a message to the master so that 514 * all nodes get the errored status when we first encounter it. To avoid 515 * deadlocking when multiple soft-partitions encounter an error on one 516 * physical unit we drop the unit readerlock before enqueueing the 517 * request. That way we can service any messages that require a 518 * writerlock to be held. Additionally, to avoid deadlocking when at 519 * the bottom of a metadevice stack and a higher level mirror has 520 * multiple requests outstanding on this soft-part, we clone the ps 521 * that failed and pass the error back up the stack to release the 522 * reference that this i/o may have in the higher-level metadevice. 523 * The other nodes in the cluster just have to modify the soft-part 524 * status and we do not need to block the i/o completion for this. 525 */ 526 if (MD_MNSET_SETNO(setno)) { 527 md_spps_t *err_ps; 528 err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 529 sp_parent_init(err_ps); 530 531 err_ps->ps_un = ps->ps_un; 532 err_ps->ps_ui = ps->ps_ui; 533 534 md_unit_readerexit(ps->ps_ui); 535 536 daemon_request(&md_sp_daemon, sp_xmit_error, 537 (daemon_queue_t *)err_ps, REQ_OLD); 538 539 sp_finish_error(ps, 0); 540 541 return; 542 } else { 543 ps->ps_un->un_status = MD_SP_ERR; 544 } 545 546 /* Flag the error */ 547 sp_finish_error(ps, 1); 548 549 } 550 551 /* 552 * FUNCTION: sp_mapbuf() 553 * INPUT: un - unit structure for soft partition we are doing 554 * I/O on. 555 * voff - virtual offset in soft partition to map. 556 * bcount - # of blocks in the I/O. 557 * OUTPUT: bp - translated buffer to be passed down to next layer. 558 * RETURNS: 1 - request must be fragmented, more work to do, 559 * 0 - request satisified, no more work to do 560 * -1 - error 561 * PURPOSE: Map the the virtual offset in the soft partition (passed 562 * in via voff) to the "physical" offset on whatever the soft 563 * partition is built on top of. We do this by doing a binary 564 * search of the extent array in the soft partition unit 565 * structure. Once the current extent is found, we do the 566 * translation, determine if the I/O will cross extent 567 * boundaries (if so, we have to fragment the I/O), then 568 * fill in the buf structure to be passed down to the next layer. 569 */ 570 static int 571 sp_mapbuf( 572 mp_unit_t *un, 573 sp_ext_offset_t voff, 574 sp_ext_length_t bcount, 575 buf_t *bp 576 ) 577 { 578 int lo, mid, hi, found, more; 579 size_t new_bcount; 580 sp_ext_offset_t new_blkno; 581 sp_ext_offset_t new_offset; 582 sp_ext_offset_t ext_endblk; 583 md_dev64_t new_edev; 584 extern unsigned md_maxphys; 585 586 found = 0; 587 lo = 0; 588 hi = un->un_numexts - 1; 589 590 /* 591 * do a binary search to find the extent that contains the 592 * starting offset. after this loop, mid contains the index 593 * of the correct extent. 594 */ 595 while (lo <= hi && !found) { 596 mid = (lo + hi) / 2; 597 /* is the starting offset contained within the mid-ext? */ 598 if (voff >= un->un_ext[mid].un_voff && 599 voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len) 600 found = 1; 601 else if (voff < un->un_ext[mid].un_voff) 602 hi = mid - 1; 603 else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */ 604 lo = mid + 1; 605 } 606 607 if (!found) { 608 cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff); 609 return (-1); 610 } 611 612 /* translate to underlying physical offset/device */ 613 new_offset = voff - un->un_ext[mid].un_voff; 614 new_blkno = un->un_ext[mid].un_poff + new_offset; 615 new_edev = un->un_dev; 616 617 /* determine if we need to break the I/O into fragments */ 618 ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len; 619 if (voff + btodb(bcount) > ext_endblk) { 620 new_bcount = dbtob(ext_endblk - voff); 621 more = 1; 622 } else { 623 new_bcount = bcount; 624 more = 0; 625 } 626 627 /* only break up the I/O if we're not built on another metadevice */ 628 if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) { 629 new_bcount = md_maxphys; 630 more = 1; 631 } 632 if (bp != (buf_t *)NULL) { 633 /* do bp updates */ 634 bp->b_bcount = new_bcount; 635 bp->b_lblkno = new_blkno; 636 bp->b_edev = md_dev64_to_dev(new_edev); 637 } 638 return (more); 639 } 640 641 /* 642 * FUNCTION: sp_validate() 643 * INPUT: un - unit structure to be validated. 644 * OUTPUT: none. 645 * RETURNS: 0 - soft partition ok. 646 * -1 - error. 647 * PURPOSE: called on open to sanity check the soft partition. In 648 * order to open a soft partition: 649 * - it must have at least one extent 650 * - the extent info in core and on disk must match 651 * - it may not be in an intermediate state (which would 652 * imply that a two-phase commit was interrupted) 653 * 654 * If the extent checking fails (B_ERROR returned from the read 655 * strategy call) _and_ we're a multi-owner diskset, we send a 656 * message to the master so that all nodes inherit the same view 657 * of the soft partition. 658 * If we are checking a soft-part that is marked as in error, and 659 * we can actually read and validate the watermarks we send a 660 * message to clear the error to the master node. 661 */ 662 static int 663 sp_validate(mp_unit_t *un) 664 { 665 uint_t ext; 666 struct buf *buf; 667 sp_ext_length_t len; 668 mp_watermark_t *wm; 669 set_t setno; 670 int reset_error = 0; 671 672 setno = MD_UN2SET(un); 673 674 /* sanity check unit structure components ?? */ 675 if (un->un_status != MD_SP_OK) { 676 if (un->un_status != MD_SP_ERR) { 677 cmn_err(CE_WARN, "md: %s: open failed, soft partition " 678 "status is %u.", 679 md_shortname(MD_SID(un)), 680 un->un_status); 681 return (-1); 682 } else { 683 cmn_err(CE_WARN, "md: %s: open of soft partition " 684 "in Errored state.", 685 md_shortname(MD_SID(un))); 686 reset_error = 1; 687 } 688 } 689 690 if (un->un_numexts == 0) { 691 cmn_err(CE_WARN, "md: %s: open failed, soft partition does " 692 "not have any extents.", md_shortname(MD_SID(un))); 693 return (-1); 694 } 695 696 len = 0LL; 697 for (ext = 0; ext < un->un_numexts; ext++) { 698 699 /* tally extent lengths to check total size */ 700 len += un->un_ext[ext].un_len; 701 702 /* allocate buffer for watermark */ 703 buf = getrbuf(KM_SLEEP); 704 705 /* read watermark */ 706 buf->b_flags = B_READ; 707 buf->b_edev = md_dev64_to_dev(un->un_dev); 708 buf->b_iodone = NULL; 709 buf->b_proc = NULL; 710 buf->b_bcount = sizeof (mp_watermark_t); 711 buf->b_lblkno = un->un_ext[ext].un_poff - 1; 712 buf->b_bufsize = sizeof (mp_watermark_t); 713 buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t), 714 KM_SLEEP); 715 716 /* 717 * make the call non-blocking so that it is not affected 718 * by a set take. 719 */ 720 md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL); 721 (void) biowait(buf); 722 723 if (buf->b_flags & B_ERROR) { 724 cmn_err(CE_WARN, "md: %s: open failed, could not " 725 "read watermark at block %llu for extent %u, " 726 "error %d.", md_shortname(MD_SID(un)), 727 buf->b_lblkno, ext, buf->b_error); 728 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 729 freerbuf(buf); 730 731 /* 732 * If we're a multi-owner diskset we send a message 733 * indicating that this soft-part has an invalid 734 * extent to the master node. This ensures a consistent 735 * view of the soft-part across the cluster. 736 */ 737 if (MD_MNSET_SETNO(setno)) { 738 sp_send_stat_err(un); 739 } 740 return (-1); 741 } 742 743 wm = (mp_watermark_t *)buf->b_un.b_addr; 744 745 /* make sure the checksum is correct first */ 746 if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum, 747 (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) { 748 cmn_err(CE_WARN, "md: %s: open failed, watermark " 749 "at block %llu for extent %u does not have a " 750 "valid checksum 0x%08x.", md_shortname(MD_SID(un)), 751 buf->b_lblkno, ext, wm->wm_checksum); 752 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 753 freerbuf(buf); 754 return (-1); 755 } 756 757 if (wm->wm_magic != MD_SP_MAGIC) { 758 cmn_err(CE_WARN, "md: %s: open failed, watermark " 759 "at block %llu for extent %u does not have a " 760 "valid watermark magic number, expected 0x%x, " 761 "found 0x%x.", md_shortname(MD_SID(un)), 762 buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic); 763 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 764 freerbuf(buf); 765 return (-1); 766 } 767 768 /* make sure sequence number matches the current extent */ 769 if (wm->wm_seq != ext) { 770 cmn_err(CE_WARN, "md: %s: open failed, watermark " 771 "at block %llu for extent %u has invalid " 772 "sequence number %u.", md_shortname(MD_SID(un)), 773 buf->b_lblkno, ext, wm->wm_seq); 774 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 775 freerbuf(buf); 776 return (-1); 777 } 778 779 /* make sure watermark length matches unit structure */ 780 if (wm->wm_length != un->un_ext[ext].un_len) { 781 cmn_err(CE_WARN, "md: %s: open failed, watermark " 782 "at block %llu for extent %u has inconsistent " 783 "length, expected %llu, found %llu.", 784 md_shortname(MD_SID(un)), buf->b_lblkno, 785 ext, un->un_ext[ext].un_len, 786 (u_longlong_t)wm->wm_length); 787 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 788 freerbuf(buf); 789 return (-1); 790 } 791 792 /* 793 * make sure the type is a valid soft partition and not 794 * a free extent or the end. 795 */ 796 if (wm->wm_type != EXTTYP_ALLOC) { 797 cmn_err(CE_WARN, "md: %s: open failed, watermark " 798 "at block %llu for extent %u is not marked " 799 "as in-use, type = %u.", md_shortname(MD_SID(un)), 800 buf->b_lblkno, ext, wm->wm_type); 801 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 802 freerbuf(buf); 803 return (-1); 804 } 805 /* free up buffer */ 806 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 807 freerbuf(buf); 808 } 809 810 if (len != un->un_length) { 811 cmn_err(CE_WARN, "md: %s: open failed, computed length " 812 "%llu != expected length %llu.", md_shortname(MD_SID(un)), 813 len, un->un_length); 814 return (-1); 815 } 816 817 /* 818 * If we're a multi-owner set _and_ reset_error is set, we should clear 819 * the error condition on all nodes in the set. Use SP_SETSTAT2 with 820 * MD_SP_OK. 821 */ 822 if (MD_MNSET_SETNO(setno) && reset_error) { 823 sp_send_stat_ok(un); 824 } 825 return (0); 826 } 827 828 /* 829 * FUNCTION: sp_done() 830 * INPUT: child_buf - buffer attached to child save structure. 831 * this is the buffer on which I/O has just 832 * completed. 833 * OUTPUT: none. 834 * RETURNS: 0 - success. 835 * 1 - error. 836 * PURPOSE: called on I/O completion. 837 */ 838 static int 839 sp_done(struct buf *child_buf) 840 { 841 struct buf *parent_buf; 842 mdi_unit_t *ui; 843 md_spps_t *ps; 844 md_spcs_t *cs; 845 846 /* find the child save structure to which this buffer belongs */ 847 cs = (md_spcs_t *)((caddr_t)child_buf - 848 (sizeof (md_spcs_t) - sizeof (buf_t))); 849 /* now get the parent save structure */ 850 ps = cs->cs_ps; 851 parent_buf = ps->ps_bp; 852 853 mutex_enter(&ps->ps_mx); 854 /* pass any errors back up to the parent */ 855 if (child_buf->b_flags & B_ERROR) { 856 ps->ps_flags |= MD_SPPS_ERROR; 857 parent_buf->b_error = child_buf->b_error; 858 } 859 /* mapout, if needed */ 860 if (child_buf->b_flags & B_REMAPPED) 861 bp_mapout(child_buf); 862 863 ps->ps_frags--; 864 if (ps->ps_frags != 0) { 865 /* 866 * if this parent has more children, we just free the 867 * child and return. 868 */ 869 kmem_cache_free(sp_child_cache, cs); 870 mutex_exit(&ps->ps_mx); 871 return (1); 872 } 873 /* there are no more children */ 874 kmem_cache_free(sp_child_cache, cs); 875 if (ps->ps_flags & MD_SPPS_ERROR) { 876 sp_error(ps); 877 return (1); 878 } 879 ui = ps->ps_ui; 880 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 881 mutex_exit(&ps->ps_mx); 882 } else { 883 /* 884 * this should only ever happen if we are panicking, 885 * since DONTFREE is only set on the parent if panicstr 886 * is non-NULL. 887 */ 888 ASSERT(panicstr); 889 } 890 SPPS_FREE(sp_parent_cache, ps); 891 md_kstat_done(ui, parent_buf, 0); 892 md_unit_readerexit(ui); 893 md_biodone(parent_buf); 894 return (0); 895 } 896 897 /* 898 * FUNCTION: md_sp_strategy() 899 * INPUT: parent_buf - parent buffer 900 * flag - flags 901 * private - private data 902 * OUTPUT: none. 903 * RETURNS: void. 904 * PURPOSE: Soft partitioning I/O strategy. Performs the main work 905 * needed to do I/O to a soft partition. The basic 906 * algorithm is as follows: 907 * - Allocate a child save structure to keep track 908 * of the I/O we are going to pass down. 909 * - Map the I/O to the correct extent in the soft 910 * partition (see sp_mapbuf()). 911 * - bioclone() the buffer and pass it down the 912 * stack using md_call_strategy. 913 * - If the I/O needs to split across extents, 914 * repeat the above steps until all fragments 915 * are finished. 916 */ 917 static void 918 md_sp_strategy(buf_t *parent_buf, int flag, void *private) 919 { 920 md_spps_t *ps; 921 md_spcs_t *cs; 922 int more; 923 mp_unit_t *un; 924 mdi_unit_t *ui; 925 size_t current_count; 926 off_t current_offset; 927 sp_ext_offset_t current_blkno; 928 buf_t *child_buf; 929 set_t setno = MD_MIN2SET(getminor(parent_buf->b_edev)); 930 int strat_flag = flag; 931 932 /* 933 * When doing IO to a multi owner meta device, check if set is halted. 934 * We do this check without the needed lock held, for performance 935 * reasons. 936 * If an IO just slips through while the set is locked via an 937 * MD_MN_SUSPEND_SET, we don't care about it. 938 * Only check for suspension if we are a top-level i/o request 939 * (MD_STR_NOTTOP is cleared in 'flag'); 940 */ 941 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 942 (MD_SET_HALTED | MD_SET_MNSET)) { 943 if ((flag & MD_STR_NOTTOP) == 0) { 944 mutex_enter(&md_mx); 945 /* Here we loop until the set is no longer halted */ 946 while (md_set[setno].s_status & MD_SET_HALTED) { 947 cv_wait(&md_cv, &md_mx); 948 } 949 mutex_exit(&md_mx); 950 } 951 } 952 953 ui = MDI_UNIT(getminor(parent_buf->b_edev)); 954 955 md_kstat_waitq_enter(ui); 956 957 un = (mp_unit_t *)md_unit_readerlock(ui); 958 959 if ((flag & MD_NOBLOCK) == 0) { 960 if (md_inc_iocount(setno) != 0) { 961 parent_buf->b_flags |= B_ERROR; 962 parent_buf->b_error = ENXIO; 963 parent_buf->b_resid = parent_buf->b_bcount; 964 md_kstat_waitq_exit(ui); 965 md_unit_readerexit(ui); 966 biodone(parent_buf); 967 return; 968 } 969 } else { 970 md_inc_iocount_noblock(setno); 971 } 972 973 if (!(flag & MD_STR_NOTTOP)) { 974 if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) { 975 md_kstat_waitq_exit(ui); 976 return; 977 } 978 } 979 980 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 981 sp_parent_init(ps); 982 983 /* 984 * Save essential information from the original buffhdr 985 * in the parent. 986 */ 987 ps->ps_un = un; 988 ps->ps_ui = ui; 989 ps->ps_bp = parent_buf; 990 ps->ps_addr = parent_buf->b_un.b_addr; 991 992 current_count = parent_buf->b_bcount; 993 current_blkno = (sp_ext_offset_t)parent_buf->b_blkno; 994 current_offset = 0; 995 996 /* 997 * if we are at the top and we are panicking, 998 * we don't free in order to save state. 999 */ 1000 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) 1001 ps->ps_flags |= MD_SPPS_DONTFREE; 1002 1003 md_kstat_waitq_to_runq(ui); 1004 1005 ps->ps_frags++; 1006 1007 /* 1008 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this 1009 * metadevice. 1010 */ 1011 if (ui->ui_tstate & MD_ABR_CAP) 1012 strat_flag |= MD_STR_ABR; 1013 1014 /* 1015 * this loop does the main work of an I/O. we allocate a 1016 * a child save for each buf, do the logical to physical 1017 * mapping, decide if we need to frag the I/O, clone the 1018 * new I/O to pass down the stack. repeat until we've 1019 * taken care of the entire buf that was passed to us. 1020 */ 1021 do { 1022 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1023 sp_child_init(cs); 1024 child_buf = &cs->cs_buf; 1025 cs->cs_ps = ps; 1026 1027 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1028 if (more == -1) { 1029 parent_buf->b_flags |= B_ERROR; 1030 parent_buf->b_error = EIO; 1031 md_kstat_done(ui, parent_buf, 0); 1032 md_unit_readerexit(ui); 1033 md_biodone(parent_buf); 1034 kmem_cache_free(sp_parent_cache, ps); 1035 return; 1036 } 1037 1038 child_buf = md_bioclone(parent_buf, current_offset, 1039 child_buf->b_bcount, child_buf->b_edev, 1040 child_buf->b_blkno, sp_done, child_buf, 1041 KM_NOSLEEP); 1042 /* calculate new offset, counts, etc... */ 1043 current_offset += child_buf->b_bcount; 1044 current_count -= child_buf->b_bcount; 1045 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1046 1047 if (more) { 1048 mutex_enter(&ps->ps_mx); 1049 ps->ps_frags++; 1050 mutex_exit(&ps->ps_mx); 1051 } 1052 1053 md_call_strategy(child_buf, strat_flag, private); 1054 } while (more); 1055 1056 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) { 1057 while (!(ps->ps_flags & MD_SPPS_DONE)) { 1058 md_daemon(1, &md_done_daemon); 1059 } 1060 kmem_cache_free(sp_parent_cache, ps); 1061 } 1062 } 1063 1064 /* 1065 * FUNCTION: sp_directed_read() 1066 * INPUT: mnum - minor number 1067 * vdr - vol_directed_rd_t from user 1068 * mode - access mode for copying data out. 1069 * OUTPUT: none. 1070 * RETURNS: 0 - success 1071 * Exxxxx - failure error-code 1072 * PURPOSE: Construct the necessary sub-device i/o requests to perform the 1073 * directed read as requested by the user. This is essentially the 1074 * same as md_sp_strategy() with the exception being that the 1075 * underlying 'md_call_strategy' is replaced with an ioctl call. 1076 */ 1077 int 1078 sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode) 1079 { 1080 md_spps_t *ps; 1081 md_spcs_t *cs; 1082 int more; 1083 mp_unit_t *un; 1084 mdi_unit_t *ui; 1085 size_t current_count; 1086 off_t current_offset; 1087 sp_ext_offset_t current_blkno; 1088 buf_t *child_buf, *parent_buf; 1089 void *kbuffer; 1090 vol_directed_rd_t cvdr; 1091 caddr_t userbuf; 1092 offset_t useroff; 1093 int ret = 0; 1094 1095 ui = MDI_UNIT(mnum); 1096 1097 md_kstat_waitq_enter(ui); 1098 1099 bzero(&cvdr, sizeof (cvdr)); 1100 1101 un = (mp_unit_t *)md_unit_readerlock(ui); 1102 1103 /* 1104 * Construct a parent_buf header which reflects the user-supplied 1105 * request. 1106 */ 1107 1108 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 1109 if (kbuffer == NULL) { 1110 vdr->vdr_flags |= DKV_DMR_ERROR; 1111 md_kstat_waitq_exit(ui); 1112 md_unit_readerexit(ui); 1113 return (ENOMEM); 1114 } 1115 1116 parent_buf = getrbuf(KM_NOSLEEP); 1117 if (parent_buf == NULL) { 1118 vdr->vdr_flags |= DKV_DMR_ERROR; 1119 md_kstat_waitq_exit(ui); 1120 md_unit_readerexit(ui); 1121 kmem_free(kbuffer, vdr->vdr_nbytes); 1122 return (ENOMEM); 1123 } 1124 parent_buf->b_un.b_addr = kbuffer; 1125 parent_buf->b_flags = B_READ; 1126 parent_buf->b_bcount = vdr->vdr_nbytes; 1127 parent_buf->b_lblkno = lbtodb(vdr->vdr_offset); 1128 parent_buf->b_edev = un->un_dev; 1129 1130 1131 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 1132 sp_parent_init(ps); 1133 1134 /* 1135 * Save essential information from the original buffhdr 1136 * in the parent. 1137 */ 1138 ps->ps_un = un; 1139 ps->ps_ui = ui; 1140 ps->ps_bp = parent_buf; 1141 ps->ps_addr = parent_buf->b_un.b_addr; 1142 1143 current_count = parent_buf->b_bcount; 1144 current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno; 1145 current_offset = 0; 1146 1147 md_kstat_waitq_to_runq(ui); 1148 1149 ps->ps_frags++; 1150 vdr->vdr_bytesread = 0; 1151 1152 /* 1153 * this loop does the main work of an I/O. we allocate a 1154 * a child save for each buf, do the logical to physical 1155 * mapping, decide if we need to frag the I/O, clone the 1156 * new I/O to pass down the stack. repeat until we've 1157 * taken care of the entire buf that was passed to us. 1158 */ 1159 do { 1160 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1161 sp_child_init(cs); 1162 child_buf = &cs->cs_buf; 1163 cs->cs_ps = ps; 1164 1165 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1166 if (more == -1) { 1167 ret = EIO; 1168 vdr->vdr_flags |= DKV_DMR_SHORT; 1169 kmem_cache_free(sp_child_cache, cs); 1170 goto err_out; 1171 } 1172 1173 cvdr.vdr_flags = vdr->vdr_flags; 1174 cvdr.vdr_side = vdr->vdr_side; 1175 cvdr.vdr_nbytes = child_buf->b_bcount; 1176 cvdr.vdr_offset = ldbtob(child_buf->b_lblkno); 1177 /* Work out where we are in the allocated buffer */ 1178 useroff = (offset_t)(uintptr_t)kbuffer; 1179 useroff = useroff + (offset_t)current_offset; 1180 cvdr.vdr_data = (void *)(uintptr_t)useroff; 1181 child_buf = md_bioclone(parent_buf, current_offset, 1182 child_buf->b_bcount, child_buf->b_edev, 1183 child_buf->b_blkno, NULL, 1184 child_buf, KM_NOSLEEP); 1185 /* calculate new offset, counts, etc... */ 1186 current_offset += child_buf->b_bcount; 1187 current_count -= child_buf->b_bcount; 1188 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1189 1190 if (more) { 1191 mutex_enter(&ps->ps_mx); 1192 ps->ps_frags++; 1193 mutex_exit(&ps->ps_mx); 1194 } 1195 1196 ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr, 1197 (mode | FKIOCTL), NULL); 1198 1199 /* 1200 * Free the child structure as we've finished with it. 1201 * Normally this would be done by sp_done() but we're just 1202 * using md_bioclone() to segment the transfer and we never 1203 * issue a strategy request so the iodone will not be called. 1204 */ 1205 kmem_cache_free(sp_child_cache, cs); 1206 if (ret == 0) { 1207 /* copyout the returned data to vdr_data + offset */ 1208 userbuf = (caddr_t)kbuffer; 1209 userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer; 1210 if (ddi_copyout(userbuf, vdr->vdr_data, 1211 cvdr.vdr_bytesread, mode)) { 1212 ret = EFAULT; 1213 goto err_out; 1214 } 1215 vdr->vdr_bytesread += cvdr.vdr_bytesread; 1216 } else { 1217 goto err_out; 1218 } 1219 } while (more); 1220 1221 /* 1222 * Update the user-supplied vol_directed_rd_t structure with the 1223 * contents of the last issued child request. 1224 */ 1225 vdr->vdr_flags = cvdr.vdr_flags; 1226 vdr->vdr_side = cvdr.vdr_side; 1227 bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME); 1228 1229 err_out: 1230 if (ret != 0) { 1231 vdr->vdr_flags |= DKV_DMR_ERROR; 1232 } 1233 if (vdr->vdr_bytesread != vdr->vdr_nbytes) { 1234 vdr->vdr_flags |= DKV_DMR_SHORT; 1235 } 1236 kmem_cache_free(sp_parent_cache, ps); 1237 kmem_free(kbuffer, vdr->vdr_nbytes); 1238 freerbuf(parent_buf); 1239 md_unit_readerexit(ui); 1240 return (ret); 1241 } 1242 1243 /* 1244 * FUNCTION: sp_snarf() 1245 * INPUT: cmd - snarf cmd. 1246 * setno - set number. 1247 * OUTPUT: none. 1248 * RETURNS: 1 - soft partitions were snarfed. 1249 * 0 - no soft partitions were snarfed. 1250 * PURPOSE: Snarf soft partition metadb records into their in-core 1251 * structures. This routine is called at "snarf time" when 1252 * md loads and gets all metadevices records into memory. 1253 * The basic algorithm is simply to walk the soft partition 1254 * records in the metadb and call the soft partitioning 1255 * build_incore routine to set up the in-core structures. 1256 */ 1257 static int 1258 sp_snarf(md_snarfcmd_t cmd, set_t setno) 1259 { 1260 mp_unit_t *un; 1261 mddb_recid_t recid; 1262 int gotsomething; 1263 int all_sp_gotten; 1264 mddb_type_t rec_type; 1265 mddb_de_ic_t *dep; 1266 mddb_rb32_t *rbp; 1267 mp_unit_t *big_un; 1268 mp_unit32_od_t *small_un; 1269 size_t newreqsize; 1270 1271 1272 if (cmd == MD_SNARF_CLEANUP) 1273 return (0); 1274 1275 all_sp_gotten = 1; 1276 gotsomething = 0; 1277 1278 /* get the record type */ 1279 rec_type = (mddb_type_t)md_getshared_key(setno, 1280 sp_md_ops.md_driver.md_drivername); 1281 recid = mddb_makerecid(setno, 0); 1282 1283 /* 1284 * walk soft partition records in the metadb and call 1285 * sp_build_incore to build in-core structures. 1286 */ 1287 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1288 /* if we've already gotten this record, go to the next one */ 1289 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1290 continue; 1291 1292 1293 dep = mddb_getrecdep(recid); 1294 dep->de_flags = MDDB_F_SOFTPART; 1295 rbp = dep->de_rb; 1296 1297 switch (rbp->rb_revision) { 1298 case MDDB_REV_RB: 1299 case MDDB_REV_RBFN: 1300 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 1301 /* 1302 * This means, we have an old and small record. 1303 * And this record hasn't already been converted 1304 * :-o before we create an incore metadevice 1305 * from this we have to convert it to a big 1306 * record. 1307 */ 1308 small_un = 1309 (mp_unit32_od_t *)mddb_getrecaddr(recid); 1310 newreqsize = sizeof (mp_unit_t) + 1311 ((small_un->un_numexts - 1) * 1312 sizeof (struct mp_ext)); 1313 big_un = (mp_unit_t *)kmem_zalloc(newreqsize, 1314 KM_SLEEP); 1315 softpart_convert((caddr_t)small_un, 1316 (caddr_t)big_un, SMALL_2_BIG); 1317 kmem_free(small_un, dep->de_reqsize); 1318 dep->de_rb_userdata = big_un; 1319 dep->de_reqsize = newreqsize; 1320 rbp->rb_private |= MD_PRV_CONVD; 1321 un = big_un; 1322 } else { 1323 /* Record has already been converted */ 1324 un = (mp_unit_t *)mddb_getrecaddr(recid); 1325 } 1326 un->c.un_revision &= ~MD_64BIT_META_DEV; 1327 break; 1328 case MDDB_REV_RB64: 1329 case MDDB_REV_RB64FN: 1330 /* Large device */ 1331 un = (mp_unit_t *)mddb_getrecaddr(recid); 1332 un->c.un_revision |= MD_64BIT_META_DEV; 1333 un->c.un_flag |= MD_EFILABEL; 1334 break; 1335 } 1336 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 1337 1338 /* 1339 * Create minor node for snarfed entry. 1340 */ 1341 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 1342 1343 if (MD_UNIT(MD_SID(un)) != NULL) { 1344 /* unit is already in-core */ 1345 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1346 continue; 1347 } 1348 all_sp_gotten = 0; 1349 if (sp_build_incore((void *)un, 1) == 0) { 1350 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1351 md_create_unit_incore(MD_SID(un), &sp_md_ops, 0); 1352 gotsomething = 1; 1353 } 1354 } 1355 1356 if (!all_sp_gotten) 1357 return (gotsomething); 1358 /* double-check records */ 1359 recid = mddb_makerecid(setno, 0); 1360 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) 1361 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 1362 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1363 1364 return (0); 1365 } 1366 1367 /* 1368 * FUNCTION: sp_halt() 1369 * INPUT: cmd - halt cmd. 1370 * setno - set number. 1371 * RETURNS: 0 - success. 1372 * 1 - err. 1373 * PURPOSE: Perform driver halt operations. As with stripe, we 1374 * support MD_HALT_CHECK and MD_HALT_DOIT. The first 1375 * does a check to see if halting can be done safely 1376 * (no open soft partitions), the second cleans up and 1377 * shuts down the driver. 1378 */ 1379 static int 1380 sp_halt(md_haltcmd_t cmd, set_t setno) 1381 { 1382 int i; 1383 mdi_unit_t *ui; 1384 minor_t mnum; 1385 1386 if (cmd == MD_HALT_CLOSE) 1387 return (0); 1388 1389 if (cmd == MD_HALT_OPEN) 1390 return (0); 1391 1392 if (cmd == MD_HALT_UNLOAD) 1393 return (0); 1394 1395 if (cmd == MD_HALT_CHECK) { 1396 for (i = 0; i < md_nunits; i++) { 1397 mnum = MD_MKMIN(setno, i); 1398 if ((ui = MDI_UNIT(mnum)) == NULL) 1399 continue; 1400 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1401 continue; 1402 if (md_unit_isopen(ui)) 1403 return (1); 1404 } 1405 return (0); 1406 } 1407 1408 if (cmd != MD_HALT_DOIT) 1409 return (1); 1410 1411 for (i = 0; i < md_nunits; i++) { 1412 mnum = MD_MKMIN(setno, i); 1413 if ((ui = MDI_UNIT(mnum)) == NULL) 1414 continue; 1415 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1416 continue; 1417 reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0); 1418 } 1419 1420 return (0); 1421 } 1422 1423 /* 1424 * FUNCTION: sp_open_dev() 1425 * INPUT: un - unit structure. 1426 * oflags - open flags. 1427 * OUTPUT: none. 1428 * RETURNS: 0 - success. 1429 * non-zero - err. 1430 * PURPOSE: open underlying device via md_layered_open. 1431 */ 1432 static int 1433 sp_open_dev(mp_unit_t *un, int oflags) 1434 { 1435 minor_t mnum = MD_SID(un); 1436 int err; 1437 md_dev64_t tmpdev; 1438 set_t setno = MD_MIN2SET(MD_SID(un)); 1439 side_t side = mddb_getsidenum(setno); 1440 1441 tmpdev = un->un_dev; 1442 /* 1443 * Do the open by device id if underlying is regular 1444 */ 1445 if ((md_getmajor(tmpdev) != md_major) && 1446 md_devid_found(setno, side, un->un_key) == 1) { 1447 tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key); 1448 } 1449 err = md_layered_open(mnum, &tmpdev, oflags); 1450 un->un_dev = tmpdev; 1451 1452 if (err) 1453 return (ENXIO); 1454 1455 return (0); 1456 } 1457 1458 /* 1459 * FUNCTION: sp_open() 1460 * INPUT: dev - device to open. 1461 * flag - pass-through flag. 1462 * otyp - pass-through open type. 1463 * cred_p - credentials. 1464 * md_oflags - open flags. 1465 * OUTPUT: none. 1466 * RETURNS: 0 - success. 1467 * non-zero - err. 1468 * PURPOSE: open a soft partition. 1469 */ 1470 /* ARGSUSED */ 1471 static int 1472 sp_open( 1473 dev_t *dev, 1474 int flag, 1475 int otyp, 1476 cred_t *cred_p, 1477 int md_oflags 1478 ) 1479 { 1480 minor_t mnum = getminor(*dev); 1481 mdi_unit_t *ui = MDI_UNIT(mnum); 1482 mp_unit_t *un; 1483 int err = 0; 1484 set_t setno; 1485 1486 /* 1487 * When doing an open of a multi owner metadevice, check to see if this 1488 * node is a starting node and if a reconfig cycle is underway. 1489 * If so, the system isn't sufficiently set up enough to handle the 1490 * open (which involves I/O during sp_validate), so fail with ENXIO. 1491 */ 1492 setno = MD_MIN2SET(mnum); 1493 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 1494 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 1495 return (ENXIO); 1496 } 1497 1498 /* grab necessary locks */ 1499 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1500 setno = MD_UN2SET(un); 1501 1502 /* open underlying device, if necessary */ 1503 if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { 1504 if ((err = sp_open_dev(un, md_oflags)) != 0) 1505 goto out; 1506 1507 if (MD_MNSET_SETNO(setno)) { 1508 /* For probe, don't incur the overhead of validate */ 1509 if (!(md_oflags & MD_OFLG_PROBEDEV)) { 1510 /* 1511 * Don't call sp_validate while 1512 * unit_openclose lock is held. So, actually 1513 * open the device, drop openclose lock, 1514 * call sp_validate, reacquire openclose lock, 1515 * and close the device. If sp_validate 1516 * succeeds, then device will be re-opened. 1517 */ 1518 if ((err = md_unit_incopen(mnum, flag, 1519 otyp)) != 0) 1520 goto out; 1521 1522 mutex_enter(&ui->ui_mx); 1523 ui->ui_lock |= MD_UL_OPENINPROGRESS; 1524 mutex_exit(&ui->ui_mx); 1525 md_unit_openclose_exit(ui); 1526 if (otyp != OTYP_LYR) 1527 rw_exit(&md_unit_array_rw.lock); 1528 1529 err = sp_validate(un); 1530 1531 if (otyp != OTYP_LYR) 1532 rw_enter(&md_unit_array_rw.lock, 1533 RW_READER); 1534 (void) md_unit_openclose_enter(ui); 1535 (void) md_unit_decopen(mnum, otyp); 1536 mutex_enter(&ui->ui_mx); 1537 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 1538 cv_broadcast(&ui->ui_cv); 1539 mutex_exit(&ui->ui_mx); 1540 /* 1541 * Should be in the same state as before 1542 * the sp_validate. 1543 */ 1544 if (err != 0) { 1545 /* close the device opened above */ 1546 md_layered_close(un->un_dev, md_oflags); 1547 err = EIO; 1548 goto out; 1549 } 1550 } 1551 /* 1552 * As we're a multi-owner metadevice we need to ensure 1553 * that all nodes have the same idea of the status. 1554 * sp_validate() will mark the device as errored (if 1555 * it cannot read the watermark) or ok (if it was 1556 * previously errored but the watermark is now valid). 1557 * This code-path is only entered on the non-probe open 1558 * so we will maintain the errored state during a probe 1559 * call. This means the sys-admin must metarecover -m 1560 * to reset the soft-partition error. 1561 */ 1562 } else { 1563 /* For probe, don't incur the overhead of validate */ 1564 if (!(md_oflags & MD_OFLG_PROBEDEV) && 1565 (err = sp_validate(un)) != 0) { 1566 /* close the device opened above */ 1567 md_layered_close(un->un_dev, md_oflags); 1568 err = EIO; 1569 goto out; 1570 } else { 1571 /* 1572 * we succeeded in validating the on disk 1573 * format versus the in core, so reset the 1574 * status if it's in error 1575 */ 1576 if (un->un_status == MD_SP_ERR) { 1577 un->un_status = MD_SP_OK; 1578 } 1579 } 1580 } 1581 } 1582 1583 /* count open */ 1584 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 1585 goto out; 1586 1587 out: 1588 md_unit_openclose_exit(ui); 1589 return (err); 1590 } 1591 1592 /* 1593 * FUNCTION: sp_close() 1594 * INPUT: dev - device to close. 1595 * flag - pass-through flag. 1596 * otyp - pass-through type. 1597 * cred_p - credentials. 1598 * md_cflags - close flags. 1599 * OUTPUT: none. 1600 * RETURNS: 0 - success. 1601 * non-zero - err. 1602 * PURPOSE: close a soft paritition. 1603 */ 1604 /* ARGSUSED */ 1605 static int 1606 sp_close( 1607 dev_t dev, 1608 int flag, 1609 int otyp, 1610 cred_t *cred_p, 1611 int md_cflags 1612 ) 1613 { 1614 minor_t mnum = getminor(dev); 1615 mdi_unit_t *ui = MDI_UNIT(mnum); 1616 mp_unit_t *un; 1617 int err = 0; 1618 1619 /* grab necessary locks */ 1620 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1621 1622 /* count closed */ 1623 if ((err = md_unit_decopen(mnum, otyp)) != 0) 1624 goto out; 1625 1626 /* close devices, if necessary */ 1627 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 1628 md_layered_close(un->un_dev, md_cflags); 1629 } 1630 1631 /* 1632 * If a MN set and transient capabilities (eg ABR/DMR) are set, 1633 * clear these capabilities if this is the last close in 1634 * the cluster 1635 */ 1636 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1637 (ui->ui_tstate & MD_ABR_CAP)) { 1638 md_unit_openclose_exit(ui); 1639 mdmn_clear_all_capabilities(mnum); 1640 return (0); 1641 } 1642 /* unlock, return success */ 1643 out: 1644 md_unit_openclose_exit(ui); 1645 return (err); 1646 } 1647 1648 1649 /* used in sp_dump routine */ 1650 static struct buf dumpbuf; 1651 1652 /* 1653 * FUNCTION: sp_dump() 1654 * INPUT: dev - device to dump to. 1655 * addr - address to dump. 1656 * blkno - blkno on device. 1657 * nblk - number of blocks to dump. 1658 * OUTPUT: none. 1659 * RETURNS: result from bdev_dump. 1660 * PURPOSE: This routine dumps memory to the disk. It assumes that 1661 * the memory has already been mapped into mainbus space. 1662 * It is called at disk interrupt priority when the system 1663 * is in trouble. 1664 * NOTE: this function is defined using 32-bit arguments, 1665 * but soft partitioning is internally 64-bit. Arguments 1666 * are casted where appropriate. 1667 */ 1668 static int 1669 sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1670 { 1671 mp_unit_t *un; 1672 buf_t *bp; 1673 sp_ext_length_t nb; 1674 daddr_t mapblk; 1675 int result; 1676 int more; 1677 int saveresult = 0; 1678 1679 /* 1680 * Don't need to grab the unit lock. 1681 * Cause nothing else is supposed to be happenning. 1682 * Also dump is not supposed to sleep. 1683 */ 1684 un = (mp_unit_t *)MD_UNIT(getminor(dev)); 1685 1686 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 1687 return (EINVAL); 1688 1689 if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks) 1690 return (EINVAL); 1691 1692 bp = &dumpbuf; 1693 nb = (sp_ext_length_t)dbtob(nblk); 1694 do { 1695 bzero((caddr_t)bp, sizeof (*bp)); 1696 more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp); 1697 nblk = (int)(btodb(bp->b_bcount)); 1698 mapblk = bp->b_blkno; 1699 result = bdev_dump(bp->b_edev, addr, mapblk, nblk); 1700 if (result) 1701 saveresult = result; 1702 1703 nb -= bp->b_bcount; 1704 addr += bp->b_bcount; 1705 blkno += nblk; 1706 } while (more); 1707 1708 return (saveresult); 1709 } 1710 1711 static int 1712 sp_imp_set( 1713 set_t setno 1714 ) 1715 { 1716 mddb_recid_t recid; 1717 int gotsomething; 1718 mddb_type_t rec_type; 1719 mddb_de_ic_t *dep; 1720 mddb_rb32_t *rbp; 1721 mp_unit_t *un64; 1722 mp_unit32_od_t *un32; 1723 md_dev64_t self_devt; 1724 minor_t *self_id; /* minor needs to be updated */ 1725 md_parent_t *parent_id; /* parent needs to be updated */ 1726 mddb_recid_t *record_id; /* record id needs to be updated */ 1727 1728 gotsomething = 0; 1729 1730 rec_type = (mddb_type_t)md_getshared_key(setno, 1731 sp_md_ops.md_driver.md_drivername); 1732 recid = mddb_makerecid(setno, 0); 1733 1734 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1735 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1736 continue; 1737 1738 dep = mddb_getrecdep(recid); 1739 rbp = dep->de_rb; 1740 1741 switch (rbp->rb_revision) { 1742 case MDDB_REV_RB: 1743 case MDDB_REV_RBFN: 1744 /* 1745 * Small device 1746 */ 1747 un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid); 1748 self_id = &(un32->c.un_self_id); 1749 parent_id = &(un32->c.un_parent); 1750 record_id = &(un32->c.un_record_id); 1751 1752 if (!md_update_minor(setno, mddb_getsidenum 1753 (setno), un32->un_key)) 1754 goto out; 1755 break; 1756 1757 case MDDB_REV_RB64: 1758 case MDDB_REV_RB64FN: 1759 un64 = (mp_unit_t *)mddb_getrecaddr(recid); 1760 self_id = &(un64->c.un_self_id); 1761 parent_id = &(un64->c.un_parent); 1762 record_id = &(un64->c.un_record_id); 1763 1764 if (!md_update_minor(setno, mddb_getsidenum 1765 (setno), un64->un_key)) 1766 goto out; 1767 break; 1768 } 1769 1770 /* 1771 * If this is a top level and a friendly name metadevice, 1772 * update its minor in the namespace. 1773 */ 1774 if ((*parent_id == MD_NO_PARENT) && 1775 ((rbp->rb_revision == MDDB_REV_RBFN) || 1776 (rbp->rb_revision == MDDB_REV_RB64FN))) { 1777 1778 self_devt = md_makedevice(md_major, *self_id); 1779 if (!md_update_top_device_minor(setno, 1780 mddb_getsidenum(setno), self_devt)) 1781 goto out; 1782 } 1783 1784 /* 1785 * Update unit with the imported setno 1786 * 1787 */ 1788 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1789 1790 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 1791 if (*parent_id != MD_NO_PARENT) 1792 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 1793 *record_id = MAKERECID(setno, DBID(*record_id)); 1794 1795 gotsomething = 1; 1796 } 1797 1798 out: 1799 return (gotsomething); 1800 } 1801 1802 static md_named_services_t sp_named_services[] = { 1803 {NULL, 0} 1804 }; 1805 1806 md_ops_t sp_md_ops = { 1807 sp_open, /* open */ 1808 sp_close, /* close */ 1809 md_sp_strategy, /* strategy */ 1810 NULL, /* print */ 1811 sp_dump, /* dump */ 1812 NULL, /* read */ 1813 NULL, /* write */ 1814 md_sp_ioctl, /* ioctl, */ 1815 sp_snarf, /* snarf */ 1816 sp_halt, /* halt */ 1817 NULL, /* aread */ 1818 NULL, /* awrite */ 1819 sp_imp_set, /* import set */ 1820 sp_named_services 1821 }; 1822 1823 static void 1824 init_init() 1825 { 1826 sp_parent_cache = kmem_cache_create("md_softpart_parent", 1827 sizeof (md_spps_t), 0, sp_parent_constructor, 1828 sp_parent_destructor, sp_run_queue, NULL, NULL, 0); 1829 sp_child_cache = kmem_cache_create("md_softpart_child", 1830 sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0, 1831 sp_child_constructor, sp_child_destructor, sp_run_queue, 1832 NULL, NULL, 0); 1833 } 1834 1835 static void 1836 fini_uninit() 1837 { 1838 kmem_cache_destroy(sp_parent_cache); 1839 kmem_cache_destroy(sp_child_cache); 1840 sp_parent_cache = sp_child_cache = NULL; 1841 } 1842 1843 /* define the module linkage */ 1844 MD_PLUGIN_MISC_MODULE("soft partition module %I%", init_init(), fini_uninit()) 1845