1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Soft partitioning metadevice driver (md_sp). 29 * 30 * This file contains the primary operations of the soft partitioning 31 * metadevice driver. This includes all routines for normal operation 32 * (open/close/read/write). Please see mdvar.h for a definition of 33 * metadevice operations vector (md_ops_t). This driver is loosely 34 * based on the stripe driver (md_stripe). 35 * 36 * All metadevice administration is done through the use of ioctl's. 37 * As such, all administrative routines appear in sp_ioctl.c. 38 * 39 * Soft partitions are represented both in-core and in the metadb with a 40 * unit structure. The soft partition-specific information in the unit 41 * structure includes the following information: 42 * - Device information (md_dev64_t & md key) about the device on which 43 * the soft partition is built. 44 * - Soft partition status information. 45 * - The size of the soft partition and number of extents used to 46 * make up that size. 47 * - An array of exents which define virtual/physical offset 48 * mappings and lengths for each extent. 49 * 50 * Typical soft partition operation proceeds as follows: 51 * - The unit structure is fetched from the metadb and placed into 52 * an in-core array (as with other metadevices). This operation 53 * is performed via sp_build_incore( ) and takes place during 54 * "snarfing" (when all metadevices are brought in-core at 55 * once) and when a new soft partition is created. 56 * - A soft partition is opened via sp_open( ). At open time the 57 * the soft partition unit structure is verified with the soft 58 * partition on-disk structures. Additionally, the soft partition 59 * status is checked (only soft partitions in the OK state may be 60 * opened). 61 * - Soft partition I/O is performed via sp_strategy( ) which relies on 62 * a support routine, sp_mapbuf( ), to do most of the work. 63 * sp_mapbuf( ) maps a buffer to a particular extent via a binary 64 * search of the extent array in the soft partition unit structure. 65 * Once a translation has been performed, the I/O is passed down 66 * to the next layer, which may be another metadevice or a physical 67 * disk. Since a soft partition may contain multiple, non-contiguous 68 * extents, a single I/O may have to be fragmented. 69 * - Soft partitions are closed using sp_close. 70 * 71 */ 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/conf.h> 76 #include <sys/file.h> 77 #include <sys/user.h> 78 #include <sys/uio.h> 79 #include <sys/t_lock.h> 80 #include <sys/buf.h> 81 #include <sys/dkio.h> 82 #include <sys/vtoc.h> 83 #include <sys/kmem.h> 84 #include <vm/page.h> 85 #include <sys/cmn_err.h> 86 #include <sys/sysmacros.h> 87 #include <sys/types.h> 88 #include <sys/mkdev.h> 89 #include <sys/stat.h> 90 #include <sys/open.h> 91 #include <sys/lvm/mdvar.h> 92 #include <sys/lvm/md_sp.h> 93 #include <sys/lvm/md_convert.h> 94 #include <sys/lvm/md_notify.h> 95 #include <sys/lvm/md_crc.h> 96 #include <sys/modctl.h> 97 #include <sys/ddi.h> 98 #include <sys/sunddi.h> 99 #include <sys/debug.h> 100 101 #include <sys/sysevent/eventdefs.h> 102 #include <sys/sysevent/svm.h> 103 104 md_ops_t sp_md_ops; 105 #ifndef lint 106 char _depends_on[] = "drv/md"; 107 md_ops_t *md_interface_ops = &sp_md_ops; 108 #endif 109 110 extern unit_t md_nunits; 111 extern set_t md_nsets; 112 extern md_set_t md_set[]; 113 114 extern int md_status; 115 extern major_t md_major; 116 extern mdq_anchor_t md_done_daemon; 117 extern mdq_anchor_t md_sp_daemon; 118 extern kmutex_t md_mx; 119 extern kcondvar_t md_cv; 120 extern md_krwlock_t md_unit_array_rw; 121 122 static kmem_cache_t *sp_parent_cache = NULL; 123 static kmem_cache_t *sp_child_cache = NULL; 124 static void sp_send_stat_ok(mp_unit_t *); 125 static void sp_send_stat_err(mp_unit_t *); 126 127 /* 128 * FUNCTION: sp_parent_constructor() 129 * INPUT: none. 130 * OUTPUT: ps - parent save structure initialized. 131 * RETURNS: void * - ptr to initialized parent save structure. 132 * PURPOSE: initialize parent save structure. 133 */ 134 /*ARGSUSED1*/ 135 static int 136 sp_parent_constructor(void *p, void *d1, int d2) 137 { 138 mutex_init(&((md_spps_t *)p)->ps_mx, 139 NULL, MUTEX_DEFAULT, NULL); 140 return (0); 141 } 142 143 static void 144 sp_parent_init(md_spps_t *ps) 145 { 146 bzero(ps, offsetof(md_spps_t, ps_mx)); 147 } 148 149 /*ARGSUSED1*/ 150 static void 151 sp_parent_destructor(void *p, void *d) 152 { 153 mutex_destroy(&((md_spps_t *)p)->ps_mx); 154 } 155 156 /* 157 * FUNCTION: sp_child_constructor() 158 * INPUT: none. 159 * OUTPUT: cs - child save structure initialized. 160 * RETURNS: void * - ptr to initialized child save structure. 161 * PURPOSE: initialize child save structure. 162 */ 163 /*ARGSUSED1*/ 164 static int 165 sp_child_constructor(void *p, void *d1, int d2) 166 { 167 bioinit(&((md_spcs_t *)p)->cs_buf); 168 return (0); 169 } 170 171 static void 172 sp_child_init(md_spcs_t *cs) 173 { 174 cs->cs_mdunit = 0; 175 cs->cs_ps = NULL; 176 md_bioreset(&cs->cs_buf); 177 } 178 179 /*ARGSUSED1*/ 180 static void 181 sp_child_destructor(void *p, void *d) 182 { 183 biofini(&((md_spcs_t *)p)->cs_buf); 184 } 185 186 /* 187 * FUNCTION: sp_run_queue() 188 * INPUT: none. 189 * OUTPUT: none. 190 * RETURNS: void. 191 * PURPOSE: run the md_daemon to clean up memory pool. 192 */ 193 /*ARGSUSED*/ 194 static void 195 sp_run_queue(void *d) 196 { 197 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 198 md_daemon(1, &md_done_daemon); 199 } 200 201 202 /* 203 * FUNCTION: sp_build_incore() 204 * INPUT: p - ptr to unit structure. 205 * snarfing - flag to tell us we are snarfing. 206 * OUTPUT: non. 207 * RETURNS: int - 0 (always). 208 * PURPOSE: place unit structure into in-core unit array (keyed from 209 * minor number). 210 */ 211 int 212 sp_build_incore(void *p, int snarfing) 213 { 214 mp_unit_t *un = (mp_unit_t *)p; 215 minor_t mnum; 216 set_t setno; 217 md_dev64_t tmpdev; 218 219 mnum = MD_SID(un); 220 221 if (MD_UNIT(mnum) != NULL) 222 return (0); 223 224 MD_STATUS(un) = 0; 225 226 if (snarfing) { 227 /* 228 * if we are snarfing, we get the device information 229 * from the metadb record (using the metadb key for 230 * that device). 231 */ 232 setno = MD_MIN2SET(mnum); 233 234 tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), 235 un->un_key, MD_NOTRUST_DEVT); 236 un->un_dev = tmpdev; 237 } 238 239 /* place various information in the in-core data structures */ 240 md_nblocks_set(mnum, un->c.un_total_blocks); 241 MD_UNIT(mnum) = un; 242 243 return (0); 244 } 245 246 /* 247 * FUNCTION: reset_sp() 248 * INPUT: un - unit structure to be reset/removed. 249 * mnum - minor number to be reset/removed. 250 * removing - flag to tell us if we are removing 251 * permanently or just reseting in-core 252 * structures. 253 * OUTPUT: none. 254 * RETURNS: void. 255 * PURPOSE: used to either simply reset in-core structures or to 256 * permanently remove metadevices from the metadb. 257 */ 258 void 259 reset_sp(mp_unit_t *un, minor_t mnum, int removing) 260 { 261 sv_dev_t *sv; 262 mddb_recid_t vtoc_id; 263 264 /* clean up in-core structures */ 265 md_destroy_unit_incore(mnum, &sp_md_ops); 266 267 md_nblocks_set(mnum, -1ULL); 268 MD_UNIT(mnum) = NULL; 269 270 /* 271 * Attempt release of minor node 272 */ 273 md_remove_minor_node(mnum); 274 275 if (!removing) 276 return; 277 278 /* we are removing the soft partition from the metadb */ 279 280 /* 281 * Save off device information so we can get to 282 * it after we do the mddb_deleterec(). 283 */ 284 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); 285 sv->setno = MD_MIN2SET(mnum); 286 sv->key = un->un_key; 287 vtoc_id = un->c.un_vtoc_id; 288 289 /* 290 * Remove self from the namespace 291 */ 292 if (un->c.un_revision & MD_FN_META_DEV) { 293 (void) md_rem_selfname(un->c.un_self_id); 294 } 295 296 /* Remove the unit structure */ 297 mddb_deleterec_wrapper(un->c.un_record_id); 298 299 if (vtoc_id) 300 mddb_deleterec_wrapper(vtoc_id); 301 302 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE, 303 MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); 304 305 /* 306 * remove the underlying device name from the metadb. if other 307 * soft partitions are built on this device, this will simply 308 * decrease the reference count for this device. otherwise the 309 * name record for this device will be removed from the metadb. 310 */ 311 md_rem_names(sv, 1); 312 kmem_free(sv, sizeof (sv_dev_t)); 313 } 314 315 /* 316 * FUNCTION: sp_send_stat_msg 317 * INPUT: un - unit reference 318 * status - status to be sent to master node 319 * MD_SP_OK - soft-partition is now OK 320 * MD_SP_ERR " " errored 321 * OUTPUT: none. 322 * RETURNS: void. 323 * PURPOSE: send a soft-partition status change to the master node. If the 324 * message succeeds we simply return. If it fails we panic as the 325 * cluster-wide view of the metadevices is now inconsistent. 326 * CALLING CONTEXT: 327 * Blockable. No locks can be held. 328 */ 329 static void 330 sp_send_stat_msg(mp_unit_t *un, sp_status_t status) 331 { 332 md_mn_msg_sp_setstat_t sp_msg; 333 md_mn_kresult_t *kres; 334 set_t setno = MD_UN2SET(un); 335 int rval; 336 const char *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK"; 337 338 sp_msg.sp_setstat_mnum = MD_SID(un); 339 sp_msg.sp_setstat_status = status; 340 341 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 342 343 rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG, 344 (char *)&sp_msg, sizeof (sp_msg), kres); 345 346 if (!MDMN_KSEND_MSG_OK(rval, kres)) { 347 mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2"); 348 349 /* 350 * Panic as we are now in an inconsistent state. 351 */ 352 353 cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n", 354 md_shortname(MD_SID(un)), str); 355 } 356 357 kmem_free(kres, sizeof (md_mn_kresult_t)); 358 } 359 360 /* 361 * FUNCTION: sp_finish_error 362 * INPUT: ps - parent save structure for error-ed I/O. 363 * lock_held - set if the unit readerlock is held 364 * OUTPUT: none. 365 * RETURNS: void. 366 * PURPOSE: report a driver error 367 */ 368 static void 369 sp_finish_error(md_spps_t *ps, int lock_held) 370 { 371 struct buf *pb = ps->ps_bp; 372 mdi_unit_t *ui = ps->ps_ui; 373 md_dev64_t un_dev; /* underlying device */ 374 md_dev64_t md_dev = md_expldev(pb->b_edev); /* metadev in error */ 375 char *str; 376 377 un_dev = md_expldev(ps->ps_un->un_dev); 378 /* set error type */ 379 if (pb->b_flags & B_READ) { 380 str = "read"; 381 } else { 382 str = "write"; 383 } 384 385 386 SPPS_FREE(sp_parent_cache, ps); 387 pb->b_flags |= B_ERROR; 388 389 md_kstat_done(ui, pb, 0); 390 391 if (lock_held) { 392 md_unit_readerexit(ui); 393 } 394 md_biodone(pb); 395 396 cmn_err(CE_WARN, "md: %s: %s error on %s", 397 md_shortname(md_getminor(md_dev)), str, 398 md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0)); 399 } 400 401 402 /* 403 * FUNCTION: sp_xmit_ok 404 * INPUT: dq - daemon queue referencing failing ps structure 405 * OUTPUT: none. 406 * RETURNS: void. 407 * PURPOSE: send a message to the master node in a multi-owner diskset to 408 * update all attached nodes view of the soft-part to be MD_SP_OK. 409 * CALLING CONTEXT: 410 * Blockable. No unit lock held. 411 */ 412 static void 413 sp_xmit_ok(daemon_queue_t *dq) 414 { 415 md_spps_t *ps = (md_spps_t *)dq; 416 417 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 418 sp_send_stat_msg(ps->ps_un, MD_SP_OK); 419 420 /* 421 * Successfully transmitted error state to all nodes, now release this 422 * parent structure. 423 */ 424 SPPS_FREE(sp_parent_cache, ps); 425 } 426 427 /* 428 * FUNCTION: sp_xmit_error 429 * INPUT: dq - daemon queue referencing failing ps structure 430 * OUTPUT: none. 431 * RETURNS: void. 432 * PURPOSE: send a message to the master node in a multi-owner diskset to 433 * update all attached nodes view of the soft-part to be MD_SP_ERR. 434 * CALLING CONTEXT: 435 * Blockable. No unit lock held. 436 */ 437 static void 438 sp_xmit_error(daemon_queue_t *dq) 439 { 440 md_spps_t *ps = (md_spps_t *)dq; 441 442 /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 443 sp_send_stat_msg(ps->ps_un, MD_SP_ERR); 444 445 /* 446 * Successfully transmitted error state to all nodes, now release this 447 * parent structure. 448 */ 449 SPPS_FREE(sp_parent_cache, ps); 450 } 451 static void 452 sp_send_stat_ok(mp_unit_t *un) 453 { 454 minor_t mnum = MD_SID(un); 455 md_spps_t *ps; 456 457 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 458 sp_parent_init(ps); 459 ps->ps_un = un; 460 ps->ps_ui = MDI_UNIT(mnum); 461 462 daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps, 463 REQ_OLD); 464 } 465 466 static void 467 sp_send_stat_err(mp_unit_t *un) 468 { 469 minor_t mnum = MD_SID(un); 470 md_spps_t *ps; 471 472 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 473 sp_parent_init(ps); 474 ps->ps_un = un; 475 ps->ps_ui = MDI_UNIT(mnum); 476 477 daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps, 478 REQ_OLD); 479 } 480 481 482 /* 483 * FUNCTION: sp_error() 484 * INPUT: ps - parent save structure for error-ed I/O. 485 * OUTPUT: none. 486 * RETURNS: void. 487 * PURPOSE: report a driver error. 488 * CALLING CONTEXT: 489 * Interrupt - non-blockable 490 */ 491 static void 492 sp_error(md_spps_t *ps) 493 { 494 set_t setno = MD_UN2SET(ps->ps_un); 495 496 /* 497 * Drop the mutex associated with this request before (potentially) 498 * enqueuing the free onto a separate thread. We have to release the 499 * mutex before destroying the parent structure. 500 */ 501 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 502 if (MUTEX_HELD(&ps->ps_mx)) { 503 mutex_exit(&ps->ps_mx); 504 } 505 } else { 506 /* 507 * this should only ever happen if we are panicking, 508 * since DONTFREE is only set on the parent if panicstr 509 * is non-NULL. 510 */ 511 ASSERT(panicstr); 512 } 513 514 /* 515 * For a multi-owner set we need to send a message to the master so that 516 * all nodes get the errored status when we first encounter it. To avoid 517 * deadlocking when multiple soft-partitions encounter an error on one 518 * physical unit we drop the unit readerlock before enqueueing the 519 * request. That way we can service any messages that require a 520 * writerlock to be held. Additionally, to avoid deadlocking when at 521 * the bottom of a metadevice stack and a higher level mirror has 522 * multiple requests outstanding on this soft-part, we clone the ps 523 * that failed and pass the error back up the stack to release the 524 * reference that this i/o may have in the higher-level metadevice. 525 * The other nodes in the cluster just have to modify the soft-part 526 * status and we do not need to block the i/o completion for this. 527 */ 528 if (MD_MNSET_SETNO(setno)) { 529 md_spps_t *err_ps; 530 err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 531 sp_parent_init(err_ps); 532 533 err_ps->ps_un = ps->ps_un; 534 err_ps->ps_ui = ps->ps_ui; 535 536 md_unit_readerexit(ps->ps_ui); 537 538 daemon_request(&md_sp_daemon, sp_xmit_error, 539 (daemon_queue_t *)err_ps, REQ_OLD); 540 541 sp_finish_error(ps, 0); 542 543 return; 544 } else { 545 ps->ps_un->un_status = MD_SP_ERR; 546 } 547 548 /* Flag the error */ 549 sp_finish_error(ps, 1); 550 551 } 552 553 /* 554 * FUNCTION: sp_mapbuf() 555 * INPUT: un - unit structure for soft partition we are doing 556 * I/O on. 557 * voff - virtual offset in soft partition to map. 558 * bcount - # of blocks in the I/O. 559 * OUTPUT: bp - translated buffer to be passed down to next layer. 560 * RETURNS: 1 - request must be fragmented, more work to do, 561 * 0 - request satisified, no more work to do 562 * -1 - error 563 * PURPOSE: Map the the virtual offset in the soft partition (passed 564 * in via voff) to the "physical" offset on whatever the soft 565 * partition is built on top of. We do this by doing a binary 566 * search of the extent array in the soft partition unit 567 * structure. Once the current extent is found, we do the 568 * translation, determine if the I/O will cross extent 569 * boundaries (if so, we have to fragment the I/O), then 570 * fill in the buf structure to be passed down to the next layer. 571 */ 572 static int 573 sp_mapbuf( 574 mp_unit_t *un, 575 sp_ext_offset_t voff, 576 sp_ext_length_t bcount, 577 buf_t *bp 578 ) 579 { 580 int lo, mid, hi, found, more; 581 size_t new_bcount; 582 sp_ext_offset_t new_blkno; 583 sp_ext_offset_t new_offset; 584 sp_ext_offset_t ext_endblk; 585 md_dev64_t new_edev; 586 extern unsigned md_maxphys; 587 588 found = 0; 589 lo = 0; 590 hi = un->un_numexts - 1; 591 592 /* 593 * do a binary search to find the extent that contains the 594 * starting offset. after this loop, mid contains the index 595 * of the correct extent. 596 */ 597 while (lo <= hi && !found) { 598 mid = (lo + hi) / 2; 599 /* is the starting offset contained within the mid-ext? */ 600 if (voff >= un->un_ext[mid].un_voff && 601 voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len) 602 found = 1; 603 else if (voff < un->un_ext[mid].un_voff) 604 hi = mid - 1; 605 else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */ 606 lo = mid + 1; 607 } 608 609 if (!found) { 610 cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff); 611 return (-1); 612 } 613 614 /* translate to underlying physical offset/device */ 615 new_offset = voff - un->un_ext[mid].un_voff; 616 new_blkno = un->un_ext[mid].un_poff + new_offset; 617 new_edev = un->un_dev; 618 619 /* determine if we need to break the I/O into fragments */ 620 ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len; 621 if (voff + btodb(bcount) > ext_endblk) { 622 new_bcount = dbtob(ext_endblk - voff); 623 more = 1; 624 } else { 625 new_bcount = bcount; 626 more = 0; 627 } 628 629 /* only break up the I/O if we're not built on another metadevice */ 630 if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) { 631 new_bcount = md_maxphys; 632 more = 1; 633 } 634 if (bp != (buf_t *)NULL) { 635 /* do bp updates */ 636 bp->b_bcount = new_bcount; 637 bp->b_lblkno = new_blkno; 638 bp->b_edev = md_dev64_to_dev(new_edev); 639 } 640 return (more); 641 } 642 643 /* 644 * FUNCTION: sp_validate() 645 * INPUT: un - unit structure to be validated. 646 * OUTPUT: none. 647 * RETURNS: 0 - soft partition ok. 648 * -1 - error. 649 * PURPOSE: called on open to sanity check the soft partition. In 650 * order to open a soft partition: 651 * - it must have at least one extent 652 * - the extent info in core and on disk must match 653 * - it may not be in an intermediate state (which would 654 * imply that a two-phase commit was interrupted) 655 * 656 * If the extent checking fails (B_ERROR returned from the read 657 * strategy call) _and_ we're a multi-owner diskset, we send a 658 * message to the master so that all nodes inherit the same view 659 * of the soft partition. 660 * If we are checking a soft-part that is marked as in error, and 661 * we can actually read and validate the watermarks we send a 662 * message to clear the error to the master node. 663 */ 664 static int 665 sp_validate(mp_unit_t *un) 666 { 667 uint_t ext; 668 struct buf *buf; 669 sp_ext_length_t len; 670 mp_watermark_t *wm; 671 set_t setno; 672 int reset_error = 0; 673 674 setno = MD_UN2SET(un); 675 676 /* sanity check unit structure components ?? */ 677 if (un->un_status != MD_SP_OK) { 678 if (un->un_status != MD_SP_ERR) { 679 cmn_err(CE_WARN, "md: %s: open failed, soft partition " 680 "status is %u.", 681 md_shortname(MD_SID(un)), 682 un->un_status); 683 return (-1); 684 } else { 685 cmn_err(CE_WARN, "md: %s: open of soft partition " 686 "in Errored state.", 687 md_shortname(MD_SID(un))); 688 reset_error = 1; 689 } 690 } 691 692 if (un->un_numexts == 0) { 693 cmn_err(CE_WARN, "md: %s: open failed, soft partition does " 694 "not have any extents.", md_shortname(MD_SID(un))); 695 return (-1); 696 } 697 698 len = 0LL; 699 for (ext = 0; ext < un->un_numexts; ext++) { 700 701 /* tally extent lengths to check total size */ 702 len += un->un_ext[ext].un_len; 703 704 /* allocate buffer for watermark */ 705 buf = getrbuf(KM_SLEEP); 706 707 /* read watermark */ 708 buf->b_flags = B_READ; 709 buf->b_edev = md_dev64_to_dev(un->un_dev); 710 buf->b_iodone = NULL; 711 buf->b_proc = NULL; 712 buf->b_bcount = sizeof (mp_watermark_t); 713 buf->b_lblkno = un->un_ext[ext].un_poff - 1; 714 buf->b_bufsize = sizeof (mp_watermark_t); 715 buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t), 716 KM_SLEEP); 717 718 /* 719 * make the call non-blocking so that it is not affected 720 * by a set take. 721 */ 722 md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL); 723 (void) biowait(buf); 724 725 if (buf->b_flags & B_ERROR) { 726 cmn_err(CE_WARN, "md: %s: open failed, could not " 727 "read watermark at block %llu for extent %u, " 728 "error %d.", md_shortname(MD_SID(un)), 729 buf->b_lblkno, ext, buf->b_error); 730 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 731 freerbuf(buf); 732 733 /* 734 * If we're a multi-owner diskset we send a message 735 * indicating that this soft-part has an invalid 736 * extent to the master node. This ensures a consistent 737 * view of the soft-part across the cluster. 738 */ 739 if (MD_MNSET_SETNO(setno)) { 740 sp_send_stat_err(un); 741 } 742 return (-1); 743 } 744 745 wm = (mp_watermark_t *)buf->b_un.b_addr; 746 747 /* make sure the checksum is correct first */ 748 if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum, 749 (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) { 750 cmn_err(CE_WARN, "md: %s: open failed, watermark " 751 "at block %llu for extent %u does not have a " 752 "valid checksum 0x%08x.", md_shortname(MD_SID(un)), 753 buf->b_lblkno, ext, wm->wm_checksum); 754 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 755 freerbuf(buf); 756 return (-1); 757 } 758 759 if (wm->wm_magic != MD_SP_MAGIC) { 760 cmn_err(CE_WARN, "md: %s: open failed, watermark " 761 "at block %llu for extent %u does not have a " 762 "valid watermark magic number, expected 0x%x, " 763 "found 0x%x.", md_shortname(MD_SID(un)), 764 buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic); 765 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 766 freerbuf(buf); 767 return (-1); 768 } 769 770 /* make sure sequence number matches the current extent */ 771 if (wm->wm_seq != ext) { 772 cmn_err(CE_WARN, "md: %s: open failed, watermark " 773 "at block %llu for extent %u has invalid " 774 "sequence number %u.", md_shortname(MD_SID(un)), 775 buf->b_lblkno, ext, wm->wm_seq); 776 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 777 freerbuf(buf); 778 return (-1); 779 } 780 781 /* make sure watermark length matches unit structure */ 782 if (wm->wm_length != un->un_ext[ext].un_len) { 783 cmn_err(CE_WARN, "md: %s: open failed, watermark " 784 "at block %llu for extent %u has inconsistent " 785 "length, expected %llu, found %llu.", 786 md_shortname(MD_SID(un)), buf->b_lblkno, 787 ext, un->un_ext[ext].un_len, 788 (u_longlong_t)wm->wm_length); 789 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 790 freerbuf(buf); 791 return (-1); 792 } 793 794 /* 795 * make sure the type is a valid soft partition and not 796 * a free extent or the end. 797 */ 798 if (wm->wm_type != EXTTYP_ALLOC) { 799 cmn_err(CE_WARN, "md: %s: open failed, watermark " 800 "at block %llu for extent %u is not marked " 801 "as in-use, type = %u.", md_shortname(MD_SID(un)), 802 buf->b_lblkno, ext, wm->wm_type); 803 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 804 freerbuf(buf); 805 return (-1); 806 } 807 /* free up buffer */ 808 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 809 freerbuf(buf); 810 } 811 812 if (len != un->un_length) { 813 cmn_err(CE_WARN, "md: %s: open failed, computed length " 814 "%llu != expected length %llu.", md_shortname(MD_SID(un)), 815 len, un->un_length); 816 return (-1); 817 } 818 819 /* 820 * If we're a multi-owner set _and_ reset_error is set, we should clear 821 * the error condition on all nodes in the set. Use SP_SETSTAT2 with 822 * MD_SP_OK. 823 */ 824 if (MD_MNSET_SETNO(setno) && reset_error) { 825 sp_send_stat_ok(un); 826 } 827 return (0); 828 } 829 830 /* 831 * FUNCTION: sp_done() 832 * INPUT: child_buf - buffer attached to child save structure. 833 * this is the buffer on which I/O has just 834 * completed. 835 * OUTPUT: none. 836 * RETURNS: 0 - success. 837 * 1 - error. 838 * PURPOSE: called on I/O completion. 839 */ 840 static int 841 sp_done(struct buf *child_buf) 842 { 843 struct buf *parent_buf; 844 mdi_unit_t *ui; 845 md_spps_t *ps; 846 md_spcs_t *cs; 847 848 /* find the child save structure to which this buffer belongs */ 849 cs = (md_spcs_t *)((caddr_t)child_buf - 850 (sizeof (md_spcs_t) - sizeof (buf_t))); 851 /* now get the parent save structure */ 852 ps = cs->cs_ps; 853 parent_buf = ps->ps_bp; 854 855 mutex_enter(&ps->ps_mx); 856 /* pass any errors back up to the parent */ 857 if (child_buf->b_flags & B_ERROR) { 858 ps->ps_flags |= MD_SPPS_ERROR; 859 parent_buf->b_error = child_buf->b_error; 860 } 861 /* mapout, if needed */ 862 if (child_buf->b_flags & B_REMAPPED) 863 bp_mapout(child_buf); 864 865 ps->ps_frags--; 866 if (ps->ps_frags != 0) { 867 /* 868 * if this parent has more children, we just free the 869 * child and return. 870 */ 871 kmem_cache_free(sp_child_cache, cs); 872 mutex_exit(&ps->ps_mx); 873 return (1); 874 } 875 /* there are no more children */ 876 kmem_cache_free(sp_child_cache, cs); 877 if (ps->ps_flags & MD_SPPS_ERROR) { 878 sp_error(ps); 879 return (1); 880 } 881 ui = ps->ps_ui; 882 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 883 mutex_exit(&ps->ps_mx); 884 } else { 885 /* 886 * this should only ever happen if we are panicking, 887 * since DONTFREE is only set on the parent if panicstr 888 * is non-NULL. 889 */ 890 ASSERT(panicstr); 891 } 892 SPPS_FREE(sp_parent_cache, ps); 893 md_kstat_done(ui, parent_buf, 0); 894 md_unit_readerexit(ui); 895 md_biodone(parent_buf); 896 return (0); 897 } 898 899 /* 900 * FUNCTION: md_sp_strategy() 901 * INPUT: parent_buf - parent buffer 902 * flag - flags 903 * private - private data 904 * OUTPUT: none. 905 * RETURNS: void. 906 * PURPOSE: Soft partitioning I/O strategy. Performs the main work 907 * needed to do I/O to a soft partition. The basic 908 * algorithm is as follows: 909 * - Allocate a child save structure to keep track 910 * of the I/O we are going to pass down. 911 * - Map the I/O to the correct extent in the soft 912 * partition (see sp_mapbuf()). 913 * - bioclone() the buffer and pass it down the 914 * stack using md_call_strategy. 915 * - If the I/O needs to split across extents, 916 * repeat the above steps until all fragments 917 * are finished. 918 */ 919 static void 920 md_sp_strategy(buf_t *parent_buf, int flag, void *private) 921 { 922 md_spps_t *ps; 923 md_spcs_t *cs; 924 int more; 925 mp_unit_t *un; 926 mdi_unit_t *ui; 927 size_t current_count; 928 off_t current_offset; 929 sp_ext_offset_t current_blkno; 930 buf_t *child_buf; 931 set_t setno = MD_MIN2SET(getminor(parent_buf->b_edev)); 932 int strat_flag = flag; 933 934 /* 935 * When doing IO to a multi owner meta device, check if set is halted. 936 * We do this check without the needed lock held, for performance 937 * reasons. 938 * If an IO just slips through while the set is locked via an 939 * MD_MN_SUSPEND_SET, we don't care about it. 940 * Only check for suspension if we are a top-level i/o request 941 * (MD_STR_NOTTOP is cleared in 'flag'); 942 */ 943 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 944 (MD_SET_HALTED | MD_SET_MNSET)) { 945 if ((flag & MD_STR_NOTTOP) == 0) { 946 mutex_enter(&md_mx); 947 /* Here we loop until the set is no longer halted */ 948 while (md_set[setno].s_status & MD_SET_HALTED) { 949 cv_wait(&md_cv, &md_mx); 950 } 951 mutex_exit(&md_mx); 952 } 953 } 954 955 ui = MDI_UNIT(getminor(parent_buf->b_edev)); 956 957 md_kstat_waitq_enter(ui); 958 959 un = (mp_unit_t *)md_unit_readerlock(ui); 960 961 if ((flag & MD_NOBLOCK) == 0) { 962 if (md_inc_iocount(setno) != 0) { 963 parent_buf->b_flags |= B_ERROR; 964 parent_buf->b_error = ENXIO; 965 parent_buf->b_resid = parent_buf->b_bcount; 966 md_kstat_waitq_exit(ui); 967 md_unit_readerexit(ui); 968 biodone(parent_buf); 969 return; 970 } 971 } else { 972 md_inc_iocount_noblock(setno); 973 } 974 975 if (!(flag & MD_STR_NOTTOP)) { 976 if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) { 977 md_kstat_waitq_exit(ui); 978 return; 979 } 980 } 981 982 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 983 sp_parent_init(ps); 984 985 /* 986 * Save essential information from the original buffhdr 987 * in the parent. 988 */ 989 ps->ps_un = un; 990 ps->ps_ui = ui; 991 ps->ps_bp = parent_buf; 992 ps->ps_addr = parent_buf->b_un.b_addr; 993 994 current_count = parent_buf->b_bcount; 995 current_blkno = (sp_ext_offset_t)parent_buf->b_blkno; 996 current_offset = 0; 997 998 /* 999 * if we are at the top and we are panicking, 1000 * we don't free in order to save state. 1001 */ 1002 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) 1003 ps->ps_flags |= MD_SPPS_DONTFREE; 1004 1005 md_kstat_waitq_to_runq(ui); 1006 1007 ps->ps_frags++; 1008 1009 /* 1010 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this 1011 * metadevice. 1012 */ 1013 if (ui->ui_tstate & MD_ABR_CAP) 1014 strat_flag |= MD_STR_ABR; 1015 1016 /* 1017 * this loop does the main work of an I/O. we allocate a 1018 * a child save for each buf, do the logical to physical 1019 * mapping, decide if we need to frag the I/O, clone the 1020 * new I/O to pass down the stack. repeat until we've 1021 * taken care of the entire buf that was passed to us. 1022 */ 1023 do { 1024 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1025 sp_child_init(cs); 1026 child_buf = &cs->cs_buf; 1027 cs->cs_ps = ps; 1028 1029 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1030 if (more == -1) { 1031 parent_buf->b_flags |= B_ERROR; 1032 parent_buf->b_error = EIO; 1033 md_kstat_done(ui, parent_buf, 0); 1034 md_unit_readerexit(ui); 1035 md_biodone(parent_buf); 1036 kmem_cache_free(sp_parent_cache, ps); 1037 return; 1038 } 1039 1040 child_buf = md_bioclone(parent_buf, current_offset, 1041 child_buf->b_bcount, child_buf->b_edev, 1042 child_buf->b_blkno, sp_done, child_buf, 1043 KM_NOSLEEP); 1044 /* calculate new offset, counts, etc... */ 1045 current_offset += child_buf->b_bcount; 1046 current_count -= child_buf->b_bcount; 1047 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1048 1049 if (more) { 1050 mutex_enter(&ps->ps_mx); 1051 ps->ps_frags++; 1052 mutex_exit(&ps->ps_mx); 1053 } 1054 1055 md_call_strategy(child_buf, strat_flag, private); 1056 } while (more); 1057 1058 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) { 1059 while (!(ps->ps_flags & MD_SPPS_DONE)) { 1060 md_daemon(1, &md_done_daemon); 1061 } 1062 kmem_cache_free(sp_parent_cache, ps); 1063 } 1064 } 1065 1066 /* 1067 * FUNCTION: sp_directed_read() 1068 * INPUT: mnum - minor number 1069 * vdr - vol_directed_rd_t from user 1070 * mode - access mode for copying data out. 1071 * OUTPUT: none. 1072 * RETURNS: 0 - success 1073 * Exxxxx - failure error-code 1074 * PURPOSE: Construct the necessary sub-device i/o requests to perform the 1075 * directed read as requested by the user. This is essentially the 1076 * same as md_sp_strategy() with the exception being that the 1077 * underlying 'md_call_strategy' is replaced with an ioctl call. 1078 */ 1079 int 1080 sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode) 1081 { 1082 md_spps_t *ps; 1083 md_spcs_t *cs; 1084 int more; 1085 mp_unit_t *un; 1086 mdi_unit_t *ui; 1087 size_t current_count; 1088 off_t current_offset; 1089 sp_ext_offset_t current_blkno; 1090 buf_t *child_buf, *parent_buf; 1091 void *kbuffer; 1092 vol_directed_rd_t cvdr; 1093 caddr_t userbuf; 1094 offset_t useroff; 1095 int ret = 0; 1096 1097 ui = MDI_UNIT(mnum); 1098 1099 md_kstat_waitq_enter(ui); 1100 1101 bzero(&cvdr, sizeof (cvdr)); 1102 1103 un = (mp_unit_t *)md_unit_readerlock(ui); 1104 1105 /* 1106 * Construct a parent_buf header which reflects the user-supplied 1107 * request. 1108 */ 1109 1110 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 1111 if (kbuffer == NULL) { 1112 vdr->vdr_flags |= DKV_DMR_ERROR; 1113 md_kstat_waitq_exit(ui); 1114 md_unit_readerexit(ui); 1115 return (ENOMEM); 1116 } 1117 1118 parent_buf = getrbuf(KM_NOSLEEP); 1119 if (parent_buf == NULL) { 1120 vdr->vdr_flags |= DKV_DMR_ERROR; 1121 md_kstat_waitq_exit(ui); 1122 md_unit_readerexit(ui); 1123 kmem_free(kbuffer, vdr->vdr_nbytes); 1124 return (ENOMEM); 1125 } 1126 parent_buf->b_un.b_addr = kbuffer; 1127 parent_buf->b_flags = B_READ; 1128 parent_buf->b_bcount = vdr->vdr_nbytes; 1129 parent_buf->b_lblkno = lbtodb(vdr->vdr_offset); 1130 parent_buf->b_edev = un->un_dev; 1131 1132 1133 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 1134 sp_parent_init(ps); 1135 1136 /* 1137 * Save essential information from the original buffhdr 1138 * in the parent. 1139 */ 1140 ps->ps_un = un; 1141 ps->ps_ui = ui; 1142 ps->ps_bp = parent_buf; 1143 ps->ps_addr = parent_buf->b_un.b_addr; 1144 1145 current_count = parent_buf->b_bcount; 1146 current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno; 1147 current_offset = 0; 1148 1149 md_kstat_waitq_to_runq(ui); 1150 1151 ps->ps_frags++; 1152 vdr->vdr_bytesread = 0; 1153 1154 /* 1155 * this loop does the main work of an I/O. we allocate a 1156 * a child save for each buf, do the logical to physical 1157 * mapping, decide if we need to frag the I/O, clone the 1158 * new I/O to pass down the stack. repeat until we've 1159 * taken care of the entire buf that was passed to us. 1160 */ 1161 do { 1162 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1163 sp_child_init(cs); 1164 child_buf = &cs->cs_buf; 1165 cs->cs_ps = ps; 1166 1167 more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1168 if (more == -1) { 1169 ret = EIO; 1170 vdr->vdr_flags |= DKV_DMR_SHORT; 1171 kmem_cache_free(sp_child_cache, cs); 1172 goto err_out; 1173 } 1174 1175 cvdr.vdr_flags = vdr->vdr_flags; 1176 cvdr.vdr_side = vdr->vdr_side; 1177 cvdr.vdr_nbytes = child_buf->b_bcount; 1178 cvdr.vdr_offset = ldbtob(child_buf->b_lblkno); 1179 /* Work out where we are in the allocated buffer */ 1180 useroff = (offset_t)(uintptr_t)kbuffer; 1181 useroff = useroff + (offset_t)current_offset; 1182 cvdr.vdr_data = (void *)(uintptr_t)useroff; 1183 child_buf = md_bioclone(parent_buf, current_offset, 1184 child_buf->b_bcount, child_buf->b_edev, 1185 child_buf->b_blkno, NULL, 1186 child_buf, KM_NOSLEEP); 1187 /* calculate new offset, counts, etc... */ 1188 current_offset += child_buf->b_bcount; 1189 current_count -= child_buf->b_bcount; 1190 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1191 1192 if (more) { 1193 mutex_enter(&ps->ps_mx); 1194 ps->ps_frags++; 1195 mutex_exit(&ps->ps_mx); 1196 } 1197 1198 ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr, 1199 (mode | FKIOCTL), NULL); 1200 1201 /* 1202 * Free the child structure as we've finished with it. 1203 * Normally this would be done by sp_done() but we're just 1204 * using md_bioclone() to segment the transfer and we never 1205 * issue a strategy request so the iodone will not be called. 1206 */ 1207 kmem_cache_free(sp_child_cache, cs); 1208 if (ret == 0) { 1209 /* copyout the returned data to vdr_data + offset */ 1210 userbuf = (caddr_t)kbuffer; 1211 userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer; 1212 if (ddi_copyout(userbuf, vdr->vdr_data, 1213 cvdr.vdr_bytesread, mode)) { 1214 ret = EFAULT; 1215 goto err_out; 1216 } 1217 vdr->vdr_bytesread += cvdr.vdr_bytesread; 1218 } else { 1219 goto err_out; 1220 } 1221 } while (more); 1222 1223 /* 1224 * Update the user-supplied vol_directed_rd_t structure with the 1225 * contents of the last issued child request. 1226 */ 1227 vdr->vdr_flags = cvdr.vdr_flags; 1228 vdr->vdr_side = cvdr.vdr_side; 1229 bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME); 1230 1231 err_out: 1232 if (ret != 0) { 1233 vdr->vdr_flags |= DKV_DMR_ERROR; 1234 } 1235 if (vdr->vdr_bytesread != vdr->vdr_nbytes) { 1236 vdr->vdr_flags |= DKV_DMR_SHORT; 1237 } 1238 kmem_cache_free(sp_parent_cache, ps); 1239 kmem_free(kbuffer, vdr->vdr_nbytes); 1240 freerbuf(parent_buf); 1241 md_unit_readerexit(ui); 1242 return (ret); 1243 } 1244 1245 /* 1246 * FUNCTION: sp_snarf() 1247 * INPUT: cmd - snarf cmd. 1248 * setno - set number. 1249 * OUTPUT: none. 1250 * RETURNS: 1 - soft partitions were snarfed. 1251 * 0 - no soft partitions were snarfed. 1252 * PURPOSE: Snarf soft partition metadb records into their in-core 1253 * structures. This routine is called at "snarf time" when 1254 * md loads and gets all metadevices records into memory. 1255 * The basic algorithm is simply to walk the soft partition 1256 * records in the metadb and call the soft partitioning 1257 * build_incore routine to set up the in-core structures. 1258 */ 1259 static int 1260 sp_snarf(md_snarfcmd_t cmd, set_t setno) 1261 { 1262 mp_unit_t *un; 1263 mddb_recid_t recid; 1264 int gotsomething; 1265 int all_sp_gotten; 1266 mddb_type_t rec_type; 1267 mddb_de_ic_t *dep; 1268 mddb_rb32_t *rbp; 1269 mp_unit_t *big_un; 1270 mp_unit32_od_t *small_un; 1271 size_t newreqsize; 1272 1273 1274 if (cmd == MD_SNARF_CLEANUP) 1275 return (0); 1276 1277 all_sp_gotten = 1; 1278 gotsomething = 0; 1279 1280 /* get the record type */ 1281 rec_type = (mddb_type_t)md_getshared_key(setno, 1282 sp_md_ops.md_driver.md_drivername); 1283 recid = mddb_makerecid(setno, 0); 1284 1285 /* 1286 * walk soft partition records in the metadb and call 1287 * sp_build_incore to build in-core structures. 1288 */ 1289 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1290 /* if we've already gotten this record, go to the next one */ 1291 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1292 continue; 1293 1294 1295 dep = mddb_getrecdep(recid); 1296 dep->de_flags = MDDB_F_SOFTPART; 1297 rbp = dep->de_rb; 1298 1299 switch (rbp->rb_revision) { 1300 case MDDB_REV_RB: 1301 case MDDB_REV_RBFN: 1302 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 1303 /* 1304 * This means, we have an old and small record. 1305 * And this record hasn't already been converted 1306 * :-o before we create an incore metadevice 1307 * from this we have to convert it to a big 1308 * record. 1309 */ 1310 small_un = 1311 (mp_unit32_od_t *)mddb_getrecaddr(recid); 1312 newreqsize = sizeof (mp_unit_t) + 1313 ((small_un->un_numexts - 1) * 1314 sizeof (struct mp_ext)); 1315 big_un = (mp_unit_t *)kmem_zalloc(newreqsize, 1316 KM_SLEEP); 1317 softpart_convert((caddr_t)small_un, 1318 (caddr_t)big_un, SMALL_2_BIG); 1319 kmem_free(small_un, dep->de_reqsize); 1320 dep->de_rb_userdata = big_un; 1321 dep->de_reqsize = newreqsize; 1322 rbp->rb_private |= MD_PRV_CONVD; 1323 un = big_un; 1324 } else { 1325 /* Record has already been converted */ 1326 un = (mp_unit_t *)mddb_getrecaddr(recid); 1327 } 1328 un->c.un_revision &= ~MD_64BIT_META_DEV; 1329 break; 1330 case MDDB_REV_RB64: 1331 case MDDB_REV_RB64FN: 1332 /* Large device */ 1333 un = (mp_unit_t *)mddb_getrecaddr(recid); 1334 un->c.un_revision |= MD_64BIT_META_DEV; 1335 un->c.un_flag |= MD_EFILABEL; 1336 break; 1337 } 1338 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 1339 1340 /* 1341 * Create minor node for snarfed entry. 1342 */ 1343 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 1344 1345 if (MD_UNIT(MD_SID(un)) != NULL) { 1346 /* unit is already in-core */ 1347 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1348 continue; 1349 } 1350 all_sp_gotten = 0; 1351 if (sp_build_incore((void *)un, 1) == 0) { 1352 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1353 md_create_unit_incore(MD_SID(un), &sp_md_ops, 0); 1354 gotsomething = 1; 1355 } 1356 } 1357 1358 if (!all_sp_gotten) 1359 return (gotsomething); 1360 /* double-check records */ 1361 recid = mddb_makerecid(setno, 0); 1362 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) 1363 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 1364 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1365 1366 return (0); 1367 } 1368 1369 /* 1370 * FUNCTION: sp_halt() 1371 * INPUT: cmd - halt cmd. 1372 * setno - set number. 1373 * RETURNS: 0 - success. 1374 * 1 - err. 1375 * PURPOSE: Perform driver halt operations. As with stripe, we 1376 * support MD_HALT_CHECK and MD_HALT_DOIT. The first 1377 * does a check to see if halting can be done safely 1378 * (no open soft partitions), the second cleans up and 1379 * shuts down the driver. 1380 */ 1381 static int 1382 sp_halt(md_haltcmd_t cmd, set_t setno) 1383 { 1384 int i; 1385 mdi_unit_t *ui; 1386 minor_t mnum; 1387 1388 if (cmd == MD_HALT_CLOSE) 1389 return (0); 1390 1391 if (cmd == MD_HALT_OPEN) 1392 return (0); 1393 1394 if (cmd == MD_HALT_UNLOAD) 1395 return (0); 1396 1397 if (cmd == MD_HALT_CHECK) { 1398 for (i = 0; i < md_nunits; i++) { 1399 mnum = MD_MKMIN(setno, i); 1400 if ((ui = MDI_UNIT(mnum)) == NULL) 1401 continue; 1402 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1403 continue; 1404 if (md_unit_isopen(ui)) 1405 return (1); 1406 } 1407 return (0); 1408 } 1409 1410 if (cmd != MD_HALT_DOIT) 1411 return (1); 1412 1413 for (i = 0; i < md_nunits; i++) { 1414 mnum = MD_MKMIN(setno, i); 1415 if ((ui = MDI_UNIT(mnum)) == NULL) 1416 continue; 1417 if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1418 continue; 1419 reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0); 1420 } 1421 1422 return (0); 1423 } 1424 1425 /* 1426 * FUNCTION: sp_open_dev() 1427 * INPUT: un - unit structure. 1428 * oflags - open flags. 1429 * OUTPUT: none. 1430 * RETURNS: 0 - success. 1431 * non-zero - err. 1432 * PURPOSE: open underlying device via md_layered_open. 1433 */ 1434 static int 1435 sp_open_dev(mp_unit_t *un, int oflags) 1436 { 1437 minor_t mnum = MD_SID(un); 1438 int err; 1439 md_dev64_t tmpdev; 1440 set_t setno = MD_MIN2SET(MD_SID(un)); 1441 side_t side = mddb_getsidenum(setno); 1442 1443 tmpdev = un->un_dev; 1444 /* 1445 * Do the open by device id if underlying is regular 1446 */ 1447 if ((md_getmajor(tmpdev) != md_major) && 1448 md_devid_found(setno, side, un->un_key) == 1) { 1449 tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key); 1450 } 1451 err = md_layered_open(mnum, &tmpdev, oflags); 1452 un->un_dev = tmpdev; 1453 1454 if (err) 1455 return (ENXIO); 1456 1457 return (0); 1458 } 1459 1460 /* 1461 * FUNCTION: sp_open() 1462 * INPUT: dev - device to open. 1463 * flag - pass-through flag. 1464 * otyp - pass-through open type. 1465 * cred_p - credentials. 1466 * md_oflags - open flags. 1467 * OUTPUT: none. 1468 * RETURNS: 0 - success. 1469 * non-zero - err. 1470 * PURPOSE: open a soft partition. 1471 */ 1472 /* ARGSUSED */ 1473 static int 1474 sp_open( 1475 dev_t *dev, 1476 int flag, 1477 int otyp, 1478 cred_t *cred_p, 1479 int md_oflags 1480 ) 1481 { 1482 minor_t mnum = getminor(*dev); 1483 mdi_unit_t *ui = MDI_UNIT(mnum); 1484 mp_unit_t *un; 1485 int err = 0; 1486 set_t setno; 1487 1488 /* 1489 * When doing an open of a multi owner metadevice, check to see if this 1490 * node is a starting node and if a reconfig cycle is underway. 1491 * If so, the system isn't sufficiently set up enough to handle the 1492 * open (which involves I/O during sp_validate), so fail with ENXIO. 1493 */ 1494 setno = MD_MIN2SET(mnum); 1495 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 1496 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 1497 return (ENXIO); 1498 } 1499 1500 /* grab necessary locks */ 1501 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1502 setno = MD_UN2SET(un); 1503 1504 /* open underlying device, if necessary */ 1505 if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { 1506 if ((err = sp_open_dev(un, md_oflags)) != 0) 1507 goto out; 1508 1509 if (MD_MNSET_SETNO(setno)) { 1510 /* For probe, don't incur the overhead of validate */ 1511 if (!(md_oflags & MD_OFLG_PROBEDEV)) { 1512 /* 1513 * Don't call sp_validate while 1514 * unit_openclose lock is held. So, actually 1515 * open the device, drop openclose lock, 1516 * call sp_validate, reacquire openclose lock, 1517 * and close the device. If sp_validate 1518 * succeeds, then device will be re-opened. 1519 */ 1520 if ((err = md_unit_incopen(mnum, flag, 1521 otyp)) != 0) 1522 goto out; 1523 1524 mutex_enter(&ui->ui_mx); 1525 ui->ui_lock |= MD_UL_OPENINPROGRESS; 1526 mutex_exit(&ui->ui_mx); 1527 md_unit_openclose_exit(ui); 1528 if (otyp != OTYP_LYR) 1529 rw_exit(&md_unit_array_rw.lock); 1530 1531 err = sp_validate(un); 1532 1533 if (otyp != OTYP_LYR) 1534 rw_enter(&md_unit_array_rw.lock, 1535 RW_READER); 1536 (void) md_unit_openclose_enter(ui); 1537 (void) md_unit_decopen(mnum, otyp); 1538 mutex_enter(&ui->ui_mx); 1539 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 1540 cv_broadcast(&ui->ui_cv); 1541 mutex_exit(&ui->ui_mx); 1542 /* 1543 * Should be in the same state as before 1544 * the sp_validate. 1545 */ 1546 if (err != 0) { 1547 /* close the device opened above */ 1548 md_layered_close(un->un_dev, md_oflags); 1549 err = EIO; 1550 goto out; 1551 } 1552 } 1553 /* 1554 * As we're a multi-owner metadevice we need to ensure 1555 * that all nodes have the same idea of the status. 1556 * sp_validate() will mark the device as errored (if 1557 * it cannot read the watermark) or ok (if it was 1558 * previously errored but the watermark is now valid). 1559 * This code-path is only entered on the non-probe open 1560 * so we will maintain the errored state during a probe 1561 * call. This means the sys-admin must metarecover -m 1562 * to reset the soft-partition error. 1563 */ 1564 } else { 1565 /* For probe, don't incur the overhead of validate */ 1566 if (!(md_oflags & MD_OFLG_PROBEDEV) && 1567 (err = sp_validate(un)) != 0) { 1568 /* close the device opened above */ 1569 md_layered_close(un->un_dev, md_oflags); 1570 err = EIO; 1571 goto out; 1572 } else { 1573 /* 1574 * we succeeded in validating the on disk 1575 * format versus the in core, so reset the 1576 * status if it's in error 1577 */ 1578 if (un->un_status == MD_SP_ERR) { 1579 un->un_status = MD_SP_OK; 1580 } 1581 } 1582 } 1583 } 1584 1585 /* count open */ 1586 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 1587 goto out; 1588 1589 out: 1590 md_unit_openclose_exit(ui); 1591 return (err); 1592 } 1593 1594 /* 1595 * FUNCTION: sp_close() 1596 * INPUT: dev - device to close. 1597 * flag - pass-through flag. 1598 * otyp - pass-through type. 1599 * cred_p - credentials. 1600 * md_cflags - close flags. 1601 * OUTPUT: none. 1602 * RETURNS: 0 - success. 1603 * non-zero - err. 1604 * PURPOSE: close a soft paritition. 1605 */ 1606 /* ARGSUSED */ 1607 static int 1608 sp_close( 1609 dev_t dev, 1610 int flag, 1611 int otyp, 1612 cred_t *cred_p, 1613 int md_cflags 1614 ) 1615 { 1616 minor_t mnum = getminor(dev); 1617 mdi_unit_t *ui = MDI_UNIT(mnum); 1618 mp_unit_t *un; 1619 int err = 0; 1620 1621 /* grab necessary locks */ 1622 un = (mp_unit_t *)md_unit_openclose_enter(ui); 1623 1624 /* count closed */ 1625 if ((err = md_unit_decopen(mnum, otyp)) != 0) 1626 goto out; 1627 1628 /* close devices, if necessary */ 1629 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 1630 md_layered_close(un->un_dev, md_cflags); 1631 } 1632 1633 /* 1634 * If a MN set and transient capabilities (eg ABR/DMR) are set, 1635 * clear these capabilities if this is the last close in 1636 * the cluster 1637 */ 1638 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1639 (ui->ui_tstate & MD_ABR_CAP)) { 1640 md_unit_openclose_exit(ui); 1641 mdmn_clear_all_capabilities(mnum); 1642 return (0); 1643 } 1644 /* unlock, return success */ 1645 out: 1646 md_unit_openclose_exit(ui); 1647 return (err); 1648 } 1649 1650 1651 /* used in sp_dump routine */ 1652 static struct buf dumpbuf; 1653 1654 /* 1655 * FUNCTION: sp_dump() 1656 * INPUT: dev - device to dump to. 1657 * addr - address to dump. 1658 * blkno - blkno on device. 1659 * nblk - number of blocks to dump. 1660 * OUTPUT: none. 1661 * RETURNS: result from bdev_dump. 1662 * PURPOSE: This routine dumps memory to the disk. It assumes that 1663 * the memory has already been mapped into mainbus space. 1664 * It is called at disk interrupt priority when the system 1665 * is in trouble. 1666 * NOTE: this function is defined using 32-bit arguments, 1667 * but soft partitioning is internally 64-bit. Arguments 1668 * are casted where appropriate. 1669 */ 1670 static int 1671 sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1672 { 1673 mp_unit_t *un; 1674 buf_t *bp; 1675 sp_ext_length_t nb; 1676 daddr_t mapblk; 1677 int result; 1678 int more; 1679 int saveresult = 0; 1680 1681 /* 1682 * Don't need to grab the unit lock. 1683 * Cause nothing else is supposed to be happenning. 1684 * Also dump is not supposed to sleep. 1685 */ 1686 un = (mp_unit_t *)MD_UNIT(getminor(dev)); 1687 1688 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 1689 return (EINVAL); 1690 1691 if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks) 1692 return (EINVAL); 1693 1694 bp = &dumpbuf; 1695 nb = (sp_ext_length_t)dbtob(nblk); 1696 do { 1697 bzero((caddr_t)bp, sizeof (*bp)); 1698 more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp); 1699 nblk = (int)(btodb(bp->b_bcount)); 1700 mapblk = bp->b_blkno; 1701 result = bdev_dump(bp->b_edev, addr, mapblk, nblk); 1702 if (result) 1703 saveresult = result; 1704 1705 nb -= bp->b_bcount; 1706 addr += bp->b_bcount; 1707 blkno += nblk; 1708 } while (more); 1709 1710 return (saveresult); 1711 } 1712 1713 static int 1714 sp_imp_set( 1715 set_t setno 1716 ) 1717 { 1718 mddb_recid_t recid; 1719 int gotsomething; 1720 mddb_type_t rec_type; 1721 mddb_de_ic_t *dep; 1722 mddb_rb32_t *rbp; 1723 mp_unit_t *un64; 1724 mp_unit32_od_t *un32; 1725 md_dev64_t self_devt; 1726 minor_t *self_id; /* minor needs to be updated */ 1727 md_parent_t *parent_id; /* parent needs to be updated */ 1728 mddb_recid_t *record_id; /* record id needs to be updated */ 1729 1730 gotsomething = 0; 1731 1732 rec_type = (mddb_type_t)md_getshared_key(setno, 1733 sp_md_ops.md_driver.md_drivername); 1734 recid = mddb_makerecid(setno, 0); 1735 1736 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1737 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1738 continue; 1739 1740 dep = mddb_getrecdep(recid); 1741 rbp = dep->de_rb; 1742 1743 switch (rbp->rb_revision) { 1744 case MDDB_REV_RB: 1745 case MDDB_REV_RBFN: 1746 /* 1747 * Small device 1748 */ 1749 un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid); 1750 self_id = &(un32->c.un_self_id); 1751 parent_id = &(un32->c.un_parent); 1752 record_id = &(un32->c.un_record_id); 1753 1754 if (!md_update_minor(setno, mddb_getsidenum 1755 (setno), un32->un_key)) 1756 goto out; 1757 break; 1758 1759 case MDDB_REV_RB64: 1760 case MDDB_REV_RB64FN: 1761 un64 = (mp_unit_t *)mddb_getrecaddr(recid); 1762 self_id = &(un64->c.un_self_id); 1763 parent_id = &(un64->c.un_parent); 1764 record_id = &(un64->c.un_record_id); 1765 1766 if (!md_update_minor(setno, mddb_getsidenum 1767 (setno), un64->un_key)) 1768 goto out; 1769 break; 1770 } 1771 1772 /* 1773 * If this is a top level and a friendly name metadevice, 1774 * update its minor in the namespace. 1775 */ 1776 if ((*parent_id == MD_NO_PARENT) && 1777 ((rbp->rb_revision == MDDB_REV_RBFN) || 1778 (rbp->rb_revision == MDDB_REV_RB64FN))) { 1779 1780 self_devt = md_makedevice(md_major, *self_id); 1781 if (!md_update_top_device_minor(setno, 1782 mddb_getsidenum(setno), self_devt)) 1783 goto out; 1784 } 1785 1786 /* 1787 * Update unit with the imported setno 1788 * 1789 */ 1790 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1791 1792 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 1793 if (*parent_id != MD_NO_PARENT) 1794 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 1795 *record_id = MAKERECID(setno, DBID(*record_id)); 1796 1797 gotsomething = 1; 1798 } 1799 1800 out: 1801 return (gotsomething); 1802 } 1803 1804 static md_named_services_t sp_named_services[] = { 1805 {NULL, 0} 1806 }; 1807 1808 md_ops_t sp_md_ops = { 1809 sp_open, /* open */ 1810 sp_close, /* close */ 1811 md_sp_strategy, /* strategy */ 1812 NULL, /* print */ 1813 sp_dump, /* dump */ 1814 NULL, /* read */ 1815 NULL, /* write */ 1816 md_sp_ioctl, /* ioctl, */ 1817 sp_snarf, /* snarf */ 1818 sp_halt, /* halt */ 1819 NULL, /* aread */ 1820 NULL, /* awrite */ 1821 sp_imp_set, /* import set */ 1822 sp_named_services 1823 }; 1824 1825 static void 1826 init_init() 1827 { 1828 sp_parent_cache = kmem_cache_create("md_softpart_parent", 1829 sizeof (md_spps_t), 0, sp_parent_constructor, 1830 sp_parent_destructor, sp_run_queue, NULL, NULL, 0); 1831 sp_child_cache = kmem_cache_create("md_softpart_child", 1832 sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0, 1833 sp_child_constructor, sp_child_destructor, sp_run_queue, 1834 NULL, NULL, 0); 1835 } 1836 1837 static void 1838 fini_uninit() 1839 { 1840 kmem_cache_destroy(sp_parent_cache); 1841 kmem_cache_destroy(sp_child_cache); 1842 sp_parent_cache = sp_child_cache = NULL; 1843 } 1844 1845 /* define the module linkage */ 1846 MD_PLUGIN_MISC_MODULE("soft partition module", init_init(), fini_uninit()) 1847