1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Md - is the meta-disk driver. It sits below the UFS file system 30 * but above the 'real' disk drivers, xy, id, sd etc. 31 * 32 * To the UFS software, md looks like a normal driver, since it has 33 * the normal kinds of entries in the bdevsw and cdevsw arrays. So 34 * UFS accesses md in the usual ways. In particular, the strategy 35 * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(), 36 * and ufs_writelbn(). 37 * 38 * Md maintains an array of minor devices (meta-partitions). Each 39 * meta partition stands for a matrix of real partitions, in rows 40 * which are not necessarily of equal length. Md maintains a table, 41 * with one entry for each meta-partition, which lists the rows and 42 * columns of actual partitions, and the job of the strategy routine 43 * is to translate from the meta-partition device and block numbers 44 * known to UFS into the actual partitions' device and block numbers. 45 * 46 * See below, in mdstrategy(), mdreal(), and mddone() for details of 47 * this translation. 48 */ 49 50 /* 51 * Driver for Virtual Disk. 52 */ 53 54 #include <sys/user.h> 55 #include <sys/sysmacros.h> 56 #include <sys/conf.h> 57 #include <sys/stat.h> 58 #include <sys/errno.h> 59 #include <sys/param.h> 60 #include <sys/systm.h> 61 #include <sys/file.h> 62 #include <sys/open.h> 63 #include <sys/dkio.h> 64 #include <sys/vtoc.h> 65 #include <sys/cmn_err.h> 66 #include <sys/ddi.h> 67 #include <sys/sunddi.h> 68 #include <sys/debug.h> 69 #include <sys/utsname.h> 70 #include <sys/lvm/mdvar.h> 71 #include <sys/lvm/md_names.h> 72 #include <sys/lvm/md_mddb.h> 73 #include <sys/lvm/md_sp.h> 74 #include <sys/types.h> 75 #include <sys/kmem.h> 76 #include <sys/cladm.h> 77 #include <sys/priv_names.h> 78 79 #ifndef lint 80 char _depends_on[] = "strmod/rpcmod"; 81 #endif /* lint */ 82 int md_init_debug = 0; /* module binding debug */ 83 84 /* 85 * Tunable to turn off the failfast behavior. 86 */ 87 int md_ff_disable = 0; 88 89 md_krwlock_t md_unit_array_rw; /* protects all unit arrays */ 90 md_krwlock_t nm_lock; /* protects all the name spaces */ 91 92 md_resync_t md_cpr_resync; 93 94 extern char svm_bootpath[]; 95 #define SVM_PSEUDO_STR "/pseudo/md@0:" 96 97 #define VERSION_LENGTH 6 98 #define VERSION "1.0" 99 100 /* 101 * Keep track of possible 'orphan' entries in the name space 102 */ 103 int *md_nm_snarfed = NULL; 104 105 /* 106 * Global tunable giving the percentage of free space left in replica during 107 * conversion of non-devid style replica to devid style replica. 108 */ 109 int md_conv_perc = MDDB_DEVID_CONV_PERC; 110 111 #ifdef DEBUG 112 /* debug code to verify framework exclusion guarantees */ 113 int md_in; 114 kmutex_t md_in_mx; /* used to md global stuff */ 115 #define IN_INIT 0x01 116 #define IN_FINI 0x02 117 #define IN_ATTACH 0x04 118 #define IN_DETACH 0x08 119 #define IN_OPEN 0x10 120 #define MD_SET_IN(x) { \ 121 mutex_enter(&md_in_mx); \ 122 if (md_in) \ 123 debug_enter("MD_SET_IN exclusion lost"); \ 124 if (md_in & x) \ 125 debug_enter("MD_SET_IN already set"); \ 126 md_in |= x; \ 127 mutex_exit(&md_in_mx); \ 128 } 129 130 #define MD_CLR_IN(x) { \ 131 mutex_enter(&md_in_mx); \ 132 if (md_in & ~(x)) \ 133 debug_enter("MD_CLR_IN exclusion lost"); \ 134 if (!(md_in & x)) \ 135 debug_enter("MD_CLR_IN already clr"); \ 136 md_in &= ~x; \ 137 mutex_exit(&md_in_mx); \ 138 } 139 #else /* DEBUG */ 140 #define MD_SET_IN(x) 141 #define MD_CLR_IN(x) 142 #endif /* DEBUG */ 143 hrtime_t savetime1, savetime2; 144 145 146 /* 147 * list things protected by md_mx even if they aren't 148 * used in this file. 149 */ 150 kmutex_t md_mx; /* used to md global stuff */ 151 kcondvar_t md_cv; /* md_status events */ 152 int md_status = 0; /* global status for the meta-driver */ 153 int md_num_daemons = 0; 154 int md_ioctl_cnt = 0; 155 int md_mtioctl_cnt = 0; /* multithreaded ioctl cnt */ 156 uint_t md_mdelay = 10; /* variable so can be patched */ 157 158 int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 159 160 major_t md_major, md_major_targ; 161 162 unit_t md_nunits = MD_MAXUNITS; 163 set_t md_nsets = MD_MAXSETS; 164 int md_nmedh = 0; 165 char *md_med_trans_lst = NULL; 166 md_set_t md_set[MD_MAXSETS]; 167 md_set_io_t md_set_io[MD_MAXSETS]; 168 169 md_krwlock_t hsp_rwlp; /* protects hot_spare_interface */ 170 md_krwlock_t ni_rwlp; /* protects notify_interface */ 171 md_ops_t **md_ops; 172 ddi_modhandle_t *md_mods; 173 md_ops_t *md_opslist; 174 clock_t md_hz; 175 md_event_queue_t *md_event_queue = NULL; 176 177 int md_in_upgrade; 178 int md_keep_repl_state; 179 int md_devid_destroy; 180 181 /* for sending messages thru a door to userland */ 182 door_handle_t mdmn_door_handle = NULL; 183 int mdmn_door_did = -1; 184 185 dev_info_t *md_devinfo = NULL; 186 187 md_mn_nodeid_t md_mn_mynode_id = ~0u; /* My node id (for multi-node sets) */ 188 189 static uint_t md_ocnt[OTYPCNT]; 190 191 static int mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 192 static int mdattach(dev_info_t *, ddi_attach_cmd_t); 193 static int mddetach(dev_info_t *, ddi_detach_cmd_t); 194 static int mdopen(dev_t *, int, int, cred_t *); 195 static int mdclose(dev_t, int, int, cred_t *); 196 static int mddump(dev_t, caddr_t, daddr_t, int); 197 static int mdread(dev_t, struct uio *, cred_t *); 198 static int mdwrite(dev_t, struct uio *, cred_t *); 199 static int mdaread(dev_t, struct aio_req *, cred_t *); 200 static int mdawrite(dev_t, struct aio_req *, cred_t *); 201 static int mdioctl(dev_t, int, intptr_t, int, cred_t *, int *); 202 static int mdprop_op(dev_t, dev_info_t *, 203 ddi_prop_op_t, int, char *, caddr_t, int *); 204 205 static struct cb_ops md_cb_ops = { 206 mdopen, /* open */ 207 mdclose, /* close */ 208 mdstrategy, /* strategy */ 209 /* print routine -- none yet */ 210 (int(*)(dev_t, char *))nulldev, 211 mddump, /* dump */ 212 mdread, /* read */ 213 mdwrite, /* write */ 214 mdioctl, /* ioctl */ 215 /* devmap */ 216 (int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *, 217 uint_t))nodev, 218 /* mmap */ 219 (int(*)(dev_t, off_t, int))nodev, 220 /* segmap */ 221 (int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned, 222 unsigned, unsigned, cred_t *))nodev, 223 nochpoll, /* poll */ 224 mdprop_op, /* prop_op */ 225 0, /* streamtab */ 226 (D_64BIT|D_MP|D_NEW), /* driver compatibility flag */ 227 CB_REV, /* cb_ops version */ 228 mdaread, /* aread */ 229 mdawrite, /* awrite */ 230 }; 231 232 static struct dev_ops md_devops = { 233 DEVO_REV, /* dev_ops version */ 234 0, /* device reference count */ 235 mdinfo, /* info routine */ 236 nulldev, /* identify routine */ 237 nulldev, /* probe - not defined */ 238 mdattach, /* attach routine */ 239 mddetach, /* detach routine */ 240 nodev, /* reset - not defined */ 241 &md_cb_ops, /* driver operations */ 242 NULL, /* bus operations */ 243 nodev /* power management */ 244 }; 245 246 /* 247 * loadable module wrapper 248 */ 249 #include <sys/modctl.h> 250 251 static struct modldrv modldrv = { 252 &mod_driverops, /* type of module -- a pseudodriver */ 253 "Solaris Volume Manager base module %I%", /* name of the module */ 254 &md_devops, /* driver ops */ 255 }; 256 257 static struct modlinkage modlinkage = { 258 MODREV_1, 259 (void *)&modldrv, 260 NULL 261 }; 262 263 264 /* md_medd.c */ 265 extern void med_init(void); 266 extern void med_fini(void); 267 extern void md_devid_cleanup(set_t, uint_t); 268 269 /* md_names.c */ 270 extern void *lookup_entry(struct nm_next_hdr *, set_t, 271 side_t, mdkey_t, md_dev64_t, int); 272 extern struct nm_next_hdr *get_first_record(set_t, int, int); 273 extern int remove_entry(struct nm_next_hdr *, 274 side_t, mdkey_t, int); 275 276 int md_maxphys = 0; /* maximum io size in bytes */ 277 #define MD_MAXBCOUNT (1024 * 1024) 278 unsigned md_maxbcount = 0; /* maximum physio size in bytes */ 279 280 /* allocate/free dynamic space associated with driver globals */ 281 void 282 md_global_alloc_free(int alloc) 283 { 284 set_t s; 285 286 if (alloc) { 287 /* initialize driver global locks */ 288 cv_init(&md_cv, NULL, CV_DEFAULT, NULL); 289 mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL); 290 rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL); 291 rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL); 292 rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL); 293 rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL); 294 mutex_init(&md_cpr_resync.md_resync_mutex, NULL, 295 MUTEX_DEFAULT, NULL); 296 297 /* initialize per set driver global locks */ 298 for (s = 0; s < MD_MAXSETS; s++) { 299 /* initialize per set driver globals locks */ 300 mutex_init(&md_set[s].s_dbmx, 301 NULL, MUTEX_DEFAULT, NULL); 302 mutex_init(&md_set_io[s].md_io_mx, 303 NULL, MUTEX_DEFAULT, NULL); 304 cv_init(&md_set_io[s].md_io_cv, 305 NULL, CV_DEFAULT, NULL); 306 } 307 } else { 308 /* destroy per set driver global locks */ 309 for (s = 0; s < MD_MAXSETS; s++) { 310 cv_destroy(&md_set_io[s].md_io_cv); 311 mutex_destroy(&md_set_io[s].md_io_mx); 312 mutex_destroy(&md_set[s].s_dbmx); 313 } 314 315 /* destroy driver global locks */ 316 mutex_destroy(&md_cpr_resync.md_resync_mutex); 317 rw_destroy(&hsp_rwlp.lock); 318 rw_destroy(&ni_rwlp.lock); 319 rw_destroy(&nm_lock.lock); 320 rw_destroy(&md_unit_array_rw.lock); 321 mutex_destroy(&md_mx); 322 cv_destroy(&md_cv); 323 } 324 } 325 326 int 327 _init(void) 328 { 329 set_t s; 330 int err; 331 332 MD_SET_IN(IN_INIT); 333 334 /* allocate dynamic space associated with driver globals */ 335 md_global_alloc_free(1); 336 337 /* initialize driver globals */ 338 md_major = ddi_name_to_major("md"); 339 md_hz = drv_usectohz(NUM_USEC_IN_SEC); 340 341 /* initialize tunable globals */ 342 if (md_maxphys == 0) /* maximum io size in bytes */ 343 md_maxphys = maxphys; 344 if (md_maxbcount == 0) /* maximum physio size in bytes */ 345 md_maxbcount = MD_MAXBCOUNT; 346 347 /* initialize per set driver globals */ 348 for (s = 0; s < MD_MAXSETS; s++) 349 md_set_io[s].io_state = MD_SET_ACTIVE; 350 351 /* 352 * NOTE: the framework does not currently guarantee exclusion 353 * between _init and attach after calling mod_install. 354 */ 355 MD_CLR_IN(IN_INIT); 356 if ((err = mod_install(&modlinkage))) { 357 MD_SET_IN(IN_INIT); 358 md_global_alloc_free(0); /* free dynamic space */ 359 MD_CLR_IN(IN_INIT); 360 } 361 return (err); 362 } 363 364 int 365 _fini(void) 366 { 367 int err; 368 369 /* 370 * NOTE: the framework currently does not guarantee exclusion 371 * with attach until after mod_remove returns 0. 372 */ 373 if ((err = mod_remove(&modlinkage))) 374 return (err); 375 376 MD_SET_IN(IN_FINI); 377 md_global_alloc_free(0); /* free dynamic space */ 378 MD_CLR_IN(IN_FINI); 379 return (err); 380 } 381 382 int 383 _info(struct modinfo *modinfop) 384 { 385 return (mod_info(&modlinkage, modinfop)); 386 } 387 388 /* ARGSUSED */ 389 static int 390 mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd) 391 { 392 int len; 393 unit_t i; 394 size_t sz; 395 char ver[VERSION_LENGTH]; 396 char **maj_str_array; 397 char *str, *str2; 398 399 MD_SET_IN(IN_ATTACH); 400 md_in_upgrade = 0; 401 md_keep_repl_state = 0; 402 md_devid_destroy = 0; 403 404 if (cmd != DDI_ATTACH) { 405 MD_CLR_IN(IN_ATTACH); 406 return (DDI_FAILURE); 407 } 408 409 if (md_devinfo != NULL) { 410 MD_CLR_IN(IN_ATTACH); 411 return (DDI_FAILURE); 412 } 413 414 mddb_init(); 415 416 if (md_start_daemons(TRUE)) { 417 MD_CLR_IN(IN_ATTACH); 418 mddb_unload(); /* undo mddb_init() allocations */ 419 return (DDI_FAILURE); 420 } 421 422 /* clear the halted state */ 423 md_clr_status(MD_GBL_HALTED); 424 425 /* see if the diagnostic switch is on */ 426 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 427 DDI_PROP_DONTPASS, "md_init_debug", 0)) 428 md_init_debug++; 429 430 /* see if the failfast disable switch is on */ 431 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 432 DDI_PROP_DONTPASS, "md_ff_disable", 0)) 433 md_ff_disable++; 434 435 /* try and get the md_nmedh property */ 436 md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 437 DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS); 438 if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS)) 439 md_nmedh = MED_DEF_HOSTS; 440 441 /* try and get the md_med_trans_lst property */ 442 len = 0; 443 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN, 444 0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS || 445 len == 0) { 446 md_med_trans_lst = md_strdup("tcp"); 447 } else { 448 md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP); 449 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF, 450 0, "md_med_trans_lst", md_med_trans_lst, &len) != 451 DDI_PROP_SUCCESS) { 452 kmem_free(md_med_trans_lst, (size_t)len); 453 md_med_trans_lst = md_strdup("tcp"); 454 } 455 } 456 457 /* try and get the md_xlate property */ 458 /* Should we only do this if upgrade? */ 459 len = sizeof (char) * 5; 460 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF, 461 0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) { 462 if (strcmp(ver, VERSION) == 0) { 463 len = 0; 464 if (ddi_prop_op(DDI_DEV_T_ANY, dip, 465 PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate", 466 (caddr_t)&md_tuple_table, &len) != 467 DDI_PROP_SUCCESS) { 468 if (md_init_debug) 469 cmn_err(CE_WARN, 470 "md_xlate ddi_prop_op failed"); 471 goto attach_failure; 472 } else { 473 md_tuple_length = 474 len/(2 * ((int)sizeof (dev32_t))); 475 md_in_upgrade = 1; 476 } 477 478 /* Get target's name to major table */ 479 if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY, 480 dip, DDI_PROP_DONTPASS, 481 "md_targ_nm_table", &maj_str_array, 482 &md_majortab_len) != DDI_PROP_SUCCESS) { 483 md_majortab_len = 0; 484 if (md_init_debug) 485 cmn_err(CE_WARN, "md_targ_nm_table " 486 "ddi_prop_lookup_string_array failed"); 487 goto attach_failure; 488 } 489 490 md_major_tuple_table = 491 (struct md_xlate_major_table *) 492 kmem_zalloc(md_majortab_len * 493 sizeof (struct md_xlate_major_table), KM_SLEEP); 494 495 for (i = 0; i < md_majortab_len; i++) { 496 /* Getting major name */ 497 str = strchr(maj_str_array[i], ' '); 498 if (str == NULL) 499 continue; 500 *str = '\0'; 501 md_major_tuple_table[i].drv_name = 502 md_strdup(maj_str_array[i]); 503 504 /* Simplified atoi to get major number */ 505 str2 = str + 1; 506 md_major_tuple_table[i].targ_maj = 0; 507 while ((*str2 >= '0') && (*str2 <= '9')) { 508 md_major_tuple_table[i].targ_maj *= 10; 509 md_major_tuple_table[i].targ_maj += 510 *str2++ - '0'; 511 } 512 *str = ' '; 513 } 514 ddi_prop_free((void *)maj_str_array); 515 } else { 516 if (md_init_debug) 517 cmn_err(CE_WARN, "md_xlate_ver is incorrect"); 518 goto attach_failure; 519 } 520 } 521 522 /* 523 * Check for properties: 524 * md_keep_repl_state and md_devid_destroy 525 * and set globals if these exist. 526 */ 527 md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip, 528 0, "md_keep_repl_state", 0); 529 530 md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip, 531 0, "md_devid_destroy", 0); 532 533 if (MD_UPGRADE) 534 md_major_targ = md_targ_name_to_major("md"); 535 else 536 md_major_targ = 0; 537 538 /* alloc md_ops and md_mods struct */ 539 md_ops = (md_ops_t **)kmem_zalloc( 540 sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP); 541 md_mods = (ddi_modhandle_t *)kmem_zalloc( 542 sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP); 543 544 /* allocate admin device node */ 545 if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR, 546 MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640)) 547 goto attach_failure; 548 549 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 550 DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS) 551 goto attach_failure; 552 553 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, 554 "ddi-abrwrite-supported", 1) != DDI_SUCCESS) 555 goto attach_failure; 556 557 /* these could have been cleared by a detach */ 558 md_nunits = MD_MAXUNITS; 559 md_nsets = MD_MAXSETS; 560 561 sz = sizeof (void *) * MD_MAXUNITS; 562 if (md_set[0].s_un == NULL) 563 md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP); 564 if (md_set[0].s_ui == NULL) 565 md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP); 566 567 md_devinfo = dip; 568 569 /* 570 * Only allocate device node for root mirror metadevice. 571 * Don't pre-allocate unnecessary device nodes (thus slowing down a 572 * boot when we attach). 573 * We can't read the mddbs in attach. The mddbs will be read 574 * by metainit during the boot process when it is doing the 575 * auto-take processing and any other minor nodes will be 576 * allocated at that point. 577 * 578 * There are two scenarios to be aware of here: 579 * 1) when we are booting from a mirrored root we need the root 580 * metadevice to exist very early (during vfs_mountroot processing) 581 * 2) we need all of the nodes to be created so that any mnttab entries 582 * will succeed (handled by metainit reading the mddb during boot). 583 */ 584 if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1) 585 == 0) { 586 char *p; 587 int mnum = 0; 588 589 /* 590 * The svm_bootpath string looks something like 591 * /pseudo/md@0:0,150,blk where 150 is the minor number 592 * in this example so we need to set the pointer p onto 593 * the first digit of the minor number and convert it 594 * from ascii. 595 */ 596 for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1; 597 *p >= '0' && *p <= '9'; p++) { 598 mnum *= 10; 599 mnum += *p - '0'; 600 } 601 602 if (md_create_minor_node(0, mnum)) { 603 kmem_free(md_set[0].s_un, sz); 604 kmem_free(md_set[0].s_ui, sz); 605 goto attach_failure; 606 } 607 } 608 609 med_init(); 610 611 MD_CLR_IN(IN_ATTACH); 612 return (DDI_SUCCESS); 613 614 attach_failure: 615 /* 616 * Use our own detach routine to toss any stuff we allocated above. 617 * NOTE: detach will call md_halt to free the mddb_init allocations. 618 */ 619 MD_CLR_IN(IN_ATTACH); 620 if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS) 621 cmn_err(CE_WARN, "detach from attach failed"); 622 return (DDI_FAILURE); 623 } 624 625 /* ARGSUSED */ 626 static int 627 mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd) 628 { 629 extern int check_active_locators(); 630 set_t s; 631 size_t sz; 632 int len; 633 634 MD_SET_IN(IN_DETACH); 635 636 /* check command */ 637 if (cmd != DDI_DETACH) { 638 MD_CLR_IN(IN_DETACH); 639 return (DDI_FAILURE); 640 } 641 642 /* 643 * if we have not already halted yet we have no active config 644 * then automatically initiate a halt so we can detach. 645 */ 646 if (!(md_get_status() & MD_GBL_HALTED)) { 647 if (check_active_locators() == 0) { 648 /* 649 * NOTE: a successful md_halt will have done the 650 * mddb_unload to free allocations done in mddb_init 651 */ 652 if (md_halt(MD_NO_GBL_LOCKS_HELD)) { 653 cmn_err(CE_NOTE, "md:detach: " 654 "Could not halt Solaris Volume Manager"); 655 MD_CLR_IN(IN_DETACH); 656 return (DDI_FAILURE); 657 } 658 } 659 660 /* fail detach if we have not halted */ 661 if (!(md_get_status() & MD_GBL_HALTED)) { 662 MD_CLR_IN(IN_DETACH); 663 return (DDI_FAILURE); 664 } 665 } 666 667 /* must be in halted state, this will be cleared on next attach */ 668 ASSERT(md_get_status() & MD_GBL_HALTED); 669 670 /* cleanup attach allocations and initializations */ 671 md_major_targ = 0; 672 673 sz = sizeof (void *) * md_nunits; 674 for (s = 0; s < md_nsets; s++) { 675 if (md_set[s].s_un != NULL) { 676 kmem_free(md_set[s].s_un, sz); 677 md_set[s].s_un = NULL; 678 } 679 680 if (md_set[s].s_ui != NULL) { 681 kmem_free(md_set[s].s_ui, sz); 682 md_set[s].s_ui = NULL; 683 } 684 } 685 md_nunits = 0; 686 md_nsets = 0; 687 md_nmedh = 0; 688 689 if (md_med_trans_lst != NULL) { 690 kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1); 691 md_med_trans_lst = NULL; 692 } 693 694 if (md_mods != NULL) { 695 kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS); 696 md_mods = NULL; 697 } 698 699 if (md_ops != NULL) { 700 kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS); 701 md_ops = NULL; 702 } 703 704 if (MD_UPGRADE) { 705 len = md_tuple_length * (2 * ((int)sizeof (dev32_t))); 706 md_in_upgrade = 0; 707 md_xlate_free(len); 708 md_majortab_free(); 709 } 710 711 /* 712 * Undo what we did in mdattach, freeing resources 713 * and removing things we installed. The system 714 * framework guarantees we are not active with this devinfo 715 * node in any other entry points at this time. 716 */ 717 ddi_prop_remove_all(dip); 718 ddi_remove_minor_node(dip, NULL); 719 720 med_fini(); 721 md_devinfo = NULL; 722 723 MD_CLR_IN(IN_DETACH); 724 return (DDI_SUCCESS); 725 } 726 727 728 /* 729 * Given the device number return the devinfo pointer 730 * given to md via md_attach 731 */ 732 /*ARGSUSED*/ 733 static int 734 mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 735 { 736 int error = DDI_FAILURE; 737 738 switch (infocmd) { 739 case DDI_INFO_DEVT2DEVINFO: 740 if (md_devinfo) { 741 *result = (void *)md_devinfo; 742 error = DDI_SUCCESS; 743 } 744 break; 745 746 case DDI_INFO_DEVT2INSTANCE: 747 *result = (void *)0; 748 error = DDI_SUCCESS; 749 break; 750 } 751 return (error); 752 } 753 754 /* 755 * property operation routine. return the number of blocks for the partition 756 * in question or forward the request to the property facilities. 757 */ 758 static int 759 mdprop_op( 760 dev_t dev, /* device number associated with device */ 761 dev_info_t *dip, /* device info struct for this device */ 762 ddi_prop_op_t prop_op, /* property operator */ 763 int mod_flags, /* property flags */ 764 char *name, /* name of property */ 765 caddr_t valuep, /* where to put property value */ 766 int *lengthp) /* put length of property here */ 767 { 768 minor_t mnum; 769 set_t setno; 770 md_unit_t *un; 771 mdi_unit_t *ui; 772 uint64_t nblocks64; 773 774 /* 775 * Our dynamic properties are all device specific and size oriented. 776 * Requests issued under conditions where size is valid are passed 777 * to ddi_prop_op_nblocks with the size information, otherwise the 778 * request is passed to ddi_prop_op. Make sure that the minor device 779 * is a valid part of the Virtual Disk subsystem. 780 */ 781 mnum = getminor(dev); 782 setno = MD_MIN2SET(mnum); 783 if ((dev == DDI_DEV_T_ANY) || (mnum == MD_ADM_MINOR) || 784 (setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) { 785 pass: return (ddi_prop_op(dev, dip, prop_op, mod_flags, 786 name, valuep, lengthp)); 787 } else { 788 rw_enter(&md_unit_array_rw.lock, RW_READER); 789 if (((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) || 790 ((ui = MDI_UNIT(mnum)) == NULL)) { 791 rw_exit(&md_unit_array_rw.lock); 792 goto pass; 793 } 794 795 /* get nblocks value */ 796 un = (md_unit_t *)md_unit_readerlock(ui); 797 nblocks64 = un->c.un_total_blocks; 798 md_unit_readerexit(ui); 799 rw_exit(&md_unit_array_rw.lock); 800 801 return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags, 802 name, valuep, lengthp, nblocks64)); 803 } 804 805 } 806 807 static void 808 snarf_user_data(set_t setno) 809 { 810 mddb_recid_t recid; 811 mddb_recstatus_t status; 812 813 recid = mddb_makerecid(setno, 0); 814 while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) { 815 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 816 continue; 817 818 status = mddb_getrecstatus(recid); 819 if (status == MDDB_STALE) 820 continue; 821 822 if (status == MDDB_NODATA) { 823 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 824 continue; 825 } 826 827 ASSERT(status == MDDB_OK); 828 829 mddb_setrecprivate(recid, MD_PRV_GOTIT); 830 } 831 } 832 833 static void 834 md_print_block_usage(mddb_set_t *s, uint_t blks) 835 { 836 uint_t ib; 837 int li; 838 mddb_mb_ic_t *mbip; 839 uint_t max_blk_needed; 840 mddb_lb_t *lbp; 841 mddb_sidelocator_t *slp; 842 int drv_index; 843 md_splitname sn; 844 char *name; 845 char *suffix; 846 size_t prefixlen; 847 size_t suffixlen; 848 int alloc_sz; 849 850 851 max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks; 852 853 854 cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n" 855 " Additional Blocks Needed: %d\n\n" 856 " Increase size of following replicas for\n" 857 " device relocatability by deleting listed\n" 858 " replica and re-adding replica with\n" 859 " increased size (see metadb(1M)):\n" 860 " Replica Increase By", 861 s->s_totalblkcnt, (blks - s->s_freeblkcnt)); 862 863 lbp = s->s_lbp; 864 865 for (li = 0; li < lbp->lb_loccnt; li++) { 866 if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED) 867 continue; 868 ib = 0; 869 for (mbip = s->s_mbiarray[li]; mbip != NULL; 870 mbip = mbip->mbi_next) { 871 ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt; 872 } 873 if (ib == 0) 874 continue; 875 if (ib < max_blk_needed) { 876 slp = &lbp->lb_sidelocators[s->s_sideno][li]; 877 drv_index = slp->l_drvnm_index; 878 mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, 879 &sn); 880 prefixlen = SPN_PREFIX(&sn).pre_len; 881 suffixlen = SPN_SUFFIX(&sn).suf_len; 882 alloc_sz = (int)(prefixlen + suffixlen + 2); 883 name = (char *)kmem_alloc(alloc_sz, KM_SLEEP); 884 (void) strncpy(name, SPN_PREFIX(&sn).pre_data, 885 prefixlen); 886 name[prefixlen] = '/'; 887 suffix = name + (prefixlen + 1); 888 (void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data, 889 suffixlen); 890 name[prefixlen + suffixlen + 1] = '\0'; 891 cmn_err(CE_WARN, 892 " %s (%s:%d:%d) %d blocks", 893 name, lbp->lb_drvnm[drv_index].dn_data, 894 slp->l_mnum, lbp->lb_locators[li].l_blkno, 895 (max_blk_needed - ib)); 896 kmem_free(name, alloc_sz); 897 } 898 } 899 } 900 901 /* 902 * md_create_minor_node: 903 * Create the minor device for the given set and un_self_id. 904 * 905 * Input: 906 * setno - set number 907 * mnum - selfID of unit 908 * 909 * Output: 910 * None. 911 * 912 * Returns 0 for success, 1 for failure. 913 * 914 * Side-effects: 915 * None. 916 */ 917 int 918 md_create_minor_node(set_t setno, minor_t mnum) 919 { 920 char name[20]; 921 922 /* Check for valid arguments */ 923 if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS) 924 return (1); 925 926 (void) snprintf(name, 20, "%u,%u,blk", 927 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum)); 928 929 if (ddi_create_minor_node(md_devinfo, name, S_IFBLK, 930 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0)) 931 return (1); 932 933 (void) snprintf(name, 20, "%u,%u,raw", 934 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum)); 935 936 if (ddi_create_minor_node(md_devinfo, name, S_IFCHR, 937 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0)) 938 return (1); 939 940 return (0); 941 } 942 943 /* 944 * For a given key check if it is an orphaned record. 945 * The following conditions are used to determine an orphan. 946 * 1. The device associated with that key is not a metadevice. 947 * 2. If DEVID_STYLE then the physical device does not have a device Id 948 * associated with it. 949 * 950 * If a key does not have an entry in the devid namespace it could be 951 * a device that does not support device ids. Hence the record is not 952 * deleted. 953 */ 954 955 static int 956 md_verify_orphaned_record(set_t setno, mdkey_t key) 957 { 958 md_dev64_t odev; /* orphaned dev */ 959 mddb_set_t *s; 960 side_t side = 0; 961 struct nm_next_hdr *did_nh = NULL; 962 963 s = (mddb_set_t *)md_set[setno].s_db; 964 if ((did_nh = get_first_record(setno, 1, (NM_DEVID | NM_NOTSHARED))) 965 == NULL) 966 return (0); 967 /* 968 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT 969 */ 970 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) { 971 odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT); 972 if ((odev == NODEV64) || (md_getmajor(odev) == md_major)) 973 return (0); 974 if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) == 975 NULL) 976 return (1); 977 } 978 return (0); 979 } 980 981 int 982 md_snarf_db_set(set_t setno, md_error_t *ep) 983 { 984 int err = 0; 985 int i; 986 mddb_recid_t recid; 987 mddb_type_t drvrid; 988 mddb_recstatus_t status; 989 md_ops_t *ops; 990 uint_t privat; 991 mddb_set_t *s; 992 uint_t cvt_blks; 993 struct nm_next_hdr *nh; 994 mdkey_t key = MD_KEYWILD; 995 side_t side = 0; 996 int size; 997 int devid_flag; 998 int retval; 999 uint_t un; 1000 int un_next_set = 0; 1001 1002 md_haltsnarf_enter(setno); 1003 1004 mutex_enter(&md_mx); 1005 if (md_set[setno].s_status & MD_SET_SNARFED) { 1006 mutex_exit(&md_mx); 1007 md_haltsnarf_exit(setno); 1008 return (0); 1009 } 1010 mutex_exit(&md_mx); 1011 1012 if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) { 1013 if (md_start_daemons(TRUE)) { 1014 if (ep != NULL) 1015 (void) mdsyserror(ep, ENXIO); 1016 err = -1; 1017 goto out; 1018 } 1019 } 1020 1021 1022 /* 1023 * Load the devid name space if it exists 1024 */ 1025 (void) md_load_namespace(setno, NULL, NM_DEVID); 1026 if (!md_load_namespace(setno, ep, 0L)) { 1027 /* 1028 * Unload the devid namespace 1029 */ 1030 (void) md_unload_namespace(setno, NM_DEVID); 1031 err = -1; 1032 goto out; 1033 } 1034 1035 /* 1036 * If replica is in non-devid state, convert if: 1037 * - not in probe during upgrade (md_keep_repl_state = 0) 1038 * - enough space available in replica 1039 * - local set 1040 * - not a multi-node diskset 1041 * - clustering is not present (for non-local set) 1042 */ 1043 s = (mddb_set_t *)md_set[setno].s_db; 1044 devid_flag = 0; 1045 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state) 1046 devid_flag = 1; 1047 if (cluster_bootflags & CLUSTER_CONFIGURED) 1048 if (setno != MD_LOCAL_SET) 1049 devid_flag = 0; 1050 if (MD_MNSET_SETNO(setno)) 1051 devid_flag = 0; 1052 if ((md_devid_destroy == 1) && (md_keep_repl_state == 1)) 1053 devid_flag = 0; 1054 1055 /* 1056 * if we weren't devid style before and md_keep_repl_state=1 1057 * we need to stay non-devid 1058 */ 1059 if ((md_keep_repl_state == 1) && 1060 ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0)) 1061 devid_flag = 0; 1062 if (devid_flag) { 1063 /* 1064 * Determine number of free blocks needed to convert 1065 * entire replica to device id format - locator blocks 1066 * and namespace. 1067 */ 1068 cvt_blks = 0; 1069 if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) { 1070 if (ep != NULL) 1071 (void) mdsyserror(ep, EIO); 1072 err = -1; 1073 goto out; 1074 1075 } 1076 cvt_blks += md_nm_did_chkspace(setno); 1077 1078 /* add MDDB_DEVID_CONV_PERC% */ 1079 if ((md_conv_perc > 0) && (md_conv_perc <= 100)) { 1080 cvt_blks = cvt_blks * (100 + md_conv_perc) / 100; 1081 } 1082 1083 if (cvt_blks <= s->s_freeblkcnt) { 1084 if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) { 1085 if (ep != NULL) 1086 (void) mdsyserror(ep, EIO); 1087 err = -1; 1088 goto out; 1089 } 1090 1091 } else { 1092 /* 1093 * Print message that replica can't be converted for 1094 * lack of space. No failure - just continue to 1095 * run without device ids. 1096 */ 1097 cmn_err(CE_WARN, 1098 "Unable to add Solaris Volume Manager device " 1099 "relocation data.\n" 1100 " To use device relocation feature:\n" 1101 " - Increase size of listed replicas\n" 1102 " - Reboot"); 1103 md_print_block_usage(s, cvt_blks); 1104 cmn_err(CE_WARN, 1105 "Loading set without device relocation data.\n" 1106 " Solaris Volume Manager disk movement " 1107 "not tracked in local set."); 1108 } 1109 } 1110 1111 /* 1112 * go through and load any modules referenced in 1113 * data base 1114 */ 1115 recid = mddb_makerecid(setno, 0); 1116 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1117 status = mddb_getrecstatus(recid); 1118 if (status == MDDB_STALE) { 1119 if (! (md_get_setstatus(setno) & MD_SET_STALE)) { 1120 md_set_setstatus(setno, MD_SET_STALE); 1121 cmn_err(CE_WARN, 1122 "md: state database is stale"); 1123 } 1124 } else if (status == MDDB_NODATA) { 1125 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1126 continue; 1127 } 1128 drvrid = mddb_getrectype1(recid); 1129 if (drvrid < MDDB_FIRST_MODID) 1130 continue; 1131 if (md_loadsubmod(setno, md_getshared_name(setno, drvrid), 1132 drvrid) < 0) { 1133 cmn_err(CE_NOTE, "md: could not load misc/%s", 1134 md_getshared_name(setno, drvrid)); 1135 } 1136 } 1137 1138 if (recid < 0) 1139 goto out; 1140 1141 snarf_user_data(setno); 1142 1143 /* 1144 * Initialize the md_nm_snarfed array 1145 * this array is indexed by the key and 1146 * is set by md_getdevnum during the snarf time 1147 */ 1148 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) { 1149 size = (int)((((struct nm_rec_hdr *)nh->nmn_record)-> 1150 r_next_key) * (sizeof (int))); 1151 md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP); 1152 } 1153 1154 /* 1155 * go through and snarf until nothing gets added 1156 */ 1157 do { 1158 i = 0; 1159 for (ops = md_opslist; ops != NULL; ops = ops->md_next) { 1160 if (ops->md_snarf != NULL) { 1161 retval = ops->md_snarf(MD_SNARF_DOIT, setno); 1162 if (retval == -1) { 1163 err = -1; 1164 /* Don't know the failed unit */ 1165 (void) mdmderror(ep, MDE_RR_ALLOC_ERROR, 1166 0); 1167 (void) md_halt_set(setno, MD_HALT_ALL); 1168 (void) mddb_unload_set(setno); 1169 md_haltsnarf_exit(setno); 1170 return (err); 1171 } else { 1172 i += retval; 1173 } 1174 } 1175 } 1176 } while (i); 1177 1178 /* 1179 * Set the first available slot and availability 1180 */ 1181 md_set[setno].s_un_avail = 0; 1182 for (un = 0; un < MD_MAXUNITS; un++) { 1183 if (md_set[setno].s_un[un] != NULL) { 1184 continue; 1185 } else { 1186 if (!un_next_set) { 1187 md_set[setno].s_un_next = un; 1188 un_next_set = 1; 1189 } 1190 md_set[setno].s_un_avail++; 1191 } 1192 } 1193 1194 md_set_setstatus(setno, MD_SET_SNARFED); 1195 1196 recid = mddb_makerecid(setno, 0); 1197 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1198 privat = mddb_getrecprivate(recid); 1199 if (privat & MD_PRV_COMMIT) { 1200 if (mddb_commitrec(recid)) { 1201 if (!(md_get_setstatus(setno) & MD_SET_STALE)) { 1202 md_set_setstatus(setno, MD_SET_STALE); 1203 cmn_err(CE_WARN, 1204 "md: state database is stale"); 1205 } 1206 } 1207 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1208 } 1209 } 1210 1211 /* Deletes must happen after all the commits */ 1212 recid = mddb_makerecid(setno, 0); 1213 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1214 privat = mddb_getrecprivate(recid); 1215 if (privat & MD_PRV_DELETE) { 1216 if (mddb_deleterec(recid)) { 1217 if (!(md_get_setstatus(setno) & MD_SET_STALE)) { 1218 md_set_setstatus(setno, MD_SET_STALE); 1219 cmn_err(CE_WARN, 1220 "md: state database is stale"); 1221 } 1222 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1223 } 1224 recid = mddb_makerecid(setno, 0); 1225 } 1226 } 1227 1228 /* 1229 * go through and clean up records until nothing gets cleaned up. 1230 */ 1231 do { 1232 i = 0; 1233 for (ops = md_opslist; ops != NULL; ops = ops->md_next) 1234 if (ops->md_snarf != NULL) 1235 i += ops->md_snarf(MD_SNARF_CLEANUP, setno); 1236 } while (i); 1237 1238 if (md_nm_snarfed != NULL && 1239 !(md_get_setstatus(setno) & MD_SET_STALE)) { 1240 /* 1241 * go thru and cleanup the namespace and the device id 1242 * name space 1243 */ 1244 for (key = 1; 1245 key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key; 1246 key++) { 1247 /* 1248 * Is the entry an 'orphan'? 1249 */ 1250 if (lookup_entry(nh, setno, side, key, NODEV64, 0L) != 1251 NULL) { 1252 /* 1253 * If the value is not set then apparently 1254 * it is not part of the current configuration, 1255 * remove it this can happen when system panic 1256 * between the primary name space update and 1257 * the device id name space update 1258 */ 1259 if (md_nm_snarfed[key] == 0) { 1260 if (md_verify_orphaned_record(setno, 1261 key) == 1) 1262 (void) remove_entry(nh, 1263 side, key, 0L); 1264 } 1265 } 1266 } 1267 } 1268 1269 if (md_nm_snarfed != NULL) { 1270 /* 1271 * Done and free the memory 1272 */ 1273 kmem_free(md_nm_snarfed, size); 1274 md_nm_snarfed = NULL; 1275 } 1276 1277 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE && 1278 !(md_get_setstatus(setno) & MD_SET_STALE)) { 1279 /* 1280 * if the destroy flag has been set and 1281 * the MD_SET_DIDCLUP bit is not set in 1282 * the set's status field, cleanup the 1283 * entire device id namespace 1284 */ 1285 if (md_devid_destroy && 1286 !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) { 1287 (void) md_devid_cleanup(setno, 1); 1288 md_set_setstatus(setno, MD_SET_DIDCLUP); 1289 } else 1290 (void) md_devid_cleanup(setno, 0); 1291 } 1292 1293 /* 1294 * clear single threading on snarf, return success or error 1295 */ 1296 out: 1297 md_haltsnarf_exit(setno); 1298 return (err); 1299 } 1300 1301 void 1302 get_minfo(struct dk_minfo *info, minor_t mnum) 1303 { 1304 md_unit_t *un; 1305 mdi_unit_t *ui; 1306 1307 info->dki_capacity = 0; 1308 info->dki_lbsize = 0; 1309 info->dki_media_type = 0; 1310 1311 if ((ui = MDI_UNIT(mnum)) == NULL) { 1312 return; 1313 } 1314 un = (md_unit_t *)md_unit_readerlock(ui); 1315 info->dki_capacity = un->c.un_total_blocks; 1316 md_unit_readerexit(ui); 1317 info->dki_lbsize = DEV_BSIZE; 1318 info->dki_media_type = DK_UNKNOWN; 1319 } 1320 1321 1322 void 1323 get_info(struct dk_cinfo *info, minor_t mnum) 1324 { 1325 /* 1326 * Controller Information 1327 */ 1328 info->dki_ctype = DKC_MD; 1329 info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo)); 1330 (void) strcpy(info->dki_cname, 1331 ddi_get_name(ddi_get_parent(md_devinfo))); 1332 /* 1333 * Unit Information 1334 */ 1335 info->dki_unit = mnum; 1336 info->dki_slave = 0; 1337 (void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo)); 1338 info->dki_flags = 0; 1339 info->dki_partition = 0; 1340 info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE); 1341 1342 /* 1343 * We can't get from here to there yet 1344 */ 1345 info->dki_addr = 0; 1346 info->dki_space = 0; 1347 info->dki_prio = 0; 1348 info->dki_vec = 0; 1349 } 1350 1351 /* 1352 * open admin device 1353 */ 1354 static int 1355 mdadminopen( 1356 int flag, 1357 int otyp) 1358 { 1359 int err = 0; 1360 1361 /* single thread */ 1362 mutex_enter(&md_mx); 1363 1364 /* check type and flags */ 1365 if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) { 1366 err = EINVAL; 1367 goto out; 1368 } 1369 if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) || 1370 (md_status & MD_GBL_EXCL)) { 1371 err = EBUSY; 1372 goto out; 1373 } 1374 1375 /* count and flag open */ 1376 md_ocnt[otyp]++; 1377 md_status |= MD_GBL_OPEN; 1378 if (flag & FEXCL) 1379 md_status |= MD_GBL_EXCL; 1380 1381 /* unlock return success */ 1382 out: 1383 mutex_exit(&md_mx); 1384 return (err); 1385 } 1386 1387 /* 1388 * open entry point 1389 */ 1390 static int 1391 mdopen( 1392 dev_t *dev, 1393 int flag, 1394 int otyp, 1395 cred_t *cred_p) 1396 { 1397 minor_t mnum = getminor(*dev); 1398 unit_t unit = MD_MIN2UNIT(mnum); 1399 set_t setno = MD_MIN2SET(mnum); 1400 mdi_unit_t *ui = NULL; 1401 int err = 0; 1402 md_parent_t parent; 1403 1404 /* dispatch admin device opens */ 1405 if (mnum == MD_ADM_MINOR) 1406 return (mdadminopen(flag, otyp)); 1407 1408 /* lock, check status */ 1409 rw_enter(&md_unit_array_rw.lock, RW_READER); 1410 1411 tryagain: 1412 if (md_get_status() & MD_GBL_HALTED) { 1413 err = ENODEV; 1414 goto out; 1415 } 1416 1417 /* check minor */ 1418 if ((setno >= md_nsets) || (unit >= md_nunits)) { 1419 err = ENXIO; 1420 goto out; 1421 } 1422 1423 /* make sure we're snarfed */ 1424 if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) { 1425 if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) { 1426 err = ENODEV; 1427 goto out; 1428 } 1429 } 1430 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) { 1431 err = ENODEV; 1432 goto out; 1433 } 1434 1435 /* check unit */ 1436 if ((ui = MDI_UNIT(mnum)) == NULL) { 1437 err = ENXIO; 1438 goto out; 1439 } 1440 1441 /* 1442 * The softpart open routine may do an I/O during the open, in 1443 * which case the open routine will set the OPENINPROGRESS flag 1444 * and drop all locks during the I/O. If this thread sees 1445 * the OPENINPROGRESS flag set, if should wait until the flag 1446 * is reset before calling the driver's open routine. It must 1447 * also revalidate the world after it grabs the unit_array lock 1448 * since the set may have been released or the metadevice cleared 1449 * during the sleep. 1450 */ 1451 if (MD_MNSET_SETNO(setno)) { 1452 mutex_enter(&ui->ui_mx); 1453 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 1454 rw_exit(&md_unit_array_rw.lock); 1455 cv_wait(&ui->ui_cv, &ui->ui_mx); 1456 rw_enter(&md_unit_array_rw.lock, RW_READER); 1457 mutex_exit(&ui->ui_mx); 1458 goto tryagain; 1459 } 1460 mutex_exit(&ui->ui_mx); 1461 } 1462 1463 /* Test if device is openable */ 1464 if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) { 1465 err = ENXIO; 1466 goto out; 1467 } 1468 1469 /* don't allow opens w/WRITE flag if stale */ 1470 if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) { 1471 err = EROFS; 1472 goto out; 1473 } 1474 1475 /* don't allow writes to subdevices */ 1476 parent = md_get_parent(md_expldev(*dev)); 1477 if ((flag & FWRITE) && MD_HAS_PARENT(parent)) { 1478 err = EROFS; 1479 goto out; 1480 } 1481 1482 /* open underlying driver */ 1483 if (md_ops[ui->ui_opsindex]->md_open != NULL) { 1484 if ((err = (*md_ops[ui->ui_opsindex]->md_open) 1485 (dev, flag, otyp, cred_p, 0)) != 0) 1486 goto out; 1487 } 1488 1489 /* or do it ourselves */ 1490 else { 1491 /* single thread */ 1492 (void) md_unit_openclose_enter(ui); 1493 err = md_unit_incopen(mnum, flag, otyp); 1494 md_unit_openclose_exit(ui); 1495 if (err != 0) 1496 goto out; 1497 } 1498 1499 /* unlock, return status */ 1500 out: 1501 rw_exit(&md_unit_array_rw.lock); 1502 return (err); 1503 } 1504 1505 /* 1506 * close admin device 1507 */ 1508 static int 1509 mdadminclose( 1510 int otyp) 1511 { 1512 int i; 1513 int err = 0; 1514 1515 /* single thread */ 1516 mutex_enter(&md_mx); 1517 1518 /* check type and flags */ 1519 if ((otyp < 0) || (otyp >= OTYPCNT)) { 1520 err = EINVAL; 1521 goto out; 1522 } else if (md_ocnt[otyp] == 0) { 1523 err = ENXIO; 1524 goto out; 1525 } 1526 1527 /* count and flag closed */ 1528 if (otyp == OTYP_LYR) 1529 md_ocnt[otyp]--; 1530 else 1531 md_ocnt[otyp] = 0; 1532 md_status &= ~MD_GBL_OPEN; 1533 for (i = 0; (i < OTYPCNT); ++i) 1534 if (md_ocnt[i] != 0) 1535 md_status |= MD_GBL_OPEN; 1536 if (! (md_status & MD_GBL_OPEN)) 1537 md_status &= ~MD_GBL_EXCL; 1538 1539 /* unlock return success */ 1540 out: 1541 mutex_exit(&md_mx); 1542 return (err); 1543 } 1544 1545 /* 1546 * close entry point 1547 */ 1548 static int 1549 mdclose( 1550 dev_t dev, 1551 int flag, 1552 int otyp, 1553 cred_t *cred_p) 1554 { 1555 minor_t mnum = getminor(dev); 1556 set_t setno = MD_MIN2SET(mnum); 1557 unit_t unit = MD_MIN2UNIT(mnum); 1558 mdi_unit_t *ui = NULL; 1559 int err = 0; 1560 1561 /* dispatch admin device closes */ 1562 if (mnum == MD_ADM_MINOR) 1563 return (mdadminclose(otyp)); 1564 1565 /* check minor */ 1566 if ((setno >= md_nsets) || (unit >= md_nunits) || 1567 ((ui = MDI_UNIT(mnum)) == NULL)) { 1568 err = ENXIO; 1569 goto out; 1570 } 1571 1572 /* close underlying driver */ 1573 if (md_ops[ui->ui_opsindex]->md_close != NULL) { 1574 if ((err = (*md_ops[ui->ui_opsindex]->md_close) 1575 (dev, flag, otyp, cred_p, 0)) != 0) 1576 goto out; 1577 } 1578 1579 /* or do it ourselves */ 1580 else { 1581 /* single thread */ 1582 (void) md_unit_openclose_enter(ui); 1583 err = md_unit_decopen(mnum, otyp); 1584 md_unit_openclose_exit(ui); 1585 if (err != 0) 1586 goto out; 1587 } 1588 1589 /* return success */ 1590 out: 1591 return (err); 1592 } 1593 1594 1595 /* 1596 * This routine performs raw read operations. It is called from the 1597 * device switch at normal priority. 1598 * 1599 * The main catch is that the *uio struct which is passed to us may 1600 * specify a read which spans two buffers, which would be contiguous 1601 * on a single partition, but not on a striped partition. This will 1602 * be handled by mdstrategy. 1603 */ 1604 /*ARGSUSED*/ 1605 static int 1606 mdread(dev_t dev, struct uio *uio, cred_t *credp) 1607 { 1608 minor_t mnum; 1609 mdi_unit_t *ui; 1610 int error; 1611 1612 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1613 (MD_MIN2SET(mnum) >= md_nsets) || 1614 (MD_MIN2UNIT(mnum) >= md_nunits) || 1615 ((ui = MDI_UNIT(mnum)) == NULL)) 1616 return (ENXIO); 1617 1618 if (md_ops[ui->ui_opsindex]->md_read != NULL) 1619 return ((*md_ops[ui->ui_opsindex]->md_read) 1620 (dev, uio, credp)); 1621 1622 if ((error = md_chk_uio(uio)) != 0) 1623 return (error); 1624 1625 return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio)); 1626 } 1627 1628 /* 1629 * This routine performs async raw read operations. It is called from the 1630 * device switch at normal priority. 1631 * 1632 * The main catch is that the *aio struct which is passed to us may 1633 * specify a read which spans two buffers, which would be contiguous 1634 * on a single partition, but not on a striped partition. This will 1635 * be handled by mdstrategy. 1636 */ 1637 /*ARGSUSED*/ 1638 static int 1639 mdaread(dev_t dev, struct aio_req *aio, cred_t *credp) 1640 { 1641 minor_t mnum; 1642 mdi_unit_t *ui; 1643 int error; 1644 1645 1646 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1647 (MD_MIN2SET(mnum) >= md_nsets) || 1648 (MD_MIN2UNIT(mnum) >= md_nunits) || 1649 ((ui = MDI_UNIT(mnum)) == NULL)) 1650 return (ENXIO); 1651 1652 if (md_ops[ui->ui_opsindex]->md_aread != NULL) 1653 return ((*md_ops[ui->ui_opsindex]->md_aread) 1654 (dev, aio, credp)); 1655 1656 if ((error = md_chk_uio(aio->aio_uio)) != 0) 1657 return (error); 1658 1659 return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio)); 1660 } 1661 1662 /* 1663 * This routine performs raw write operations. It is called from the 1664 * device switch at normal priority. 1665 * 1666 * The main catch is that the *uio struct which is passed to us may 1667 * specify a write which spans two buffers, which would be contiguous 1668 * on a single partition, but not on a striped partition. This is 1669 * handled by mdstrategy. 1670 * 1671 */ 1672 /*ARGSUSED*/ 1673 static int 1674 mdwrite(dev_t dev, struct uio *uio, cred_t *credp) 1675 { 1676 minor_t mnum; 1677 mdi_unit_t *ui; 1678 int error; 1679 1680 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1681 (MD_MIN2SET(mnum) >= md_nsets) || 1682 (MD_MIN2UNIT(mnum) >= md_nunits) || 1683 ((ui = MDI_UNIT(mnum)) == NULL)) 1684 return (ENXIO); 1685 1686 if (md_ops[ui->ui_opsindex]->md_write != NULL) 1687 return ((*md_ops[ui->ui_opsindex]->md_write) 1688 (dev, uio, credp)); 1689 1690 if ((error = md_chk_uio(uio)) != 0) 1691 return (error); 1692 1693 return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio)); 1694 } 1695 1696 /* 1697 * This routine performs async raw write operations. It is called from the 1698 * device switch at normal priority. 1699 * 1700 * The main catch is that the *aio struct which is passed to us may 1701 * specify a write which spans two buffers, which would be contiguous 1702 * on a single partition, but not on a striped partition. This is 1703 * handled by mdstrategy. 1704 * 1705 */ 1706 /*ARGSUSED*/ 1707 static int 1708 mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp) 1709 { 1710 minor_t mnum; 1711 mdi_unit_t *ui; 1712 int error; 1713 1714 1715 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1716 (MD_MIN2SET(mnum) >= md_nsets) || 1717 (MD_MIN2UNIT(mnum) >= md_nunits) || 1718 ((ui = MDI_UNIT(mnum)) == NULL)) 1719 return (ENXIO); 1720 1721 if (md_ops[ui->ui_opsindex]->md_awrite != NULL) 1722 return ((*md_ops[ui->ui_opsindex]->md_awrite) 1723 (dev, aio, credp)); 1724 1725 if ((error = md_chk_uio(aio->aio_uio)) != 0) 1726 return (error); 1727 1728 return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio)); 1729 } 1730 1731 int 1732 mdstrategy(struct buf *bp) 1733 { 1734 minor_t mnum; 1735 mdi_unit_t *ui; 1736 1737 ASSERT((bp->b_flags & B_DONE) == 0); 1738 1739 if (panicstr) 1740 md_clr_status(MD_GBL_DAEMONS_LIVE); 1741 1742 if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) || 1743 (MD_MIN2SET(mnum) >= md_nsets) || 1744 (MD_MIN2UNIT(mnum) >= md_nunits) || 1745 ((ui = MDI_UNIT(mnum)) == NULL)) { 1746 bp->b_flags |= B_ERROR; 1747 bp->b_error = ENXIO; 1748 bp->b_resid = bp->b_bcount; 1749 biodone(bp); 1750 return (0); 1751 } 1752 1753 bp->b_flags &= ~(B_ERROR | B_DONE); 1754 if (md_ops[ui->ui_opsindex]->md_strategy != NULL) { 1755 (*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL); 1756 } else { 1757 (void) errdone(ui, bp, ENXIO); 1758 } 1759 return (0); 1760 } 1761 1762 /* 1763 * Return true if the ioctl is allowed to be multithreaded. 1764 * All the ioctls with MN are sent only from the message handlers through 1765 * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two 1766 * ioctl for the same metadevice are issued at the same time. 1767 * So we are safe here. 1768 * The other ioctls do not mess with any metadevice structures and therefor 1769 * are harmless too, if called multiple times at the same time. 1770 */ 1771 static boolean_t 1772 is_mt_ioctl(int cmd) { 1773 1774 switch (cmd) { 1775 case MD_IOCGUNIQMSGID: 1776 case MD_IOCGVERSION: 1777 case MD_IOCISOPEN: 1778 case MD_MN_SET_MM_OWNER: 1779 case MD_MN_SET_STATE: 1780 case MD_MN_SUSPEND_WRITES: 1781 case MD_MN_ALLOCATE_HOTSPARE: 1782 case MD_MN_SET_SETFLAGS: 1783 case MD_MN_GET_SETFLAGS: 1784 case MD_MN_MDDB_OPTRECFIX: 1785 case MD_MN_MDDB_PARSE: 1786 case MD_MN_MDDB_BLOCK: 1787 case MD_MN_DB_USERREQ: 1788 case MD_IOC_SPSTATUS: 1789 case MD_MN_COMMD_ERR: 1790 case MD_MN_SET_COMMD_RUNNING: 1791 case MD_MN_RESYNC: 1792 case MD_MN_SETSYNC: 1793 case MD_MN_POKE_HOTSPARES: 1794 return (1); 1795 default: 1796 return (0); 1797 } 1798 } 1799 1800 /* 1801 * This routine implements the ioctl calls for the Virtual Disk System. 1802 * It is called from the device switch at normal priority. 1803 */ 1804 /* ARGSUSED */ 1805 static int 1806 mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p, 1807 int *rval_p) 1808 { 1809 minor_t mnum = getminor(dev); 1810 mdi_unit_t *ui; 1811 IOLOCK lock; 1812 int err; 1813 1814 /* 1815 * For multinode disksets number of ioctls are allowed to be 1816 * multithreaded. 1817 * A fundamental assumption made in this implementation is that 1818 * ioctls either do not interact with other md structures or the 1819 * ioctl to the admin device can only occur if the metadevice 1820 * device is open. i.e. avoid a race between metaclear and the 1821 * progress of a multithreaded ioctl. 1822 */ 1823 1824 if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) { 1825 return (EINTR); 1826 } 1827 1828 /* 1829 * initialize lock tracker 1830 */ 1831 IOLOCK_INIT(&lock); 1832 1833 /* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */ 1834 1835 if (is_mt_ioctl(cmd)) { 1836 /* increment the md_mtioctl_cnt */ 1837 mutex_enter(&md_mx); 1838 md_mtioctl_cnt++; 1839 mutex_exit(&md_mx); 1840 lock.l_flags |= MD_MT_IOCTL; 1841 } 1842 1843 /* 1844 * this has been added to prevent notification from re-snarfing 1845 * so metaunload will work. It may interfere with other modules 1846 * halt process. 1847 */ 1848 if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE)) 1849 return (IOLOCK_RETURN(ENXIO, &lock)); 1850 1851 /* 1852 * admin device ioctls 1853 */ 1854 if (mnum == MD_ADM_MINOR) { 1855 err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data, 1856 mode, &lock); 1857 } 1858 1859 /* 1860 * metadevice ioctls 1861 */ 1862 else if ((MD_MIN2SET(mnum) >= md_nsets) || 1863 (MD_MIN2UNIT(mnum) >= md_nunits) || 1864 ((ui = MDI_UNIT(mnum)) == NULL)) { 1865 err = ENXIO; 1866 } else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) { 1867 err = ENOTTY; 1868 } else { 1869 err = (*md_ops[ui->ui_opsindex]->md_ioctl) 1870 (dev, cmd, (void *) data, mode, &lock); 1871 } 1872 1873 /* 1874 * drop any locks we grabbed 1875 */ 1876 return (IOLOCK_RETURN_IOCTLEND(err, &lock)); 1877 } 1878 1879 static int 1880 mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1881 { 1882 minor_t mnum; 1883 set_t setno; 1884 mdi_unit_t *ui; 1885 1886 if ((mnum = getminor(dev)) == MD_ADM_MINOR) 1887 return (ENXIO); 1888 1889 setno = MD_MIN2SET(mnum); 1890 1891 if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) || 1892 ((ui = MDI_UNIT(mnum)) == NULL)) 1893 return (ENXIO); 1894 1895 1896 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 1897 return (ENXIO); 1898 1899 if (md_ops[ui->ui_opsindex]->md_dump != NULL) 1900 return ((*md_ops[ui->ui_opsindex]->md_dump) 1901 (dev, addr, blkno, nblk)); 1902 1903 return (ENXIO); 1904 } 1905 1906 /* 1907 * Metadevice unit number dispatcher 1908 * When this routine is called it will scan the 1909 * incore unit array and return the avail slot 1910 * hence the unit number to the caller 1911 * 1912 * Return -1 if there is nothing available 1913 */ 1914 unit_t 1915 md_get_nextunit(set_t setno) 1916 { 1917 unit_t un, start; 1918 1919 /* 1920 * If nothing available 1921 */ 1922 if (md_set[setno].s_un_avail == 0) { 1923 return (MD_UNITBAD); 1924 } 1925 1926 mutex_enter(&md_mx); 1927 start = un = md_set[setno].s_un_next; 1928 1929 /* LINTED: E_CONSTANT_CONDITION */ 1930 while (1) { 1931 if (md_set[setno].s_un[un] == NULL) { 1932 /* 1933 * Advance the starting index for the next 1934 * md_get_nextunit call 1935 */ 1936 if (un == MD_MAXUNITS - 1) { 1937 md_set[setno].s_un_next = 0; 1938 } else { 1939 md_set[setno].s_un_next = un + 1; 1940 } 1941 break; 1942 } 1943 1944 un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1); 1945 1946 if (un == start) { 1947 un = MD_UNITBAD; 1948 break; 1949 } 1950 1951 } 1952 1953 mutex_exit(&md_mx); 1954 return (un); 1955 } 1956