1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Md - is the meta-disk driver. It sits below the UFS file system 30 * but above the 'real' disk drivers, xy, id, sd etc. 31 * 32 * To the UFS software, md looks like a normal driver, since it has 33 * the normal kinds of entries in the bdevsw and cdevsw arrays. So 34 * UFS accesses md in the usual ways. In particular, the strategy 35 * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(), 36 * and ufs_writelbn(). 37 * 38 * Md maintains an array of minor devices (meta-partitions). Each 39 * meta partition stands for a matrix of real partitions, in rows 40 * which are not necessarily of equal length. Md maintains a table, 41 * with one entry for each meta-partition, which lists the rows and 42 * columns of actual partitions, and the job of the strategy routine 43 * is to translate from the meta-partition device and block numbers 44 * known to UFS into the actual partitions' device and block numbers. 45 * 46 * See below, in mdstrategy(), mdreal(), and mddone() for details of 47 * this translation. 48 */ 49 50 /* 51 * Driver for Virtual Disk. 52 */ 53 54 #include <sys/user.h> 55 #include <sys/sysmacros.h> 56 #include <sys/conf.h> 57 #include <sys/stat.h> 58 #include <sys/errno.h> 59 #include <sys/param.h> 60 #include <sys/systm.h> 61 #include <sys/file.h> 62 #include <sys/open.h> 63 #include <sys/dkio.h> 64 #include <sys/vtoc.h> 65 #include <sys/cmn_err.h> 66 #include <sys/ddi.h> 67 #include <sys/sunddi.h> 68 #include <sys/debug.h> 69 #include <sys/utsname.h> 70 #include <sys/lvm/mdvar.h> 71 #include <sys/lvm/md_names.h> 72 #include <sys/lvm/md_mddb.h> 73 #include <sys/lvm/md_sp.h> 74 #include <sys/types.h> 75 #include <sys/kmem.h> 76 #include <sys/cladm.h> 77 #include <sys/priv_names.h> 78 79 #ifndef lint 80 char _depends_on[] = "strmod/rpcmod"; 81 #endif /* lint */ 82 int md_init_debug = 0; /* module binding debug */ 83 84 /* 85 * Tunable to turn off the failfast behavior. 86 */ 87 int md_ff_disable = 0; 88 89 /* 90 * dynamically allocated list of non FF driver names - needs to 91 * be freed when md is detached. 92 */ 93 char **non_ff_drivers = NULL; 94 95 md_krwlock_t md_unit_array_rw; /* protects all unit arrays */ 96 md_krwlock_t nm_lock; /* protects all the name spaces */ 97 98 md_resync_t md_cpr_resync; 99 100 extern char svm_bootpath[]; 101 #define SVM_PSEUDO_STR "/pseudo/md@0:" 102 103 #define VERSION_LENGTH 6 104 #define VERSION "1.0" 105 106 /* 107 * Keep track of possible 'orphan' entries in the name space 108 */ 109 int *md_nm_snarfed = NULL; 110 111 /* 112 * Global tunable giving the percentage of free space left in replica during 113 * conversion of non-devid style replica to devid style replica. 114 */ 115 int md_conv_perc = MDDB_DEVID_CONV_PERC; 116 117 #ifdef DEBUG 118 /* debug code to verify framework exclusion guarantees */ 119 int md_in; 120 kmutex_t md_in_mx; /* used to md global stuff */ 121 #define IN_INIT 0x01 122 #define IN_FINI 0x02 123 #define IN_ATTACH 0x04 124 #define IN_DETACH 0x08 125 #define IN_OPEN 0x10 126 #define MD_SET_IN(x) { \ 127 mutex_enter(&md_in_mx); \ 128 if (md_in) \ 129 debug_enter("MD_SET_IN exclusion lost"); \ 130 if (md_in & x) \ 131 debug_enter("MD_SET_IN already set"); \ 132 md_in |= x; \ 133 mutex_exit(&md_in_mx); \ 134 } 135 136 #define MD_CLR_IN(x) { \ 137 mutex_enter(&md_in_mx); \ 138 if (md_in & ~(x)) \ 139 debug_enter("MD_CLR_IN exclusion lost"); \ 140 if (!(md_in & x)) \ 141 debug_enter("MD_CLR_IN already clr"); \ 142 md_in &= ~x; \ 143 mutex_exit(&md_in_mx); \ 144 } 145 #else /* DEBUG */ 146 #define MD_SET_IN(x) 147 #define MD_CLR_IN(x) 148 #endif /* DEBUG */ 149 hrtime_t savetime1, savetime2; 150 151 152 /* 153 * list things protected by md_mx even if they aren't 154 * used in this file. 155 */ 156 kmutex_t md_mx; /* used to md global stuff */ 157 kcondvar_t md_cv; /* md_status events */ 158 int md_status = 0; /* global status for the meta-driver */ 159 int md_num_daemons = 0; 160 int md_ioctl_cnt = 0; 161 int md_mtioctl_cnt = 0; /* multithreaded ioctl cnt */ 162 uint_t md_mdelay = 10; /* variable so can be patched */ 163 164 int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 165 166 major_t md_major, md_major_targ; 167 168 unit_t md_nunits = MD_MAXUNITS; 169 set_t md_nsets = MD_MAXSETS; 170 int md_nmedh = 0; 171 char *md_med_trans_lst = NULL; 172 md_set_t md_set[MD_MAXSETS]; 173 md_set_io_t md_set_io[MD_MAXSETS]; 174 175 md_krwlock_t hsp_rwlp; /* protects hot_spare_interface */ 176 md_krwlock_t ni_rwlp; /* protects notify_interface */ 177 md_ops_t **md_ops; 178 ddi_modhandle_t *md_mods; 179 md_ops_t *md_opslist; 180 clock_t md_hz; 181 md_event_queue_t *md_event_queue = NULL; 182 183 int md_in_upgrade; 184 int md_keep_repl_state; 185 int md_devid_destroy; 186 187 /* for sending messages thru a door to userland */ 188 door_handle_t mdmn_door_handle = NULL; 189 int mdmn_door_did = -1; 190 191 dev_info_t *md_devinfo = NULL; 192 193 md_mn_nodeid_t md_mn_mynode_id = ~0u; /* My node id (for multi-node sets) */ 194 195 static uint_t md_ocnt[OTYPCNT]; 196 197 static int mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 198 static int mdattach(dev_info_t *, ddi_attach_cmd_t); 199 static int mddetach(dev_info_t *, ddi_detach_cmd_t); 200 static int mdopen(dev_t *, int, int, cred_t *); 201 static int mdclose(dev_t, int, int, cred_t *); 202 static int mddump(dev_t, caddr_t, daddr_t, int); 203 static int mdread(dev_t, struct uio *, cred_t *); 204 static int mdwrite(dev_t, struct uio *, cred_t *); 205 static int mdaread(dev_t, struct aio_req *, cred_t *); 206 static int mdawrite(dev_t, struct aio_req *, cred_t *); 207 static int mdioctl(dev_t, int, intptr_t, int, cred_t *, int *); 208 static int mdprop_op(dev_t, dev_info_t *, 209 ddi_prop_op_t, int, char *, caddr_t, int *); 210 211 static struct cb_ops md_cb_ops = { 212 mdopen, /* open */ 213 mdclose, /* close */ 214 mdstrategy, /* strategy */ 215 /* print routine -- none yet */ 216 (int(*)(dev_t, char *))nulldev, 217 mddump, /* dump */ 218 mdread, /* read */ 219 mdwrite, /* write */ 220 mdioctl, /* ioctl */ 221 /* devmap */ 222 (int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *, 223 uint_t))nodev, 224 /* mmap */ 225 (int(*)(dev_t, off_t, int))nodev, 226 /* segmap */ 227 (int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned, 228 unsigned, unsigned, cred_t *))nodev, 229 nochpoll, /* poll */ 230 mdprop_op, /* prop_op */ 231 0, /* streamtab */ 232 (D_64BIT|D_MP|D_NEW), /* driver compatibility flag */ 233 CB_REV, /* cb_ops version */ 234 mdaread, /* aread */ 235 mdawrite, /* awrite */ 236 }; 237 238 static struct dev_ops md_devops = { 239 DEVO_REV, /* dev_ops version */ 240 0, /* device reference count */ 241 mdinfo, /* info routine */ 242 nulldev, /* identify routine */ 243 nulldev, /* probe - not defined */ 244 mdattach, /* attach routine */ 245 mddetach, /* detach routine */ 246 nodev, /* reset - not defined */ 247 &md_cb_ops, /* driver operations */ 248 NULL, /* bus operations */ 249 nodev /* power management */ 250 }; 251 252 /* 253 * loadable module wrapper 254 */ 255 #include <sys/modctl.h> 256 257 static struct modldrv modldrv = { 258 &mod_driverops, /* type of module -- a pseudodriver */ 259 "Solaris Volume Manager base module %I%", /* name of the module */ 260 &md_devops, /* driver ops */ 261 }; 262 263 static struct modlinkage modlinkage = { 264 MODREV_1, 265 (void *)&modldrv, 266 NULL 267 }; 268 269 270 /* md_medd.c */ 271 extern void med_init(void); 272 extern void med_fini(void); 273 extern void md_devid_cleanup(set_t, uint_t); 274 275 /* md_names.c */ 276 extern void *lookup_entry(struct nm_next_hdr *, set_t, 277 side_t, mdkey_t, md_dev64_t, int); 278 extern struct nm_next_hdr *get_first_record(set_t, int, int); 279 extern int remove_entry(struct nm_next_hdr *, 280 side_t, mdkey_t, int); 281 282 int md_maxphys = 0; /* maximum io size in bytes */ 283 #define MD_MAXBCOUNT (1024 * 1024) 284 unsigned md_maxbcount = 0; /* maximum physio size in bytes */ 285 286 /* allocate/free dynamic space associated with driver globals */ 287 void 288 md_global_alloc_free(int alloc) 289 { 290 set_t s; 291 292 if (alloc) { 293 /* initialize driver global locks */ 294 cv_init(&md_cv, NULL, CV_DEFAULT, NULL); 295 mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL); 296 rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL); 297 rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL); 298 rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL); 299 rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL); 300 mutex_init(&md_cpr_resync.md_resync_mutex, NULL, 301 MUTEX_DEFAULT, NULL); 302 303 /* initialize per set driver global locks */ 304 for (s = 0; s < MD_MAXSETS; s++) { 305 /* initialize per set driver globals locks */ 306 mutex_init(&md_set[s].s_dbmx, 307 NULL, MUTEX_DEFAULT, NULL); 308 mutex_init(&md_set_io[s].md_io_mx, 309 NULL, MUTEX_DEFAULT, NULL); 310 cv_init(&md_set_io[s].md_io_cv, 311 NULL, CV_DEFAULT, NULL); 312 } 313 } else { 314 /* destroy per set driver global locks */ 315 for (s = 0; s < MD_MAXSETS; s++) { 316 cv_destroy(&md_set_io[s].md_io_cv); 317 mutex_destroy(&md_set_io[s].md_io_mx); 318 mutex_destroy(&md_set[s].s_dbmx); 319 } 320 321 /* destroy driver global locks */ 322 mutex_destroy(&md_cpr_resync.md_resync_mutex); 323 rw_destroy(&hsp_rwlp.lock); 324 rw_destroy(&ni_rwlp.lock); 325 rw_destroy(&nm_lock.lock); 326 rw_destroy(&md_unit_array_rw.lock); 327 mutex_destroy(&md_mx); 328 cv_destroy(&md_cv); 329 } 330 } 331 332 int 333 _init(void) 334 { 335 set_t s; 336 int err; 337 338 MD_SET_IN(IN_INIT); 339 340 /* allocate dynamic space associated with driver globals */ 341 md_global_alloc_free(1); 342 343 /* initialize driver globals */ 344 md_major = ddi_name_to_major("md"); 345 md_hz = drv_usectohz(NUM_USEC_IN_SEC); 346 347 /* initialize tunable globals */ 348 if (md_maxphys == 0) /* maximum io size in bytes */ 349 md_maxphys = maxphys; 350 if (md_maxbcount == 0) /* maximum physio size in bytes */ 351 md_maxbcount = MD_MAXBCOUNT; 352 353 /* initialize per set driver globals */ 354 for (s = 0; s < MD_MAXSETS; s++) 355 md_set_io[s].io_state = MD_SET_ACTIVE; 356 357 /* 358 * NOTE: the framework does not currently guarantee exclusion 359 * between _init and attach after calling mod_install. 360 */ 361 MD_CLR_IN(IN_INIT); 362 if ((err = mod_install(&modlinkage))) { 363 MD_SET_IN(IN_INIT); 364 md_global_alloc_free(0); /* free dynamic space */ 365 MD_CLR_IN(IN_INIT); 366 } 367 return (err); 368 } 369 370 int 371 _fini(void) 372 { 373 int err; 374 375 /* 376 * NOTE: the framework currently does not guarantee exclusion 377 * with attach until after mod_remove returns 0. 378 */ 379 if ((err = mod_remove(&modlinkage))) 380 return (err); 381 382 MD_SET_IN(IN_FINI); 383 md_global_alloc_free(0); /* free dynamic space */ 384 MD_CLR_IN(IN_FINI); 385 return (err); 386 } 387 388 int 389 _info(struct modinfo *modinfop) 390 { 391 return (mod_info(&modlinkage, modinfop)); 392 } 393 394 /* ARGSUSED */ 395 static int 396 mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd) 397 { 398 int len; 399 unit_t i; 400 size_t sz; 401 char ver[VERSION_LENGTH]; 402 char **maj_str_array; 403 char *str, *str2; 404 405 MD_SET_IN(IN_ATTACH); 406 md_in_upgrade = 0; 407 md_keep_repl_state = 0; 408 md_devid_destroy = 0; 409 410 if (cmd != DDI_ATTACH) { 411 MD_CLR_IN(IN_ATTACH); 412 return (DDI_FAILURE); 413 } 414 415 if (md_devinfo != NULL) { 416 MD_CLR_IN(IN_ATTACH); 417 return (DDI_FAILURE); 418 } 419 420 mddb_init(); 421 422 if (md_start_daemons(TRUE)) { 423 MD_CLR_IN(IN_ATTACH); 424 mddb_unload(); /* undo mddb_init() allocations */ 425 return (DDI_FAILURE); 426 } 427 428 /* clear the halted state */ 429 md_clr_status(MD_GBL_HALTED); 430 431 /* see if the diagnostic switch is on */ 432 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 433 DDI_PROP_DONTPASS, "md_init_debug", 0)) 434 md_init_debug++; 435 436 /* see if the failfast disable switch is on */ 437 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 438 DDI_PROP_DONTPASS, "md_ff_disable", 0)) 439 md_ff_disable++; 440 441 /* try and get the md_nmedh property */ 442 md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 443 DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS); 444 if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS)) 445 md_nmedh = MED_DEF_HOSTS; 446 447 /* try and get the md_med_trans_lst property */ 448 len = 0; 449 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN, 450 0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS || 451 len == 0) { 452 md_med_trans_lst = md_strdup("tcp"); 453 } else { 454 md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP); 455 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF, 456 0, "md_med_trans_lst", md_med_trans_lst, &len) != 457 DDI_PROP_SUCCESS) { 458 kmem_free(md_med_trans_lst, (size_t)len); 459 md_med_trans_lst = md_strdup("tcp"); 460 } 461 } 462 463 /* try and get the md_xlate property */ 464 /* Should we only do this if upgrade? */ 465 len = sizeof (char) * 5; 466 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF, 467 0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) { 468 if (strcmp(ver, VERSION) == 0) { 469 len = 0; 470 if (ddi_prop_op(DDI_DEV_T_ANY, dip, 471 PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate", 472 (caddr_t)&md_tuple_table, &len) != 473 DDI_PROP_SUCCESS) { 474 if (md_init_debug) 475 cmn_err(CE_WARN, 476 "md_xlate ddi_prop_op failed"); 477 goto attach_failure; 478 } else { 479 md_tuple_length = 480 len/(2 * ((int)sizeof (dev32_t))); 481 md_in_upgrade = 1; 482 } 483 484 /* Get target's name to major table */ 485 if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY, 486 dip, DDI_PROP_DONTPASS, 487 "md_targ_nm_table", &maj_str_array, 488 &md_majortab_len) != DDI_PROP_SUCCESS) { 489 md_majortab_len = 0; 490 if (md_init_debug) 491 cmn_err(CE_WARN, "md_targ_nm_table " 492 "ddi_prop_lookup_string_array failed"); 493 goto attach_failure; 494 } 495 496 md_major_tuple_table = 497 (struct md_xlate_major_table *) 498 kmem_zalloc(md_majortab_len * 499 sizeof (struct md_xlate_major_table), KM_SLEEP); 500 501 for (i = 0; i < md_majortab_len; i++) { 502 /* Getting major name */ 503 str = strchr(maj_str_array[i], ' '); 504 if (str == NULL) 505 continue; 506 *str = '\0'; 507 md_major_tuple_table[i].drv_name = 508 md_strdup(maj_str_array[i]); 509 510 /* Simplified atoi to get major number */ 511 str2 = str + 1; 512 md_major_tuple_table[i].targ_maj = 0; 513 while ((*str2 >= '0') && (*str2 <= '9')) { 514 md_major_tuple_table[i].targ_maj *= 10; 515 md_major_tuple_table[i].targ_maj += 516 *str2++ - '0'; 517 } 518 *str = ' '; 519 } 520 ddi_prop_free((void *)maj_str_array); 521 } else { 522 if (md_init_debug) 523 cmn_err(CE_WARN, "md_xlate_ver is incorrect"); 524 goto attach_failure; 525 } 526 } 527 528 /* 529 * Check for properties: 530 * md_keep_repl_state and md_devid_destroy 531 * and set globals if these exist. 532 */ 533 md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip, 534 0, "md_keep_repl_state", 0); 535 536 md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip, 537 0, "md_devid_destroy", 0); 538 539 if (MD_UPGRADE) 540 md_major_targ = md_targ_name_to_major("md"); 541 else 542 md_major_targ = 0; 543 544 /* alloc md_ops and md_mods struct */ 545 md_ops = (md_ops_t **)kmem_zalloc( 546 sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP); 547 md_mods = (ddi_modhandle_t *)kmem_zalloc( 548 sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP); 549 550 /* allocate admin device node */ 551 if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR, 552 MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640)) 553 goto attach_failure; 554 555 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 556 DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS) 557 goto attach_failure; 558 559 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, 560 "ddi-abrwrite-supported", 1) != DDI_SUCCESS) 561 goto attach_failure; 562 563 /* these could have been cleared by a detach */ 564 md_nunits = MD_MAXUNITS; 565 md_nsets = MD_MAXSETS; 566 567 sz = sizeof (void *) * MD_MAXUNITS; 568 if (md_set[0].s_un == NULL) 569 md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP); 570 if (md_set[0].s_ui == NULL) 571 md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP); 572 573 md_devinfo = dip; 574 575 /* 576 * Only allocate device node for root mirror metadevice. 577 * Don't pre-allocate unnecessary device nodes (thus slowing down a 578 * boot when we attach). 579 * We can't read the mddbs in attach. The mddbs will be read 580 * by metainit during the boot process when it is doing the 581 * auto-take processing and any other minor nodes will be 582 * allocated at that point. 583 * 584 * There are two scenarios to be aware of here: 585 * 1) when we are booting from a mirrored root we need the root 586 * metadevice to exist very early (during vfs_mountroot processing) 587 * 2) we need all of the nodes to be created so that any mnttab entries 588 * will succeed (handled by metainit reading the mddb during boot). 589 */ 590 if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1) 591 == 0) { 592 char *p; 593 int mnum = 0; 594 595 /* 596 * The svm_bootpath string looks something like 597 * /pseudo/md@0:0,150,blk where 150 is the minor number 598 * in this example so we need to set the pointer p onto 599 * the first digit of the minor number and convert it 600 * from ascii. 601 */ 602 for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1; 603 *p >= '0' && *p <= '9'; p++) { 604 mnum *= 10; 605 mnum += *p - '0'; 606 } 607 608 if (md_create_minor_node(0, mnum)) { 609 kmem_free(md_set[0].s_un, sz); 610 kmem_free(md_set[0].s_ui, sz); 611 goto attach_failure; 612 } 613 } 614 615 med_init(); 616 617 MD_CLR_IN(IN_ATTACH); 618 return (DDI_SUCCESS); 619 620 attach_failure: 621 /* 622 * Use our own detach routine to toss any stuff we allocated above. 623 * NOTE: detach will call md_halt to free the mddb_init allocations. 624 */ 625 MD_CLR_IN(IN_ATTACH); 626 if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS) 627 cmn_err(CE_WARN, "detach from attach failed"); 628 return (DDI_FAILURE); 629 } 630 631 /* ARGSUSED */ 632 static int 633 mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd) 634 { 635 extern int check_active_locators(); 636 set_t s; 637 size_t sz; 638 int len; 639 640 MD_SET_IN(IN_DETACH); 641 642 /* check command */ 643 if (cmd != DDI_DETACH) { 644 MD_CLR_IN(IN_DETACH); 645 return (DDI_FAILURE); 646 } 647 648 /* 649 * if we have not already halted yet we have no active config 650 * then automatically initiate a halt so we can detach. 651 */ 652 if (!(md_get_status() & MD_GBL_HALTED)) { 653 if (check_active_locators() == 0) { 654 /* 655 * NOTE: a successful md_halt will have done the 656 * mddb_unload to free allocations done in mddb_init 657 */ 658 if (md_halt(MD_NO_GBL_LOCKS_HELD)) { 659 cmn_err(CE_NOTE, "md:detach: " 660 "Could not halt Solaris Volume Manager"); 661 MD_CLR_IN(IN_DETACH); 662 return (DDI_FAILURE); 663 } 664 } 665 666 /* fail detach if we have not halted */ 667 if (!(md_get_status() & MD_GBL_HALTED)) { 668 MD_CLR_IN(IN_DETACH); 669 return (DDI_FAILURE); 670 } 671 } 672 673 /* must be in halted state, this will be cleared on next attach */ 674 ASSERT(md_get_status() & MD_GBL_HALTED); 675 676 /* cleanup attach allocations and initializations */ 677 md_major_targ = 0; 678 679 sz = sizeof (void *) * md_nunits; 680 for (s = 0; s < md_nsets; s++) { 681 if (md_set[s].s_un != NULL) { 682 kmem_free(md_set[s].s_un, sz); 683 md_set[s].s_un = NULL; 684 } 685 686 if (md_set[s].s_ui != NULL) { 687 kmem_free(md_set[s].s_ui, sz); 688 md_set[s].s_ui = NULL; 689 } 690 } 691 md_nunits = 0; 692 md_nsets = 0; 693 md_nmedh = 0; 694 695 if (non_ff_drivers != NULL) { 696 int i; 697 698 for (i = 0; non_ff_drivers[i] != NULL; i++) 699 kmem_free(non_ff_drivers[i], strlen(non_ff_drivers[i]) + 1); 700 701 kmem_free(non_ff_drivers, 2 * sizeof (char *)); 702 non_ff_drivers = NULL; 703 } 704 705 if (md_med_trans_lst != NULL) { 706 kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1); 707 md_med_trans_lst = NULL; 708 } 709 710 if (md_mods != NULL) { 711 kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS); 712 md_mods = NULL; 713 } 714 715 if (md_ops != NULL) { 716 kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS); 717 md_ops = NULL; 718 } 719 720 if (MD_UPGRADE) { 721 len = md_tuple_length * (2 * ((int)sizeof (dev32_t))); 722 md_in_upgrade = 0; 723 md_xlate_free(len); 724 md_majortab_free(); 725 } 726 727 /* 728 * Undo what we did in mdattach, freeing resources 729 * and removing things we installed. The system 730 * framework guarantees we are not active with this devinfo 731 * node in any other entry points at this time. 732 */ 733 ddi_prop_remove_all(dip); 734 ddi_remove_minor_node(dip, NULL); 735 736 med_fini(); 737 md_devinfo = NULL; 738 739 MD_CLR_IN(IN_DETACH); 740 return (DDI_SUCCESS); 741 } 742 743 744 /* 745 * Given the device number return the devinfo pointer 746 * given to md via md_attach 747 */ 748 /*ARGSUSED*/ 749 static int 750 mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 751 { 752 int error = DDI_FAILURE; 753 754 switch (infocmd) { 755 case DDI_INFO_DEVT2DEVINFO: 756 if (md_devinfo) { 757 *result = (void *)md_devinfo; 758 error = DDI_SUCCESS; 759 } 760 break; 761 762 case DDI_INFO_DEVT2INSTANCE: 763 *result = (void *)0; 764 error = DDI_SUCCESS; 765 break; 766 } 767 return (error); 768 } 769 770 /* 771 * property operation routine. return the number of blocks for the partition 772 * in question or forward the request to the property facilities. 773 */ 774 static int 775 mdprop_op( 776 dev_t dev, /* device number associated with device */ 777 dev_info_t *dip, /* device info struct for this device */ 778 ddi_prop_op_t prop_op, /* property operator */ 779 int mod_flags, /* property flags */ 780 char *name, /* name of property */ 781 caddr_t valuep, /* where to put property value */ 782 int *lengthp) /* put length of property here */ 783 { 784 minor_t mnum; 785 set_t setno; 786 md_unit_t *un; 787 mdi_unit_t *ui; 788 uint64_t nblocks64; 789 790 /* 791 * Our dynamic properties are all device specific and size oriented. 792 * Requests issued under conditions where size is valid are passed 793 * to ddi_prop_op_nblocks with the size information, otherwise the 794 * request is passed to ddi_prop_op. Make sure that the minor device 795 * is a valid part of the Virtual Disk subsystem. 796 */ 797 mnum = getminor(dev); 798 setno = MD_MIN2SET(mnum); 799 if ((dev == DDI_DEV_T_ANY) || (mnum == MD_ADM_MINOR) || 800 (setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) { 801 pass: return (ddi_prop_op(dev, dip, prop_op, mod_flags, 802 name, valuep, lengthp)); 803 } else { 804 rw_enter(&md_unit_array_rw.lock, RW_READER); 805 if (((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) || 806 ((ui = MDI_UNIT(mnum)) == NULL)) { 807 rw_exit(&md_unit_array_rw.lock); 808 goto pass; 809 } 810 811 /* get nblocks value */ 812 un = (md_unit_t *)md_unit_readerlock(ui); 813 nblocks64 = un->c.un_total_blocks; 814 md_unit_readerexit(ui); 815 rw_exit(&md_unit_array_rw.lock); 816 817 return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags, 818 name, valuep, lengthp, nblocks64)); 819 } 820 821 } 822 823 static void 824 snarf_user_data(set_t setno) 825 { 826 mddb_recid_t recid; 827 mddb_recstatus_t status; 828 829 recid = mddb_makerecid(setno, 0); 830 while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) { 831 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 832 continue; 833 834 status = mddb_getrecstatus(recid); 835 if (status == MDDB_STALE) 836 continue; 837 838 if (status == MDDB_NODATA) { 839 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 840 continue; 841 } 842 843 ASSERT(status == MDDB_OK); 844 845 mddb_setrecprivate(recid, MD_PRV_GOTIT); 846 } 847 } 848 849 static void 850 md_print_block_usage(mddb_set_t *s, uint_t blks) 851 { 852 uint_t ib; 853 int li; 854 mddb_mb_ic_t *mbip; 855 uint_t max_blk_needed; 856 mddb_lb_t *lbp; 857 mddb_sidelocator_t *slp; 858 int drv_index; 859 md_splitname sn; 860 char *name; 861 char *suffix; 862 size_t prefixlen; 863 size_t suffixlen; 864 int alloc_sz; 865 866 867 max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks; 868 869 870 cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n" 871 " Additional Blocks Needed: %d\n\n" 872 " Increase size of following replicas for\n" 873 " device relocatability by deleting listed\n" 874 " replica and re-adding replica with\n" 875 " increased size (see metadb(1M)):\n" 876 " Replica Increase By", 877 s->s_totalblkcnt, (blks - s->s_freeblkcnt)); 878 879 lbp = s->s_lbp; 880 881 for (li = 0; li < lbp->lb_loccnt; li++) { 882 if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED) 883 continue; 884 ib = 0; 885 for (mbip = s->s_mbiarray[li]; mbip != NULL; 886 mbip = mbip->mbi_next) { 887 ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt; 888 } 889 if (ib == 0) 890 continue; 891 if (ib < max_blk_needed) { 892 slp = &lbp->lb_sidelocators[s->s_sideno][li]; 893 drv_index = slp->l_drvnm_index; 894 mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, 895 &sn); 896 prefixlen = SPN_PREFIX(&sn).pre_len; 897 suffixlen = SPN_SUFFIX(&sn).suf_len; 898 alloc_sz = (int)(prefixlen + suffixlen + 2); 899 name = (char *)kmem_alloc(alloc_sz, KM_SLEEP); 900 (void) strncpy(name, SPN_PREFIX(&sn).pre_data, 901 prefixlen); 902 name[prefixlen] = '/'; 903 suffix = name + (prefixlen + 1); 904 (void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data, 905 suffixlen); 906 name[prefixlen + suffixlen + 1] = '\0'; 907 cmn_err(CE_WARN, 908 " %s (%s:%d:%d) %d blocks", 909 name, lbp->lb_drvnm[drv_index].dn_data, 910 slp->l_mnum, lbp->lb_locators[li].l_blkno, 911 (max_blk_needed - ib)); 912 kmem_free(name, alloc_sz); 913 } 914 } 915 } 916 917 /* 918 * md_create_minor_node: 919 * Create the minor device for the given set and un_self_id. 920 * 921 * Input: 922 * setno - set number 923 * mnum - selfID of unit 924 * 925 * Output: 926 * None. 927 * 928 * Returns 0 for success, 1 for failure. 929 * 930 * Side-effects: 931 * None. 932 */ 933 int 934 md_create_minor_node(set_t setno, minor_t mnum) 935 { 936 char name[20]; 937 938 /* Check for valid arguments */ 939 if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS) 940 return (1); 941 942 (void) snprintf(name, 20, "%u,%u,blk", 943 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum)); 944 945 if (ddi_create_minor_node(md_devinfo, name, S_IFBLK, 946 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0)) 947 return (1); 948 949 (void) snprintf(name, 20, "%u,%u,raw", 950 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum)); 951 952 if (ddi_create_minor_node(md_devinfo, name, S_IFCHR, 953 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0)) 954 return (1); 955 956 return (0); 957 } 958 959 /* 960 * For a given key check if it is an orphaned record. 961 * The following conditions are used to determine an orphan. 962 * 1. The device associated with that key is not a metadevice. 963 * 2. If DEVID_STYLE then the physical device does not have a device Id 964 * associated with it. 965 * 966 * If a key does not have an entry in the devid namespace it could be 967 * a device that does not support device ids. Hence the record is not 968 * deleted. 969 */ 970 971 static int 972 md_verify_orphaned_record(set_t setno, mdkey_t key) 973 { 974 md_dev64_t odev; /* orphaned dev */ 975 mddb_set_t *s; 976 side_t side = 0; 977 struct nm_next_hdr *did_nh = NULL; 978 979 s = (mddb_set_t *)md_set[setno].s_db; 980 if ((did_nh = get_first_record(setno, 1, (NM_DEVID | NM_NOTSHARED))) 981 == NULL) 982 return (0); 983 /* 984 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT 985 */ 986 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) { 987 odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT); 988 if ((odev == NODEV64) || (md_getmajor(odev) == md_major)) 989 return (0); 990 if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) == 991 NULL) 992 return (1); 993 } 994 return (0); 995 } 996 997 int 998 md_snarf_db_set(set_t setno, md_error_t *ep) 999 { 1000 int err = 0; 1001 int i; 1002 mddb_recid_t recid; 1003 mddb_type_t drvrid; 1004 mddb_recstatus_t status; 1005 md_ops_t *ops; 1006 uint_t privat; 1007 mddb_set_t *s; 1008 uint_t cvt_blks; 1009 struct nm_next_hdr *nh; 1010 mdkey_t key = MD_KEYWILD; 1011 side_t side = 0; 1012 int size; 1013 int devid_flag; 1014 int retval; 1015 uint_t un; 1016 int un_next_set = 0; 1017 1018 md_haltsnarf_enter(setno); 1019 1020 mutex_enter(&md_mx); 1021 if (md_set[setno].s_status & MD_SET_SNARFED) { 1022 mutex_exit(&md_mx); 1023 md_haltsnarf_exit(setno); 1024 return (0); 1025 } 1026 mutex_exit(&md_mx); 1027 1028 if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) { 1029 if (md_start_daemons(TRUE)) { 1030 if (ep != NULL) 1031 (void) mdsyserror(ep, ENXIO); 1032 err = -1; 1033 goto out; 1034 } 1035 } 1036 1037 1038 /* 1039 * Load the devid name space if it exists 1040 */ 1041 (void) md_load_namespace(setno, NULL, NM_DEVID); 1042 if (!md_load_namespace(setno, ep, 0L)) { 1043 /* 1044 * Unload the devid namespace 1045 */ 1046 (void) md_unload_namespace(setno, NM_DEVID); 1047 err = -1; 1048 goto out; 1049 } 1050 1051 /* 1052 * If replica is in non-devid state, convert if: 1053 * - not in probe during upgrade (md_keep_repl_state = 0) 1054 * - enough space available in replica 1055 * - local set 1056 * - not a multi-node diskset 1057 * - clustering is not present (for non-local set) 1058 */ 1059 s = (mddb_set_t *)md_set[setno].s_db; 1060 devid_flag = 0; 1061 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state) 1062 devid_flag = 1; 1063 if (cluster_bootflags & CLUSTER_CONFIGURED) 1064 if (setno != MD_LOCAL_SET) 1065 devid_flag = 0; 1066 if (MD_MNSET_SETNO(setno)) 1067 devid_flag = 0; 1068 if ((md_devid_destroy == 1) && (md_keep_repl_state == 1)) 1069 devid_flag = 0; 1070 1071 /* 1072 * if we weren't devid style before and md_keep_repl_state=1 1073 * we need to stay non-devid 1074 */ 1075 if ((md_keep_repl_state == 1) && 1076 ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0)) 1077 devid_flag = 0; 1078 if (devid_flag) { 1079 /* 1080 * Determine number of free blocks needed to convert 1081 * entire replica to device id format - locator blocks 1082 * and namespace. 1083 */ 1084 cvt_blks = 0; 1085 if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) { 1086 if (ep != NULL) 1087 (void) mdsyserror(ep, EIO); 1088 err = -1; 1089 goto out; 1090 1091 } 1092 cvt_blks += md_nm_did_chkspace(setno); 1093 1094 /* add MDDB_DEVID_CONV_PERC% */ 1095 if ((md_conv_perc > 0) && (md_conv_perc <= 100)) { 1096 cvt_blks = cvt_blks * (100 + md_conv_perc) / 100; 1097 } 1098 1099 if (cvt_blks <= s->s_freeblkcnt) { 1100 if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) { 1101 if (ep != NULL) 1102 (void) mdsyserror(ep, EIO); 1103 err = -1; 1104 goto out; 1105 } 1106 1107 } else { 1108 /* 1109 * Print message that replica can't be converted for 1110 * lack of space. No failure - just continue to 1111 * run without device ids. 1112 */ 1113 cmn_err(CE_WARN, 1114 "Unable to add Solaris Volume Manager device " 1115 "relocation data.\n" 1116 " To use device relocation feature:\n" 1117 " - Increase size of listed replicas\n" 1118 " - Reboot"); 1119 md_print_block_usage(s, cvt_blks); 1120 cmn_err(CE_WARN, 1121 "Loading set without device relocation data.\n" 1122 " Solaris Volume Manager disk movement " 1123 "not tracked in local set."); 1124 } 1125 } 1126 1127 /* 1128 * go through and load any modules referenced in 1129 * data base 1130 */ 1131 recid = mddb_makerecid(setno, 0); 1132 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1133 status = mddb_getrecstatus(recid); 1134 if (status == MDDB_STALE) { 1135 if (! (md_get_setstatus(setno) & MD_SET_STALE)) { 1136 md_set_setstatus(setno, MD_SET_STALE); 1137 cmn_err(CE_WARN, 1138 "md: state database is stale"); 1139 } 1140 } else if (status == MDDB_NODATA) { 1141 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1142 continue; 1143 } 1144 drvrid = mddb_getrectype1(recid); 1145 if (drvrid < MDDB_FIRST_MODID) 1146 continue; 1147 if (md_loadsubmod(setno, md_getshared_name(setno, drvrid), 1148 drvrid) < 0) { 1149 cmn_err(CE_NOTE, "md: could not load misc/%s", 1150 md_getshared_name(setno, drvrid)); 1151 } 1152 } 1153 1154 if (recid < 0) 1155 goto out; 1156 1157 snarf_user_data(setno); 1158 1159 /* 1160 * Initialize the md_nm_snarfed array 1161 * this array is indexed by the key and 1162 * is set by md_getdevnum during the snarf time 1163 */ 1164 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) { 1165 size = (int)((((struct nm_rec_hdr *)nh->nmn_record)-> 1166 r_next_key) * (sizeof (int))); 1167 md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP); 1168 } 1169 1170 /* 1171 * go through and snarf until nothing gets added 1172 */ 1173 do { 1174 i = 0; 1175 for (ops = md_opslist; ops != NULL; ops = ops->md_next) { 1176 if (ops->md_snarf != NULL) { 1177 retval = ops->md_snarf(MD_SNARF_DOIT, setno); 1178 if (retval == -1) { 1179 err = -1; 1180 /* Don't know the failed unit */ 1181 (void) mdmderror(ep, MDE_RR_ALLOC_ERROR, 1182 0); 1183 (void) md_halt_set(setno, MD_HALT_ALL); 1184 (void) mddb_unload_set(setno); 1185 md_haltsnarf_exit(setno); 1186 return (err); 1187 } else { 1188 i += retval; 1189 } 1190 } 1191 } 1192 } while (i); 1193 1194 /* 1195 * Set the first available slot and availability 1196 */ 1197 md_set[setno].s_un_avail = 0; 1198 for (un = 0; un < MD_MAXUNITS; un++) { 1199 if (md_set[setno].s_un[un] != NULL) { 1200 continue; 1201 } else { 1202 if (!un_next_set) { 1203 md_set[setno].s_un_next = un; 1204 un_next_set = 1; 1205 } 1206 md_set[setno].s_un_avail++; 1207 } 1208 } 1209 1210 md_set_setstatus(setno, MD_SET_SNARFED); 1211 1212 recid = mddb_makerecid(setno, 0); 1213 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1214 privat = mddb_getrecprivate(recid); 1215 if (privat & MD_PRV_COMMIT) { 1216 if (mddb_commitrec(recid)) { 1217 if (!(md_get_setstatus(setno) & MD_SET_STALE)) { 1218 md_set_setstatus(setno, MD_SET_STALE); 1219 cmn_err(CE_WARN, 1220 "md: state database is stale"); 1221 } 1222 } 1223 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1224 } 1225 } 1226 1227 /* Deletes must happen after all the commits */ 1228 recid = mddb_makerecid(setno, 0); 1229 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1230 privat = mddb_getrecprivate(recid); 1231 if (privat & MD_PRV_DELETE) { 1232 if (mddb_deleterec(recid)) { 1233 if (!(md_get_setstatus(setno) & MD_SET_STALE)) { 1234 md_set_setstatus(setno, MD_SET_STALE); 1235 cmn_err(CE_WARN, 1236 "md: state database is stale"); 1237 } 1238 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1239 } 1240 recid = mddb_makerecid(setno, 0); 1241 } 1242 } 1243 1244 /* 1245 * go through and clean up records until nothing gets cleaned up. 1246 */ 1247 do { 1248 i = 0; 1249 for (ops = md_opslist; ops != NULL; ops = ops->md_next) 1250 if (ops->md_snarf != NULL) 1251 i += ops->md_snarf(MD_SNARF_CLEANUP, setno); 1252 } while (i); 1253 1254 if (md_nm_snarfed != NULL && 1255 !(md_get_setstatus(setno) & MD_SET_STALE)) { 1256 /* 1257 * go thru and cleanup the namespace and the device id 1258 * name space 1259 */ 1260 for (key = 1; 1261 key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key; 1262 key++) { 1263 /* 1264 * Is the entry an 'orphan'? 1265 */ 1266 if (lookup_entry(nh, setno, side, key, NODEV64, 0L) != 1267 NULL) { 1268 /* 1269 * If the value is not set then apparently 1270 * it is not part of the current configuration, 1271 * remove it this can happen when system panic 1272 * between the primary name space update and 1273 * the device id name space update 1274 */ 1275 if (md_nm_snarfed[key] == 0) { 1276 if (md_verify_orphaned_record(setno, 1277 key) == 1) 1278 (void) remove_entry(nh, 1279 side, key, 0L); 1280 } 1281 } 1282 } 1283 } 1284 1285 if (md_nm_snarfed != NULL) { 1286 /* 1287 * Done and free the memory 1288 */ 1289 kmem_free(md_nm_snarfed, size); 1290 md_nm_snarfed = NULL; 1291 } 1292 1293 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE && 1294 !(md_get_setstatus(setno) & MD_SET_STALE)) { 1295 /* 1296 * if the destroy flag has been set and 1297 * the MD_SET_DIDCLUP bit is not set in 1298 * the set's status field, cleanup the 1299 * entire device id namespace 1300 */ 1301 if (md_devid_destroy && 1302 !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) { 1303 (void) md_devid_cleanup(setno, 1); 1304 md_set_setstatus(setno, MD_SET_DIDCLUP); 1305 } else 1306 (void) md_devid_cleanup(setno, 0); 1307 } 1308 1309 /* 1310 * clear single threading on snarf, return success or error 1311 */ 1312 out: 1313 md_haltsnarf_exit(setno); 1314 return (err); 1315 } 1316 1317 void 1318 get_minfo(struct dk_minfo *info, minor_t mnum) 1319 { 1320 md_unit_t *un; 1321 mdi_unit_t *ui; 1322 1323 info->dki_capacity = 0; 1324 info->dki_lbsize = 0; 1325 info->dki_media_type = 0; 1326 1327 if ((ui = MDI_UNIT(mnum)) == NULL) { 1328 return; 1329 } 1330 un = (md_unit_t *)md_unit_readerlock(ui); 1331 info->dki_capacity = un->c.un_total_blocks; 1332 md_unit_readerexit(ui); 1333 info->dki_lbsize = DEV_BSIZE; 1334 info->dki_media_type = DK_UNKNOWN; 1335 } 1336 1337 1338 void 1339 get_info(struct dk_cinfo *info, minor_t mnum) 1340 { 1341 /* 1342 * Controller Information 1343 */ 1344 info->dki_ctype = DKC_MD; 1345 info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo)); 1346 (void) strcpy(info->dki_cname, 1347 ddi_get_name(ddi_get_parent(md_devinfo))); 1348 /* 1349 * Unit Information 1350 */ 1351 info->dki_unit = mnum; 1352 info->dki_slave = 0; 1353 (void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo)); 1354 info->dki_flags = 0; 1355 info->dki_partition = 0; 1356 info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE); 1357 1358 /* 1359 * We can't get from here to there yet 1360 */ 1361 info->dki_addr = 0; 1362 info->dki_space = 0; 1363 info->dki_prio = 0; 1364 info->dki_vec = 0; 1365 } 1366 1367 /* 1368 * open admin device 1369 */ 1370 static int 1371 mdadminopen( 1372 int flag, 1373 int otyp) 1374 { 1375 int err = 0; 1376 1377 /* single thread */ 1378 mutex_enter(&md_mx); 1379 1380 /* check type and flags */ 1381 if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) { 1382 err = EINVAL; 1383 goto out; 1384 } 1385 if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) || 1386 (md_status & MD_GBL_EXCL)) { 1387 err = EBUSY; 1388 goto out; 1389 } 1390 1391 /* count and flag open */ 1392 md_ocnt[otyp]++; 1393 md_status |= MD_GBL_OPEN; 1394 if (flag & FEXCL) 1395 md_status |= MD_GBL_EXCL; 1396 1397 /* unlock return success */ 1398 out: 1399 mutex_exit(&md_mx); 1400 return (err); 1401 } 1402 1403 /* 1404 * open entry point 1405 */ 1406 static int 1407 mdopen( 1408 dev_t *dev, 1409 int flag, 1410 int otyp, 1411 cred_t *cred_p) 1412 { 1413 minor_t mnum = getminor(*dev); 1414 unit_t unit = MD_MIN2UNIT(mnum); 1415 set_t setno = MD_MIN2SET(mnum); 1416 mdi_unit_t *ui = NULL; 1417 int err = 0; 1418 md_parent_t parent; 1419 1420 /* dispatch admin device opens */ 1421 if (mnum == MD_ADM_MINOR) 1422 return (mdadminopen(flag, otyp)); 1423 1424 /* lock, check status */ 1425 rw_enter(&md_unit_array_rw.lock, RW_READER); 1426 1427 tryagain: 1428 if (md_get_status() & MD_GBL_HALTED) { 1429 err = ENODEV; 1430 goto out; 1431 } 1432 1433 /* check minor */ 1434 if ((setno >= md_nsets) || (unit >= md_nunits)) { 1435 err = ENXIO; 1436 goto out; 1437 } 1438 1439 /* make sure we're snarfed */ 1440 if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) { 1441 if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) { 1442 err = ENODEV; 1443 goto out; 1444 } 1445 } 1446 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) { 1447 err = ENODEV; 1448 goto out; 1449 } 1450 1451 /* check unit */ 1452 if ((ui = MDI_UNIT(mnum)) == NULL) { 1453 err = ENXIO; 1454 goto out; 1455 } 1456 1457 /* 1458 * The softpart open routine may do an I/O during the open, in 1459 * which case the open routine will set the OPENINPROGRESS flag 1460 * and drop all locks during the I/O. If this thread sees 1461 * the OPENINPROGRESS flag set, if should wait until the flag 1462 * is reset before calling the driver's open routine. It must 1463 * also revalidate the world after it grabs the unit_array lock 1464 * since the set may have been released or the metadevice cleared 1465 * during the sleep. 1466 */ 1467 if (MD_MNSET_SETNO(setno)) { 1468 mutex_enter(&ui->ui_mx); 1469 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 1470 rw_exit(&md_unit_array_rw.lock); 1471 cv_wait(&ui->ui_cv, &ui->ui_mx); 1472 rw_enter(&md_unit_array_rw.lock, RW_READER); 1473 mutex_exit(&ui->ui_mx); 1474 goto tryagain; 1475 } 1476 mutex_exit(&ui->ui_mx); 1477 } 1478 1479 /* Test if device is openable */ 1480 if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) { 1481 err = ENXIO; 1482 goto out; 1483 } 1484 1485 /* don't allow opens w/WRITE flag if stale */ 1486 if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) { 1487 err = EROFS; 1488 goto out; 1489 } 1490 1491 /* don't allow writes to subdevices */ 1492 parent = md_get_parent(md_expldev(*dev)); 1493 if ((flag & FWRITE) && MD_HAS_PARENT(parent)) { 1494 err = EROFS; 1495 goto out; 1496 } 1497 1498 /* open underlying driver */ 1499 if (md_ops[ui->ui_opsindex]->md_open != NULL) { 1500 if ((err = (*md_ops[ui->ui_opsindex]->md_open) 1501 (dev, flag, otyp, cred_p, 0)) != 0) 1502 goto out; 1503 } 1504 1505 /* or do it ourselves */ 1506 else { 1507 /* single thread */ 1508 (void) md_unit_openclose_enter(ui); 1509 err = md_unit_incopen(mnum, flag, otyp); 1510 md_unit_openclose_exit(ui); 1511 if (err != 0) 1512 goto out; 1513 } 1514 1515 /* unlock, return status */ 1516 out: 1517 rw_exit(&md_unit_array_rw.lock); 1518 return (err); 1519 } 1520 1521 /* 1522 * close admin device 1523 */ 1524 static int 1525 mdadminclose( 1526 int otyp) 1527 { 1528 int i; 1529 int err = 0; 1530 1531 /* single thread */ 1532 mutex_enter(&md_mx); 1533 1534 /* check type and flags */ 1535 if ((otyp < 0) || (otyp >= OTYPCNT)) { 1536 err = EINVAL; 1537 goto out; 1538 } else if (md_ocnt[otyp] == 0) { 1539 err = ENXIO; 1540 goto out; 1541 } 1542 1543 /* count and flag closed */ 1544 if (otyp == OTYP_LYR) 1545 md_ocnt[otyp]--; 1546 else 1547 md_ocnt[otyp] = 0; 1548 md_status &= ~MD_GBL_OPEN; 1549 for (i = 0; (i < OTYPCNT); ++i) 1550 if (md_ocnt[i] != 0) 1551 md_status |= MD_GBL_OPEN; 1552 if (! (md_status & MD_GBL_OPEN)) 1553 md_status &= ~MD_GBL_EXCL; 1554 1555 /* unlock return success */ 1556 out: 1557 mutex_exit(&md_mx); 1558 return (err); 1559 } 1560 1561 /* 1562 * close entry point 1563 */ 1564 static int 1565 mdclose( 1566 dev_t dev, 1567 int flag, 1568 int otyp, 1569 cred_t *cred_p) 1570 { 1571 minor_t mnum = getminor(dev); 1572 set_t setno = MD_MIN2SET(mnum); 1573 unit_t unit = MD_MIN2UNIT(mnum); 1574 mdi_unit_t *ui = NULL; 1575 int err = 0; 1576 1577 /* dispatch admin device closes */ 1578 if (mnum == MD_ADM_MINOR) 1579 return (mdadminclose(otyp)); 1580 1581 /* check minor */ 1582 if ((setno >= md_nsets) || (unit >= md_nunits) || 1583 ((ui = MDI_UNIT(mnum)) == NULL)) { 1584 err = ENXIO; 1585 goto out; 1586 } 1587 1588 /* close underlying driver */ 1589 if (md_ops[ui->ui_opsindex]->md_close != NULL) { 1590 if ((err = (*md_ops[ui->ui_opsindex]->md_close) 1591 (dev, flag, otyp, cred_p, 0)) != 0) 1592 goto out; 1593 } 1594 1595 /* or do it ourselves */ 1596 else { 1597 /* single thread */ 1598 (void) md_unit_openclose_enter(ui); 1599 err = md_unit_decopen(mnum, otyp); 1600 md_unit_openclose_exit(ui); 1601 if (err != 0) 1602 goto out; 1603 } 1604 1605 /* return success */ 1606 out: 1607 return (err); 1608 } 1609 1610 1611 /* 1612 * This routine performs raw read operations. It is called from the 1613 * device switch at normal priority. 1614 * 1615 * The main catch is that the *uio struct which is passed to us may 1616 * specify a read which spans two buffers, which would be contiguous 1617 * on a single partition, but not on a striped partition. This will 1618 * be handled by mdstrategy. 1619 */ 1620 /*ARGSUSED*/ 1621 static int 1622 mdread(dev_t dev, struct uio *uio, cred_t *credp) 1623 { 1624 minor_t mnum; 1625 mdi_unit_t *ui; 1626 int error; 1627 1628 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1629 (MD_MIN2SET(mnum) >= md_nsets) || 1630 (MD_MIN2UNIT(mnum) >= md_nunits) || 1631 ((ui = MDI_UNIT(mnum)) == NULL)) 1632 return (ENXIO); 1633 1634 if (md_ops[ui->ui_opsindex]->md_read != NULL) 1635 return ((*md_ops[ui->ui_opsindex]->md_read) 1636 (dev, uio, credp)); 1637 1638 if ((error = md_chk_uio(uio)) != 0) 1639 return (error); 1640 1641 return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio)); 1642 } 1643 1644 /* 1645 * This routine performs async raw read operations. It is called from the 1646 * device switch at normal priority. 1647 * 1648 * The main catch is that the *aio struct which is passed to us may 1649 * specify a read which spans two buffers, which would be contiguous 1650 * on a single partition, but not on a striped partition. This will 1651 * be handled by mdstrategy. 1652 */ 1653 /*ARGSUSED*/ 1654 static int 1655 mdaread(dev_t dev, struct aio_req *aio, cred_t *credp) 1656 { 1657 minor_t mnum; 1658 mdi_unit_t *ui; 1659 int error; 1660 1661 1662 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1663 (MD_MIN2SET(mnum) >= md_nsets) || 1664 (MD_MIN2UNIT(mnum) >= md_nunits) || 1665 ((ui = MDI_UNIT(mnum)) == NULL)) 1666 return (ENXIO); 1667 1668 if (md_ops[ui->ui_opsindex]->md_aread != NULL) 1669 return ((*md_ops[ui->ui_opsindex]->md_aread) 1670 (dev, aio, credp)); 1671 1672 if ((error = md_chk_uio(aio->aio_uio)) != 0) 1673 return (error); 1674 1675 return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio)); 1676 } 1677 1678 /* 1679 * This routine performs raw write operations. It is called from the 1680 * device switch at normal priority. 1681 * 1682 * The main catch is that the *uio struct which is passed to us may 1683 * specify a write which spans two buffers, which would be contiguous 1684 * on a single partition, but not on a striped partition. This is 1685 * handled by mdstrategy. 1686 * 1687 */ 1688 /*ARGSUSED*/ 1689 static int 1690 mdwrite(dev_t dev, struct uio *uio, cred_t *credp) 1691 { 1692 minor_t mnum; 1693 mdi_unit_t *ui; 1694 int error; 1695 1696 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1697 (MD_MIN2SET(mnum) >= md_nsets) || 1698 (MD_MIN2UNIT(mnum) >= md_nunits) || 1699 ((ui = MDI_UNIT(mnum)) == NULL)) 1700 return (ENXIO); 1701 1702 if (md_ops[ui->ui_opsindex]->md_write != NULL) 1703 return ((*md_ops[ui->ui_opsindex]->md_write) 1704 (dev, uio, credp)); 1705 1706 if ((error = md_chk_uio(uio)) != 0) 1707 return (error); 1708 1709 return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio)); 1710 } 1711 1712 /* 1713 * This routine performs async raw write operations. It is called from the 1714 * device switch at normal priority. 1715 * 1716 * The main catch is that the *aio struct which is passed to us may 1717 * specify a write which spans two buffers, which would be contiguous 1718 * on a single partition, but not on a striped partition. This is 1719 * handled by mdstrategy. 1720 * 1721 */ 1722 /*ARGSUSED*/ 1723 static int 1724 mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp) 1725 { 1726 minor_t mnum; 1727 mdi_unit_t *ui; 1728 int error; 1729 1730 1731 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1732 (MD_MIN2SET(mnum) >= md_nsets) || 1733 (MD_MIN2UNIT(mnum) >= md_nunits) || 1734 ((ui = MDI_UNIT(mnum)) == NULL)) 1735 return (ENXIO); 1736 1737 if (md_ops[ui->ui_opsindex]->md_awrite != NULL) 1738 return ((*md_ops[ui->ui_opsindex]->md_awrite) 1739 (dev, aio, credp)); 1740 1741 if ((error = md_chk_uio(aio->aio_uio)) != 0) 1742 return (error); 1743 1744 return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio)); 1745 } 1746 1747 int 1748 mdstrategy(struct buf *bp) 1749 { 1750 minor_t mnum; 1751 mdi_unit_t *ui; 1752 1753 ASSERT((bp->b_flags & B_DONE) == 0); 1754 1755 if (panicstr) 1756 md_clr_status(MD_GBL_DAEMONS_LIVE); 1757 1758 if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) || 1759 (MD_MIN2SET(mnum) >= md_nsets) || 1760 (MD_MIN2UNIT(mnum) >= md_nunits) || 1761 ((ui = MDI_UNIT(mnum)) == NULL)) { 1762 bp->b_flags |= B_ERROR; 1763 bp->b_error = ENXIO; 1764 bp->b_resid = bp->b_bcount; 1765 biodone(bp); 1766 return (0); 1767 } 1768 1769 bp->b_flags &= ~(B_ERROR | B_DONE); 1770 if (md_ops[ui->ui_opsindex]->md_strategy != NULL) { 1771 (*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL); 1772 } else { 1773 (void) errdone(ui, bp, ENXIO); 1774 } 1775 return (0); 1776 } 1777 1778 /* 1779 * Return true if the ioctl is allowed to be multithreaded. 1780 * All the ioctls with MN are sent only from the message handlers through 1781 * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two 1782 * ioctl for the same metadevice are issued at the same time. 1783 * So we are safe here. 1784 * The other ioctls do not mess with any metadevice structures and therefor 1785 * are harmless too, if called multiple times at the same time. 1786 */ 1787 static boolean_t 1788 is_mt_ioctl(int cmd) { 1789 1790 switch (cmd) { 1791 case MD_IOCGUNIQMSGID: 1792 case MD_IOCGVERSION: 1793 case MD_IOCISOPEN: 1794 case MD_MN_SET_MM_OWNER: 1795 case MD_MN_SET_STATE: 1796 case MD_MN_SUSPEND_WRITES: 1797 case MD_MN_ALLOCATE_HOTSPARE: 1798 case MD_MN_SET_SETFLAGS: 1799 case MD_MN_GET_SETFLAGS: 1800 case MD_MN_MDDB_OPTRECFIX: 1801 case MD_MN_MDDB_PARSE: 1802 case MD_MN_MDDB_BLOCK: 1803 case MD_MN_DB_USERREQ: 1804 case MD_IOC_SPSTATUS: 1805 case MD_MN_COMMD_ERR: 1806 case MD_MN_SET_COMMD_RUNNING: 1807 case MD_MN_RESYNC: 1808 case MD_MN_SETSYNC: 1809 case MD_MN_POKE_HOTSPARES: 1810 return (1); 1811 default: 1812 return (0); 1813 } 1814 } 1815 1816 /* 1817 * This routine implements the ioctl calls for the Virtual Disk System. 1818 * It is called from the device switch at normal priority. 1819 */ 1820 /* ARGSUSED */ 1821 static int 1822 mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p, 1823 int *rval_p) 1824 { 1825 minor_t mnum = getminor(dev); 1826 mdi_unit_t *ui; 1827 IOLOCK lock; 1828 int err; 1829 1830 /* 1831 * For multinode disksets number of ioctls are allowed to be 1832 * multithreaded. 1833 * A fundamental assumption made in this implementation is that 1834 * ioctls either do not interact with other md structures or the 1835 * ioctl to the admin device can only occur if the metadevice 1836 * device is open. i.e. avoid a race between metaclear and the 1837 * progress of a multithreaded ioctl. 1838 */ 1839 1840 if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) { 1841 return (EINTR); 1842 } 1843 1844 /* 1845 * initialize lock tracker 1846 */ 1847 IOLOCK_INIT(&lock); 1848 1849 /* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */ 1850 1851 if (is_mt_ioctl(cmd)) { 1852 /* increment the md_mtioctl_cnt */ 1853 mutex_enter(&md_mx); 1854 md_mtioctl_cnt++; 1855 mutex_exit(&md_mx); 1856 lock.l_flags |= MD_MT_IOCTL; 1857 } 1858 1859 /* 1860 * this has been added to prevent notification from re-snarfing 1861 * so metaunload will work. It may interfere with other modules 1862 * halt process. 1863 */ 1864 if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE)) 1865 return (IOLOCK_RETURN(ENXIO, &lock)); 1866 1867 /* 1868 * admin device ioctls 1869 */ 1870 if (mnum == MD_ADM_MINOR) { 1871 err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data, 1872 mode, &lock); 1873 } 1874 1875 /* 1876 * metadevice ioctls 1877 */ 1878 else if ((MD_MIN2SET(mnum) >= md_nsets) || 1879 (MD_MIN2UNIT(mnum) >= md_nunits) || 1880 ((ui = MDI_UNIT(mnum)) == NULL)) { 1881 err = ENXIO; 1882 } else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) { 1883 err = ENOTTY; 1884 } else { 1885 err = (*md_ops[ui->ui_opsindex]->md_ioctl) 1886 (dev, cmd, (void *) data, mode, &lock); 1887 } 1888 1889 /* 1890 * drop any locks we grabbed 1891 */ 1892 return (IOLOCK_RETURN_IOCTLEND(err, &lock)); 1893 } 1894 1895 static int 1896 mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1897 { 1898 minor_t mnum; 1899 set_t setno; 1900 mdi_unit_t *ui; 1901 1902 if ((mnum = getminor(dev)) == MD_ADM_MINOR) 1903 return (ENXIO); 1904 1905 setno = MD_MIN2SET(mnum); 1906 1907 if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) || 1908 ((ui = MDI_UNIT(mnum)) == NULL)) 1909 return (ENXIO); 1910 1911 1912 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 1913 return (ENXIO); 1914 1915 if (md_ops[ui->ui_opsindex]->md_dump != NULL) 1916 return ((*md_ops[ui->ui_opsindex]->md_dump) 1917 (dev, addr, blkno, nblk)); 1918 1919 return (ENXIO); 1920 } 1921 1922 /* 1923 * Metadevice unit number dispatcher 1924 * When this routine is called it will scan the 1925 * incore unit array and return the avail slot 1926 * hence the unit number to the caller 1927 * 1928 * Return -1 if there is nothing available 1929 */ 1930 unit_t 1931 md_get_nextunit(set_t setno) 1932 { 1933 unit_t un, start; 1934 1935 /* 1936 * If nothing available 1937 */ 1938 if (md_set[setno].s_un_avail == 0) { 1939 return (MD_UNITBAD); 1940 } 1941 1942 mutex_enter(&md_mx); 1943 start = un = md_set[setno].s_un_next; 1944 1945 /* LINTED: E_CONSTANT_CONDITION */ 1946 while (1) { 1947 if (md_set[setno].s_un[un] == NULL) { 1948 /* 1949 * Advance the starting index for the next 1950 * md_get_nextunit call 1951 */ 1952 if (un == MD_MAXUNITS - 1) { 1953 md_set[setno].s_un_next = 0; 1954 } else { 1955 md_set[setno].s_un_next = un + 1; 1956 } 1957 break; 1958 } 1959 1960 un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1); 1961 1962 if (un == start) { 1963 un = MD_UNITBAD; 1964 break; 1965 } 1966 1967 } 1968 1969 mutex_exit(&md_mx); 1970 return (un); 1971 } 1972