1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Md - is the meta-disk driver. It sits below the UFS file system 30 * but above the 'real' disk drivers, xy, id, sd etc. 31 * 32 * To the UFS software, md looks like a normal driver, since it has 33 * the normal kinds of entries in the bdevsw and cdevsw arrays. So 34 * UFS accesses md in the usual ways. In particular, the strategy 35 * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(), 36 * and ufs_writelbn(). 37 * 38 * Md maintains an array of minor devices (meta-partitions). Each 39 * meta partition stands for a matrix of real partitions, in rows 40 * which are not necessarily of equal length. Md maintains a table, 41 * with one entry for each meta-partition, which lists the rows and 42 * columns of actual partitions, and the job of the strategy routine 43 * is to translate from the meta-partition device and block numbers 44 * known to UFS into the actual partitions' device and block numbers. 45 * 46 * See below, in mdstrategy(), mdreal(), and mddone() for details of 47 * this translation. 48 */ 49 50 /* 51 * Driver for Virtual Disk. 52 */ 53 54 #include <sys/user.h> 55 #include <sys/sysmacros.h> 56 #include <sys/conf.h> 57 #include <sys/stat.h> 58 #include <sys/errno.h> 59 #include <sys/param.h> 60 #include <sys/systm.h> 61 #include <sys/file.h> 62 #include <sys/open.h> 63 #include <sys/dkio.h> 64 #include <sys/vtoc.h> 65 #include <sys/cmn_err.h> 66 #include <sys/ddi.h> 67 #include <sys/sunddi.h> 68 #include <sys/debug.h> 69 #include <sys/utsname.h> 70 #include <sys/lvm/mdvar.h> 71 #include <sys/lvm/md_names.h> 72 #include <sys/lvm/md_mddb.h> 73 #include <sys/lvm/md_sp.h> 74 #include <sys/types.h> 75 #include <sys/kmem.h> 76 #include <sys/cladm.h> 77 #include <sys/priv_names.h> 78 79 #ifndef lint 80 char _depends_on[] = "strmod/rpcmod"; 81 #endif /* lint */ 82 int md_init_debug = 0; /* module binding debug */ 83 84 /* 85 * Tunable to turn off the failfast behavior. 86 */ 87 int md_ff_disable = 0; 88 89 /* 90 * dynamically allocated list of non FF driver names - needs to 91 * be freed when md is detached. 92 */ 93 char **non_ff_drivers = NULL; 94 95 md_krwlock_t md_unit_array_rw; /* protects all unit arrays */ 96 md_krwlock_t nm_lock; /* protects all the name spaces */ 97 98 md_resync_t md_cpr_resync; 99 100 extern char svm_bootpath[]; 101 #define SVM_PSEUDO_STR "/pseudo/md@0:" 102 103 #define VERSION_LENGTH 6 104 #define VERSION "1.0" 105 106 /* 107 * Keep track of possible 'orphan' entries in the name space 108 */ 109 int *md_nm_snarfed = NULL; 110 111 /* 112 * Global tunable giving the percentage of free space left in replica during 113 * conversion of non-devid style replica to devid style replica. 114 */ 115 int md_conv_perc = MDDB_DEVID_CONV_PERC; 116 117 #ifdef DEBUG 118 /* debug code to verify framework exclusion guarantees */ 119 int md_in; 120 kmutex_t md_in_mx; /* used to md global stuff */ 121 #define IN_INIT 0x01 122 #define IN_FINI 0x02 123 #define IN_ATTACH 0x04 124 #define IN_DETACH 0x08 125 #define IN_OPEN 0x10 126 #define MD_SET_IN(x) { \ 127 mutex_enter(&md_in_mx); \ 128 if (md_in) \ 129 debug_enter("MD_SET_IN exclusion lost"); \ 130 if (md_in & x) \ 131 debug_enter("MD_SET_IN already set"); \ 132 md_in |= x; \ 133 mutex_exit(&md_in_mx); \ 134 } 135 136 #define MD_CLR_IN(x) { \ 137 mutex_enter(&md_in_mx); \ 138 if (md_in & ~(x)) \ 139 debug_enter("MD_CLR_IN exclusion lost"); \ 140 if (!(md_in & x)) \ 141 debug_enter("MD_CLR_IN already clr"); \ 142 md_in &= ~x; \ 143 mutex_exit(&md_in_mx); \ 144 } 145 #else /* DEBUG */ 146 #define MD_SET_IN(x) 147 #define MD_CLR_IN(x) 148 #endif /* DEBUG */ 149 hrtime_t savetime1, savetime2; 150 151 152 /* 153 * list things protected by md_mx even if they aren't 154 * used in this file. 155 */ 156 kmutex_t md_mx; /* used to md global stuff */ 157 kcondvar_t md_cv; /* md_status events */ 158 int md_status = 0; /* global status for the meta-driver */ 159 int md_num_daemons = 0; 160 int md_ioctl_cnt = 0; 161 int md_mtioctl_cnt = 0; /* multithreaded ioctl cnt */ 162 uint_t md_mdelay = 10; /* variable so can be patched */ 163 164 int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 165 166 major_t md_major, md_major_targ; 167 168 unit_t md_nunits = MD_MAXUNITS; 169 set_t md_nsets = MD_MAXSETS; 170 int md_nmedh = 0; 171 char *md_med_trans_lst = NULL; 172 md_set_t md_set[MD_MAXSETS]; 173 md_set_io_t md_set_io[MD_MAXSETS]; 174 175 md_krwlock_t hsp_rwlp; /* protects hot_spare_interface */ 176 md_krwlock_t ni_rwlp; /* protects notify_interface */ 177 md_ops_t **md_ops; 178 ddi_modhandle_t *md_mods; 179 md_ops_t *md_opslist; 180 clock_t md_hz; 181 md_event_queue_t *md_event_queue = NULL; 182 183 int md_in_upgrade; 184 int md_keep_repl_state; 185 int md_devid_destroy; 186 187 /* for sending messages thru a door to userland */ 188 door_handle_t mdmn_door_handle = NULL; 189 int mdmn_door_did = -1; 190 191 dev_info_t *md_devinfo = NULL; 192 193 md_mn_nodeid_t md_mn_mynode_id = ~0u; /* My node id (for multi-node sets) */ 194 195 static uint_t md_ocnt[OTYPCNT]; 196 197 static int mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 198 static int mdattach(dev_info_t *, ddi_attach_cmd_t); 199 static int mddetach(dev_info_t *, ddi_detach_cmd_t); 200 static int mdopen(dev_t *, int, int, cred_t *); 201 static int mdclose(dev_t, int, int, cred_t *); 202 static int mddump(dev_t, caddr_t, daddr_t, int); 203 static int mdread(dev_t, struct uio *, cred_t *); 204 static int mdwrite(dev_t, struct uio *, cred_t *); 205 static int mdaread(dev_t, struct aio_req *, cred_t *); 206 static int mdawrite(dev_t, struct aio_req *, cred_t *); 207 static int mdioctl(dev_t, int, intptr_t, int, cred_t *, int *); 208 static int mdprop_op(dev_t, dev_info_t *, 209 ddi_prop_op_t, int, char *, caddr_t, int *); 210 211 static struct cb_ops md_cb_ops = { 212 mdopen, /* open */ 213 mdclose, /* close */ 214 mdstrategy, /* strategy */ 215 /* print routine -- none yet */ 216 (int(*)(dev_t, char *))nulldev, 217 mddump, /* dump */ 218 mdread, /* read */ 219 mdwrite, /* write */ 220 mdioctl, /* ioctl */ 221 /* devmap */ 222 (int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *, 223 uint_t))nodev, 224 /* mmap */ 225 (int(*)(dev_t, off_t, int))nodev, 226 /* segmap */ 227 (int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned, 228 unsigned, unsigned, cred_t *))nodev, 229 nochpoll, /* poll */ 230 mdprop_op, /* prop_op */ 231 0, /* streamtab */ 232 (D_64BIT|D_MP|D_NEW), /* driver compatibility flag */ 233 CB_REV, /* cb_ops version */ 234 mdaread, /* aread */ 235 mdawrite, /* awrite */ 236 }; 237 238 static struct dev_ops md_devops = { 239 DEVO_REV, /* dev_ops version */ 240 0, /* device reference count */ 241 mdinfo, /* info routine */ 242 nulldev, /* identify routine */ 243 nulldev, /* probe - not defined */ 244 mdattach, /* attach routine */ 245 mddetach, /* detach routine */ 246 nodev, /* reset - not defined */ 247 &md_cb_ops, /* driver operations */ 248 NULL, /* bus operations */ 249 nodev /* power management */ 250 }; 251 252 /* 253 * loadable module wrapper 254 */ 255 #include <sys/modctl.h> 256 257 static struct modldrv modldrv = { 258 &mod_driverops, /* type of module -- a pseudodriver */ 259 "Solaris Volume Manager base module %I%", /* name of the module */ 260 &md_devops, /* driver ops */ 261 }; 262 263 static struct modlinkage modlinkage = { 264 MODREV_1, 265 (void *)&modldrv, 266 NULL 267 }; 268 269 270 /* md_medd.c */ 271 extern void med_init(void); 272 extern void med_fini(void); 273 extern void md_devid_cleanup(set_t, uint_t); 274 275 /* md_names.c */ 276 extern void *lookup_entry(struct nm_next_hdr *, set_t, 277 side_t, mdkey_t, md_dev64_t, int); 278 extern struct nm_next_hdr *get_first_record(set_t, int, int); 279 extern int remove_entry(struct nm_next_hdr *, 280 side_t, mdkey_t, int); 281 282 int md_maxphys = 0; /* maximum io size in bytes */ 283 #define MD_MAXBCOUNT (1024 * 1024) 284 unsigned md_maxbcount = 0; /* maximum physio size in bytes */ 285 286 /* allocate/free dynamic space associated with driver globals */ 287 void 288 md_global_alloc_free(int alloc) 289 { 290 set_t s; 291 292 if (alloc) { 293 /* initialize driver global locks */ 294 cv_init(&md_cv, NULL, CV_DEFAULT, NULL); 295 mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL); 296 rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL); 297 rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL); 298 rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL); 299 rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL); 300 mutex_init(&md_cpr_resync.md_resync_mutex, NULL, 301 MUTEX_DEFAULT, NULL); 302 303 /* initialize per set driver global locks */ 304 for (s = 0; s < MD_MAXSETS; s++) { 305 /* initialize per set driver globals locks */ 306 mutex_init(&md_set[s].s_dbmx, 307 NULL, MUTEX_DEFAULT, NULL); 308 mutex_init(&md_set_io[s].md_io_mx, 309 NULL, MUTEX_DEFAULT, NULL); 310 cv_init(&md_set_io[s].md_io_cv, 311 NULL, CV_DEFAULT, NULL); 312 } 313 } else { 314 /* destroy per set driver global locks */ 315 for (s = 0; s < MD_MAXSETS; s++) { 316 cv_destroy(&md_set_io[s].md_io_cv); 317 mutex_destroy(&md_set_io[s].md_io_mx); 318 mutex_destroy(&md_set[s].s_dbmx); 319 } 320 321 /* destroy driver global locks */ 322 mutex_destroy(&md_cpr_resync.md_resync_mutex); 323 rw_destroy(&hsp_rwlp.lock); 324 rw_destroy(&ni_rwlp.lock); 325 rw_destroy(&nm_lock.lock); 326 rw_destroy(&md_unit_array_rw.lock); 327 mutex_destroy(&md_mx); 328 cv_destroy(&md_cv); 329 } 330 } 331 332 int 333 _init(void) 334 { 335 set_t s; 336 int err; 337 338 MD_SET_IN(IN_INIT); 339 340 /* allocate dynamic space associated with driver globals */ 341 md_global_alloc_free(1); 342 343 /* initialize driver globals */ 344 md_major = ddi_name_to_major("md"); 345 md_hz = drv_usectohz(NUM_USEC_IN_SEC); 346 347 /* initialize tunable globals */ 348 if (md_maxphys == 0) /* maximum io size in bytes */ 349 md_maxphys = maxphys; 350 if (md_maxbcount == 0) /* maximum physio size in bytes */ 351 md_maxbcount = MD_MAXBCOUNT; 352 353 /* initialize per set driver globals */ 354 for (s = 0; s < MD_MAXSETS; s++) 355 md_set_io[s].io_state = MD_SET_ACTIVE; 356 357 /* 358 * NOTE: the framework does not currently guarantee exclusion 359 * between _init and attach after calling mod_install. 360 */ 361 MD_CLR_IN(IN_INIT); 362 if ((err = mod_install(&modlinkage))) { 363 MD_SET_IN(IN_INIT); 364 md_global_alloc_free(0); /* free dynamic space */ 365 MD_CLR_IN(IN_INIT); 366 } 367 return (err); 368 } 369 370 int 371 _fini(void) 372 { 373 int err; 374 375 /* 376 * NOTE: the framework currently does not guarantee exclusion 377 * with attach until after mod_remove returns 0. 378 */ 379 if ((err = mod_remove(&modlinkage))) 380 return (err); 381 382 MD_SET_IN(IN_FINI); 383 md_global_alloc_free(0); /* free dynamic space */ 384 MD_CLR_IN(IN_FINI); 385 return (err); 386 } 387 388 int 389 _info(struct modinfo *modinfop) 390 { 391 return (mod_info(&modlinkage, modinfop)); 392 } 393 394 /* ARGSUSED */ 395 static int 396 mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd) 397 { 398 int len; 399 unit_t i; 400 size_t sz; 401 char ver[VERSION_LENGTH]; 402 char **maj_str_array; 403 char *str, *str2; 404 405 MD_SET_IN(IN_ATTACH); 406 md_in_upgrade = 0; 407 md_keep_repl_state = 0; 408 md_devid_destroy = 0; 409 410 if (cmd != DDI_ATTACH) { 411 MD_CLR_IN(IN_ATTACH); 412 return (DDI_FAILURE); 413 } 414 415 if (md_devinfo != NULL) { 416 MD_CLR_IN(IN_ATTACH); 417 return (DDI_FAILURE); 418 } 419 420 mddb_init(); 421 422 if (md_start_daemons(TRUE)) { 423 MD_CLR_IN(IN_ATTACH); 424 mddb_unload(); /* undo mddb_init() allocations */ 425 return (DDI_FAILURE); 426 } 427 428 /* clear the halted state */ 429 md_clr_status(MD_GBL_HALTED); 430 431 /* see if the diagnostic switch is on */ 432 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 433 DDI_PROP_DONTPASS, "md_init_debug", 0)) 434 md_init_debug++; 435 436 /* see if the failfast disable switch is on */ 437 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 438 DDI_PROP_DONTPASS, "md_ff_disable", 0)) 439 md_ff_disable++; 440 441 /* try and get the md_nmedh property */ 442 md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 443 DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS); 444 if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS)) 445 md_nmedh = MED_DEF_HOSTS; 446 447 /* try and get the md_med_trans_lst property */ 448 len = 0; 449 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN, 450 0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS || 451 len == 0) { 452 md_med_trans_lst = md_strdup("tcp"); 453 } else { 454 md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP); 455 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF, 456 0, "md_med_trans_lst", md_med_trans_lst, &len) != 457 DDI_PROP_SUCCESS) { 458 kmem_free(md_med_trans_lst, (size_t)len); 459 md_med_trans_lst = md_strdup("tcp"); 460 } 461 } 462 463 /* try and get the md_xlate property */ 464 /* Should we only do this if upgrade? */ 465 len = sizeof (char) * 5; 466 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF, 467 0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) { 468 if (strcmp(ver, VERSION) == 0) { 469 len = 0; 470 if (ddi_prop_op(DDI_DEV_T_ANY, dip, 471 PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate", 472 (caddr_t)&md_tuple_table, &len) != 473 DDI_PROP_SUCCESS) { 474 if (md_init_debug) 475 cmn_err(CE_WARN, 476 "md_xlate ddi_prop_op failed"); 477 goto attach_failure; 478 } else { 479 md_tuple_length = 480 len/(2 * ((int)sizeof (dev32_t))); 481 md_in_upgrade = 1; 482 } 483 484 /* Get target's name to major table */ 485 if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY, 486 dip, DDI_PROP_DONTPASS, 487 "md_targ_nm_table", &maj_str_array, 488 &md_majortab_len) != DDI_PROP_SUCCESS) { 489 md_majortab_len = 0; 490 if (md_init_debug) 491 cmn_err(CE_WARN, "md_targ_nm_table " 492 "ddi_prop_lookup_string_array failed"); 493 goto attach_failure; 494 } 495 496 md_major_tuple_table = 497 (struct md_xlate_major_table *) 498 kmem_zalloc(md_majortab_len * 499 sizeof (struct md_xlate_major_table), KM_SLEEP); 500 501 for (i = 0; i < md_majortab_len; i++) { 502 /* Getting major name */ 503 str = strchr(maj_str_array[i], ' '); 504 if (str == NULL) 505 continue; 506 *str = '\0'; 507 md_major_tuple_table[i].drv_name = 508 md_strdup(maj_str_array[i]); 509 510 /* Simplified atoi to get major number */ 511 str2 = str + 1; 512 md_major_tuple_table[i].targ_maj = 0; 513 while ((*str2 >= '0') && (*str2 <= '9')) { 514 md_major_tuple_table[i].targ_maj *= 10; 515 md_major_tuple_table[i].targ_maj += 516 *str2++ - '0'; 517 } 518 *str = ' '; 519 } 520 ddi_prop_free((void *)maj_str_array); 521 } else { 522 if (md_init_debug) 523 cmn_err(CE_WARN, "md_xlate_ver is incorrect"); 524 goto attach_failure; 525 } 526 } 527 528 /* 529 * Check for properties: 530 * md_keep_repl_state and md_devid_destroy 531 * and set globals if these exist. 532 */ 533 md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip, 534 0, "md_keep_repl_state", 0); 535 536 md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip, 537 0, "md_devid_destroy", 0); 538 539 if (MD_UPGRADE) 540 md_major_targ = md_targ_name_to_major("md"); 541 else 542 md_major_targ = 0; 543 544 /* alloc md_ops and md_mods struct */ 545 md_ops = (md_ops_t **)kmem_zalloc( 546 sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP); 547 md_mods = (ddi_modhandle_t *)kmem_zalloc( 548 sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP); 549 550 /* allocate admin device node */ 551 if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR, 552 MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640)) 553 goto attach_failure; 554 555 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 556 DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS) 557 goto attach_failure; 558 559 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, 560 "ddi-abrwrite-supported", 1) != DDI_SUCCESS) 561 goto attach_failure; 562 563 /* these could have been cleared by a detach */ 564 md_nunits = MD_MAXUNITS; 565 md_nsets = MD_MAXSETS; 566 567 sz = sizeof (void *) * MD_MAXUNITS; 568 if (md_set[0].s_un == NULL) 569 md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP); 570 if (md_set[0].s_ui == NULL) 571 md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP); 572 573 md_devinfo = dip; 574 575 /* 576 * Only allocate device node for root mirror metadevice. 577 * Don't pre-allocate unnecessary device nodes (thus slowing down a 578 * boot when we attach). 579 * We can't read the mddbs in attach. The mddbs will be read 580 * by metainit during the boot process when it is doing the 581 * auto-take processing and any other minor nodes will be 582 * allocated at that point. 583 * 584 * There are two scenarios to be aware of here: 585 * 1) when we are booting from a mirrored root we need the root 586 * metadevice to exist very early (during vfs_mountroot processing) 587 * 2) we need all of the nodes to be created so that any mnttab entries 588 * will succeed (handled by metainit reading the mddb during boot). 589 */ 590 if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1) 591 == 0) { 592 char *p; 593 int mnum = 0; 594 595 /* 596 * The svm_bootpath string looks something like 597 * /pseudo/md@0:0,150,blk where 150 is the minor number 598 * in this example so we need to set the pointer p onto 599 * the first digit of the minor number and convert it 600 * from ascii. 601 */ 602 for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1; 603 *p >= '0' && *p <= '9'; p++) { 604 mnum *= 10; 605 mnum += *p - '0'; 606 } 607 608 if (md_create_minor_node(0, mnum)) { 609 kmem_free(md_set[0].s_un, sz); 610 kmem_free(md_set[0].s_ui, sz); 611 goto attach_failure; 612 } 613 } 614 615 med_init(); 616 617 MD_CLR_IN(IN_ATTACH); 618 return (DDI_SUCCESS); 619 620 attach_failure: 621 /* 622 * Use our own detach routine to toss any stuff we allocated above. 623 * NOTE: detach will call md_halt to free the mddb_init allocations. 624 */ 625 MD_CLR_IN(IN_ATTACH); 626 if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS) 627 cmn_err(CE_WARN, "detach from attach failed"); 628 return (DDI_FAILURE); 629 } 630 631 /* ARGSUSED */ 632 static int 633 mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd) 634 { 635 extern int check_active_locators(); 636 set_t s; 637 size_t sz; 638 int len; 639 640 MD_SET_IN(IN_DETACH); 641 642 /* check command */ 643 if (cmd != DDI_DETACH) { 644 MD_CLR_IN(IN_DETACH); 645 return (DDI_FAILURE); 646 } 647 648 /* 649 * if we have not already halted yet we have no active config 650 * then automatically initiate a halt so we can detach. 651 */ 652 if (!(md_get_status() & MD_GBL_HALTED)) { 653 if (check_active_locators() == 0) { 654 /* 655 * NOTE: a successful md_halt will have done the 656 * mddb_unload to free allocations done in mddb_init 657 */ 658 if (md_halt(MD_NO_GBL_LOCKS_HELD)) { 659 cmn_err(CE_NOTE, "md:detach: " 660 "Could not halt Solaris Volume Manager"); 661 MD_CLR_IN(IN_DETACH); 662 return (DDI_FAILURE); 663 } 664 } 665 666 /* fail detach if we have not halted */ 667 if (!(md_get_status() & MD_GBL_HALTED)) { 668 MD_CLR_IN(IN_DETACH); 669 return (DDI_FAILURE); 670 } 671 } 672 673 /* must be in halted state, this will be cleared on next attach */ 674 ASSERT(md_get_status() & MD_GBL_HALTED); 675 676 /* cleanup attach allocations and initializations */ 677 md_major_targ = 0; 678 679 sz = sizeof (void *) * md_nunits; 680 for (s = 0; s < md_nsets; s++) { 681 if (md_set[s].s_un != NULL) { 682 kmem_free(md_set[s].s_un, sz); 683 md_set[s].s_un = NULL; 684 } 685 686 if (md_set[s].s_ui != NULL) { 687 kmem_free(md_set[s].s_ui, sz); 688 md_set[s].s_ui = NULL; 689 } 690 } 691 md_nunits = 0; 692 md_nsets = 0; 693 md_nmedh = 0; 694 695 if (non_ff_drivers != NULL) { 696 int i; 697 698 for (i = 0; non_ff_drivers[i] != NULL; i++) 699 kmem_free(non_ff_drivers[i], strlen(non_ff_drivers[i]) + 1); 700 701 /* free i+1 entries because there is a null entry at list end */ 702 kmem_free(non_ff_drivers, (i + 1) * sizeof (char *)); 703 non_ff_drivers = NULL; 704 } 705 706 if (md_med_trans_lst != NULL) { 707 kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1); 708 md_med_trans_lst = NULL; 709 } 710 711 if (md_mods != NULL) { 712 kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS); 713 md_mods = NULL; 714 } 715 716 if (md_ops != NULL) { 717 kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS); 718 md_ops = NULL; 719 } 720 721 if (MD_UPGRADE) { 722 len = md_tuple_length * (2 * ((int)sizeof (dev32_t))); 723 md_in_upgrade = 0; 724 md_xlate_free(len); 725 md_majortab_free(); 726 } 727 728 /* 729 * Undo what we did in mdattach, freeing resources 730 * and removing things we installed. The system 731 * framework guarantees we are not active with this devinfo 732 * node in any other entry points at this time. 733 */ 734 ddi_prop_remove_all(dip); 735 ddi_remove_minor_node(dip, NULL); 736 737 med_fini(); 738 md_devinfo = NULL; 739 740 MD_CLR_IN(IN_DETACH); 741 return (DDI_SUCCESS); 742 } 743 744 745 /* 746 * Given the device number return the devinfo pointer 747 * given to md via md_attach 748 */ 749 /*ARGSUSED*/ 750 static int 751 mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 752 { 753 int error = DDI_FAILURE; 754 755 switch (infocmd) { 756 case DDI_INFO_DEVT2DEVINFO: 757 if (md_devinfo) { 758 *result = (void *)md_devinfo; 759 error = DDI_SUCCESS; 760 } 761 break; 762 763 case DDI_INFO_DEVT2INSTANCE: 764 *result = (void *)0; 765 error = DDI_SUCCESS; 766 break; 767 } 768 return (error); 769 } 770 771 /* 772 * property operation routine. return the number of blocks for the partition 773 * in question or forward the request to the property facilities. 774 */ 775 static int 776 mdprop_op( 777 dev_t dev, /* device number associated with device */ 778 dev_info_t *dip, /* device info struct for this device */ 779 ddi_prop_op_t prop_op, /* property operator */ 780 int mod_flags, /* property flags */ 781 char *name, /* name of property */ 782 caddr_t valuep, /* where to put property value */ 783 int *lengthp) /* put length of property here */ 784 { 785 minor_t mnum; 786 set_t setno; 787 md_unit_t *un; 788 mdi_unit_t *ui; 789 uint64_t nblocks64; 790 791 /* 792 * Our dynamic properties are all device specific and size oriented. 793 * Requests issued under conditions where size is valid are passed 794 * to ddi_prop_op_nblocks with the size information, otherwise the 795 * request is passed to ddi_prop_op. Make sure that the minor device 796 * is a valid part of the Virtual Disk subsystem. 797 */ 798 mnum = getminor(dev); 799 setno = MD_MIN2SET(mnum); 800 if ((dev == DDI_DEV_T_ANY) || (mnum == MD_ADM_MINOR) || 801 (setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) { 802 pass: return (ddi_prop_op(dev, dip, prop_op, mod_flags, 803 name, valuep, lengthp)); 804 } else { 805 rw_enter(&md_unit_array_rw.lock, RW_READER); 806 if (((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) || 807 ((ui = MDI_UNIT(mnum)) == NULL)) { 808 rw_exit(&md_unit_array_rw.lock); 809 goto pass; 810 } 811 812 /* get nblocks value */ 813 un = (md_unit_t *)md_unit_readerlock(ui); 814 nblocks64 = un->c.un_total_blocks; 815 md_unit_readerexit(ui); 816 rw_exit(&md_unit_array_rw.lock); 817 818 return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags, 819 name, valuep, lengthp, nblocks64)); 820 } 821 822 } 823 824 static void 825 snarf_user_data(set_t setno) 826 { 827 mddb_recid_t recid; 828 mddb_recstatus_t status; 829 830 recid = mddb_makerecid(setno, 0); 831 while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) { 832 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 833 continue; 834 835 status = mddb_getrecstatus(recid); 836 if (status == MDDB_STALE) 837 continue; 838 839 if (status == MDDB_NODATA) { 840 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 841 continue; 842 } 843 844 ASSERT(status == MDDB_OK); 845 846 mddb_setrecprivate(recid, MD_PRV_GOTIT); 847 } 848 } 849 850 static void 851 md_print_block_usage(mddb_set_t *s, uint_t blks) 852 { 853 uint_t ib; 854 int li; 855 mddb_mb_ic_t *mbip; 856 uint_t max_blk_needed; 857 mddb_lb_t *lbp; 858 mddb_sidelocator_t *slp; 859 int drv_index; 860 md_splitname sn; 861 char *name; 862 char *suffix; 863 size_t prefixlen; 864 size_t suffixlen; 865 int alloc_sz; 866 867 868 max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks; 869 870 871 cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n" 872 " Additional Blocks Needed: %d\n\n" 873 " Increase size of following replicas for\n" 874 " device relocatability by deleting listed\n" 875 " replica and re-adding replica with\n" 876 " increased size (see metadb(1M)):\n" 877 " Replica Increase By", 878 s->s_totalblkcnt, (blks - s->s_freeblkcnt)); 879 880 lbp = s->s_lbp; 881 882 for (li = 0; li < lbp->lb_loccnt; li++) { 883 if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED) 884 continue; 885 ib = 0; 886 for (mbip = s->s_mbiarray[li]; mbip != NULL; 887 mbip = mbip->mbi_next) { 888 ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt; 889 } 890 if (ib == 0) 891 continue; 892 if (ib < max_blk_needed) { 893 slp = &lbp->lb_sidelocators[s->s_sideno][li]; 894 drv_index = slp->l_drvnm_index; 895 mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, 896 &sn); 897 prefixlen = SPN_PREFIX(&sn).pre_len; 898 suffixlen = SPN_SUFFIX(&sn).suf_len; 899 alloc_sz = (int)(prefixlen + suffixlen + 2); 900 name = (char *)kmem_alloc(alloc_sz, KM_SLEEP); 901 (void) strncpy(name, SPN_PREFIX(&sn).pre_data, 902 prefixlen); 903 name[prefixlen] = '/'; 904 suffix = name + (prefixlen + 1); 905 (void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data, 906 suffixlen); 907 name[prefixlen + suffixlen + 1] = '\0'; 908 cmn_err(CE_WARN, 909 " %s (%s:%d:%d) %d blocks", 910 name, lbp->lb_drvnm[drv_index].dn_data, 911 slp->l_mnum, lbp->lb_locators[li].l_blkno, 912 (max_blk_needed - ib)); 913 kmem_free(name, alloc_sz); 914 } 915 } 916 } 917 918 /* 919 * md_create_minor_node: 920 * Create the minor device for the given set and un_self_id. 921 * 922 * Input: 923 * setno - set number 924 * mnum - selfID of unit 925 * 926 * Output: 927 * None. 928 * 929 * Returns 0 for success, 1 for failure. 930 * 931 * Side-effects: 932 * None. 933 */ 934 int 935 md_create_minor_node(set_t setno, minor_t mnum) 936 { 937 char name[20]; 938 939 /* Check for valid arguments */ 940 if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS) 941 return (1); 942 943 (void) snprintf(name, 20, "%u,%u,blk", 944 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum)); 945 946 if (ddi_create_minor_node(md_devinfo, name, S_IFBLK, 947 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0)) 948 return (1); 949 950 (void) snprintf(name, 20, "%u,%u,raw", 951 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum)); 952 953 if (ddi_create_minor_node(md_devinfo, name, S_IFCHR, 954 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0)) 955 return (1); 956 957 return (0); 958 } 959 960 /* 961 * For a given key check if it is an orphaned record. 962 * The following conditions are used to determine an orphan. 963 * 1. The device associated with that key is not a metadevice. 964 * 2. If DEVID_STYLE then the physical device does not have a device Id 965 * associated with it. 966 * 967 * If a key does not have an entry in the devid namespace it could be 968 * a device that does not support device ids. Hence the record is not 969 * deleted. 970 */ 971 972 static int 973 md_verify_orphaned_record(set_t setno, mdkey_t key) 974 { 975 md_dev64_t odev; /* orphaned dev */ 976 mddb_set_t *s; 977 side_t side = 0; 978 struct nm_next_hdr *did_nh = NULL; 979 980 s = (mddb_set_t *)md_set[setno].s_db; 981 if ((did_nh = get_first_record(setno, 1, (NM_DEVID | NM_NOTSHARED))) 982 == NULL) 983 return (0); 984 /* 985 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT 986 */ 987 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) { 988 odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT); 989 if ((odev == NODEV64) || (md_getmajor(odev) == md_major)) 990 return (0); 991 if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) == 992 NULL) 993 return (1); 994 } 995 return (0); 996 } 997 998 int 999 md_snarf_db_set(set_t setno, md_error_t *ep) 1000 { 1001 int err = 0; 1002 int i; 1003 mddb_recid_t recid; 1004 mddb_type_t drvrid; 1005 mddb_recstatus_t status; 1006 md_ops_t *ops; 1007 uint_t privat; 1008 mddb_set_t *s; 1009 uint_t cvt_blks; 1010 struct nm_next_hdr *nh; 1011 mdkey_t key = MD_KEYWILD; 1012 side_t side = 0; 1013 int size; 1014 int devid_flag; 1015 int retval; 1016 uint_t un; 1017 int un_next_set = 0; 1018 1019 md_haltsnarf_enter(setno); 1020 1021 mutex_enter(&md_mx); 1022 if (md_set[setno].s_status & MD_SET_SNARFED) { 1023 mutex_exit(&md_mx); 1024 md_haltsnarf_exit(setno); 1025 return (0); 1026 } 1027 mutex_exit(&md_mx); 1028 1029 if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) { 1030 if (md_start_daemons(TRUE)) { 1031 if (ep != NULL) 1032 (void) mdsyserror(ep, ENXIO); 1033 err = -1; 1034 goto out; 1035 } 1036 } 1037 1038 1039 /* 1040 * Load the devid name space if it exists 1041 */ 1042 (void) md_load_namespace(setno, NULL, NM_DEVID); 1043 if (!md_load_namespace(setno, ep, 0L)) { 1044 /* 1045 * Unload the devid namespace 1046 */ 1047 (void) md_unload_namespace(setno, NM_DEVID); 1048 err = -1; 1049 goto out; 1050 } 1051 1052 /* 1053 * If replica is in non-devid state, convert if: 1054 * - not in probe during upgrade (md_keep_repl_state = 0) 1055 * - enough space available in replica 1056 * - local set 1057 * - not a multi-node diskset 1058 * - clustering is not present (for non-local set) 1059 */ 1060 s = (mddb_set_t *)md_set[setno].s_db; 1061 devid_flag = 0; 1062 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state) 1063 devid_flag = 1; 1064 if (cluster_bootflags & CLUSTER_CONFIGURED) 1065 if (setno != MD_LOCAL_SET) 1066 devid_flag = 0; 1067 if (MD_MNSET_SETNO(setno)) 1068 devid_flag = 0; 1069 if ((md_devid_destroy == 1) && (md_keep_repl_state == 1)) 1070 devid_flag = 0; 1071 1072 /* 1073 * if we weren't devid style before and md_keep_repl_state=1 1074 * we need to stay non-devid 1075 */ 1076 if ((md_keep_repl_state == 1) && 1077 ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0)) 1078 devid_flag = 0; 1079 if (devid_flag) { 1080 /* 1081 * Determine number of free blocks needed to convert 1082 * entire replica to device id format - locator blocks 1083 * and namespace. 1084 */ 1085 cvt_blks = 0; 1086 if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) { 1087 if (ep != NULL) 1088 (void) mdsyserror(ep, EIO); 1089 err = -1; 1090 goto out; 1091 1092 } 1093 cvt_blks += md_nm_did_chkspace(setno); 1094 1095 /* add MDDB_DEVID_CONV_PERC% */ 1096 if ((md_conv_perc > 0) && (md_conv_perc <= 100)) { 1097 cvt_blks = cvt_blks * (100 + md_conv_perc) / 100; 1098 } 1099 1100 if (cvt_blks <= s->s_freeblkcnt) { 1101 if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) { 1102 if (ep != NULL) 1103 (void) mdsyserror(ep, EIO); 1104 err = -1; 1105 goto out; 1106 } 1107 1108 } else { 1109 /* 1110 * Print message that replica can't be converted for 1111 * lack of space. No failure - just continue to 1112 * run without device ids. 1113 */ 1114 cmn_err(CE_WARN, 1115 "Unable to add Solaris Volume Manager device " 1116 "relocation data.\n" 1117 " To use device relocation feature:\n" 1118 " - Increase size of listed replicas\n" 1119 " - Reboot"); 1120 md_print_block_usage(s, cvt_blks); 1121 cmn_err(CE_WARN, 1122 "Loading set without device relocation data.\n" 1123 " Solaris Volume Manager disk movement " 1124 "not tracked in local set."); 1125 } 1126 } 1127 1128 /* 1129 * go through and load any modules referenced in 1130 * data base 1131 */ 1132 recid = mddb_makerecid(setno, 0); 1133 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1134 status = mddb_getrecstatus(recid); 1135 if (status == MDDB_STALE) { 1136 if (! (md_get_setstatus(setno) & MD_SET_STALE)) { 1137 md_set_setstatus(setno, MD_SET_STALE); 1138 cmn_err(CE_WARN, 1139 "md: state database is stale"); 1140 } 1141 } else if (status == MDDB_NODATA) { 1142 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1143 continue; 1144 } 1145 drvrid = mddb_getrectype1(recid); 1146 if (drvrid < MDDB_FIRST_MODID) 1147 continue; 1148 if (md_loadsubmod(setno, md_getshared_name(setno, drvrid), 1149 drvrid) < 0) { 1150 cmn_err(CE_NOTE, "md: could not load misc/%s", 1151 md_getshared_name(setno, drvrid)); 1152 } 1153 } 1154 1155 if (recid < 0) 1156 goto out; 1157 1158 snarf_user_data(setno); 1159 1160 /* 1161 * Initialize the md_nm_snarfed array 1162 * this array is indexed by the key and 1163 * is set by md_getdevnum during the snarf time 1164 */ 1165 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) { 1166 size = (int)((((struct nm_rec_hdr *)nh->nmn_record)-> 1167 r_next_key) * (sizeof (int))); 1168 md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP); 1169 } 1170 1171 /* 1172 * go through and snarf until nothing gets added 1173 */ 1174 do { 1175 i = 0; 1176 for (ops = md_opslist; ops != NULL; ops = ops->md_next) { 1177 if (ops->md_snarf != NULL) { 1178 retval = ops->md_snarf(MD_SNARF_DOIT, setno); 1179 if (retval == -1) { 1180 err = -1; 1181 /* Don't know the failed unit */ 1182 (void) mdmderror(ep, MDE_RR_ALLOC_ERROR, 1183 0); 1184 (void) md_halt_set(setno, MD_HALT_ALL); 1185 (void) mddb_unload_set(setno); 1186 md_haltsnarf_exit(setno); 1187 return (err); 1188 } else { 1189 i += retval; 1190 } 1191 } 1192 } 1193 } while (i); 1194 1195 /* 1196 * Set the first available slot and availability 1197 */ 1198 md_set[setno].s_un_avail = 0; 1199 for (un = 0; un < MD_MAXUNITS; un++) { 1200 if (md_set[setno].s_un[un] != NULL) { 1201 continue; 1202 } else { 1203 if (!un_next_set) { 1204 md_set[setno].s_un_next = un; 1205 un_next_set = 1; 1206 } 1207 md_set[setno].s_un_avail++; 1208 } 1209 } 1210 1211 md_set_setstatus(setno, MD_SET_SNARFED); 1212 1213 recid = mddb_makerecid(setno, 0); 1214 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1215 privat = mddb_getrecprivate(recid); 1216 if (privat & MD_PRV_COMMIT) { 1217 if (mddb_commitrec(recid)) { 1218 if (!(md_get_setstatus(setno) & MD_SET_STALE)) { 1219 md_set_setstatus(setno, MD_SET_STALE); 1220 cmn_err(CE_WARN, 1221 "md: state database is stale"); 1222 } 1223 } 1224 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1225 } 1226 } 1227 1228 /* Deletes must happen after all the commits */ 1229 recid = mddb_makerecid(setno, 0); 1230 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1231 privat = mddb_getrecprivate(recid); 1232 if (privat & MD_PRV_DELETE) { 1233 if (mddb_deleterec(recid)) { 1234 if (!(md_get_setstatus(setno) & MD_SET_STALE)) { 1235 md_set_setstatus(setno, MD_SET_STALE); 1236 cmn_err(CE_WARN, 1237 "md: state database is stale"); 1238 } 1239 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1240 } 1241 recid = mddb_makerecid(setno, 0); 1242 } 1243 } 1244 1245 /* 1246 * go through and clean up records until nothing gets cleaned up. 1247 */ 1248 do { 1249 i = 0; 1250 for (ops = md_opslist; ops != NULL; ops = ops->md_next) 1251 if (ops->md_snarf != NULL) 1252 i += ops->md_snarf(MD_SNARF_CLEANUP, setno); 1253 } while (i); 1254 1255 if (md_nm_snarfed != NULL && 1256 !(md_get_setstatus(setno) & MD_SET_STALE)) { 1257 /* 1258 * go thru and cleanup the namespace and the device id 1259 * name space 1260 */ 1261 for (key = 1; 1262 key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key; 1263 key++) { 1264 /* 1265 * Is the entry an 'orphan'? 1266 */ 1267 if (lookup_entry(nh, setno, side, key, NODEV64, 0L) != 1268 NULL) { 1269 /* 1270 * If the value is not set then apparently 1271 * it is not part of the current configuration, 1272 * remove it this can happen when system panic 1273 * between the primary name space update and 1274 * the device id name space update 1275 */ 1276 if (md_nm_snarfed[key] == 0) { 1277 if (md_verify_orphaned_record(setno, 1278 key) == 1) 1279 (void) remove_entry(nh, 1280 side, key, 0L); 1281 } 1282 } 1283 } 1284 } 1285 1286 if (md_nm_snarfed != NULL) { 1287 /* 1288 * Done and free the memory 1289 */ 1290 kmem_free(md_nm_snarfed, size); 1291 md_nm_snarfed = NULL; 1292 } 1293 1294 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE && 1295 !(md_get_setstatus(setno) & MD_SET_STALE)) { 1296 /* 1297 * if the destroy flag has been set and 1298 * the MD_SET_DIDCLUP bit is not set in 1299 * the set's status field, cleanup the 1300 * entire device id namespace 1301 */ 1302 if (md_devid_destroy && 1303 !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) { 1304 (void) md_devid_cleanup(setno, 1); 1305 md_set_setstatus(setno, MD_SET_DIDCLUP); 1306 } else 1307 (void) md_devid_cleanup(setno, 0); 1308 } 1309 1310 /* 1311 * clear single threading on snarf, return success or error 1312 */ 1313 out: 1314 md_haltsnarf_exit(setno); 1315 return (err); 1316 } 1317 1318 void 1319 get_minfo(struct dk_minfo *info, minor_t mnum) 1320 { 1321 md_unit_t *un; 1322 mdi_unit_t *ui; 1323 1324 info->dki_capacity = 0; 1325 info->dki_lbsize = 0; 1326 info->dki_media_type = 0; 1327 1328 if ((ui = MDI_UNIT(mnum)) == NULL) { 1329 return; 1330 } 1331 un = (md_unit_t *)md_unit_readerlock(ui); 1332 info->dki_capacity = un->c.un_total_blocks; 1333 md_unit_readerexit(ui); 1334 info->dki_lbsize = DEV_BSIZE; 1335 info->dki_media_type = DK_UNKNOWN; 1336 } 1337 1338 1339 void 1340 get_info(struct dk_cinfo *info, minor_t mnum) 1341 { 1342 /* 1343 * Controller Information 1344 */ 1345 info->dki_ctype = DKC_MD; 1346 info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo)); 1347 (void) strcpy(info->dki_cname, 1348 ddi_get_name(ddi_get_parent(md_devinfo))); 1349 /* 1350 * Unit Information 1351 */ 1352 info->dki_unit = mnum; 1353 info->dki_slave = 0; 1354 (void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo)); 1355 info->dki_flags = 0; 1356 info->dki_partition = 0; 1357 info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE); 1358 1359 /* 1360 * We can't get from here to there yet 1361 */ 1362 info->dki_addr = 0; 1363 info->dki_space = 0; 1364 info->dki_prio = 0; 1365 info->dki_vec = 0; 1366 } 1367 1368 /* 1369 * open admin device 1370 */ 1371 static int 1372 mdadminopen( 1373 int flag, 1374 int otyp) 1375 { 1376 int err = 0; 1377 1378 /* single thread */ 1379 mutex_enter(&md_mx); 1380 1381 /* check type and flags */ 1382 if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) { 1383 err = EINVAL; 1384 goto out; 1385 } 1386 if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) || 1387 (md_status & MD_GBL_EXCL)) { 1388 err = EBUSY; 1389 goto out; 1390 } 1391 1392 /* count and flag open */ 1393 md_ocnt[otyp]++; 1394 md_status |= MD_GBL_OPEN; 1395 if (flag & FEXCL) 1396 md_status |= MD_GBL_EXCL; 1397 1398 /* unlock return success */ 1399 out: 1400 mutex_exit(&md_mx); 1401 return (err); 1402 } 1403 1404 /* 1405 * open entry point 1406 */ 1407 static int 1408 mdopen( 1409 dev_t *dev, 1410 int flag, 1411 int otyp, 1412 cred_t *cred_p) 1413 { 1414 minor_t mnum = getminor(*dev); 1415 unit_t unit = MD_MIN2UNIT(mnum); 1416 set_t setno = MD_MIN2SET(mnum); 1417 mdi_unit_t *ui = NULL; 1418 int err = 0; 1419 md_parent_t parent; 1420 1421 /* dispatch admin device opens */ 1422 if (mnum == MD_ADM_MINOR) 1423 return (mdadminopen(flag, otyp)); 1424 1425 /* lock, check status */ 1426 rw_enter(&md_unit_array_rw.lock, RW_READER); 1427 1428 tryagain: 1429 if (md_get_status() & MD_GBL_HALTED) { 1430 err = ENODEV; 1431 goto out; 1432 } 1433 1434 /* check minor */ 1435 if ((setno >= md_nsets) || (unit >= md_nunits)) { 1436 err = ENXIO; 1437 goto out; 1438 } 1439 1440 /* make sure we're snarfed */ 1441 if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) { 1442 if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) { 1443 err = ENODEV; 1444 goto out; 1445 } 1446 } 1447 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) { 1448 err = ENODEV; 1449 goto out; 1450 } 1451 1452 /* check unit */ 1453 if ((ui = MDI_UNIT(mnum)) == NULL) { 1454 err = ENXIO; 1455 goto out; 1456 } 1457 1458 /* 1459 * The softpart open routine may do an I/O during the open, in 1460 * which case the open routine will set the OPENINPROGRESS flag 1461 * and drop all locks during the I/O. If this thread sees 1462 * the OPENINPROGRESS flag set, if should wait until the flag 1463 * is reset before calling the driver's open routine. It must 1464 * also revalidate the world after it grabs the unit_array lock 1465 * since the set may have been released or the metadevice cleared 1466 * during the sleep. 1467 */ 1468 if (MD_MNSET_SETNO(setno)) { 1469 mutex_enter(&ui->ui_mx); 1470 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 1471 rw_exit(&md_unit_array_rw.lock); 1472 cv_wait(&ui->ui_cv, &ui->ui_mx); 1473 rw_enter(&md_unit_array_rw.lock, RW_READER); 1474 mutex_exit(&ui->ui_mx); 1475 goto tryagain; 1476 } 1477 mutex_exit(&ui->ui_mx); 1478 } 1479 1480 /* Test if device is openable */ 1481 if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) { 1482 err = ENXIO; 1483 goto out; 1484 } 1485 1486 /* don't allow opens w/WRITE flag if stale */ 1487 if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) { 1488 err = EROFS; 1489 goto out; 1490 } 1491 1492 /* don't allow writes to subdevices */ 1493 parent = md_get_parent(md_expldev(*dev)); 1494 if ((flag & FWRITE) && MD_HAS_PARENT(parent)) { 1495 err = EROFS; 1496 goto out; 1497 } 1498 1499 /* open underlying driver */ 1500 if (md_ops[ui->ui_opsindex]->md_open != NULL) { 1501 if ((err = (*md_ops[ui->ui_opsindex]->md_open) 1502 (dev, flag, otyp, cred_p, 0)) != 0) 1503 goto out; 1504 } 1505 1506 /* or do it ourselves */ 1507 else { 1508 /* single thread */ 1509 (void) md_unit_openclose_enter(ui); 1510 err = md_unit_incopen(mnum, flag, otyp); 1511 md_unit_openclose_exit(ui); 1512 if (err != 0) 1513 goto out; 1514 } 1515 1516 /* unlock, return status */ 1517 out: 1518 rw_exit(&md_unit_array_rw.lock); 1519 return (err); 1520 } 1521 1522 /* 1523 * close admin device 1524 */ 1525 static int 1526 mdadminclose( 1527 int otyp) 1528 { 1529 int i; 1530 int err = 0; 1531 1532 /* single thread */ 1533 mutex_enter(&md_mx); 1534 1535 /* check type and flags */ 1536 if ((otyp < 0) || (otyp >= OTYPCNT)) { 1537 err = EINVAL; 1538 goto out; 1539 } else if (md_ocnt[otyp] == 0) { 1540 err = ENXIO; 1541 goto out; 1542 } 1543 1544 /* count and flag closed */ 1545 if (otyp == OTYP_LYR) 1546 md_ocnt[otyp]--; 1547 else 1548 md_ocnt[otyp] = 0; 1549 md_status &= ~MD_GBL_OPEN; 1550 for (i = 0; (i < OTYPCNT); ++i) 1551 if (md_ocnt[i] != 0) 1552 md_status |= MD_GBL_OPEN; 1553 if (! (md_status & MD_GBL_OPEN)) 1554 md_status &= ~MD_GBL_EXCL; 1555 1556 /* unlock return success */ 1557 out: 1558 mutex_exit(&md_mx); 1559 return (err); 1560 } 1561 1562 /* 1563 * close entry point 1564 */ 1565 static int 1566 mdclose( 1567 dev_t dev, 1568 int flag, 1569 int otyp, 1570 cred_t *cred_p) 1571 { 1572 minor_t mnum = getminor(dev); 1573 set_t setno = MD_MIN2SET(mnum); 1574 unit_t unit = MD_MIN2UNIT(mnum); 1575 mdi_unit_t *ui = NULL; 1576 int err = 0; 1577 1578 /* dispatch admin device closes */ 1579 if (mnum == MD_ADM_MINOR) 1580 return (mdadminclose(otyp)); 1581 1582 /* check minor */ 1583 if ((setno >= md_nsets) || (unit >= md_nunits) || 1584 ((ui = MDI_UNIT(mnum)) == NULL)) { 1585 err = ENXIO; 1586 goto out; 1587 } 1588 1589 /* close underlying driver */ 1590 if (md_ops[ui->ui_opsindex]->md_close != NULL) { 1591 if ((err = (*md_ops[ui->ui_opsindex]->md_close) 1592 (dev, flag, otyp, cred_p, 0)) != 0) 1593 goto out; 1594 } 1595 1596 /* or do it ourselves */ 1597 else { 1598 /* single thread */ 1599 (void) md_unit_openclose_enter(ui); 1600 err = md_unit_decopen(mnum, otyp); 1601 md_unit_openclose_exit(ui); 1602 if (err != 0) 1603 goto out; 1604 } 1605 1606 /* return success */ 1607 out: 1608 return (err); 1609 } 1610 1611 1612 /* 1613 * This routine performs raw read operations. It is called from the 1614 * device switch at normal priority. 1615 * 1616 * The main catch is that the *uio struct which is passed to us may 1617 * specify a read which spans two buffers, which would be contiguous 1618 * on a single partition, but not on a striped partition. This will 1619 * be handled by mdstrategy. 1620 */ 1621 /*ARGSUSED*/ 1622 static int 1623 mdread(dev_t dev, struct uio *uio, cred_t *credp) 1624 { 1625 minor_t mnum; 1626 mdi_unit_t *ui; 1627 int error; 1628 1629 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1630 (MD_MIN2SET(mnum) >= md_nsets) || 1631 (MD_MIN2UNIT(mnum) >= md_nunits) || 1632 ((ui = MDI_UNIT(mnum)) == NULL)) 1633 return (ENXIO); 1634 1635 if (md_ops[ui->ui_opsindex]->md_read != NULL) 1636 return ((*md_ops[ui->ui_opsindex]->md_read) 1637 (dev, uio, credp)); 1638 1639 if ((error = md_chk_uio(uio)) != 0) 1640 return (error); 1641 1642 return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio)); 1643 } 1644 1645 /* 1646 * This routine performs async raw read operations. It is called from the 1647 * device switch at normal priority. 1648 * 1649 * The main catch is that the *aio struct which is passed to us may 1650 * specify a read which spans two buffers, which would be contiguous 1651 * on a single partition, but not on a striped partition. This will 1652 * be handled by mdstrategy. 1653 */ 1654 /*ARGSUSED*/ 1655 static int 1656 mdaread(dev_t dev, struct aio_req *aio, cred_t *credp) 1657 { 1658 minor_t mnum; 1659 mdi_unit_t *ui; 1660 int error; 1661 1662 1663 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1664 (MD_MIN2SET(mnum) >= md_nsets) || 1665 (MD_MIN2UNIT(mnum) >= md_nunits) || 1666 ((ui = MDI_UNIT(mnum)) == NULL)) 1667 return (ENXIO); 1668 1669 if (md_ops[ui->ui_opsindex]->md_aread != NULL) 1670 return ((*md_ops[ui->ui_opsindex]->md_aread) 1671 (dev, aio, credp)); 1672 1673 if ((error = md_chk_uio(aio->aio_uio)) != 0) 1674 return (error); 1675 1676 return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio)); 1677 } 1678 1679 /* 1680 * This routine performs raw write operations. It is called from the 1681 * device switch at normal priority. 1682 * 1683 * The main catch is that the *uio struct which is passed to us may 1684 * specify a write which spans two buffers, which would be contiguous 1685 * on a single partition, but not on a striped partition. This is 1686 * handled by mdstrategy. 1687 * 1688 */ 1689 /*ARGSUSED*/ 1690 static int 1691 mdwrite(dev_t dev, struct uio *uio, cred_t *credp) 1692 { 1693 minor_t mnum; 1694 mdi_unit_t *ui; 1695 int error; 1696 1697 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1698 (MD_MIN2SET(mnum) >= md_nsets) || 1699 (MD_MIN2UNIT(mnum) >= md_nunits) || 1700 ((ui = MDI_UNIT(mnum)) == NULL)) 1701 return (ENXIO); 1702 1703 if (md_ops[ui->ui_opsindex]->md_write != NULL) 1704 return ((*md_ops[ui->ui_opsindex]->md_write) 1705 (dev, uio, credp)); 1706 1707 if ((error = md_chk_uio(uio)) != 0) 1708 return (error); 1709 1710 return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio)); 1711 } 1712 1713 /* 1714 * This routine performs async raw write operations. It is called from the 1715 * device switch at normal priority. 1716 * 1717 * The main catch is that the *aio struct which is passed to us may 1718 * specify a write which spans two buffers, which would be contiguous 1719 * on a single partition, but not on a striped partition. This is 1720 * handled by mdstrategy. 1721 * 1722 */ 1723 /*ARGSUSED*/ 1724 static int 1725 mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp) 1726 { 1727 minor_t mnum; 1728 mdi_unit_t *ui; 1729 int error; 1730 1731 1732 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1733 (MD_MIN2SET(mnum) >= md_nsets) || 1734 (MD_MIN2UNIT(mnum) >= md_nunits) || 1735 ((ui = MDI_UNIT(mnum)) == NULL)) 1736 return (ENXIO); 1737 1738 if (md_ops[ui->ui_opsindex]->md_awrite != NULL) 1739 return ((*md_ops[ui->ui_opsindex]->md_awrite) 1740 (dev, aio, credp)); 1741 1742 if ((error = md_chk_uio(aio->aio_uio)) != 0) 1743 return (error); 1744 1745 return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio)); 1746 } 1747 1748 int 1749 mdstrategy(struct buf *bp) 1750 { 1751 minor_t mnum; 1752 mdi_unit_t *ui; 1753 1754 ASSERT((bp->b_flags & B_DONE) == 0); 1755 1756 if (panicstr) 1757 md_clr_status(MD_GBL_DAEMONS_LIVE); 1758 1759 if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) || 1760 (MD_MIN2SET(mnum) >= md_nsets) || 1761 (MD_MIN2UNIT(mnum) >= md_nunits) || 1762 ((ui = MDI_UNIT(mnum)) == NULL)) { 1763 bp->b_flags |= B_ERROR; 1764 bp->b_error = ENXIO; 1765 bp->b_resid = bp->b_bcount; 1766 biodone(bp); 1767 return (0); 1768 } 1769 1770 bp->b_flags &= ~(B_ERROR | B_DONE); 1771 if (md_ops[ui->ui_opsindex]->md_strategy != NULL) { 1772 (*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL); 1773 } else { 1774 (void) errdone(ui, bp, ENXIO); 1775 } 1776 return (0); 1777 } 1778 1779 /* 1780 * Return true if the ioctl is allowed to be multithreaded. 1781 * All the ioctls with MN are sent only from the message handlers through 1782 * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two 1783 * ioctl for the same metadevice are issued at the same time. 1784 * So we are safe here. 1785 * The other ioctls do not mess with any metadevice structures and therefor 1786 * are harmless too, if called multiple times at the same time. 1787 */ 1788 static boolean_t 1789 is_mt_ioctl(int cmd) { 1790 1791 switch (cmd) { 1792 case MD_IOCGUNIQMSGID: 1793 case MD_IOCGVERSION: 1794 case MD_IOCISOPEN: 1795 case MD_MN_SET_MM_OWNER: 1796 case MD_MN_SET_STATE: 1797 case MD_MN_SUSPEND_WRITES: 1798 case MD_MN_ALLOCATE_HOTSPARE: 1799 case MD_MN_SET_SETFLAGS: 1800 case MD_MN_GET_SETFLAGS: 1801 case MD_MN_MDDB_OPTRECFIX: 1802 case MD_MN_MDDB_PARSE: 1803 case MD_MN_MDDB_BLOCK: 1804 case MD_MN_DB_USERREQ: 1805 case MD_IOC_SPSTATUS: 1806 case MD_MN_COMMD_ERR: 1807 case MD_MN_SET_COMMD_RUNNING: 1808 case MD_MN_RESYNC: 1809 case MD_MN_SETSYNC: 1810 case MD_MN_POKE_HOTSPARES: 1811 return (1); 1812 default: 1813 return (0); 1814 } 1815 } 1816 1817 /* 1818 * This routine implements the ioctl calls for the Virtual Disk System. 1819 * It is called from the device switch at normal priority. 1820 */ 1821 /* ARGSUSED */ 1822 static int 1823 mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p, 1824 int *rval_p) 1825 { 1826 minor_t mnum = getminor(dev); 1827 mdi_unit_t *ui; 1828 IOLOCK lock; 1829 int err; 1830 1831 /* 1832 * For multinode disksets number of ioctls are allowed to be 1833 * multithreaded. 1834 * A fundamental assumption made in this implementation is that 1835 * ioctls either do not interact with other md structures or the 1836 * ioctl to the admin device can only occur if the metadevice 1837 * device is open. i.e. avoid a race between metaclear and the 1838 * progress of a multithreaded ioctl. 1839 */ 1840 1841 if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) { 1842 return (EINTR); 1843 } 1844 1845 /* 1846 * initialize lock tracker 1847 */ 1848 IOLOCK_INIT(&lock); 1849 1850 /* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */ 1851 1852 if (is_mt_ioctl(cmd)) { 1853 /* increment the md_mtioctl_cnt */ 1854 mutex_enter(&md_mx); 1855 md_mtioctl_cnt++; 1856 mutex_exit(&md_mx); 1857 lock.l_flags |= MD_MT_IOCTL; 1858 } 1859 1860 /* 1861 * this has been added to prevent notification from re-snarfing 1862 * so metaunload will work. It may interfere with other modules 1863 * halt process. 1864 */ 1865 if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE)) 1866 return (IOLOCK_RETURN(ENXIO, &lock)); 1867 1868 /* 1869 * admin device ioctls 1870 */ 1871 if (mnum == MD_ADM_MINOR) { 1872 err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data, 1873 mode, &lock); 1874 } 1875 1876 /* 1877 * metadevice ioctls 1878 */ 1879 else if ((MD_MIN2SET(mnum) >= md_nsets) || 1880 (MD_MIN2UNIT(mnum) >= md_nunits) || 1881 ((ui = MDI_UNIT(mnum)) == NULL)) { 1882 err = ENXIO; 1883 } else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) { 1884 err = ENOTTY; 1885 } else { 1886 err = (*md_ops[ui->ui_opsindex]->md_ioctl) 1887 (dev, cmd, (void *) data, mode, &lock); 1888 } 1889 1890 /* 1891 * drop any locks we grabbed 1892 */ 1893 return (IOLOCK_RETURN_IOCTLEND(err, &lock)); 1894 } 1895 1896 static int 1897 mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1898 { 1899 minor_t mnum; 1900 set_t setno; 1901 mdi_unit_t *ui; 1902 1903 if ((mnum = getminor(dev)) == MD_ADM_MINOR) 1904 return (ENXIO); 1905 1906 setno = MD_MIN2SET(mnum); 1907 1908 if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) || 1909 ((ui = MDI_UNIT(mnum)) == NULL)) 1910 return (ENXIO); 1911 1912 1913 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 1914 return (ENXIO); 1915 1916 if (md_ops[ui->ui_opsindex]->md_dump != NULL) 1917 return ((*md_ops[ui->ui_opsindex]->md_dump) 1918 (dev, addr, blkno, nblk)); 1919 1920 return (ENXIO); 1921 } 1922 1923 /* 1924 * Metadevice unit number dispatcher 1925 * When this routine is called it will scan the 1926 * incore unit array and return the avail slot 1927 * hence the unit number to the caller 1928 * 1929 * Return -1 if there is nothing available 1930 */ 1931 unit_t 1932 md_get_nextunit(set_t setno) 1933 { 1934 unit_t un, start; 1935 1936 /* 1937 * If nothing available 1938 */ 1939 if (md_set[setno].s_un_avail == 0) { 1940 return (MD_UNITBAD); 1941 } 1942 1943 mutex_enter(&md_mx); 1944 start = un = md_set[setno].s_un_next; 1945 1946 /* LINTED: E_CONSTANT_CONDITION */ 1947 while (1) { 1948 if (md_set[setno].s_un[un] == NULL) { 1949 /* 1950 * Advance the starting index for the next 1951 * md_get_nextunit call 1952 */ 1953 if (un == MD_MAXUNITS - 1) { 1954 md_set[setno].s_un_next = 0; 1955 } else { 1956 md_set[setno].s_un_next = un + 1; 1957 } 1958 break; 1959 } 1960 1961 un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1); 1962 1963 if (un == start) { 1964 un = MD_UNITBAD; 1965 break; 1966 } 1967 1968 } 1969 1970 mutex_exit(&md_mx); 1971 return (un); 1972 } 1973