1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * Md - is the meta-disk driver. It sits below the UFS file system 28 * but above the 'real' disk drivers, xy, id, sd etc. 29 * 30 * To the UFS software, md looks like a normal driver, since it has 31 * the normal kinds of entries in the bdevsw and cdevsw arrays. So 32 * UFS accesses md in the usual ways. In particular, the strategy 33 * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(), 34 * and ufs_writelbn(). 35 * 36 * Md maintains an array of minor devices (meta-partitions). Each 37 * meta partition stands for a matrix of real partitions, in rows 38 * which are not necessarily of equal length. Md maintains a table, 39 * with one entry for each meta-partition, which lists the rows and 40 * columns of actual partitions, and the job of the strategy routine 41 * is to translate from the meta-partition device and block numbers 42 * known to UFS into the actual partitions' device and block numbers. 43 * 44 * See below, in mdstrategy(), mdreal(), and mddone() for details of 45 * this translation. 46 */ 47 48 /* 49 * Driver for Virtual Disk. 50 */ 51 52 #include <sys/user.h> 53 #include <sys/sysmacros.h> 54 #include <sys/conf.h> 55 #include <sys/stat.h> 56 #include <sys/errno.h> 57 #include <sys/param.h> 58 #include <sys/systm.h> 59 #include <sys/file.h> 60 #include <sys/open.h> 61 #include <sys/dkio.h> 62 #include <sys/vtoc.h> 63 #include <sys/cmn_err.h> 64 #include <sys/ddi.h> 65 #include <sys/sunddi.h> 66 #include <sys/debug.h> 67 #include <sys/utsname.h> 68 #include <sys/lvm/mdvar.h> 69 #include <sys/lvm/md_names.h> 70 #include <sys/lvm/md_mddb.h> 71 #include <sys/lvm/md_sp.h> 72 #include <sys/types.h> 73 #include <sys/kmem.h> 74 #include <sys/cladm.h> 75 #include <sys/priv_names.h> 76 #include <sys/modhash.h> 77 78 #ifndef lint 79 char _depends_on[] = "strmod/rpcmod"; 80 #endif /* lint */ 81 int md_init_debug = 0; /* module binding debug */ 82 83 /* 84 * Tunable to turn off the failfast behavior. 85 */ 86 int md_ff_disable = 0; 87 88 /* 89 * dynamically allocated list of non FF driver names - needs to 90 * be freed when md is detached. 91 */ 92 char **non_ff_drivers = NULL; 93 94 md_krwlock_t md_unit_array_rw; /* protects all unit arrays */ 95 md_krwlock_t nm_lock; /* protects all the name spaces */ 96 97 md_resync_t md_cpr_resync; 98 99 extern char svm_bootpath[]; 100 #define SVM_PSEUDO_STR "/pseudo/md@0:" 101 102 #define VERSION_LENGTH 6 103 #define VERSION "1.0" 104 105 /* 106 * Keep track of possible 'orphan' entries in the name space 107 */ 108 int *md_nm_snarfed = NULL; 109 110 /* 111 * Global tunable giving the percentage of free space left in replica during 112 * conversion of non-devid style replica to devid style replica. 113 */ 114 int md_conv_perc = MDDB_DEVID_CONV_PERC; 115 116 #ifdef DEBUG 117 /* debug code to verify framework exclusion guarantees */ 118 int md_in; 119 kmutex_t md_in_mx; /* used to md global stuff */ 120 #define IN_INIT 0x01 121 #define IN_FINI 0x02 122 #define IN_ATTACH 0x04 123 #define IN_DETACH 0x08 124 #define IN_OPEN 0x10 125 #define MD_SET_IN(x) { \ 126 mutex_enter(&md_in_mx); \ 127 if (md_in) \ 128 debug_enter("MD_SET_IN exclusion lost"); \ 129 if (md_in & x) \ 130 debug_enter("MD_SET_IN already set"); \ 131 md_in |= x; \ 132 mutex_exit(&md_in_mx); \ 133 } 134 135 #define MD_CLR_IN(x) { \ 136 mutex_enter(&md_in_mx); \ 137 if (md_in & ~(x)) \ 138 debug_enter("MD_CLR_IN exclusion lost"); \ 139 if (!(md_in & x)) \ 140 debug_enter("MD_CLR_IN already clr"); \ 141 md_in &= ~x; \ 142 mutex_exit(&md_in_mx); \ 143 } 144 #else /* DEBUG */ 145 #define MD_SET_IN(x) 146 #define MD_CLR_IN(x) 147 #endif /* DEBUG */ 148 hrtime_t savetime1, savetime2; 149 150 151 /* 152 * list things protected by md_mx even if they aren't 153 * used in this file. 154 */ 155 kmutex_t md_mx; /* used to md global stuff */ 156 kcondvar_t md_cv; /* md_status events */ 157 int md_status = 0; /* global status for the meta-driver */ 158 int md_num_daemons = 0; 159 int md_ioctl_cnt = 0; 160 int md_mtioctl_cnt = 0; /* multithreaded ioctl cnt */ 161 uint_t md_mdelay = 10; /* variable so can be patched */ 162 163 int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 164 165 major_t md_major, md_major_targ; 166 167 unit_t md_nunits = MD_MAXUNITS; 168 set_t md_nsets = MD_MAXSETS; 169 int md_nmedh = 0; 170 char *md_med_trans_lst = NULL; 171 md_set_t md_set[MD_MAXSETS]; 172 md_set_io_t md_set_io[MD_MAXSETS]; 173 174 md_krwlock_t hsp_rwlp; /* protects hot_spare_interface */ 175 md_krwlock_t ni_rwlp; /* protects notify_interface */ 176 md_ops_t **md_ops = NULL; 177 ddi_modhandle_t *md_mods = NULL; 178 md_ops_t *md_opslist; 179 clock_t md_hz; 180 md_event_queue_t *md_event_queue = NULL; 181 182 int md_in_upgrade; 183 int md_keep_repl_state; 184 int md_devid_destroy; 185 186 /* for sending messages thru a door to userland */ 187 door_handle_t mdmn_door_handle = NULL; 188 int mdmn_door_did = -1; 189 190 dev_info_t *md_devinfo = NULL; 191 192 md_mn_nodeid_t md_mn_mynode_id = ~0u; /* My node id (for multi-node sets) */ 193 194 static uint_t md_ocnt[OTYPCNT]; 195 196 static int mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 197 static int mdattach(dev_info_t *, ddi_attach_cmd_t); 198 static int mddetach(dev_info_t *, ddi_detach_cmd_t); 199 static int mdopen(dev_t *, int, int, cred_t *); 200 static int mdclose(dev_t, int, int, cred_t *); 201 static int mddump(dev_t, caddr_t, daddr_t, int); 202 static int mdread(dev_t, struct uio *, cred_t *); 203 static int mdwrite(dev_t, struct uio *, cred_t *); 204 static int mdaread(dev_t, struct aio_req *, cred_t *); 205 static int mdawrite(dev_t, struct aio_req *, cred_t *); 206 static int mdioctl(dev_t, int, intptr_t, int, cred_t *, int *); 207 static int mdprop_op(dev_t, dev_info_t *, 208 ddi_prop_op_t, int, char *, caddr_t, int *); 209 210 static struct cb_ops md_cb_ops = { 211 mdopen, /* open */ 212 mdclose, /* close */ 213 mdstrategy, /* strategy */ 214 /* print routine -- none yet */ 215 (int(*)(dev_t, char *))nulldev, 216 mddump, /* dump */ 217 mdread, /* read */ 218 mdwrite, /* write */ 219 mdioctl, /* ioctl */ 220 /* devmap */ 221 (int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *, 222 uint_t))nodev, 223 /* mmap */ 224 (int(*)(dev_t, off_t, int))nodev, 225 /* segmap */ 226 (int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned, 227 unsigned, unsigned, cred_t *))nodev, 228 nochpoll, /* poll */ 229 mdprop_op, /* prop_op */ 230 0, /* streamtab */ 231 (D_64BIT|D_MP|D_NEW), /* driver compatibility flag */ 232 CB_REV, /* cb_ops version */ 233 mdaread, /* aread */ 234 mdawrite, /* awrite */ 235 }; 236 237 static struct dev_ops md_devops = { 238 DEVO_REV, /* dev_ops version */ 239 0, /* device reference count */ 240 mdinfo, /* info routine */ 241 nulldev, /* identify routine */ 242 nulldev, /* probe - not defined */ 243 mdattach, /* attach routine */ 244 mddetach, /* detach routine */ 245 nodev, /* reset - not defined */ 246 &md_cb_ops, /* driver operations */ 247 NULL, /* bus operations */ 248 nodev, /* power management */ 249 ddi_quiesce_not_needed, /* quiesce */ 250 }; 251 252 /* 253 * loadable module wrapper 254 */ 255 #include <sys/modctl.h> 256 257 static struct modldrv modldrv = { 258 &mod_driverops, /* type of module -- a pseudodriver */ 259 "Solaris Volume Manager base module", /* name of the module */ 260 &md_devops, /* driver ops */ 261 }; 262 263 static struct modlinkage modlinkage = { 264 MODREV_1, 265 (void *)&modldrv, 266 NULL 267 }; 268 269 270 /* md_medd.c */ 271 extern void med_init(void); 272 extern void med_fini(void); 273 extern void md_devid_cleanup(set_t, uint_t); 274 275 /* md_names.c */ 276 extern struct nm_next_hdr *get_first_record(set_t, int, int); 277 278 int md_maxphys = 0; /* maximum io size in bytes */ 279 #define MD_MAXBCOUNT (1024 * 1024) 280 unsigned md_maxbcount = 0; /* maximum physio size in bytes */ 281 282 /* 283 * Some md ioctls trigger io framework device tree operations. An 284 * example is md ioctls that call md_resolve_bydevid(): which uses the 285 * io framework to resolve a devid. Such operations result in acquiring 286 * io framework locks (like ndi_devi_enter() of "/") while holding 287 * driver locks (like md_unit_writerlock()). 288 * 289 * The prop_op(9E) entry point is called from the devinfo driver with 290 * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op 291 * implementation must avoid taking a lock that is held per above md 292 * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock() 293 * without risking deadlock. 294 * 295 * To service "size" requests without risking deadlock, we maintain a 296 * "mnum->nblocks" sizemap (protected by a short-term global mutex). 297 */ 298 static kmutex_t md_nblocks_mutex; 299 static mod_hash_t *md_nblocksmap; /* mnum -> nblocks */ 300 int md_nblocksmap_size = 512; 301 302 /* 303 * Maintain "mnum->nblocks" sizemap for mdprop_op use: 304 * 305 * Create: any code that establishes a unit's un_total_blocks needs the 306 * following type of call to establish nblocks for mdprop_op(): 307 * md_nblocks_set(mnum, un->c.un_total_blocks);" 308 * NOTE: locate via cscope md_create_minor_node/md_create_unit_incore 309 * ...or "MD_UNIT..*=" 310 * 311 * Change: any code that changes a unit's un_total_blocks needs the 312 * following type of call to sync nblocks for mdprop_op(): 313 * md_nblocks_set(mnum, un->c.un_total_blocks);" 314 * NOTE: locate via cscope for "un_total_blocks[ \t]*=" 315 * 316 * Destroy: any code that deletes a unit needs the following type of call 317 * to sync nblocks for mdprop_op(): 318 * md_nblocks_set(mnum, -1ULL); 319 * NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore 320 * ...or "MD_UNIT..*=" 321 */ 322 void 323 md_nblocks_set(minor_t mnum, uint64_t nblocks) 324 { 325 mutex_enter(&md_nblocks_mutex); 326 if (nblocks == -1ULL) 327 (void) mod_hash_destroy(md_nblocksmap, 328 (mod_hash_key_t)(intptr_t)mnum); 329 else 330 (void) mod_hash_replace(md_nblocksmap, 331 (mod_hash_key_t)(intptr_t)mnum, 332 (mod_hash_val_t)(intptr_t)nblocks); 333 mutex_exit(&md_nblocks_mutex); 334 } 335 336 /* get the size of a mnum from "mnum->nblocks" sizemap */ 337 uint64_t 338 md_nblocks_get(minor_t mnum) 339 { 340 mod_hash_val_t hv; 341 342 mutex_enter(&md_nblocks_mutex); 343 if (mod_hash_find(md_nblocksmap, 344 (mod_hash_key_t)(intptr_t)mnum, &hv) == 0) { 345 mutex_exit(&md_nblocks_mutex); 346 return ((uint64_t)(intptr_t)hv); 347 } 348 mutex_exit(&md_nblocks_mutex); 349 return (0); 350 } 351 352 /* allocate/free dynamic space associated with driver globals */ 353 void 354 md_global_alloc_free(int alloc) 355 { 356 set_t s; 357 358 if (alloc) { 359 /* initialize driver global locks */ 360 cv_init(&md_cv, NULL, CV_DEFAULT, NULL); 361 mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL); 362 rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL); 363 rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL); 364 rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL); 365 rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL); 366 mutex_init(&md_cpr_resync.md_resync_mutex, NULL, 367 MUTEX_DEFAULT, NULL); 368 mutex_init(&md_nblocks_mutex, NULL, MUTEX_DEFAULT, NULL); 369 370 /* initialize per set driver global locks */ 371 for (s = 0; s < MD_MAXSETS; s++) { 372 /* initialize per set driver globals locks */ 373 mutex_init(&md_set[s].s_dbmx, 374 NULL, MUTEX_DEFAULT, NULL); 375 mutex_init(&md_set_io[s].md_io_mx, 376 NULL, MUTEX_DEFAULT, NULL); 377 cv_init(&md_set_io[s].md_io_cv, 378 NULL, CV_DEFAULT, NULL); 379 } 380 } else { 381 /* destroy per set driver global locks */ 382 for (s = 0; s < MD_MAXSETS; s++) { 383 cv_destroy(&md_set_io[s].md_io_cv); 384 mutex_destroy(&md_set_io[s].md_io_mx); 385 mutex_destroy(&md_set[s].s_dbmx); 386 } 387 388 /* destroy driver global locks */ 389 mutex_destroy(&md_nblocks_mutex); 390 mutex_destroy(&md_cpr_resync.md_resync_mutex); 391 rw_destroy(&hsp_rwlp.lock); 392 rw_destroy(&ni_rwlp.lock); 393 rw_destroy(&nm_lock.lock); 394 rw_destroy(&md_unit_array_rw.lock); 395 mutex_destroy(&md_mx); 396 cv_destroy(&md_cv); 397 } 398 } 399 400 int 401 _init(void) 402 { 403 set_t s; 404 int err; 405 406 MD_SET_IN(IN_INIT); 407 408 /* allocate dynamic space associated with driver globals */ 409 md_global_alloc_free(1); 410 411 /* initialize driver globals */ 412 md_major = ddi_name_to_major("md"); 413 md_hz = drv_usectohz(NUM_USEC_IN_SEC); 414 415 /* initialize tunable globals */ 416 if (md_maxphys == 0) /* maximum io size in bytes */ 417 md_maxphys = maxphys; 418 if (md_maxbcount == 0) /* maximum physio size in bytes */ 419 md_maxbcount = MD_MAXBCOUNT; 420 421 /* initialize per set driver globals */ 422 for (s = 0; s < MD_MAXSETS; s++) 423 md_set_io[s].io_state = MD_SET_ACTIVE; 424 425 /* 426 * NOTE: the framework does not currently guarantee exclusion 427 * between _init and attach after calling mod_install. 428 */ 429 MD_CLR_IN(IN_INIT); 430 if ((err = mod_install(&modlinkage))) { 431 MD_SET_IN(IN_INIT); 432 md_global_alloc_free(0); /* free dynamic space */ 433 MD_CLR_IN(IN_INIT); 434 } 435 return (err); 436 } 437 438 int 439 _fini(void) 440 { 441 int err; 442 443 /* 444 * NOTE: the framework currently does not guarantee exclusion 445 * with attach until after mod_remove returns 0. 446 */ 447 if ((err = mod_remove(&modlinkage))) 448 return (err); 449 450 MD_SET_IN(IN_FINI); 451 md_global_alloc_free(0); /* free dynamic space */ 452 MD_CLR_IN(IN_FINI); 453 return (err); 454 } 455 456 int 457 _info(struct modinfo *modinfop) 458 { 459 return (mod_info(&modlinkage, modinfop)); 460 } 461 462 /* ARGSUSED */ 463 static int 464 mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd) 465 { 466 int len; 467 unit_t i; 468 size_t sz; 469 char ver[VERSION_LENGTH]; 470 char **maj_str_array; 471 char *str, *str2; 472 473 MD_SET_IN(IN_ATTACH); 474 md_in_upgrade = 0; 475 md_keep_repl_state = 0; 476 md_devid_destroy = 0; 477 478 if (cmd != DDI_ATTACH) { 479 MD_CLR_IN(IN_ATTACH); 480 return (DDI_FAILURE); 481 } 482 483 if (md_devinfo != NULL) { 484 MD_CLR_IN(IN_ATTACH); 485 return (DDI_FAILURE); 486 } 487 488 mddb_init(); 489 490 if (md_start_daemons(TRUE)) { 491 MD_CLR_IN(IN_ATTACH); 492 mddb_unload(); /* undo mddb_init() allocations */ 493 return (DDI_FAILURE); 494 } 495 496 /* clear the halted state */ 497 md_clr_status(MD_GBL_HALTED); 498 499 /* see if the diagnostic switch is on */ 500 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 501 DDI_PROP_DONTPASS, "md_init_debug", 0)) 502 md_init_debug++; 503 504 /* see if the failfast disable switch is on */ 505 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 506 DDI_PROP_DONTPASS, "md_ff_disable", 0)) 507 md_ff_disable++; 508 509 /* try and get the md_nmedh property */ 510 md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 511 DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS); 512 if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS)) 513 md_nmedh = MED_DEF_HOSTS; 514 515 /* try and get the md_med_trans_lst property */ 516 len = 0; 517 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN, 518 0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS || 519 len == 0) { 520 md_med_trans_lst = md_strdup("tcp"); 521 } else { 522 md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP); 523 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF, 524 0, "md_med_trans_lst", md_med_trans_lst, &len) != 525 DDI_PROP_SUCCESS) { 526 kmem_free(md_med_trans_lst, (size_t)len); 527 md_med_trans_lst = md_strdup("tcp"); 528 } 529 } 530 531 /* 532 * Must initialize the internal data structures before the 533 * any possible calls to 'goto attach_failure' as _fini 534 * routine references them. 535 */ 536 med_init(); 537 538 md_ops = (md_ops_t **)kmem_zalloc( 539 sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP); 540 md_mods = (ddi_modhandle_t *)kmem_zalloc( 541 sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP); 542 543 /* try and get the md_xlate property */ 544 /* Should we only do this if upgrade? */ 545 len = sizeof (char) * 5; 546 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF, 547 0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) { 548 if (strcmp(ver, VERSION) == 0) { 549 len = 0; 550 if (ddi_prop_op(DDI_DEV_T_ANY, dip, 551 PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate", 552 (caddr_t)&md_tuple_table, &len) != 553 DDI_PROP_SUCCESS) { 554 if (md_init_debug) 555 cmn_err(CE_WARN, 556 "md_xlate ddi_prop_op failed"); 557 goto attach_failure; 558 } else { 559 md_tuple_length = 560 len/(2 * ((int)sizeof (dev32_t))); 561 md_in_upgrade = 1; 562 } 563 564 /* Get target's name to major table */ 565 if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY, 566 dip, DDI_PROP_DONTPASS, 567 "md_targ_nm_table", &maj_str_array, 568 &md_majortab_len) != DDI_PROP_SUCCESS) { 569 md_majortab_len = 0; 570 if (md_init_debug) 571 cmn_err(CE_WARN, "md_targ_nm_table " 572 "ddi_prop_lookup_string_array " 573 "failed"); 574 goto attach_failure; 575 } 576 577 md_major_tuple_table = 578 (struct md_xlate_major_table *) 579 kmem_zalloc(md_majortab_len * 580 sizeof (struct md_xlate_major_table), KM_SLEEP); 581 582 for (i = 0; i < md_majortab_len; i++) { 583 /* Getting major name */ 584 str = strchr(maj_str_array[i], ' '); 585 if (str == NULL) 586 continue; 587 *str = '\0'; 588 md_major_tuple_table[i].drv_name = 589 md_strdup(maj_str_array[i]); 590 591 /* Simplified atoi to get major number */ 592 str2 = str + 1; 593 md_major_tuple_table[i].targ_maj = 0; 594 while ((*str2 >= '0') && (*str2 <= '9')) { 595 md_major_tuple_table[i].targ_maj *= 10; 596 md_major_tuple_table[i].targ_maj += 597 *str2++ - '0'; 598 } 599 *str = ' '; 600 } 601 ddi_prop_free((void *)maj_str_array); 602 } else { 603 if (md_init_debug) 604 cmn_err(CE_WARN, "md_xlate_ver is incorrect"); 605 goto attach_failure; 606 } 607 } 608 609 /* 610 * Check for properties: 611 * md_keep_repl_state and md_devid_destroy 612 * and set globals if these exist. 613 */ 614 md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip, 615 0, "md_keep_repl_state", 0); 616 617 md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip, 618 0, "md_devid_destroy", 0); 619 620 if (MD_UPGRADE) 621 md_major_targ = md_targ_name_to_major("md"); 622 else 623 md_major_targ = 0; 624 625 /* allocate admin device node */ 626 if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR, 627 MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640)) 628 goto attach_failure; 629 630 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 631 DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS) 632 goto attach_failure; 633 634 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, 635 "ddi-abrwrite-supported", 1) != DDI_SUCCESS) 636 goto attach_failure; 637 638 /* these could have been cleared by a detach */ 639 md_nunits = MD_MAXUNITS; 640 md_nsets = MD_MAXSETS; 641 642 sz = sizeof (void *) * MD_MAXUNITS; 643 if (md_set[0].s_un == NULL) 644 md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP); 645 if (md_set[0].s_ui == NULL) 646 md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP); 647 648 md_devinfo = dip; 649 650 /* 651 * Only allocate device node for root mirror metadevice. 652 * Don't pre-allocate unnecessary device nodes (thus slowing down a 653 * boot when we attach). 654 * We can't read the mddbs in attach. The mddbs will be read 655 * by metainit during the boot process when it is doing the 656 * auto-take processing and any other minor nodes will be 657 * allocated at that point. 658 * 659 * There are two scenarios to be aware of here: 660 * 1) when we are booting from a mirrored root we need the root 661 * metadevice to exist very early (during vfs_mountroot processing) 662 * 2) we need all of the nodes to be created so that any mnttab entries 663 * will succeed (handled by metainit reading the mddb during boot). 664 */ 665 if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1) 666 == 0) { 667 char *p; 668 int mnum = 0; 669 670 /* 671 * The svm_bootpath string looks something like 672 * /pseudo/md@0:0,150,blk where 150 is the minor number 673 * in this example so we need to set the pointer p onto 674 * the first digit of the minor number and convert it 675 * from ascii. 676 */ 677 for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1; 678 *p >= '0' && *p <= '9'; p++) { 679 mnum *= 10; 680 mnum += *p - '0'; 681 } 682 683 if (md_create_minor_node(0, mnum)) { 684 kmem_free(md_set[0].s_un, sz); 685 kmem_free(md_set[0].s_ui, sz); 686 goto attach_failure; 687 } 688 } 689 690 /* create the hash to store the meta device sizes */ 691 md_nblocksmap = mod_hash_create_idhash("md_nblocksmap", 692 md_nblocksmap_size, mod_hash_null_valdtor); 693 694 MD_CLR_IN(IN_ATTACH); 695 return (DDI_SUCCESS); 696 697 attach_failure: 698 /* 699 * Use our own detach routine to toss any stuff we allocated above. 700 * NOTE: detach will call md_halt to free the mddb_init allocations. 701 */ 702 MD_CLR_IN(IN_ATTACH); 703 if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS) 704 cmn_err(CE_WARN, "detach from attach failed"); 705 return (DDI_FAILURE); 706 } 707 708 /* ARGSUSED */ 709 static int 710 mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd) 711 { 712 extern int check_active_locators(); 713 set_t s; 714 size_t sz; 715 int len; 716 717 MD_SET_IN(IN_DETACH); 718 719 /* check command */ 720 if (cmd != DDI_DETACH) { 721 MD_CLR_IN(IN_DETACH); 722 return (DDI_FAILURE); 723 } 724 725 /* 726 * if we have not already halted yet we have no active config 727 * then automatically initiate a halt so we can detach. 728 */ 729 if (!(md_get_status() & MD_GBL_HALTED)) { 730 if (check_active_locators() == 0) { 731 /* 732 * NOTE: a successful md_halt will have done the 733 * mddb_unload to free allocations done in mddb_init 734 */ 735 if (md_halt(MD_NO_GBL_LOCKS_HELD)) { 736 cmn_err(CE_NOTE, "md:detach: " 737 "Could not halt Solaris Volume Manager"); 738 MD_CLR_IN(IN_DETACH); 739 return (DDI_FAILURE); 740 } 741 } 742 743 /* fail detach if we have not halted */ 744 if (!(md_get_status() & MD_GBL_HALTED)) { 745 MD_CLR_IN(IN_DETACH); 746 return (DDI_FAILURE); 747 } 748 } 749 750 /* must be in halted state, this will be cleared on next attach */ 751 ASSERT(md_get_status() & MD_GBL_HALTED); 752 753 /* cleanup attach allocations and initializations */ 754 md_major_targ = 0; 755 756 sz = sizeof (void *) * md_nunits; 757 for (s = 0; s < md_nsets; s++) { 758 if (md_set[s].s_un != NULL) { 759 kmem_free(md_set[s].s_un, sz); 760 md_set[s].s_un = NULL; 761 } 762 763 if (md_set[s].s_ui != NULL) { 764 kmem_free(md_set[s].s_ui, sz); 765 md_set[s].s_ui = NULL; 766 } 767 } 768 md_nunits = 0; 769 md_nsets = 0; 770 md_nmedh = 0; 771 772 if (non_ff_drivers != NULL) { 773 int i; 774 775 for (i = 0; non_ff_drivers[i] != NULL; i++) 776 kmem_free(non_ff_drivers[i], 777 strlen(non_ff_drivers[i]) + 1); 778 779 /* free i+1 entries because there is a null entry at list end */ 780 kmem_free(non_ff_drivers, (i + 1) * sizeof (char *)); 781 non_ff_drivers = NULL; 782 } 783 784 if (md_med_trans_lst != NULL) { 785 kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1); 786 md_med_trans_lst = NULL; 787 } 788 789 if (md_mods != NULL) { 790 kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS); 791 md_mods = NULL; 792 } 793 794 if (md_ops != NULL) { 795 kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS); 796 md_ops = NULL; 797 } 798 799 if (MD_UPGRADE) { 800 len = md_tuple_length * (2 * ((int)sizeof (dev32_t))); 801 md_in_upgrade = 0; 802 md_xlate_free(len); 803 md_majortab_free(); 804 } 805 806 /* 807 * Undo what we did in mdattach, freeing resources 808 * and removing things we installed. The system 809 * framework guarantees we are not active with this devinfo 810 * node in any other entry points at this time. 811 */ 812 ddi_prop_remove_all(dip); 813 ddi_remove_minor_node(dip, NULL); 814 815 med_fini(); 816 817 mod_hash_destroy_idhash(md_nblocksmap); 818 819 md_devinfo = NULL; 820 821 MD_CLR_IN(IN_DETACH); 822 return (DDI_SUCCESS); 823 } 824 825 826 /* 827 * Given the device number return the devinfo pointer 828 * given to md via md_attach 829 */ 830 /*ARGSUSED*/ 831 static int 832 mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 833 { 834 int error = DDI_FAILURE; 835 836 switch (infocmd) { 837 case DDI_INFO_DEVT2DEVINFO: 838 if (md_devinfo) { 839 *result = (void *)md_devinfo; 840 error = DDI_SUCCESS; 841 } 842 break; 843 844 case DDI_INFO_DEVT2INSTANCE: 845 *result = (void *)0; 846 error = DDI_SUCCESS; 847 break; 848 } 849 return (error); 850 } 851 852 /* 853 * property operation routine. return the number of blocks for the partition 854 * in question or forward the request to the property facilities. 855 */ 856 static int 857 mdprop_op( 858 dev_t dev, /* device number associated with device */ 859 dev_info_t *dip, /* device info struct for this device */ 860 ddi_prop_op_t prop_op, /* property operator */ 861 int mod_flags, /* property flags */ 862 char *name, /* name of property */ 863 caddr_t valuep, /* where to put property value */ 864 int *lengthp) /* put length of property here */ 865 { 866 return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags, 867 name, valuep, lengthp, md_nblocks_get(getminor(dev)))); 868 } 869 870 static void 871 snarf_user_data(set_t setno) 872 { 873 mddb_recid_t recid; 874 mddb_recstatus_t status; 875 876 recid = mddb_makerecid(setno, 0); 877 while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) { 878 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 879 continue; 880 881 status = mddb_getrecstatus(recid); 882 if (status == MDDB_STALE) 883 continue; 884 885 if (status == MDDB_NODATA) { 886 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 887 continue; 888 } 889 890 ASSERT(status == MDDB_OK); 891 892 mddb_setrecprivate(recid, MD_PRV_GOTIT); 893 } 894 } 895 896 static void 897 md_print_block_usage(mddb_set_t *s, uint_t blks) 898 { 899 uint_t ib; 900 int li; 901 mddb_mb_ic_t *mbip; 902 uint_t max_blk_needed; 903 mddb_lb_t *lbp; 904 mddb_sidelocator_t *slp; 905 int drv_index; 906 md_splitname sn; 907 char *name; 908 char *suffix; 909 size_t prefixlen; 910 size_t suffixlen; 911 int alloc_sz; 912 913 914 max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks; 915 916 cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n" 917 " Additional Blocks Needed: %d\n\n" 918 " Increase size of following replicas for\n" 919 " device relocatability by deleting listed\n" 920 " replica and re-adding replica with\n" 921 " increased size (see metadb(1M)):\n" 922 " Replica Increase By", 923 s->s_totalblkcnt, (blks - s->s_freeblkcnt)); 924 925 lbp = s->s_lbp; 926 927 for (li = 0; li < lbp->lb_loccnt; li++) { 928 if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED) 929 continue; 930 ib = 0; 931 for (mbip = s->s_mbiarray[li]; mbip != NULL; 932 mbip = mbip->mbi_next) { 933 ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt; 934 } 935 if (ib == 0) 936 continue; 937 if (ib < max_blk_needed) { 938 slp = &lbp->lb_sidelocators[s->s_sideno][li]; 939 drv_index = slp->l_drvnm_index; 940 mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, 941 &sn); 942 prefixlen = SPN_PREFIX(&sn).pre_len; 943 suffixlen = SPN_SUFFIX(&sn).suf_len; 944 alloc_sz = (int)(prefixlen + suffixlen + 2); 945 name = (char *)kmem_alloc(alloc_sz, KM_SLEEP); 946 (void) strncpy(name, SPN_PREFIX(&sn).pre_data, 947 prefixlen); 948 name[prefixlen] = '/'; 949 suffix = name + (prefixlen + 1); 950 (void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data, 951 suffixlen); 952 name[prefixlen + suffixlen + 1] = '\0'; 953 cmn_err(CE_WARN, 954 " %s (%s:%d:%d) %d blocks", 955 name, lbp->lb_drvnm[drv_index].dn_data, 956 slp->l_mnum, lbp->lb_locators[li].l_blkno, 957 (max_blk_needed - ib)); 958 kmem_free(name, alloc_sz); 959 } 960 } 961 } 962 963 /* 964 * md_create_minor_node: 965 * Create the minor device for the given set and un_self_id. 966 * 967 * Input: 968 * setno - set number 969 * mnum - selfID of unit 970 * 971 * Output: 972 * None. 973 * 974 * Returns 0 for success, 1 for failure. 975 * 976 * Side-effects: 977 * None. 978 */ 979 int 980 md_create_minor_node(set_t setno, minor_t mnum) 981 { 982 char name[20]; 983 984 /* Check for valid arguments */ 985 if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS) 986 return (1); 987 988 (void) snprintf(name, 20, "%u,%u,blk", 989 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum)); 990 991 if (ddi_create_minor_node(md_devinfo, name, S_IFBLK, 992 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0)) 993 return (1); 994 995 (void) snprintf(name, 20, "%u,%u,raw", 996 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum)); 997 998 if (ddi_create_minor_node(md_devinfo, name, S_IFCHR, 999 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0)) 1000 return (1); 1001 1002 return (0); 1003 } 1004 1005 /* 1006 * For a given key check if it is an orphaned record. 1007 * The following conditions are used to determine an orphan. 1008 * 1. The device associated with that key is not a metadevice. 1009 * 2. If DEVID_STYLE then the physical device does not have a device Id 1010 * associated with it. 1011 * 1012 * If a key does not have an entry in the devid namespace it could be 1013 * a device that does not support device ids. Hence the record is not 1014 * deleted. 1015 */ 1016 1017 static int 1018 md_verify_orphaned_record(set_t setno, mdkey_t key) 1019 { 1020 md_dev64_t odev; /* orphaned dev */ 1021 mddb_set_t *s; 1022 side_t side = 0; 1023 struct nm_next_hdr *did_nh = NULL; 1024 1025 s = (mddb_set_t *)md_set[setno].s_db; 1026 if ((did_nh = get_first_record(setno, 1, (NM_DEVID | NM_NOTSHARED))) 1027 == NULL) 1028 return (0); 1029 /* 1030 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT 1031 */ 1032 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) { 1033 odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT); 1034 if ((odev == NODEV64) || (md_getmajor(odev) == md_major)) 1035 return (0); 1036 if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) == 1037 NULL) 1038 return (1); 1039 } 1040 return (0); 1041 } 1042 1043 int 1044 md_snarf_db_set(set_t setno, md_error_t *ep) 1045 { 1046 int err = 0; 1047 int i; 1048 mddb_recid_t recid; 1049 mddb_type_t drvrid; 1050 mddb_recstatus_t status; 1051 md_ops_t *ops; 1052 uint_t privat; 1053 mddb_set_t *s; 1054 uint_t cvt_blks; 1055 struct nm_next_hdr *nh; 1056 mdkey_t key = MD_KEYWILD; 1057 side_t side = 0; 1058 int size; 1059 int devid_flag; 1060 int retval; 1061 uint_t un; 1062 int un_next_set = 0; 1063 1064 md_haltsnarf_enter(setno); 1065 1066 mutex_enter(&md_mx); 1067 if (md_set[setno].s_status & MD_SET_SNARFED) { 1068 mutex_exit(&md_mx); 1069 md_haltsnarf_exit(setno); 1070 return (0); 1071 } 1072 mutex_exit(&md_mx); 1073 1074 if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) { 1075 if (md_start_daemons(TRUE)) { 1076 if (ep != NULL) 1077 (void) mdsyserror(ep, ENXIO); 1078 err = -1; 1079 goto out; 1080 } 1081 } 1082 1083 1084 /* 1085 * Load the devid name space if it exists 1086 */ 1087 (void) md_load_namespace(setno, NULL, NM_DEVID); 1088 if (!md_load_namespace(setno, ep, 0L)) { 1089 /* 1090 * Unload the devid namespace 1091 */ 1092 (void) md_unload_namespace(setno, NM_DEVID); 1093 err = -1; 1094 goto out; 1095 } 1096 1097 /* 1098 * If replica is in non-devid state, convert if: 1099 * - not in probe during upgrade (md_keep_repl_state = 0) 1100 * - enough space available in replica 1101 * - local set 1102 * - not a multi-node diskset 1103 * - clustering is not present (for non-local set) 1104 */ 1105 s = (mddb_set_t *)md_set[setno].s_db; 1106 devid_flag = 0; 1107 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state) 1108 devid_flag = 1; 1109 if (cluster_bootflags & CLUSTER_CONFIGURED) 1110 if (setno != MD_LOCAL_SET) 1111 devid_flag = 0; 1112 if (MD_MNSET_SETNO(setno)) 1113 devid_flag = 0; 1114 if ((md_devid_destroy == 1) && (md_keep_repl_state == 1)) 1115 devid_flag = 0; 1116 1117 /* 1118 * if we weren't devid style before and md_keep_repl_state=1 1119 * we need to stay non-devid 1120 */ 1121 if ((md_keep_repl_state == 1) && 1122 ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0)) 1123 devid_flag = 0; 1124 if (devid_flag) { 1125 /* 1126 * Determine number of free blocks needed to convert 1127 * entire replica to device id format - locator blocks 1128 * and namespace. 1129 */ 1130 cvt_blks = 0; 1131 if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) { 1132 if (ep != NULL) 1133 (void) mdsyserror(ep, EIO); 1134 err = -1; 1135 goto out; 1136 1137 } 1138 cvt_blks += md_nm_did_chkspace(setno); 1139 1140 /* add MDDB_DEVID_CONV_PERC% */ 1141 if ((md_conv_perc > 0) && (md_conv_perc <= 100)) { 1142 cvt_blks = cvt_blks * (100 + md_conv_perc) / 100; 1143 } 1144 1145 if (cvt_blks <= s->s_freeblkcnt) { 1146 if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) { 1147 if (ep != NULL) 1148 (void) mdsyserror(ep, EIO); 1149 err = -1; 1150 goto out; 1151 } 1152 1153 } else { 1154 /* 1155 * Print message that replica can't be converted for 1156 * lack of space. No failure - just continue to 1157 * run without device ids. 1158 */ 1159 cmn_err(CE_WARN, 1160 "Unable to add Solaris Volume Manager device " 1161 "relocation data.\n" 1162 " To use device relocation feature:\n" 1163 " - Increase size of listed replicas\n" 1164 " - Reboot"); 1165 md_print_block_usage(s, cvt_blks); 1166 cmn_err(CE_WARN, 1167 "Loading set without device relocation data.\n" 1168 " Solaris Volume Manager disk movement " 1169 "not tracked in local set."); 1170 } 1171 } 1172 1173 /* 1174 * go through and load any modules referenced in 1175 * data base 1176 */ 1177 recid = mddb_makerecid(setno, 0); 1178 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1179 status = mddb_getrecstatus(recid); 1180 if (status == MDDB_STALE) { 1181 if (! (md_get_setstatus(setno) & MD_SET_STALE)) { 1182 md_set_setstatus(setno, MD_SET_STALE); 1183 cmn_err(CE_WARN, 1184 "md: state database is stale"); 1185 } 1186 } else if (status == MDDB_NODATA) { 1187 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1188 continue; 1189 } 1190 drvrid = mddb_getrectype1(recid); 1191 if (drvrid < MDDB_FIRST_MODID) 1192 continue; 1193 if (md_loadsubmod(setno, md_getshared_name(setno, drvrid), 1194 drvrid) < 0) { 1195 cmn_err(CE_NOTE, "md: could not load misc/%s", 1196 md_getshared_name(setno, drvrid)); 1197 } 1198 } 1199 1200 if (recid < 0) 1201 goto out; 1202 1203 snarf_user_data(setno); 1204 1205 /* 1206 * Initialize the md_nm_snarfed array 1207 * this array is indexed by the key and 1208 * is set by md_getdevnum during the snarf time 1209 */ 1210 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) { 1211 size = (int)((((struct nm_rec_hdr *)nh->nmn_record)-> 1212 r_next_key) * (sizeof (int))); 1213 md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP); 1214 } 1215 1216 /* 1217 * go through and snarf until nothing gets added 1218 */ 1219 do { 1220 i = 0; 1221 for (ops = md_opslist; ops != NULL; ops = ops->md_next) { 1222 if (ops->md_snarf != NULL) { 1223 retval = ops->md_snarf(MD_SNARF_DOIT, setno); 1224 if (retval == -1) { 1225 err = -1; 1226 /* Don't know the failed unit */ 1227 (void) mdmderror(ep, MDE_RR_ALLOC_ERROR, 1228 0); 1229 (void) md_halt_set(setno, MD_HALT_ALL); 1230 (void) mddb_unload_set(setno); 1231 md_haltsnarf_exit(setno); 1232 return (err); 1233 } else { 1234 i += retval; 1235 } 1236 } 1237 } 1238 } while (i); 1239 1240 /* 1241 * Set the first available slot and availability 1242 */ 1243 md_set[setno].s_un_avail = 0; 1244 for (un = 0; un < MD_MAXUNITS; un++) { 1245 if (md_set[setno].s_un[un] != NULL) { 1246 continue; 1247 } else { 1248 if (!un_next_set) { 1249 md_set[setno].s_un_next = un; 1250 un_next_set = 1; 1251 } 1252 md_set[setno].s_un_avail++; 1253 } 1254 } 1255 1256 md_set_setstatus(setno, MD_SET_SNARFED); 1257 1258 recid = mddb_makerecid(setno, 0); 1259 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1260 privat = mddb_getrecprivate(recid); 1261 if (privat & MD_PRV_COMMIT) { 1262 if (mddb_commitrec(recid)) { 1263 if (!(md_get_setstatus(setno) & MD_SET_STALE)) { 1264 md_set_setstatus(setno, MD_SET_STALE); 1265 cmn_err(CE_WARN, 1266 "md: state database is stale"); 1267 } 1268 } 1269 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1270 } 1271 } 1272 1273 /* Deletes must happen after all the commits */ 1274 recid = mddb_makerecid(setno, 0); 1275 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1276 privat = mddb_getrecprivate(recid); 1277 if (privat & MD_PRV_DELETE) { 1278 if (mddb_deleterec(recid)) { 1279 if (!(md_get_setstatus(setno) & MD_SET_STALE)) { 1280 md_set_setstatus(setno, MD_SET_STALE); 1281 cmn_err(CE_WARN, 1282 "md: state database is stale"); 1283 } 1284 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1285 } 1286 recid = mddb_makerecid(setno, 0); 1287 } 1288 } 1289 1290 /* 1291 * go through and clean up records until nothing gets cleaned up. 1292 */ 1293 do { 1294 i = 0; 1295 for (ops = md_opslist; ops != NULL; ops = ops->md_next) 1296 if (ops->md_snarf != NULL) 1297 i += ops->md_snarf(MD_SNARF_CLEANUP, setno); 1298 } while (i); 1299 1300 if (md_nm_snarfed != NULL && 1301 !(md_get_setstatus(setno) & MD_SET_STALE)) { 1302 /* 1303 * go thru and cleanup the namespace and the device id 1304 * name space 1305 */ 1306 for (key = 1; 1307 key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key; 1308 key++) { 1309 /* 1310 * Is the entry an 'orphan'? 1311 */ 1312 if (lookup_entry(nh, setno, side, key, NODEV64, 0L) != 1313 NULL) { 1314 /* 1315 * If the value is not set then apparently 1316 * it is not part of the current configuration, 1317 * remove it this can happen when system panic 1318 * between the primary name space update and 1319 * the device id name space update 1320 */ 1321 if (md_nm_snarfed[key] == 0) { 1322 if (md_verify_orphaned_record(setno, 1323 key) == 1) 1324 (void) remove_entry(nh, 1325 side, key, 0L); 1326 } 1327 } 1328 } 1329 } 1330 1331 if (md_nm_snarfed != NULL) { 1332 /* 1333 * Done and free the memory 1334 */ 1335 kmem_free(md_nm_snarfed, size); 1336 md_nm_snarfed = NULL; 1337 } 1338 1339 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE && 1340 !(md_get_setstatus(setno) & MD_SET_STALE)) { 1341 /* 1342 * if the destroy flag has been set and 1343 * the MD_SET_DIDCLUP bit is not set in 1344 * the set's status field, cleanup the 1345 * entire device id namespace 1346 */ 1347 if (md_devid_destroy && 1348 !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) { 1349 (void) md_devid_cleanup(setno, 1); 1350 md_set_setstatus(setno, MD_SET_DIDCLUP); 1351 } else 1352 (void) md_devid_cleanup(setno, 0); 1353 } 1354 1355 /* 1356 * clear single threading on snarf, return success or error 1357 */ 1358 out: 1359 md_haltsnarf_exit(setno); 1360 return (err); 1361 } 1362 1363 void 1364 get_minfo(struct dk_minfo *info, minor_t mnum) 1365 { 1366 md_unit_t *un; 1367 mdi_unit_t *ui; 1368 1369 info->dki_capacity = 0; 1370 info->dki_lbsize = 0; 1371 info->dki_media_type = 0; 1372 1373 if ((ui = MDI_UNIT(mnum)) == NULL) { 1374 return; 1375 } 1376 un = (md_unit_t *)md_unit_readerlock(ui); 1377 info->dki_capacity = un->c.un_total_blocks; 1378 md_unit_readerexit(ui); 1379 info->dki_lbsize = DEV_BSIZE; 1380 info->dki_media_type = DK_UNKNOWN; 1381 } 1382 1383 1384 void 1385 get_info(struct dk_cinfo *info, minor_t mnum) 1386 { 1387 /* 1388 * Controller Information 1389 */ 1390 info->dki_ctype = DKC_MD; 1391 info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo)); 1392 (void) strcpy(info->dki_cname, 1393 ddi_get_name(ddi_get_parent(md_devinfo))); 1394 /* 1395 * Unit Information 1396 */ 1397 info->dki_unit = mnum; 1398 info->dki_slave = 0; 1399 (void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo)); 1400 info->dki_flags = 0; 1401 info->dki_partition = 0; 1402 info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE); 1403 1404 /* 1405 * We can't get from here to there yet 1406 */ 1407 info->dki_addr = 0; 1408 info->dki_space = 0; 1409 info->dki_prio = 0; 1410 info->dki_vec = 0; 1411 } 1412 1413 /* 1414 * open admin device 1415 */ 1416 static int 1417 mdadminopen( 1418 int flag, 1419 int otyp) 1420 { 1421 int err = 0; 1422 1423 /* single thread */ 1424 mutex_enter(&md_mx); 1425 1426 /* check type and flags */ 1427 if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) { 1428 err = EINVAL; 1429 goto out; 1430 } 1431 if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) || 1432 (md_status & MD_GBL_EXCL)) { 1433 err = EBUSY; 1434 goto out; 1435 } 1436 1437 /* count and flag open */ 1438 md_ocnt[otyp]++; 1439 md_status |= MD_GBL_OPEN; 1440 if (flag & FEXCL) 1441 md_status |= MD_GBL_EXCL; 1442 1443 /* unlock return success */ 1444 out: 1445 mutex_exit(&md_mx); 1446 return (err); 1447 } 1448 1449 /* 1450 * open entry point 1451 */ 1452 static int 1453 mdopen( 1454 dev_t *dev, 1455 int flag, 1456 int otyp, 1457 cred_t *cred_p) 1458 { 1459 minor_t mnum = getminor(*dev); 1460 unit_t unit = MD_MIN2UNIT(mnum); 1461 set_t setno = MD_MIN2SET(mnum); 1462 mdi_unit_t *ui = NULL; 1463 int err = 0; 1464 md_parent_t parent; 1465 1466 /* dispatch admin device opens */ 1467 if (mnum == MD_ADM_MINOR) 1468 return (mdadminopen(flag, otyp)); 1469 1470 /* lock, check status */ 1471 rw_enter(&md_unit_array_rw.lock, RW_READER); 1472 1473 tryagain: 1474 if (md_get_status() & MD_GBL_HALTED) { 1475 err = ENODEV; 1476 goto out; 1477 } 1478 1479 /* check minor */ 1480 if ((setno >= md_nsets) || (unit >= md_nunits)) { 1481 err = ENXIO; 1482 goto out; 1483 } 1484 1485 /* make sure we're snarfed */ 1486 if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) { 1487 if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) { 1488 err = ENODEV; 1489 goto out; 1490 } 1491 } 1492 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) { 1493 err = ENODEV; 1494 goto out; 1495 } 1496 1497 /* check unit */ 1498 if ((ui = MDI_UNIT(mnum)) == NULL) { 1499 err = ENXIO; 1500 goto out; 1501 } 1502 1503 /* 1504 * The softpart open routine may do an I/O during the open, in 1505 * which case the open routine will set the OPENINPROGRESS flag 1506 * and drop all locks during the I/O. If this thread sees 1507 * the OPENINPROGRESS flag set, if should wait until the flag 1508 * is reset before calling the driver's open routine. It must 1509 * also revalidate the world after it grabs the unit_array lock 1510 * since the set may have been released or the metadevice cleared 1511 * during the sleep. 1512 */ 1513 if (MD_MNSET_SETNO(setno)) { 1514 mutex_enter(&ui->ui_mx); 1515 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 1516 rw_exit(&md_unit_array_rw.lock); 1517 cv_wait(&ui->ui_cv, &ui->ui_mx); 1518 rw_enter(&md_unit_array_rw.lock, RW_READER); 1519 mutex_exit(&ui->ui_mx); 1520 goto tryagain; 1521 } 1522 mutex_exit(&ui->ui_mx); 1523 } 1524 1525 /* Test if device is openable */ 1526 if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) { 1527 err = ENXIO; 1528 goto out; 1529 } 1530 1531 /* don't allow opens w/WRITE flag if stale */ 1532 if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) { 1533 err = EROFS; 1534 goto out; 1535 } 1536 1537 /* don't allow writes to subdevices */ 1538 parent = md_get_parent(md_expldev(*dev)); 1539 if ((flag & FWRITE) && MD_HAS_PARENT(parent)) { 1540 err = EROFS; 1541 goto out; 1542 } 1543 1544 /* open underlying driver */ 1545 if (md_ops[ui->ui_opsindex]->md_open != NULL) { 1546 if ((err = (*md_ops[ui->ui_opsindex]->md_open) 1547 (dev, flag, otyp, cred_p, 0)) != 0) 1548 goto out; 1549 } 1550 1551 /* or do it ourselves */ 1552 else { 1553 /* single thread */ 1554 (void) md_unit_openclose_enter(ui); 1555 err = md_unit_incopen(mnum, flag, otyp); 1556 md_unit_openclose_exit(ui); 1557 if (err != 0) 1558 goto out; 1559 } 1560 1561 /* unlock, return status */ 1562 out: 1563 rw_exit(&md_unit_array_rw.lock); 1564 return (err); 1565 } 1566 1567 /* 1568 * close admin device 1569 */ 1570 static int 1571 mdadminclose( 1572 int otyp) 1573 { 1574 int i; 1575 int err = 0; 1576 1577 /* single thread */ 1578 mutex_enter(&md_mx); 1579 1580 /* check type and flags */ 1581 if ((otyp < 0) || (otyp >= OTYPCNT)) { 1582 err = EINVAL; 1583 goto out; 1584 } else if (md_ocnt[otyp] == 0) { 1585 err = ENXIO; 1586 goto out; 1587 } 1588 1589 /* count and flag closed */ 1590 if (otyp == OTYP_LYR) 1591 md_ocnt[otyp]--; 1592 else 1593 md_ocnt[otyp] = 0; 1594 md_status &= ~MD_GBL_OPEN; 1595 for (i = 0; (i < OTYPCNT); ++i) 1596 if (md_ocnt[i] != 0) 1597 md_status |= MD_GBL_OPEN; 1598 if (! (md_status & MD_GBL_OPEN)) 1599 md_status &= ~MD_GBL_EXCL; 1600 1601 /* unlock return success */ 1602 out: 1603 mutex_exit(&md_mx); 1604 return (err); 1605 } 1606 1607 /* 1608 * close entry point 1609 */ 1610 static int 1611 mdclose( 1612 dev_t dev, 1613 int flag, 1614 int otyp, 1615 cred_t *cred_p) 1616 { 1617 minor_t mnum = getminor(dev); 1618 set_t setno = MD_MIN2SET(mnum); 1619 unit_t unit = MD_MIN2UNIT(mnum); 1620 mdi_unit_t *ui = NULL; 1621 int err = 0; 1622 1623 /* dispatch admin device closes */ 1624 if (mnum == MD_ADM_MINOR) 1625 return (mdadminclose(otyp)); 1626 1627 /* check minor */ 1628 if ((setno >= md_nsets) || (unit >= md_nunits) || 1629 ((ui = MDI_UNIT(mnum)) == NULL)) { 1630 err = ENXIO; 1631 goto out; 1632 } 1633 1634 /* close underlying driver */ 1635 if (md_ops[ui->ui_opsindex]->md_close != NULL) { 1636 if ((err = (*md_ops[ui->ui_opsindex]->md_close) 1637 (dev, flag, otyp, cred_p, 0)) != 0) 1638 goto out; 1639 } 1640 1641 /* or do it ourselves */ 1642 else { 1643 /* single thread */ 1644 (void) md_unit_openclose_enter(ui); 1645 err = md_unit_decopen(mnum, otyp); 1646 md_unit_openclose_exit(ui); 1647 if (err != 0) 1648 goto out; 1649 } 1650 1651 /* return success */ 1652 out: 1653 return (err); 1654 } 1655 1656 1657 /* 1658 * This routine performs raw read operations. It is called from the 1659 * device switch at normal priority. 1660 * 1661 * The main catch is that the *uio struct which is passed to us may 1662 * specify a read which spans two buffers, which would be contiguous 1663 * on a single partition, but not on a striped partition. This will 1664 * be handled by mdstrategy. 1665 */ 1666 /*ARGSUSED*/ 1667 static int 1668 mdread(dev_t dev, struct uio *uio, cred_t *credp) 1669 { 1670 minor_t mnum; 1671 mdi_unit_t *ui; 1672 int error; 1673 1674 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1675 (MD_MIN2SET(mnum) >= md_nsets) || 1676 (MD_MIN2UNIT(mnum) >= md_nunits) || 1677 ((ui = MDI_UNIT(mnum)) == NULL)) 1678 return (ENXIO); 1679 1680 if (md_ops[ui->ui_opsindex]->md_read != NULL) 1681 return ((*md_ops[ui->ui_opsindex]->md_read) 1682 (dev, uio, credp)); 1683 1684 if ((error = md_chk_uio(uio)) != 0) 1685 return (error); 1686 1687 return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio)); 1688 } 1689 1690 /* 1691 * This routine performs async raw read operations. It is called from the 1692 * device switch at normal priority. 1693 * 1694 * The main catch is that the *aio struct which is passed to us may 1695 * specify a read which spans two buffers, which would be contiguous 1696 * on a single partition, but not on a striped partition. This will 1697 * be handled by mdstrategy. 1698 */ 1699 /*ARGSUSED*/ 1700 static int 1701 mdaread(dev_t dev, struct aio_req *aio, cred_t *credp) 1702 { 1703 minor_t mnum; 1704 mdi_unit_t *ui; 1705 int error; 1706 1707 1708 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1709 (MD_MIN2SET(mnum) >= md_nsets) || 1710 (MD_MIN2UNIT(mnum) >= md_nunits) || 1711 ((ui = MDI_UNIT(mnum)) == NULL)) 1712 return (ENXIO); 1713 1714 if (md_ops[ui->ui_opsindex]->md_aread != NULL) 1715 return ((*md_ops[ui->ui_opsindex]->md_aread) 1716 (dev, aio, credp)); 1717 1718 if ((error = md_chk_uio(aio->aio_uio)) != 0) 1719 return (error); 1720 1721 return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio)); 1722 } 1723 1724 /* 1725 * This routine performs raw write operations. It is called from the 1726 * device switch at normal priority. 1727 * 1728 * The main catch is that the *uio struct which is passed to us may 1729 * specify a write which spans two buffers, which would be contiguous 1730 * on a single partition, but not on a striped partition. This is 1731 * handled by mdstrategy. 1732 * 1733 */ 1734 /*ARGSUSED*/ 1735 static int 1736 mdwrite(dev_t dev, struct uio *uio, cred_t *credp) 1737 { 1738 minor_t mnum; 1739 mdi_unit_t *ui; 1740 int error; 1741 1742 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1743 (MD_MIN2SET(mnum) >= md_nsets) || 1744 (MD_MIN2UNIT(mnum) >= md_nunits) || 1745 ((ui = MDI_UNIT(mnum)) == NULL)) 1746 return (ENXIO); 1747 1748 if (md_ops[ui->ui_opsindex]->md_write != NULL) 1749 return ((*md_ops[ui->ui_opsindex]->md_write) 1750 (dev, uio, credp)); 1751 1752 if ((error = md_chk_uio(uio)) != 0) 1753 return (error); 1754 1755 return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio)); 1756 } 1757 1758 /* 1759 * This routine performs async raw write operations. It is called from the 1760 * device switch at normal priority. 1761 * 1762 * The main catch is that the *aio struct which is passed to us may 1763 * specify a write which spans two buffers, which would be contiguous 1764 * on a single partition, but not on a striped partition. This is 1765 * handled by mdstrategy. 1766 * 1767 */ 1768 /*ARGSUSED*/ 1769 static int 1770 mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp) 1771 { 1772 minor_t mnum; 1773 mdi_unit_t *ui; 1774 int error; 1775 1776 1777 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1778 (MD_MIN2SET(mnum) >= md_nsets) || 1779 (MD_MIN2UNIT(mnum) >= md_nunits) || 1780 ((ui = MDI_UNIT(mnum)) == NULL)) 1781 return (ENXIO); 1782 1783 if (md_ops[ui->ui_opsindex]->md_awrite != NULL) 1784 return ((*md_ops[ui->ui_opsindex]->md_awrite) 1785 (dev, aio, credp)); 1786 1787 if ((error = md_chk_uio(aio->aio_uio)) != 0) 1788 return (error); 1789 1790 return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio)); 1791 } 1792 1793 int 1794 mdstrategy(struct buf *bp) 1795 { 1796 minor_t mnum; 1797 mdi_unit_t *ui; 1798 1799 ASSERT((bp->b_flags & B_DONE) == 0); 1800 1801 if (panicstr) 1802 md_clr_status(MD_GBL_DAEMONS_LIVE); 1803 1804 if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) || 1805 (MD_MIN2SET(mnum) >= md_nsets) || 1806 (MD_MIN2UNIT(mnum) >= md_nunits) || 1807 ((ui = MDI_UNIT(mnum)) == NULL)) { 1808 bp->b_flags |= B_ERROR; 1809 bp->b_error = ENXIO; 1810 bp->b_resid = bp->b_bcount; 1811 biodone(bp); 1812 return (0); 1813 } 1814 1815 bp->b_flags &= ~(B_ERROR | B_DONE); 1816 if (md_ops[ui->ui_opsindex]->md_strategy != NULL) { 1817 (*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL); 1818 } else { 1819 (void) errdone(ui, bp, ENXIO); 1820 } 1821 return (0); 1822 } 1823 1824 /* 1825 * Return true if the ioctl is allowed to be multithreaded. 1826 * All the ioctls with MN are sent only from the message handlers through 1827 * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two 1828 * ioctl for the same metadevice are issued at the same time. 1829 * So we are safe here. 1830 * The other ioctls do not mess with any metadevice structures and therefor 1831 * are harmless too, if called multiple times at the same time. 1832 */ 1833 static boolean_t 1834 is_mt_ioctl(int cmd) { 1835 1836 switch (cmd) { 1837 case MD_IOCGUNIQMSGID: 1838 case MD_IOCGVERSION: 1839 case MD_IOCISOPEN: 1840 case MD_MN_SET_MM_OWNER: 1841 case MD_MN_SET_STATE: 1842 case MD_MN_SUSPEND_WRITES: 1843 case MD_MN_ALLOCATE_HOTSPARE: 1844 case MD_MN_SET_SETFLAGS: 1845 case MD_MN_GET_SETFLAGS: 1846 case MD_MN_MDDB_OPTRECFIX: 1847 case MD_MN_MDDB_PARSE: 1848 case MD_MN_MDDB_BLOCK: 1849 case MD_MN_DB_USERREQ: 1850 case MD_IOC_SPSTATUS: 1851 case MD_MN_COMMD_ERR: 1852 case MD_MN_SET_COMMD_RUNNING: 1853 case MD_MN_RESYNC: 1854 case MD_MN_SETSYNC: 1855 case MD_MN_POKE_HOTSPARES: 1856 case MD_MN_RR_DIRTY: 1857 case MD_MN_RR_CLEAN: 1858 case MD_MN_IOC_SPUPDATEWM: 1859 return (1); 1860 default: 1861 return (0); 1862 } 1863 } 1864 1865 /* 1866 * This routine implements the ioctl calls for the Virtual Disk System. 1867 * It is called from the device switch at normal priority. 1868 */ 1869 /* ARGSUSED */ 1870 static int 1871 mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p, 1872 int *rval_p) 1873 { 1874 minor_t mnum = getminor(dev); 1875 mdi_unit_t *ui; 1876 IOLOCK lock; 1877 int err; 1878 1879 /* 1880 * For multinode disksets number of ioctls are allowed to be 1881 * multithreaded. 1882 * A fundamental assumption made in this implementation is that 1883 * ioctls either do not interact with other md structures or the 1884 * ioctl to the admin device can only occur if the metadevice 1885 * device is open. i.e. avoid a race between metaclear and the 1886 * progress of a multithreaded ioctl. 1887 */ 1888 1889 if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) { 1890 return (EINTR); 1891 } 1892 1893 /* 1894 * initialize lock tracker 1895 */ 1896 IOLOCK_INIT(&lock); 1897 1898 /* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */ 1899 1900 if (is_mt_ioctl(cmd)) { 1901 /* increment the md_mtioctl_cnt */ 1902 mutex_enter(&md_mx); 1903 md_mtioctl_cnt++; 1904 mutex_exit(&md_mx); 1905 lock.l_flags |= MD_MT_IOCTL; 1906 } 1907 1908 /* 1909 * this has been added to prevent notification from re-snarfing 1910 * so metaunload will work. It may interfere with other modules 1911 * halt process. 1912 */ 1913 if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE)) 1914 return (IOLOCK_RETURN(ENXIO, &lock)); 1915 1916 /* 1917 * admin device ioctls 1918 */ 1919 if (mnum == MD_ADM_MINOR) { 1920 err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data, 1921 mode, &lock); 1922 } 1923 1924 /* 1925 * metadevice ioctls 1926 */ 1927 else if ((MD_MIN2SET(mnum) >= md_nsets) || 1928 (MD_MIN2UNIT(mnum) >= md_nunits) || 1929 (md_set[MD_MIN2SET(mnum)].s_ui == NULL) || 1930 ((ui = MDI_UNIT(mnum)) == NULL)) { 1931 err = ENXIO; 1932 } else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) { 1933 err = ENOTTY; 1934 } else { 1935 err = (*md_ops[ui->ui_opsindex]->md_ioctl) 1936 (dev, cmd, (void *) data, mode, &lock); 1937 } 1938 1939 /* 1940 * drop any locks we grabbed 1941 */ 1942 return (IOLOCK_RETURN_IOCTLEND(err, &lock)); 1943 } 1944 1945 static int 1946 mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1947 { 1948 minor_t mnum; 1949 set_t setno; 1950 mdi_unit_t *ui; 1951 1952 if ((mnum = getminor(dev)) == MD_ADM_MINOR) 1953 return (ENXIO); 1954 1955 setno = MD_MIN2SET(mnum); 1956 1957 if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) || 1958 ((ui = MDI_UNIT(mnum)) == NULL)) 1959 return (ENXIO); 1960 1961 1962 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 1963 return (ENXIO); 1964 1965 if (md_ops[ui->ui_opsindex]->md_dump != NULL) 1966 return ((*md_ops[ui->ui_opsindex]->md_dump) 1967 (dev, addr, blkno, nblk)); 1968 1969 return (ENXIO); 1970 } 1971 1972 /* 1973 * Metadevice unit number dispatcher 1974 * When this routine is called it will scan the 1975 * incore unit array and return the avail slot 1976 * hence the unit number to the caller 1977 * 1978 * Return -1 if there is nothing available 1979 */ 1980 unit_t 1981 md_get_nextunit(set_t setno) 1982 { 1983 unit_t un, start; 1984 1985 /* 1986 * If nothing available 1987 */ 1988 if (md_set[setno].s_un_avail == 0) { 1989 return (MD_UNITBAD); 1990 } 1991 1992 mutex_enter(&md_mx); 1993 start = un = md_set[setno].s_un_next; 1994 1995 /* LINTED: E_CONSTANT_CONDITION */ 1996 while (1) { 1997 if (md_set[setno].s_un[un] == NULL) { 1998 /* 1999 * Advance the starting index for the next 2000 * md_get_nextunit call 2001 */ 2002 if (un == MD_MAXUNITS - 1) { 2003 md_set[setno].s_un_next = 0; 2004 } else { 2005 md_set[setno].s_un_next = un + 1; 2006 } 2007 break; 2008 } 2009 2010 un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1); 2011 2012 if (un == start) { 2013 un = MD_UNITBAD; 2014 break; 2015 } 2016 2017 } 2018 2019 mutex_exit(&md_mx); 2020 return (un); 2021 } 2022