1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <stdio.h> 29 #include <unistd.h> 30 #include <stdlib.h> 31 #include <string.h> 32 #include <fcntl.h> 33 #include <pthread.h> 34 #include <errno.h> 35 #include <libnvpair.h> 36 #include <dlfcn.h> 37 #include <link.h> 38 #include <assert.h> 39 40 #include <sys/processor.h> 41 #include <sys/stat.h> 42 #include <sys/mdesc.h> 43 #include <sys/param.h> 44 #include <sys/systeminfo.h> 45 #include <sys/mem.h> 46 #include <sys/bl.h> 47 #include <sys/fm/protocol.h> 48 #include <fm/fmd_fmri.h> 49 #include <sys/pri.h> 50 51 #include "ldom.h" 52 #include "ldmsvcs_utils.h" 53 54 #define MD_STR_PLATFORM "platform" 55 #define MD_STR_DOM_CAPABLE "domaining-enabled" 56 57 static int ldom_ldmd_is_up = 0; /* assume stays up if ever seen up */ 58 59 static void *ldom_dl_hp = (void *)NULL; 60 static const char *ldom_dl_path = "libpri.so.1"; 61 static int ldom_dl_mode = (RTLD_NOW | RTLD_LOCAL); 62 63 static pthread_mutex_t ldom_pri_lock = PTHREAD_MUTEX_INITIALIZER; 64 static int ldom_pri_ref_cnt = 0; /* num of outstanding ldom_pri_init()s */ 65 static int ldom_pri_init_done = 0; /* bool for real pri_init() done */ 66 static int (*ldom_pri_fp_init)(void) = (int (*)(void))NULL; 67 static void (*ldom_pri_fp_fini)(void) = (void (*)(void))NULL; 68 static ssize_t (*ldom_pri_fp_get)(uint8_t wait, uint64_t *token, uint64_t **buf, 69 void *(*allocp)(size_t), void (*freep)(void *, size_t)) = 70 (ssize_t (*)(uint8_t wait, uint64_t *token, uint64_t **buf, 71 void *(*allocp)(size_t), void (*freep)(void *, size_t)))NULL; 72 73 static void 74 ldom_pri_config(void) 75 { 76 char isa[MAXNAMELEN]; /* used to see if machine is sun4v */ 77 78 if (sysinfo(SI_MACHINE, isa, MAXNAMELEN) < 0) 79 return; 80 if (strcmp(isa, "sun4v") != 0) 81 return; 82 if ((ldom_dl_hp = dlopen(ldom_dl_path, ldom_dl_mode)) == NULL) 83 return; 84 85 ldom_pri_fp_init = (int (*)(void))dlsym(ldom_dl_hp, "pri_init"); 86 ldom_pri_fp_fini = (void (*)(void))dlsym(ldom_dl_hp, "pri_fini"); 87 ldom_pri_fp_get = (ssize_t (*)(uint8_t wait, uint64_t *token, 88 uint64_t **buf, void *(*allocp)(size_t), 89 void (*freep)(void *, size_t)))dlsym(ldom_dl_hp, "pri_get"); 90 } 91 92 static void 93 ldom_pri_unconfig(void) 94 { 95 if (ldom_dl_hp == NULL) 96 return; 97 98 ldom_pri_fp_init = (int (*)(void))NULL; 99 ldom_pri_fp_fini = (void (*)(void))NULL; 100 ldom_pri_fp_get = (ssize_t (*)(uint8_t wait, uint64_t *token, 101 uint64_t **buf, void *(*allocp)(size_t), 102 void (*freep)(void *, size_t)))NULL; 103 (void) dlclose(ldom_dl_hp); 104 ldom_dl_hp = (void *)NULL; 105 } 106 107 /* 108 * ldom_pri_lock is assumed already held by anyone accessing ldom_pri_ref_cnt 109 */ 110 111 static int 112 ldom_pri_init(void) 113 { 114 if (ldom_pri_ref_cnt == 0) { 115 ldom_pri_config(); 116 /* 117 * ldom_pri_init() is called before we know whether we 118 * have LDOMS FW or not; defer calling pri_init() via 119 * ldom_pri_fp_init until the first time we try to 120 * actually get a PRI 121 */ 122 } 123 ldom_pri_ref_cnt++; 124 125 assert(ldom_pri_ref_cnt > 0); 126 127 return (0); 128 } 129 130 static void 131 ldom_pri_fini(void) 132 { 133 assert(ldom_pri_ref_cnt > 0); 134 135 ldom_pri_ref_cnt--; 136 if (ldom_pri_ref_cnt == 0) { 137 if (ldom_pri_init_done && (ldom_pri_fp_fini != NULL)) { 138 (*ldom_pri_fp_fini)(); 139 ldom_pri_init_done = 0; 140 } 141 ldom_pri_unconfig(); 142 } 143 } 144 145 static ssize_t 146 ldom_pri_get(uint8_t wait, uint64_t *token, uint64_t **buf, 147 void *(*allocp)(size_t), void (*freep)(void *, size_t)) 148 { 149 assert(ldom_pri_ref_cnt > 0); 150 151 if ((!ldom_pri_init_done) && (ldom_pri_fp_init != NULL)) { 152 if ((*ldom_pri_fp_init)() < 0) 153 return (-1); 154 ldom_pri_init_done = 1; 155 } 156 157 if (ldom_pri_fp_get != NULL) 158 return ((*ldom_pri_fp_get)(wait, token, buf, allocp, freep)); 159 else 160 return (-1); 161 } 162 163 static ssize_t 164 get_local_core_md(ldom_hdl_t *lhp, uint64_t **buf) 165 { 166 int fh; 167 size_t size; 168 uint64_t *bufp; 169 170 if ((fh = open("/devices/pseudo/mdesc@0:mdesc", O_RDONLY, 0)) < 0) 171 return (-1); 172 173 if (ioctl(fh, MDESCIOCGSZ, &size) < 0) { 174 (void) close(fh); 175 return (-1); 176 } 177 178 bufp = (uint64_t *)lhp->allocp(size); 179 180 if (read(fh, bufp, size) < 0) { 181 lhp->freep(bufp, size); 182 (void) close(fh); 183 return (-1); 184 } 185 (void) close(fh); 186 187 *buf = bufp; 188 189 return ((ssize_t)size); 190 } 191 192 193 static int 194 get_local_md_prop_value(ldom_hdl_t *lhp, char *node, char *prop, uint64_t *val) 195 { 196 int rc = 1; 197 uint64_t *bufp; 198 ssize_t bufsiz; 199 200 if ((bufsiz = get_local_core_md(lhp, &bufp)) > 0) { 201 md_t *mdp; 202 203 if (mdp = md_init_intern(bufp, lhp->allocp, lhp->freep)) { 204 int num_nodes; 205 mde_cookie_t *listp; 206 207 num_nodes = md_node_count(mdp); 208 listp = lhp->allocp(sizeof (mde_cookie_t) * num_nodes); 209 210 if (md_scan_dag(mdp, MDE_INVAL_ELEM_COOKIE, 211 md_find_name(mdp, node), 212 md_find_name(mdp, "fwd"), listp) > 0 && 213 md_get_prop_val(mdp, listp[0], prop, val) >= 0) { 214 /* found the property */ 215 rc = 0; 216 } 217 218 lhp->freep(listp, sizeof (mde_cookie_t) * num_nodes); 219 (void) md_fini(mdp); 220 } 221 lhp->freep(bufp, bufsiz); 222 } 223 return (rc); 224 } 225 226 static int 227 ldom_getinfo(struct ldom_hdl *lhp) 228 { 229 static pthread_mutex_t mt = PTHREAD_MUTEX_INITIALIZER; 230 static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; 231 static int major_version = -1; 232 static int service_ldom = -1; 233 static int busy_init = 0; 234 235 int ier, rc = 0; 236 uint64_t domain_capable; 237 238 (void) pthread_mutex_lock(&mt); 239 240 while (busy_init == 1) 241 (void) pthread_cond_wait(&cv, &mt); 242 243 if (major_version != -1 && service_ldom != -1) { 244 lhp->major_version = major_version; 245 lhp->service_ldom = service_ldom; 246 (void) pthread_mutex_unlock(&mt); 247 return (0); 248 } 249 250 /* 251 * get to this point if major_version and service_ldom have not yet 252 * been determined 253 */ 254 busy_init = 1; 255 (void) pthread_mutex_unlock(&mt); 256 257 /* 258 * set defaults which correspond to the case of "LDOMS not 259 * available". note that these can (and will) also apply to 260 * non-sun4v machines. 261 */ 262 major_version = 0; 263 service_ldom = 0; 264 265 if (get_local_md_prop_value(lhp, MD_STR_PLATFORM, MD_STR_DOM_CAPABLE, 266 &domain_capable) == 0) { 267 268 /* 269 * LDOMS capable FW is installed; it should be ok to 270 * try to communicate with ldmd and if that fails/timesout 271 * then use libpri 272 */ 273 major_version = 1; 274 275 if ((ier = ldmsvcs_check_channel()) == 0) { 276 /* 277 * control ldom 278 * ldmfma channel between FMA and ldmd only exists 279 * on the control domain. 280 */ 281 service_ldom = 1; 282 } else if (ier == 1) { 283 /* 284 * guest ldom 285 * non-control ldom such as guest and io service ldom 286 */ 287 service_ldom = 0; 288 } 289 } 290 291 (void) pthread_mutex_lock(&mt); 292 lhp->major_version = major_version; 293 lhp->service_ldom = service_ldom; 294 busy_init = 0; 295 (void) pthread_mutex_unlock(&mt); 296 297 (void) pthread_cond_broadcast(&cv); 298 299 return (rc); 300 } 301 302 303 /* 304 * search the machine description for a "pid" entry (physical cpuid) and 305 * return the corresponding "id" entry (virtual cpuid). 306 * return -1 if not found. 307 * if the pid property does not exist in a cpu node, assume pid = id. 308 */ 309 static processorid_t 310 cpu_phys2virt(ldom_hdl_t *lhp, uint32_t cpuid) 311 { 312 char isa[MAXNAMELEN]; 313 md_t *mdp; 314 mde_cookie_t *listp; 315 ssize_t bufsize; 316 processorid_t vid; 317 uint64_t *bufp; 318 uint64_t pval, pid, id; 319 int num_nodes, ncpus, i; 320 321 (void) sysinfo(SI_MACHINE, isa, MAXNAMELEN); 322 323 if (strcmp(isa, "sun4v") != 0) 324 return ((processorid_t)cpuid); 325 326 /* 327 * convert the physical cpuid to a virtual cpuid 328 */ 329 if ((bufsize = get_local_core_md(lhp, &bufp)) < 1) 330 return (-1); 331 332 if ((mdp = md_init_intern(bufp, lhp->allocp, lhp->freep)) == NULL || 333 (num_nodes = md_node_count(mdp)) < 1) { 334 lhp->freep(bufp, bufsize); 335 return (-1); 336 } 337 338 listp = (mde_cookie_t *)lhp->allocp(sizeof (mde_cookie_t) * num_nodes); 339 ncpus = md_scan_dag(mdp, MDE_INVAL_ELEM_COOKIE, 340 md_find_name(mdp, "cpu"), md_find_name(mdp, "fwd"), listp); 341 342 vid = -1; 343 for (i = 0; i < ncpus; i++) { 344 if (md_get_prop_val(mdp, listp[i], "id", &pval) < 0) 345 pval = (uint64_t)-1; 346 id = pval; 347 348 /* if pid does not exist, assume pid=id */ 349 if (md_get_prop_val(mdp, listp[i], "pid", &pval) < 0) 350 pval = id; 351 pid = pval; 352 353 if (pid == (uint64_t)cpuid) { 354 /* Found the entry */ 355 vid = (processorid_t)id; 356 break; 357 } 358 } 359 360 lhp->freep(listp, sizeof (mde_cookie_t) * num_nodes); 361 (void) md_fini(mdp); 362 lhp->freep(bufp, bufsize); 363 364 return (vid); 365 } 366 367 /* 368 * if checking for status of a retired page: 369 * 0 - page is retired 370 * EAGAIN - page is scheduled for retirement 371 * EIO - page not scheduled for retirement 372 * EINVAL - error 373 * 374 * if retiring a page: 375 * 0 - success in retiring page 376 * EIO - page is already retired 377 * EAGAIN - page is scheduled for retirement 378 * EINVAL - error 379 * 380 * the original decoder for ioctl() return values is 381 * http://fma.eng/documents/engineering/cpumem/page_retire_api.txt 382 */ 383 static int 384 os_mem_page_retire(ldom_hdl_t *lhp, int cmd, nvlist_t *nvl) 385 { 386 mem_page_t mpage; 387 char *fmribuf; 388 size_t fmrisz; 389 int fd, rc, err; 390 391 if (cmd != MEM_PAGE_RETIRE && cmd != MEM_PAGE_FMRI_RETIRE && 392 cmd != MEM_PAGE_ISRETIRED && cmd != MEM_PAGE_FMRI_ISRETIRED) 393 return (EINVAL); 394 395 if ((fd = open("/dev/mem", O_RDONLY)) < 0) 396 return (EINVAL); 397 398 if ((errno = nvlist_size(nvl, &fmrisz, NV_ENCODE_NATIVE)) != 0 || 399 fmrisz > MEM_FMRI_MAX_BUFSIZE || 400 (fmribuf = lhp->allocp(fmrisz)) == NULL) { 401 (void) close(fd); 402 return (EINVAL); 403 } 404 405 if ((errno = nvlist_pack(nvl, &fmribuf, &fmrisz, 406 NV_ENCODE_NATIVE, 0)) != 0) { 407 lhp->freep(fmribuf, fmrisz); 408 (void) close(fd); 409 return (EINVAL); 410 } 411 412 mpage.m_fmri = fmribuf; 413 mpage.m_fmrisz = fmrisz; 414 415 rc = ioctl(fd, cmd, &mpage); 416 err = errno; 417 418 lhp->freep(fmribuf, fmrisz); 419 (void) close(fd); 420 421 if (rc < 0) { 422 rc = err; 423 } 424 425 return (rc); 426 } 427 428 429 int 430 ldom_fmri_status(ldom_hdl_t *lhp, nvlist_t *nvl) 431 { 432 char *name; 433 int ret = ENOTSUP; 434 435 if (nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &name) != 0) 436 return (EINVAL); 437 438 /* 439 * ldom_ldmd_is_up can only be true if ldom_major_version() 440 * returned 1 earlier; the major version is constant for the 441 * life of the client process 442 */ 443 444 if (!ldom_ldmd_is_up) { 445 /* Zeus is unavail; use local routines for status/retire */ 446 447 if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { 448 processorid_t vid; 449 uint32_t cpuid; 450 451 if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid) 452 == 0 && (vid = cpu_phys2virt(lhp, cpuid)) != -1) 453 return (p_online(vid, P_STATUS)); 454 } else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) { 455 return (os_mem_page_retire(lhp, 456 MEM_PAGE_FMRI_ISRETIRED, nvl)); 457 } 458 459 return (EINVAL); 460 } else { 461 /* Zeus is avail; use Zeus for status/retire */ 462 463 if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { 464 uint32_t cpuid; 465 466 if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, 467 &cpuid) == 0) 468 ret = ldmsvcs_cpu_req_status(lhp, cpuid); 469 } else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) { 470 uint64_t pa; 471 472 if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, 473 &pa) == 0) 474 ret = ldmsvcs_mem_req_status(lhp, pa); 475 else 476 ret = EINVAL; 477 } 478 return (ret); 479 } 480 } 481 482 483 int 484 ldom_fmri_retire(ldom_hdl_t *lhp, nvlist_t *nvl) 485 { 486 char *name; 487 int ret = ENOTSUP; 488 489 if (nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &name) != 0) 490 return (EINVAL); 491 492 /* 493 * ldom_ldmd_is_up can only be true if ldom_major_version() 494 * returned 1 earlier; the major version is constant for the 495 * life of the client process 496 */ 497 498 if (!ldom_ldmd_is_up) { 499 /* Zeus is unavail; use local routines for status/retire */ 500 501 if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { 502 processorid_t vid; 503 uint32_t cpuid; 504 505 if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid) 506 == 0 && (vid = cpu_phys2virt(lhp, cpuid)) != -1) 507 return (p_online(vid, P_FAULTED)); 508 } else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) { 509 return (os_mem_page_retire(lhp, 510 MEM_PAGE_FMRI_RETIRE, nvl)); 511 } 512 513 return (EINVAL); 514 } else { 515 /* Zeus is avail; use Zeus for status/retire */ 516 517 if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { 518 uint32_t cpuid; 519 520 if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, 521 &cpuid) == 0) 522 ret = ldmsvcs_cpu_req_offline(lhp, cpuid); 523 } else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) { 524 uint64_t pa; 525 526 if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, 527 &pa) == 0) 528 ret = ldmsvcs_mem_req_retire(lhp, pa); 529 else 530 ret = EINVAL; 531 } 532 return (ret); 533 } 534 } 535 536 537 /* 538 * blacklist cpus in a non-LDOMS environment 539 */ 540 int 541 ldom_fmri_blacklist(ldom_hdl_t *lhp, nvlist_t *nvl) 542 { 543 char *name; 544 545 if (ldom_major_version(lhp) != 0) 546 return (0); 547 548 if (nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &name) != 0) 549 return (EINVAL); 550 551 if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { 552 bl_req_t blr; 553 char *class; 554 int fd, rc, err; 555 556 if ((nvlist_lookup_string(nvl, FM_CLASS, &class) != 0) || 557 (class == NULL) || (*class == '\0')) 558 return (EINVAL); 559 560 if ((fd = open("/dev/bl", O_RDONLY)) < 0) 561 return (EIO); 562 563 if (nvlist_size(nvl, &blr.bl_fmrisz, NV_ENCODE_NATIVE) != 0 || 564 blr.bl_fmrisz == 0 || 565 (blr.bl_fmri = (caddr_t)lhp->allocp(blr.bl_fmrisz)) == 566 NULL) { 567 (void) close(fd); 568 return (EINVAL); 569 } 570 571 blr.bl_class = class; 572 573 rc = ioctl(fd, BLIOC_INSERT, &blr); 574 err = errno; 575 576 lhp->freep((void *)&blr.bl_fmri, blr.bl_fmrisz); 577 (void) close(fd); 578 579 if (rc < 0 && err != ENOTSUP) { 580 errno = err; 581 return (-1); 582 } 583 } 584 585 return (0); 586 } 587 588 589 ssize_t 590 ldom_get_core_md(ldom_hdl_t *lhp, uint64_t **buf) 591 { 592 ssize_t rv; /* return value */ 593 uint64_t tok; /* opaque PRI token */ 594 595 switch (ldom_major_version(lhp)) { 596 case 0: 597 /* pre LDOMS */ 598 rv = get_local_core_md(lhp, buf); 599 break; 600 case 1: 601 /* LDOMS 1.0 - Zeus and libpri usable only on service dom */ 602 if (ldom_on_service(lhp) == 1) { 603 if ((rv = ldmsvcs_get_core_md(lhp, buf)) < 1) { 604 (void) pthread_mutex_lock(&ldom_pri_lock); 605 rv = ldom_pri_get(PRI_GET, &tok, 606 buf, lhp->allocp, lhp->freep); 607 (void) pthread_mutex_unlock(&ldom_pri_lock); 608 } else { 609 ldom_ldmd_is_up = 1; 610 } 611 } else { 612 rv = get_local_core_md(lhp, buf); 613 } 614 break; 615 default: 616 rv = -1; 617 break; 618 } 619 620 return (rv); 621 } 622 623 /* 624 * version 0 means no LDOMS 625 */ 626 int 627 ldom_major_version(ldom_hdl_t *lhp) 628 { 629 if (lhp == NULL) 630 return (-1); 631 632 if (ldom_getinfo(lhp) == 0) 633 return (lhp->major_version); 634 else 635 return (0); 636 } 637 638 /* 639 * in the absence of ldoms we are on a single OS instance which is the 640 * equivalent of the service ldom 641 */ 642 int 643 ldom_on_service(ldom_hdl_t *lhp) 644 { 645 if (lhp == NULL) 646 return (-1); 647 648 if (ldom_getinfo(lhp) == 0) 649 return (lhp->service_ldom); 650 else 651 return (1); 652 } 653 654 655 ldom_hdl_t * 656 ldom_init(void *(*allocp)(size_t size), 657 void (*freep)(void *addr, size_t size)) 658 { 659 struct ldom_hdl *lhp; 660 661 (void) pthread_mutex_lock(&ldom_pri_lock); 662 663 if (ldom_pri_init() < 0) { 664 (void) pthread_mutex_unlock(&ldom_pri_lock); 665 return (NULL); 666 } 667 668 if ((lhp = allocp(sizeof (struct ldom_hdl))) == NULL) { 669 ldom_pri_fini(); 670 (void) pthread_mutex_unlock(&ldom_pri_lock); 671 return (NULL); 672 } 673 674 (void) pthread_mutex_unlock(&ldom_pri_lock); 675 676 lhp->major_version = -1; /* version not yet determined */ 677 lhp->allocp = allocp; 678 lhp->freep = freep; 679 680 ldmsvcs_init(lhp); 681 682 return (lhp); 683 } 684 685 686 void 687 ldom_fini(ldom_hdl_t *lhp) 688 { 689 if (lhp == NULL) 690 return; 691 692 ldmsvcs_fini(lhp); 693 lhp->freep(lhp, sizeof (struct ldom_hdl)); 694 695 (void) pthread_mutex_lock(&ldom_pri_lock); 696 697 ldom_pri_fini(); 698 699 (void) pthread_mutex_unlock(&ldom_pri_lock); 700 } 701 702 /* end file */ 703