/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ldom.h" #include "ldmsvcs_utils.h" #define MD_STR_PLATFORM "platform" #define MD_STR_DOM_CAPABLE "domaining-enabled" static int ldom_ldmd_is_up = 0; /* assume stays up if ever seen up */ static void *ldom_dl_hp = (void *)NULL; static const char *ldom_dl_path = "libpri.so.1"; static int ldom_dl_mode = (RTLD_NOW | RTLD_LOCAL); static pthread_mutex_t ldom_pri_lock = PTHREAD_MUTEX_INITIALIZER; static int ldom_pri_ref_cnt = 0; /* num of outstanding ldom_pri_init()s */ static int ldom_pri_init_done = 0; /* bool for real pri_init() done */ static int (*ldom_pri_fp_init)(void) = (int (*)(void))NULL; static void (*ldom_pri_fp_fini)(void) = (void (*)(void))NULL; static ssize_t (*ldom_pri_fp_get)(uint8_t wait, uint64_t *token, uint64_t **buf, void *(*allocp)(size_t), void (*freep)(void *, size_t)) = (ssize_t (*)(uint8_t wait, uint64_t *token, uint64_t **buf, void *(*allocp)(size_t), void (*freep)(void *, size_t)))NULL; static void ldom_pri_config(void) { char isa[MAXNAMELEN]; /* used to see if machine is sun4v */ if (sysinfo(SI_MACHINE, isa, MAXNAMELEN) < 0) return; if (strcmp(isa, "sun4v") != 0) return; if ((ldom_dl_hp = dlopen(ldom_dl_path, ldom_dl_mode)) == NULL) return; ldom_pri_fp_init = (int (*)(void))dlsym(ldom_dl_hp, "pri_init"); ldom_pri_fp_fini = (void (*)(void))dlsym(ldom_dl_hp, "pri_fini"); ldom_pri_fp_get = (ssize_t (*)(uint8_t wait, uint64_t *token, uint64_t **buf, void *(*allocp)(size_t), void (*freep)(void *, size_t)))dlsym(ldom_dl_hp, "pri_get"); } static void ldom_pri_unconfig(void) { if (ldom_dl_hp == NULL) return; ldom_pri_fp_init = (int (*)(void))NULL; ldom_pri_fp_fini = (void (*)(void))NULL; ldom_pri_fp_get = (ssize_t (*)(uint8_t wait, uint64_t *token, uint64_t **buf, void *(*allocp)(size_t), void (*freep)(void *, size_t)))NULL; (void) dlclose(ldom_dl_hp); ldom_dl_hp = (void *)NULL; } /* * ldom_pri_lock is assumed already held by anyone accessing ldom_pri_ref_cnt */ static int ldom_pri_init(void) { if (ldom_pri_ref_cnt == 0) { ldom_pri_config(); /* * ldom_pri_init() is called before we know whether we * have LDOMS FW or not; defer calling pri_init() via * ldom_pri_fp_init until the first time we try to * actually get a PRI */ } ldom_pri_ref_cnt++; assert(ldom_pri_ref_cnt > 0); return (0); } static void ldom_pri_fini(void) { assert(ldom_pri_ref_cnt > 0); ldom_pri_ref_cnt--; if (ldom_pri_ref_cnt == 0) { if (ldom_pri_init_done && (ldom_pri_fp_fini != NULL)) { (*ldom_pri_fp_fini)(); ldom_pri_init_done = 0; } ldom_pri_unconfig(); } } static ssize_t ldom_pri_get(uint8_t wait, uint64_t *token, uint64_t **buf, void *(*allocp)(size_t), void (*freep)(void *, size_t)) { assert(ldom_pri_ref_cnt > 0); if ((!ldom_pri_init_done) && (ldom_pri_fp_init != NULL)) { if ((*ldom_pri_fp_init)() < 0) return (-1); ldom_pri_init_done = 1; } if (ldom_pri_fp_get != NULL) return ((*ldom_pri_fp_get)(wait, token, buf, allocp, freep)); else return (-1); } static ssize_t get_local_core_md(ldom_hdl_t *lhp, uint64_t **buf) { int fh; size_t size; uint64_t *bufp; if ((fh = open("/devices/pseudo/mdesc@0:mdesc", O_RDONLY, 0)) < 0) return (-1); if (ioctl(fh, MDESCIOCGSZ, &size) < 0) { (void) close(fh); return (-1); } bufp = (uint64_t *)lhp->allocp(size); if (read(fh, bufp, size) < 0) { lhp->freep(bufp, size); (void) close(fh); return (-1); } (void) close(fh); *buf = bufp; return ((ssize_t)size); } static int get_local_md_prop_value(ldom_hdl_t *lhp, char *node, char *prop, uint64_t *val) { int rc = 1; uint64_t *bufp; ssize_t bufsiz; if ((bufsiz = get_local_core_md(lhp, &bufp)) > 0) { md_t *mdp; if (mdp = md_init_intern(bufp, lhp->allocp, lhp->freep)) { int num_nodes; mde_cookie_t *listp; num_nodes = md_node_count(mdp); listp = lhp->allocp(sizeof (mde_cookie_t) * num_nodes); if (md_scan_dag(mdp, MDE_INVAL_ELEM_COOKIE, md_find_name(mdp, node), md_find_name(mdp, "fwd"), listp) > 0 && md_get_prop_val(mdp, listp[0], prop, val) >= 0) { /* found the property */ rc = 0; } lhp->freep(listp, sizeof (mde_cookie_t) * num_nodes); (void) md_fini(mdp); } lhp->freep(bufp, bufsiz); } return (rc); } static int ldom_getinfo(struct ldom_hdl *lhp) { static pthread_mutex_t mt = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; static int major_version = -1; static int service_ldom = -1; static int busy_init = 0; int ier, rc = 0; uint64_t domain_capable; (void) pthread_mutex_lock(&mt); while (busy_init == 1) (void) pthread_cond_wait(&cv, &mt); if (major_version != -1 && service_ldom != -1) { lhp->major_version = major_version; lhp->service_ldom = service_ldom; (void) pthread_mutex_unlock(&mt); return (0); } /* * get to this point if major_version and service_ldom have not yet * been determined */ busy_init = 1; (void) pthread_mutex_unlock(&mt); /* * set defaults which correspond to the case of "LDOMS not * available". note that these can (and will) also apply to * non-sun4v machines. */ major_version = 0; service_ldom = 0; if (get_local_md_prop_value(lhp, MD_STR_PLATFORM, MD_STR_DOM_CAPABLE, &domain_capable) == 0) { /* * LDOMS capable FW is installed; it should be ok to * try to communicate with ldmd and if that fails/timesout * then use libpri */ major_version = 1; if ((ier = ldmsvcs_check_channel()) == 0) { /* * control ldom * ldmfma channel between FMA and ldmd only exists * on the control domain. */ service_ldom = 1; } else if (ier == 1) { /* * guest ldom * non-control ldom such as guest and io service ldom */ service_ldom = 0; } } (void) pthread_mutex_lock(&mt); lhp->major_version = major_version; lhp->service_ldom = service_ldom; busy_init = 0; (void) pthread_mutex_unlock(&mt); (void) pthread_cond_broadcast(&cv); return (rc); } /* * search the machine description for a "pid" entry (physical cpuid) and * return the corresponding "id" entry (virtual cpuid). * return -1 if not found. * if the pid property does not exist in a cpu node, assume pid = id. */ static processorid_t cpu_phys2virt(ldom_hdl_t *lhp, uint32_t cpuid) { char isa[MAXNAMELEN]; md_t *mdp; mde_cookie_t *listp; ssize_t bufsize; processorid_t vid; uint64_t *bufp; uint64_t pval, pid, id; int num_nodes, ncpus, i; (void) sysinfo(SI_MACHINE, isa, MAXNAMELEN); if (strcmp(isa, "sun4v") != 0) return ((processorid_t)cpuid); /* * convert the physical cpuid to a virtual cpuid */ if ((bufsize = get_local_core_md(lhp, &bufp)) < 1) return (-1); if ((mdp = md_init_intern(bufp, lhp->allocp, lhp->freep)) == NULL || (num_nodes = md_node_count(mdp)) < 1) { lhp->freep(bufp, bufsize); return (-1); } listp = (mde_cookie_t *)lhp->allocp(sizeof (mde_cookie_t) * num_nodes); ncpus = md_scan_dag(mdp, MDE_INVAL_ELEM_COOKIE, md_find_name(mdp, "cpu"), md_find_name(mdp, "fwd"), listp); vid = -1; for (i = 0; i < ncpus; i++) { if (md_get_prop_val(mdp, listp[i], "id", &pval) < 0) pval = (uint64_t)-1; id = pval; /* if pid does not exist, assume pid=id */ if (md_get_prop_val(mdp, listp[i], "pid", &pval) < 0) pval = id; pid = pval; if (pid == (uint64_t)cpuid) { /* Found the entry */ vid = (processorid_t)id; break; } } lhp->freep(listp, sizeof (mde_cookie_t) * num_nodes); (void) md_fini(mdp); lhp->freep(bufp, bufsize); return (vid); } /* * if checking for status of a retired page: * 0 - page is retired * EAGAIN - page is scheduled for retirement * EIO - page not scheduled for retirement * EINVAL - error * * if retiring a page: * 0 - success in retiring page * EIO - page is already retired * EAGAIN - page is scheduled for retirement * EINVAL - error * * the original decoder for ioctl() return values is * http://fma.eng/documents/engineering/cpumem/page_retire_api.txt */ static int os_mem_page_retire(ldom_hdl_t *lhp, int cmd, nvlist_t *nvl) { mem_page_t mpage; char *fmribuf; size_t fmrisz; int fd, rc, err; if (cmd != MEM_PAGE_RETIRE && cmd != MEM_PAGE_FMRI_RETIRE && cmd != MEM_PAGE_ISRETIRED && cmd != MEM_PAGE_FMRI_ISRETIRED && cmd != MEM_PAGE_UNRETIRE && cmd != MEM_PAGE_FMRI_UNRETIRE) return (EINVAL); if ((fd = open("/dev/mem", O_RDONLY)) < 0) return (EINVAL); if ((errno = nvlist_size(nvl, &fmrisz, NV_ENCODE_NATIVE)) != 0 || fmrisz > MEM_FMRI_MAX_BUFSIZE || (fmribuf = lhp->allocp(fmrisz)) == NULL) { (void) close(fd); return (EINVAL); } if ((errno = nvlist_pack(nvl, &fmribuf, &fmrisz, NV_ENCODE_NATIVE, 0)) != 0) { lhp->freep(fmribuf, fmrisz); (void) close(fd); return (EINVAL); } mpage.m_fmri = fmribuf; mpage.m_fmrisz = fmrisz; rc = ioctl(fd, cmd, &mpage); err = errno; lhp->freep(fmribuf, fmrisz); (void) close(fd); if (rc < 0) { rc = err; } return (rc); } int ldom_fmri_status(ldom_hdl_t *lhp, nvlist_t *nvl) { char *name; int ret = ENOTSUP; if (nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &name) != 0) return (EINVAL); /* * ldom_ldmd_is_up can only be true if ldom_major_version() * returned 1 earlier; the major version is constant for the * life of the client process */ if (!ldom_ldmd_is_up) { /* Zeus is unavail; use local routines for status/retire */ if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { processorid_t vid; uint32_t cpuid; if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid) == 0 && (vid = cpu_phys2virt(lhp, cpuid)) != -1) return (p_online(vid, P_STATUS)); } else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) { return (os_mem_page_retire(lhp, MEM_PAGE_FMRI_ISRETIRED, nvl)); } return (EINVAL); } else { /* Zeus is avail; use Zeus for status/retire */ if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { uint32_t cpuid; if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid) == 0) ret = ldmsvcs_cpu_req_status(lhp, cpuid); } else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) { uint64_t pa; if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, &pa) == 0) ret = ldmsvcs_mem_req_status(lhp, pa); else ret = EINVAL; } return (ret); } } int ldom_fmri_retire(ldom_hdl_t *lhp, nvlist_t *nvl) { char *name; int ret = ENOTSUP; if (nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &name) != 0) return (EINVAL); /* * ldom_ldmd_is_up can only be true if ldom_major_version() * returned 1 earlier; the major version is constant for the * life of the client process */ if (!ldom_ldmd_is_up) { /* Zeus is unavail; use local routines for status/retire */ if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { processorid_t vid; uint32_t cpuid; if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid) == 0 && (vid = cpu_phys2virt(lhp, cpuid)) != -1) return (p_online(vid, P_FAULTED)); } else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) { return (os_mem_page_retire(lhp, MEM_PAGE_FMRI_RETIRE, nvl)); } return (EINVAL); } else { /* Zeus is avail; use Zeus for status/retire */ if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { uint32_t cpuid; if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid) == 0) ret = ldmsvcs_cpu_req_offline(lhp, cpuid); } else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) { uint64_t pa; if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, &pa) == 0) ret = ldmsvcs_mem_req_retire(lhp, pa); else ret = EINVAL; } return (ret); } } int ldom_fmri_unretire(ldom_hdl_t *lhp, nvlist_t *nvl) { char *name; int ret = ENOTSUP; if (nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &name) != 0) return (EINVAL); /* * ldom_ldmd_is_up can only be true if ldom_major_version() * returned 1 earlier; the major version is constant for the * life of the client process */ if (!ldom_ldmd_is_up) { /* Zeus is unavail; use local routines for status/retire */ if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { processorid_t vid; uint32_t cpuid; if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid) == 0 && (vid = cpu_phys2virt(lhp, cpuid)) != -1) return (p_online(vid, P_ONLINE)); } else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) { return (os_mem_page_retire(lhp, MEM_PAGE_FMRI_UNRETIRE, nvl)); } return (EINVAL); } else { /* Zeus is avail; use Zeus for status/retire */ if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { uint32_t cpuid; if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid) == 0) ret = ldmsvcs_cpu_req_online(lhp, cpuid); } else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) { uint64_t pa; if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, &pa) == 0) ret = ldmsvcs_mem_req_unretire(lhp, pa); else ret = EINVAL; } return (ret); } } static int fmri_blacklist(ldom_hdl_t *lhp, nvlist_t *nvl, int cmd) { char *name; if (ldom_major_version(lhp) != 0) return (0); if (nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &name) != 0) return (EINVAL); if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) { bl_req_t blr; char *class; int fd, rc, err; if ((nvlist_lookup_string(nvl, FM_CLASS, &class) != 0) || (class == NULL) || (*class == '\0')) return (EINVAL); if ((fd = open("/dev/bl", O_RDONLY)) < 0) return (EIO); if (nvlist_size(nvl, &blr.bl_fmrisz, NV_ENCODE_NATIVE) != 0 || blr.bl_fmrisz == 0 || (blr.bl_fmri = (caddr_t)lhp->allocp(blr.bl_fmrisz)) == NULL) { (void) close(fd); return (EINVAL); } blr.bl_class = class; rc = ioctl(fd, cmd, &blr); err = errno; lhp->freep((void *)&blr.bl_fmri, blr.bl_fmrisz); (void) close(fd); if (rc < 0 && err != ENOTSUP) { errno = err; return (-1); } } return (0); } /* * blacklist cpus in a non-LDOMS environment */ int ldom_fmri_blacklist(ldom_hdl_t *lhp, nvlist_t *nvl) { return (fmri_blacklist(lhp, nvl, BLIOC_INSERT)); } /* * unblacklist cpus */ int ldom_fmri_unblacklist(ldom_hdl_t *lhp, nvlist_t *nvl) { return (fmri_blacklist(lhp, nvl, BLIOC_DELETE)); } ssize_t ldom_get_core_md(ldom_hdl_t *lhp, uint64_t **buf) { ssize_t rv; /* return value */ uint64_t tok; /* opaque PRI token */ switch (ldom_major_version(lhp)) { case 0: /* pre LDOMS */ rv = get_local_core_md(lhp, buf); break; case 1: /* LDOMS 1.0 - Zeus and libpri usable only on service dom */ if (ldom_on_service(lhp) == 1) { if ((rv = ldmsvcs_get_core_md(lhp, buf)) < 1) { (void) pthread_mutex_lock(&ldom_pri_lock); rv = ldom_pri_get(PRI_GET, &tok, buf, lhp->allocp, lhp->freep); (void) pthread_mutex_unlock(&ldom_pri_lock); } else { ldom_ldmd_is_up = 1; } } else { rv = get_local_core_md(lhp, buf); } break; default: rv = -1; break; } return (rv); } /* * version 0 means no LDOMS */ int ldom_major_version(ldom_hdl_t *lhp) { if (lhp == NULL) return (-1); if (ldom_getinfo(lhp) == 0) return (lhp->major_version); else return (0); } /* * in the absence of ldoms we are on a single OS instance which is the * equivalent of the service ldom */ int ldom_on_service(ldom_hdl_t *lhp) { if (lhp == NULL) return (-1); if (ldom_getinfo(lhp) == 0) return (lhp->service_ldom); else return (1); } ldom_hdl_t * ldom_init(void *(*allocp)(size_t size), void (*freep)(void *addr, size_t size)) { struct ldom_hdl *lhp; (void) pthread_mutex_lock(&ldom_pri_lock); if (ldom_pri_init() < 0) { (void) pthread_mutex_unlock(&ldom_pri_lock); return (NULL); } if ((lhp = allocp(sizeof (struct ldom_hdl))) == NULL) { ldom_pri_fini(); (void) pthread_mutex_unlock(&ldom_pri_lock); return (NULL); } (void) pthread_mutex_unlock(&ldom_pri_lock); lhp->major_version = -1; /* version not yet determined */ lhp->allocp = allocp; lhp->freep = freep; ldmsvcs_init(lhp); return (lhp); } void ldom_fini(ldom_hdl_t *lhp) { if (lhp == NULL) return; ldmsvcs_fini(lhp); lhp->freep(lhp, sizeof (struct ldom_hdl)); (void) pthread_mutex_lock(&ldom_pri_lock); ldom_pri_fini(); (void) pthread_mutex_unlock(&ldom_pri_lock); } /* end file */