/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MAX_PSET_NAME 1024 /* Taken from PV_NAME_MAX_LEN */ #define ZSD_PSET_UNLIMITED UINT16_MAX #define ZONESTAT_EXACCT_FILE "/var/adm/exacct/zonestat-process" /* * zonestatd implements gathering cpu and memory utilization data for * running zones. It has these components: * * zsd_server: * Door server to respond to client connections. Each client * will connect using libzonestat.so, which will open and * call /var/tmp/.zonestat_door. Each connecting client is given * a file descriptor to the stat server. * * The zsd_server also responds to zoneadmd, which reports when a * new zone is booted. This is used to fattach the zsd_server door * into the new zone. * * zsd_stat_server: * Receives client requests for the current utilization data. Each * client request will cause zonestatd to update the current utilization * data by kicking the stat_thread. * * If the client is in a non-global zone, the utilization data will * be filtered to only show the given zone. The usage by all other zones * will be added to the system utilization. * * stat_thread: * The stat thread implements querying the system to determine the * current utilization data for each running zone. This includes * inspecting the system's processor set configuration, as well as details * of each zone, such as their configured limits, and which processor * sets they are running in. * * The stat_thread will only update memory utilization data as often as * the configured config/sample_interval on the zones-monitoring service. */ /* * The private vmusage structure unfortunately uses size_t types, and assumes * the caller's bitness matches the kernel's bitness. Since the getvmusage() * system call is contracted, and zonestatd is 32 bit, the following structures * are used to interact with a 32bit or 64 bit kernel. */ typedef struct zsd_vmusage32 { id_t vmu_zoneid; uint_t vmu_type; id_t vmu_id; uint32_t vmu_rss_all; uint32_t vmu_rss_private; uint32_t vmu_rss_shared; uint32_t vmu_swap_all; uint32_t vmu_swap_private; uint32_t vmu_swap_shared; } zsd_vmusage32_t; typedef struct zsd_vmusage64 { id_t vmu_zoneid; uint_t vmu_type; id_t vmu_id; /* * An amd64 kernel will align the following uint64_t members, but a * 32bit i386 process will not without help. */ int vmu_align_next_members_on_8_bytes; uint64_t vmu_rss_all; uint64_t vmu_rss_private; uint64_t vmu_rss_shared; uint64_t vmu_swap_all; uint64_t vmu_swap_private; uint64_t vmu_swap_shared; } zsd_vmusage64_t; struct zsd_zone; /* Used to store a zone's usage of a pset */ typedef struct zsd_pset_usage { struct zsd_zone *zsu_zone; struct zsd_pset *zsu_pset; list_node_t zsu_next; zoneid_t zsu_zoneid; boolean_t zsu_found; /* zone bound at end of interval */ boolean_t zsu_active; /* zone was bound during interval */ boolean_t zsu_new; /* zone newly bound in this interval */ boolean_t zsu_deleted; /* zone was unbound in this interval */ boolean_t zsu_empty; /* no procs in pset in this interval */ time_t zsu_start; /* time when zone was found in pset */ hrtime_t zsu_hrstart; /* time when zone was found in pset */ uint64_t zsu_cpu_shares; uint_t zsu_scheds; /* schedulers found in this pass */ timestruc_t zsu_cpu_usage; /* cpu time used */ } zsd_pset_usage_t; /* Used to store a pset's utilization */ typedef struct zsd_pset { psetid_t zsp_id; list_node_t zsp_next; char zsp_name[ZS_PSETNAME_MAX]; uint_t zsp_cputype; /* default, dedicated or shared */ boolean_t zsp_found; /* pset found at end of interval */ boolean_t zsp_new; /* pset new in this interval */ boolean_t zsp_deleted; /* pset deleted in this interval */ boolean_t zsp_active; /* pset existed during interval */ boolean_t zsp_empty; /* no processes in pset */ time_t zsp_start; hrtime_t zsp_hrstart; uint64_t zsp_online; /* online cpus in interval */ uint64_t zsp_size; /* size in this interval */ uint64_t zsp_min; /* configured min in this interval */ uint64_t zsp_max; /* configured max in this interval */ int64_t zsp_importance; /* configured max in this interval */ uint_t zsp_scheds; /* scheds of processes found in pset */ uint64_t zsp_cpu_shares; /* total shares in this interval */ timestruc_t zsp_total_time; timestruc_t zsp_usage_kern; timestruc_t zsp_usage_zones; /* Individual zone usages of pset */ list_t zsp_usage_list; int zsp_nusage; /* Summed kstat values from individual cpus in pset */ timestruc_t zsp_idle; timestruc_t zsp_intr; timestruc_t zsp_kern; timestruc_t zsp_user; } zsd_pset_t; /* Used to track an individual cpu's utilization as reported by kstats */ typedef struct zsd_cpu { processorid_t zsc_id; list_node_t zsc_next; psetid_t zsc_psetid; psetid_t zsc_psetid_prev; zsd_pset_t *zsc_pset; boolean_t zsc_found; /* cpu online in this interval */ boolean_t zsc_onlined; /* cpu onlined during this interval */ boolean_t zsc_offlined; /* cpu offlined during this interval */ boolean_t zsc_active; /* cpu online during this interval */ boolean_t zsc_allocated; /* True if cpu has ever been found */ /* kstats this interval */ uint64_t zsc_nsec_idle; uint64_t zsc_nsec_intr; uint64_t zsc_nsec_kern; uint64_t zsc_nsec_user; /* kstats in most recent interval */ uint64_t zsc_nsec_idle_prev; uint64_t zsc_nsec_intr_prev; uint64_t zsc_nsec_kern_prev; uint64_t zsc_nsec_user_prev; /* Total kstat increases since zonestatd started reading kstats */ timestruc_t zsc_idle; timestruc_t zsc_intr; timestruc_t zsc_kern; timestruc_t zsc_user; } zsd_cpu_t; /* Used to describe an individual zone and its utilization */ typedef struct zsd_zone { zoneid_t zsz_id; list_node_t zsz_next; char zsz_name[ZS_ZONENAME_MAX]; uint_t zsz_cputype; uint_t zsz_iptype; time_t zsz_start; hrtime_t zsz_hrstart; char zsz_pool[ZS_POOLNAME_MAX]; char zsz_pset[ZS_PSETNAME_MAX]; int zsz_default_sched; /* These are deduced by inspecting processes */ psetid_t zsz_psetid; uint_t zsz_scheds; boolean_t zsz_new; /* zone booted during this interval */ boolean_t zsz_deleted; /* halted during this interval */ boolean_t zsz_active; /* running in this interval */ boolean_t zsz_empty; /* no processes in this interval */ boolean_t zsz_gone; /* not installed in this interval */ boolean_t zsz_found; /* Running at end of this interval */ uint64_t zsz_cpu_shares; uint64_t zsz_cpu_cap; uint64_t zsz_ram_cap; uint64_t zsz_locked_cap; uint64_t zsz_vm_cap; uint64_t zsz_cpus_online; timestruc_t zsz_cpu_usage; /* cpu time of cpu cap */ timestruc_t zsz_cap_time; /* cpu time of cpu cap */ timestruc_t zsz_share_time; /* cpu time of share of cpu */ timestruc_t zsz_pset_time; /* time of all psets zone is bound to */ uint64_t zsz_usage_ram; uint64_t zsz_usage_locked; uint64_t zsz_usage_vm; uint64_t zsz_processes_cap; uint64_t zsz_lwps_cap; uint64_t zsz_shm_cap; uint64_t zsz_shmids_cap; uint64_t zsz_semids_cap; uint64_t zsz_msgids_cap; uint64_t zsz_lofi_cap; uint64_t zsz_processes; uint64_t zsz_lwps; uint64_t zsz_shm; uint64_t zsz_shmids; uint64_t zsz_semids; uint64_t zsz_msgids; uint64_t zsz_lofi; } zsd_zone_t; /* * Used to track the cpu usage of an individual processes. * * zonestatd sweeps /proc each interval and charges the cpu usage of processes. * to their zone. As processes exit, their extended accounting records are * read and the difference of their total and known usage is charged to their * zone. * * If a process is never seen in /proc, the total usage on its extended * accounting record will be charged to its zone. */ typedef struct zsd_proc { list_node_t zspr_next; pid_t zspr_ppid; psetid_t zspr_psetid; zoneid_t zspr_zoneid; int zspr_sched; timestruc_t zspr_usage; } zsd_proc_t; /* Used to track the overall resource usage of the system */ typedef struct zsd_system { uint64_t zss_ram_total; uint64_t zss_ram_kern; uint64_t zss_ram_zones; uint64_t zss_locked_kern; uint64_t zss_locked_zones; uint64_t zss_vm_total; uint64_t zss_vm_kern; uint64_t zss_vm_zones; uint64_t zss_swap_total; uint64_t zss_swap_used; timestruc_t zss_idle; timestruc_t zss_intr; timestruc_t zss_kern; timestruc_t zss_user; timestruc_t zss_cpu_total_time; timestruc_t zss_cpu_usage_kern; timestruc_t zss_cpu_usage_zones; uint64_t zss_maxpid; uint64_t zss_processes_max; uint64_t zss_lwps_max; uint64_t zss_shm_max; uint64_t zss_shmids_max; uint64_t zss_semids_max; uint64_t zss_msgids_max; uint64_t zss_lofi_max; uint64_t zss_processes; uint64_t zss_lwps; uint64_t zss_shm; uint64_t zss_shmids; uint64_t zss_semids; uint64_t zss_msgids; uint64_t zss_lofi; uint64_t zss_ncpus; uint64_t zss_ncpus_online; } zsd_system_t; /* * A dumping ground for various information and structures used to compute * utilization. * * This structure is used to track the system while clients are connected. * When The first client connects, a zsd_ctl is allocated and configured by * zsd_open(). When all clients disconnect, the zsd_ctl is closed. */ typedef struct zsd_ctl { kstat_ctl_t *zsctl_kstat_ctl; /* To track extended accounting */ int zsctl_proc_fd; /* Log currently being used */ ea_file_t zsctl_proc_eaf; struct stat64 zsctl_proc_stat; int zsctl_proc_open; int zsctl_proc_fd_next; /* Log file to use next */ ea_file_t zsctl_proc_eaf_next; struct stat64 zsctl_proc_stat_next; int zsctl_proc_open_next; /* pool configuration handle */ pool_conf_t *zsctl_pool_conf; int zsctl_pool_status; int zsctl_pool_changed; /* The above usage tacking structures */ zsd_system_t *zsctl_system; list_t zsctl_zones; list_t zsctl_psets; list_t zsctl_cpus; zsd_cpu_t *zsctl_cpu_array; zsd_proc_t *zsctl_proc_array; /* Various system info */ uint64_t zsctl_maxcpuid; uint64_t zsctl_maxproc; uint64_t zsctl_kern_bits; uint64_t zsctl_pagesize; /* Used to track time available under a cpu cap. */ uint64_t zsctl_hrtime; uint64_t zsctl_hrtime_prev; timestruc_t zsctl_hrtime_total; struct timeval zsctl_timeofday; /* Caches for arrays allocated for use by various system calls */ psetid_t *zsctl_pset_cache; uint_t zsctl_pset_ncache; processorid_t *zsctl_cpu_cache; uint_t zsctl_cpu_ncache; zoneid_t *zsctl_zone_cache; uint_t zsctl_zone_ncache; struct swaptable *zsctl_swap_cache; uint64_t zsctl_swap_cache_size; uint64_t zsctl_swap_cache_num; zsd_vmusage64_t *zsctl_vmusage_cache; uint64_t zsctl_vmusage_cache_num; /* Info about procfs for scanning /proc */ pool_value_t *zsctl_pool_vals[3]; /* Counts on tracked entities */ uint_t zsctl_nzones; uint_t zsctl_npsets; uint_t zsctl_npset_usages; } zsd_ctl_t; zsd_ctl_t *g_ctl; boolean_t g_open; /* True if g_ctl is open */ int g_hasclient; /* True if any clients are connected */ /* * The usage cache is updated by the stat_thread, and copied to clients by * the zsd_stat_server. Mutex and cond are to synchronize between the * stat_thread and the stat_server. */ zs_usage_cache_t *g_usage_cache; mutex_t g_usage_cache_lock; cond_t g_usage_cache_kick; uint_t g_usage_cache_kickers; cond_t g_usage_cache_wait; char *g_usage_cache_buf; uint_t g_usage_cache_bufsz; uint64_t g_gen_next; /* fds of door servers */ int g_server_door; int g_stat_door; /* * Starting and current time. Used to throttle memory calculation, and to * mark new zones and psets with their boot and creation time. */ time_t g_now; time_t g_start; hrtime_t g_hrnow; hrtime_t g_hrstart; uint64_t g_interval; /* * main() thread. */ thread_t g_main; /* PRINTFLIKE1 */ static void zsd_warn(const char *fmt, ...) { va_list alist; va_start(alist, fmt); (void) fprintf(stderr, gettext("zonestat: Warning: ")); (void) vfprintf(stderr, fmt, alist); (void) fprintf(stderr, "\n"); va_end(alist); } /* PRINTFLIKE1 */ static void zsd_error(const char *fmt, ...) { va_list alist; va_start(alist, fmt); (void) fprintf(stderr, gettext("zonestat: Error: ")); (void) vfprintf(stderr, fmt, alist); (void) fprintf(stderr, "\n"); va_end(alist); exit(1); } /* Turns on extended accounting if not configured externally */ int zsd_enable_cpu_stats() { char *path = ZONESTAT_EXACCT_FILE; char oldfile[MAXPATHLEN]; int ret, state = AC_ON; ac_res_t res[6]; /* * Start a new accounting file if accounting not configured * externally. */ res[0].ar_id = AC_PROC_PID; res[0].ar_state = AC_ON; res[1].ar_id = AC_PROC_ANCPID; res[1].ar_state = AC_ON; res[2].ar_id = AC_PROC_CPU; res[2].ar_state = AC_ON; res[3].ar_id = AC_PROC_TIME; res[3].ar_state = AC_ON; res[4].ar_id = AC_PROC_ZONENAME; res[4].ar_state = AC_ON; res[5].ar_id = AC_NONE; res[5].ar_state = AC_ON; if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) { zsd_warn(gettext("Unable to set accounting resources")); return (-1); } /* Only set accounting file if none is configured */ ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile)); if (ret < 0) { (void) unlink(path); if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) { zsd_warn(gettext("Unable to set accounting file")); return (-1); } } if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) { zsd_warn(gettext("Unable to enable accounting")); return (-1); } return (0); } /* Turns off extended accounting if not configured externally */ int zsd_disable_cpu_stats() { char *path = ZONESTAT_EXACCT_FILE; int ret, state = AC_OFF; ac_res_t res[6]; char oldfile[MAXPATHLEN]; /* If accounting file is externally configured, leave it alone */ ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile)); if (ret == 0 && strcmp(oldfile, path) != 0) return (0); res[0].ar_id = AC_PROC_PID; res[0].ar_state = AC_OFF; res[1].ar_id = AC_PROC_ANCPID; res[1].ar_state = AC_OFF; res[2].ar_id = AC_PROC_CPU; res[2].ar_state = AC_OFF; res[3].ar_id = AC_PROC_TIME; res[3].ar_state = AC_OFF; res[4].ar_id = AC_PROC_ZONENAME; res[4].ar_state = AC_OFF; res[5].ar_id = AC_NONE; res[5].ar_state = AC_OFF; if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) { zsd_warn(gettext("Unable to clear accounting resources")); return (-1); } if (acctctl(AC_PROC | AC_FILE_SET, NULL, 0) == -1) { zsd_warn(gettext("Unable to clear accounting file")); return (-1); } if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) { zsd_warn(gettext("Unable to diable accounting")); return (-1); } (void) unlink(path); return (0); } /* * If not configured externally, deletes the current extended accounting file * and starts a new one. * * Since the stat_thread holds an open handle to the accounting file, it will * read all remaining entries from the old file before switching to * read the new one. */ int zsd_roll_exacct(void) { int ret; char *path = ZONESTAT_EXACCT_FILE; char oldfile[MAXPATHLEN]; /* If accounting file is externally configured, leave it alone */ ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile)); if (ret == 0 && strcmp(oldfile, path) != 0) return (0); if (unlink(path) != 0) /* Roll it next time */ return (0); if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) { zsd_warn(gettext("Unable to set accounting file")); return (-1); } return (0); } /* Contract stuff for zone_enter() */ int init_template(void) { int fd; int err = 0; fd = open64(CTFS_ROOT "/process/template", O_RDWR); if (fd == -1) return (-1); /* * For now, zoneadmd doesn't do anything with the contract. * Deliver no events, don't inherit, and allow it to be orphaned. */ err |= ct_tmpl_set_critical(fd, 0); err |= ct_tmpl_set_informative(fd, 0); err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR); err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT); if (err || ct_tmpl_activate(fd)) { (void) close(fd); return (-1); } return (fd); } /* * Contract stuff for zone_enter() */ int contract_latest(ctid_t *id) { int cfd, r; ct_stathdl_t st; ctid_t result; if ((cfd = open64(CTFS_ROOT "/process/latest", O_RDONLY)) == -1) return (errno); if ((r = ct_status_read(cfd, CTD_COMMON, &st)) != 0) { (void) close(cfd); return (r); } result = ct_status_get_id(st); ct_status_free(st); (void) close(cfd); *id = result; return (0); } static int close_on_exec(int fd) { int flags = fcntl(fd, F_GETFD, 0); if ((flags != -1) && (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1)) return (0); return (-1); } int contract_open(ctid_t ctid, const char *type, const char *file, int oflag) { char path[PATH_MAX]; int n, fd; if (type == NULL) type = "all"; n = snprintf(path, PATH_MAX, CTFS_ROOT "/%s/%ld/%s", type, ctid, file); if (n >= sizeof (path)) { errno = ENAMETOOLONG; return (-1); } fd = open64(path, oflag); if (fd != -1) { if (close_on_exec(fd) == -1) { int err = errno; (void) close(fd); errno = err; return (-1); } } return (fd); } int contract_abandon_id(ctid_t ctid) { int fd, err; fd = contract_open(ctid, "all", "ctl", O_WRONLY); if (fd == -1) return (errno); err = ct_ctl_abandon(fd); (void) close(fd); return (err); } /* * Attach the zsd_server to a zone. Called for each zone when zonestatd * starts, and for each newly booted zone when zoneadmd contacts the zsd_server * * Zone_enter is used to avoid reaching into zone to fattach door. */ static void zsd_fattach_zone(zoneid_t zid, int door, boolean_t detach_only) { char *path = ZS_DOOR_PATH; int fd, pid, stat, tmpl_fd; ctid_t ct; if ((tmpl_fd = init_template()) == -1) { zsd_warn("Unable to init template"); return; } pid = forkx(0); if (pid < 0) { (void) ct_tmpl_clear(tmpl_fd); zsd_warn(gettext( "Unable to fork to add zonestat to zoneid %d\n"), zid); return; } if (pid == 0) { (void) ct_tmpl_clear(tmpl_fd); (void) close(tmpl_fd); if (zid != 0 && zone_enter(zid) != 0) { if (errno == EINVAL) { _exit(0); } _exit(1); } (void) fdetach(path); (void) unlink(path); if (detach_only) _exit(0); fd = open(path, O_CREAT|O_RDWR, 0644); if (fd < 0) _exit(2); if (fattach(door, path) != 0) _exit(3); _exit(0); } if (contract_latest(&ct) == -1) ct = -1; (void) ct_tmpl_clear(tmpl_fd); (void) close(tmpl_fd); (void) contract_abandon_id(ct); while (waitpid(pid, &stat, 0) != pid) ; if (WIFEXITED(stat) && WEXITSTATUS(stat) == 0) return; zsd_warn(gettext("Unable to attach door to zoneid: %d"), zid); if (WEXITSTATUS(stat) == 1) zsd_warn(gettext("Cannot entering zone")); else if (WEXITSTATUS(stat) == 2) zsd_warn(gettext("Unable to create door file: %s"), path); else if (WEXITSTATUS(stat) == 3) zsd_warn(gettext("Unable to fattach file: %s"), path); zsd_warn(gettext("Internal error entering zone: %d"), zid); } /* * Zone lookup and allocation functions to manage list of currently running * zones. */ static zsd_zone_t * zsd_lookup_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid) { zsd_zone_t *zone; for (zone = list_head(&ctl->zsctl_zones); zone != NULL; zone = list_next(&ctl->zsctl_zones, zone)) { if (strcmp(zone->zsz_name, zonename) == 0) { if (zoneid != -1) zone->zsz_id = zoneid; return (zone); } } return (NULL); } static zsd_zone_t * zsd_lookup_zone_byid(zsd_ctl_t *ctl, zoneid_t zoneid) { zsd_zone_t *zone; for (zone = list_head(&ctl->zsctl_zones); zone != NULL; zone = list_next(&ctl->zsctl_zones, zone)) { if (zone->zsz_id == zoneid) return (zone); } return (NULL); } static zsd_zone_t * zsd_allocate_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid) { zsd_zone_t *zone; if ((zone = (zsd_zone_t *)calloc(1, sizeof (zsd_zone_t))) == NULL) return (NULL); (void) strlcpy(zone->zsz_name, zonename, sizeof (zone->zsz_name)); zone->zsz_id = zoneid; zone->zsz_found = B_FALSE; /* * Allocate as deleted so if not found in first pass, zone is deleted * from list. This can happen if zone is returned by zone_list, but * exits before first attempt to fetch zone details. */ zone->zsz_start = g_now; zone->zsz_hrstart = g_hrnow; zone->zsz_deleted = B_TRUE; zone->zsz_cpu_shares = ZS_LIMIT_NONE; zone->zsz_cpu_cap = ZS_LIMIT_NONE; zone->zsz_ram_cap = ZS_LIMIT_NONE; zone->zsz_locked_cap = ZS_LIMIT_NONE; zone->zsz_vm_cap = ZS_LIMIT_NONE; zone->zsz_processes_cap = ZS_LIMIT_NONE; zone->zsz_lwps_cap = ZS_LIMIT_NONE; zone->zsz_shm_cap = ZS_LIMIT_NONE; zone->zsz_shmids_cap = ZS_LIMIT_NONE; zone->zsz_semids_cap = ZS_LIMIT_NONE; zone->zsz_msgids_cap = ZS_LIMIT_NONE; zone->zsz_lofi_cap = ZS_LIMIT_NONE; ctl->zsctl_nzones++; return (zone); } static zsd_zone_t * zsd_lookup_insert_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid) { zsd_zone_t *zone, *tmp; if ((zone = zsd_lookup_zone(ctl, zonename, zoneid)) != NULL) return (zone); if ((zone = zsd_allocate_zone(ctl, zonename, zoneid)) == NULL) return (NULL); /* Insert sorted by zonename */ tmp = list_head(&ctl->zsctl_zones); while (tmp != NULL && strcmp(zonename, tmp->zsz_name) > 0) tmp = list_next(&ctl->zsctl_zones, tmp); list_insert_before(&ctl->zsctl_zones, tmp, zone); return (zone); } /* * Mark all zones as not existing. As zones are found, they will * be marked as existing. If a zone is not found, then it must have * halted. */ static void zsd_mark_zones_start(zsd_ctl_t *ctl) { zsd_zone_t *zone; for (zone = list_head(&ctl->zsctl_zones); zone != NULL; zone = list_next(&ctl->zsctl_zones, zone)) { zone->zsz_found = B_FALSE; } } /* * Mark each zone as not using pset. If processes are found using the * pset, the zone will remain bound to the pset. If none of a zones * processes are bound to the pset, the zone's usage of the pset will * be deleted. * */ static void zsd_mark_pset_usage_start(zsd_pset_t *pset) { zsd_pset_usage_t *usage; for (usage = list_head(&pset->zsp_usage_list); usage != NULL; usage = list_next(&pset->zsp_usage_list, usage)) { usage->zsu_found = B_FALSE; usage->zsu_empty = B_TRUE; } } /* * Mark each pset as not existing. If a pset is found, it will be marked * as existing. If a pset is not found, it wil be deleted. */ static void zsd_mark_psets_start(zsd_ctl_t *ctl) { zsd_pset_t *pset; for (pset = list_head(&ctl->zsctl_psets); pset != NULL; pset = list_next(&ctl->zsctl_psets, pset)) { pset->zsp_found = B_FALSE; zsd_mark_pset_usage_start(pset); } } /* * A pset was found. Update its information */ static void zsd_mark_pset_found(zsd_pset_t *pset, uint_t type, uint64_t online, uint64_t size, uint64_t min, uint64_t max, int64_t importance) { pset->zsp_empty = B_TRUE; pset->zsp_deleted = B_FALSE; assert(pset->zsp_found == B_FALSE); /* update pset flags */ if (pset->zsp_active == B_FALSE) /* pset not seen on previous interval. It is new. */ pset->zsp_new = B_TRUE; else pset->zsp_new = B_FALSE; pset->zsp_found = B_TRUE; pset->zsp_cputype = type; pset->zsp_online = online; pset->zsp_size = size; pset->zsp_min = min; pset->zsp_max = max; pset->zsp_importance = importance; pset->zsp_cpu_shares = 0; pset->zsp_scheds = 0; pset->zsp_active = B_TRUE; } /* * A zone's process was found using a pset. Charge the process to the pset and * the per-zone data for the pset. */ static void zsd_mark_pset_usage_found(zsd_pset_usage_t *usage, uint_t sched) { zsd_zone_t *zone = usage->zsu_zone; zsd_pset_t *pset = usage->zsu_pset; /* Nothing to do if already found */ if (usage->zsu_found == B_TRUE) goto add_stats; usage->zsu_found = B_TRUE; usage->zsu_empty = B_FALSE; usage->zsu_deleted = B_FALSE; /* update usage flags */ if (usage->zsu_active == B_FALSE) usage->zsu_new = B_TRUE; else usage->zsu_new = B_FALSE; usage->zsu_scheds = 0; usage->zsu_cpu_shares = ZS_LIMIT_NONE; usage->zsu_active = B_TRUE; pset->zsp_empty = B_FALSE; zone->zsz_empty = B_FALSE; add_stats: /* Detect zone's pset id, and if it is bound to multiple psets */ if (zone->zsz_psetid == ZS_PSET_ERROR) zone->zsz_psetid = pset->zsp_id; else if (zone->zsz_psetid != pset->zsp_id) zone->zsz_psetid = ZS_PSET_MULTI; usage->zsu_scheds |= sched; pset->zsp_scheds |= sched; zone->zsz_scheds |= sched; /* Record if FSS is co-habitating with conflicting scheduler */ if ((pset->zsp_scheds & ZS_SCHED_FSS) && usage->zsu_scheds & ( ZS_SCHED_TS | ZS_SCHED_IA | ZS_SCHED_FX)) { usage->zsu_scheds |= ZS_SCHED_CONFLICT; pset->zsp_scheds |= ZS_SCHED_CONFLICT; } } /* Add cpu time for a process to a pset, zone, and system totals */ static void zsd_add_usage(zsd_ctl_t *ctl, zsd_pset_usage_t *usage, timestruc_t *delta) { zsd_system_t *system = ctl->zsctl_system; zsd_zone_t *zone = usage->zsu_zone; zsd_pset_t *pset = usage->zsu_pset; TIMESTRUC_ADD_TIMESTRUC(usage->zsu_cpu_usage, *delta); TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_zones, *delta); TIMESTRUC_ADD_TIMESTRUC(zone->zsz_cpu_usage, *delta); TIMESTRUC_ADD_TIMESTRUC(system->zss_cpu_usage_zones, *delta); } /* Determine which processor sets have been deleted */ static void zsd_mark_psets_end(zsd_ctl_t *ctl) { zsd_pset_t *pset, *tmp; /* * Mark pset as not exists, and deleted if it existed * previous interval. */ pset = list_head(&ctl->zsctl_psets); while (pset != NULL) { if (pset->zsp_found == B_FALSE) { pset->zsp_empty = B_TRUE; if (pset->zsp_deleted == B_TRUE) { tmp = pset; pset = list_next(&ctl->zsctl_psets, pset); list_remove(&ctl->zsctl_psets, tmp); free(tmp); ctl->zsctl_npsets--; continue; } else { /* Pset vanished during this interval */ pset->zsp_new = B_FALSE; pset->zsp_deleted = B_TRUE; pset->zsp_active = B_TRUE; } } pset = list_next(&ctl->zsctl_psets, pset); } } /* Determine which zones are no longer bound to processor sets */ static void zsd_mark_pset_usages_end(zsd_ctl_t *ctl) { zsd_pset_t *pset; zsd_zone_t *zone; zsd_pset_usage_t *usage, *tmp; /* * Mark pset as not exists, and deleted if it existed previous * interval. */ for (pset = list_head(&ctl->zsctl_psets); pset != NULL; pset = list_next(&ctl->zsctl_psets, pset)) { usage = list_head(&pset->zsp_usage_list); while (usage != NULL) { /* * Mark pset as not exists, and deleted if it existed * previous interval. */ if (usage->zsu_found == B_FALSE || usage->zsu_zone->zsz_deleted == B_TRUE || usage->zsu_pset->zsp_deleted == B_TRUE) { tmp = usage; usage = list_next(&pset->zsp_usage_list, usage); list_remove(&pset->zsp_usage_list, tmp); free(tmp); pset->zsp_nusage--; ctl->zsctl_npset_usages--; continue; } else { usage->zsu_new = B_FALSE; usage->zsu_deleted = B_TRUE; usage->zsu_active = B_TRUE; } /* Add cpu shares for usages that are in FSS */ zone = usage->zsu_zone; if (usage->zsu_scheds & ZS_SCHED_FSS && zone->zsz_cpu_shares != ZS_SHARES_UNLIMITED && zone->zsz_cpu_shares != 0) { zone = usage->zsu_zone; usage->zsu_cpu_shares = zone->zsz_cpu_shares; pset->zsp_cpu_shares += zone->zsz_cpu_shares; } usage = list_next(&pset->zsp_usage_list, usage); } } } /* A zone has been found. Update its information */ static void zsd_mark_zone_found(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t cpu_shares, uint64_t cpu_cap, uint64_t ram_cap, uint64_t locked_cap, uint64_t vm_cap, uint64_t processes_cap, uint64_t processes, uint64_t lwps_cap, uint64_t lwps, uint64_t shm_cap, uint64_t shm, uint64_t shmids_cap, uint64_t shmids, uint64_t semids_cap, uint64_t semids, uint64_t msgids_cap, uint64_t msgids, uint64_t lofi_cap, uint64_t lofi, char *poolname, char *psetname, uint_t sched, uint_t cputype, uint_t iptype) { zsd_system_t *sys = ctl->zsctl_system; assert(zone->zsz_found == B_FALSE); /* * Mark zone as exists, and new if it did not exist in previous * interval. */ zone->zsz_found = B_TRUE; zone->zsz_empty = B_TRUE; zone->zsz_deleted = B_FALSE; /* * Zone is new. Assume zone's properties are the same over entire * interval. */ if (zone->zsz_active == B_FALSE) zone->zsz_new = B_TRUE; else zone->zsz_new = B_FALSE; (void) strlcpy(zone->zsz_pool, poolname, sizeof (zone->zsz_pool)); (void) strlcpy(zone->zsz_pset, psetname, sizeof (zone->zsz_pset)); zone->zsz_default_sched = sched; /* Schedulers updated later as processes are found */ zone->zsz_scheds = 0; /* Cpus updated later as psets bound are identified */ zone->zsz_cpus_online = 0; zone->zsz_cputype = cputype; zone->zsz_iptype = iptype; zone->zsz_psetid = ZS_PSET_ERROR; zone->zsz_cpu_cap = cpu_cap; zone->zsz_cpu_shares = cpu_shares; zone->zsz_ram_cap = ram_cap; zone->zsz_locked_cap = locked_cap; zone->zsz_vm_cap = vm_cap; zone->zsz_processes_cap = processes_cap; zone->zsz_processes = processes; zone->zsz_lwps_cap = lwps_cap; zone->zsz_lwps = lwps; zone->zsz_shm_cap = shm_cap; zone->zsz_shm = shm; zone->zsz_shmids_cap = shmids_cap; zone->zsz_shmids = shmids; zone->zsz_semids_cap = semids_cap; zone->zsz_semids = semids; zone->zsz_msgids_cap = msgids_cap; zone->zsz_msgids = msgids; zone->zsz_lofi_cap = lofi_cap; zone->zsz_lofi = lofi; sys->zss_processes += processes; sys->zss_lwps += lwps; sys->zss_shm += shm; sys->zss_shmids += shmids; sys->zss_semids += semids; sys->zss_msgids += msgids; sys->zss_lofi += lofi; zone->zsz_active = B_TRUE; } /* Determine which zones have halted */ static void zsd_mark_zones_end(zsd_ctl_t *ctl) { zsd_zone_t *zone, *tmp; /* * Mark zone as not existing, or delete if it did not exist in * previous interval. */ zone = list_head(&ctl->zsctl_zones); while (zone != NULL) { if (zone->zsz_found == B_FALSE) { zone->zsz_empty = B_TRUE; if (zone->zsz_deleted == B_TRUE) { /* * Zone deleted in prior interval, * so it no longer exists. */ tmp = zone; zone = list_next(&ctl->zsctl_zones, zone); list_remove(&ctl->zsctl_zones, tmp); free(tmp); ctl->zsctl_nzones--; continue; } else { zone->zsz_new = B_FALSE; zone->zsz_deleted = B_TRUE; zone->zsz_active = B_TRUE; } } zone = list_next(&ctl->zsctl_zones, zone); } } /* * Mark cpus as not existing. If a cpu is found, it will be updated. If * a cpu is not found, then it must have gone offline, so it will be * deleted. * * The kstat tracking data is rolled so that the usage since the previous * interval can be determined. */ static void zsd_mark_cpus_start(zsd_ctl_t *ctl, boolean_t roll) { zsd_cpu_t *cpu; /* * Mark all cpus as not existing. As cpus are found, they will * be marked as existing. */ for (cpu = list_head(&ctl->zsctl_cpus); cpu != NULL; cpu = list_next(&ctl->zsctl_cpus, cpu)) { cpu->zsc_found = B_FALSE; if (cpu->zsc_active == B_TRUE && roll) { cpu->zsc_psetid_prev = cpu->zsc_psetid; cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle; cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr; cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern; cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user; } } } /* * An array the size of the maximum number of cpus is kept. Within this array * a list of the online cpus is maintained. */ zsd_cpu_t * zsd_lookup_insert_cpu(zsd_ctl_t *ctl, processorid_t cpuid) { zsd_cpu_t *cpu; assert(cpuid < ctl->zsctl_maxcpuid); cpu = &(ctl->zsctl_cpu_array[cpuid]); assert(cpuid == cpu->zsc_id); if (cpu->zsc_allocated == B_FALSE) { cpu->zsc_allocated = B_TRUE; list_insert_tail(&ctl->zsctl_cpus, cpu); } return (cpu); } /* A cpu has been found. Update its information */ static void zsd_mark_cpu_found(zsd_cpu_t *cpu, zsd_pset_t *pset, psetid_t psetid) { /* * legacy processor sets, the cpu may move while zonestatd is * inspecting, causing it to be found twice. In this case, just * leave cpu in the first processor set in which it was found. */ if (cpu->zsc_found == B_TRUE) return; /* Mark cpu as online */ cpu->zsc_found = B_TRUE; cpu->zsc_offlined = B_FALSE; cpu->zsc_pset = pset; /* * cpu is newly online. */ if (cpu->zsc_active == B_FALSE) { /* * Cpu is newly online. */ cpu->zsc_onlined = B_TRUE; cpu->zsc_psetid = psetid; cpu->zsc_psetid_prev = psetid; } else { /* * cpu online during previous interval. Save properties at * start of interval */ cpu->zsc_onlined = B_FALSE; cpu->zsc_psetid = psetid; } cpu->zsc_active = B_TRUE; } /* Remove all offlined cpus from the list of tracked cpus */ static void zsd_mark_cpus_end(zsd_ctl_t *ctl) { zsd_cpu_t *cpu, *tmp; int id; /* Mark cpu as online or offline */ cpu = list_head(&ctl->zsctl_cpus); while (cpu != NULL) { if (cpu->zsc_found == B_FALSE) { if (cpu->zsc_offlined == B_TRUE) { /* * cpu offlined in prior interval. It is gone. */ tmp = cpu; cpu = list_next(&ctl->zsctl_cpus, cpu); list_remove(&ctl->zsctl_cpus, tmp); /* Clear structure for future use */ id = tmp->zsc_id; bzero(tmp, sizeof (zsd_cpu_t)); tmp->zsc_id = id; tmp->zsc_allocated = B_FALSE; tmp->zsc_psetid = ZS_PSET_ERROR; tmp->zsc_psetid_prev = ZS_PSET_ERROR; } else { /* * cpu online at start of interval. Treat * as still online, since it was online for * some portion of the interval. */ cpu->zsc_offlined = B_TRUE; cpu->zsc_onlined = B_FALSE; cpu->zsc_active = B_TRUE; cpu->zsc_psetid = cpu->zsc_psetid_prev; cpu->zsc_pset = NULL; } } cpu = list_next(&ctl->zsctl_cpus, cpu); } } /* Some utility functions for managing the list of processor sets */ static zsd_pset_t * zsd_lookup_pset_byid(zsd_ctl_t *ctl, psetid_t psetid) { zsd_pset_t *pset; for (pset = list_head(&ctl->zsctl_psets); pset != NULL; pset = list_next(&ctl->zsctl_psets, pset)) { if (pset->zsp_id == psetid) return (pset); } return (NULL); } static zsd_pset_t * zsd_lookup_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid) { zsd_pset_t *pset; for (pset = list_head(&ctl->zsctl_psets); pset != NULL; pset = list_next(&ctl->zsctl_psets, pset)) { if (strcmp(pset->zsp_name, psetname) == 0) { if (psetid != -1) pset->zsp_id = psetid; return (pset); } } return (NULL); } static zsd_pset_t * zsd_allocate_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid) { zsd_pset_t *pset; if ((pset = (zsd_pset_t *)calloc(1, sizeof (zsd_pset_t))) == NULL) return (NULL); (void) strlcpy(pset->zsp_name, psetname, sizeof (pset->zsp_name)); pset->zsp_id = psetid; pset->zsp_found = B_FALSE; /* * Allocate as deleted so if not found in first pass, pset is deleted * from list. This can happen if pset is returned by pset_list, but * is destroyed before first attempt to fetch pset details. */ list_create(&pset->zsp_usage_list, sizeof (zsd_pset_usage_t), offsetof(zsd_pset_usage_t, zsu_next)); pset->zsp_hrstart = g_hrnow; pset->zsp_deleted = B_TRUE; pset->zsp_empty = B_TRUE; ctl->zsctl_npsets++; return (pset); } static zsd_pset_t * zsd_lookup_insert_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid) { zsd_pset_t *pset, *tmp; if ((pset = zsd_lookup_pset(ctl, psetname, psetid)) != NULL) return (pset); if ((pset = zsd_allocate_pset(ctl, psetname, psetid)) == NULL) return (NULL); /* Insert sorted by psetname */ tmp = list_head(&ctl->zsctl_psets); while (tmp != NULL && strcmp(psetname, tmp->zsp_name) > 0) tmp = list_next(&ctl->zsctl_psets, tmp); list_insert_before(&ctl->zsctl_psets, tmp, pset); return (pset); } /* Some utility functions for managing the list of zones using each pset */ static zsd_pset_usage_t * zsd_lookup_usage(zsd_pset_t *pset, zsd_zone_t *zone) { zsd_pset_usage_t *usage; for (usage = list_head(&pset->zsp_usage_list); usage != NULL; usage = list_next(&pset->zsp_usage_list, usage)) if (usage->zsu_zone == zone) return (usage); return (NULL); } static zsd_pset_usage_t * zsd_allocate_pset_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone) { zsd_pset_usage_t *usage; if ((usage = (zsd_pset_usage_t *)calloc(1, sizeof (zsd_pset_usage_t))) == NULL) return (NULL); list_link_init(&usage->zsu_next); usage->zsu_zone = zone; usage->zsu_zoneid = zone->zsz_id; usage->zsu_pset = pset; usage->zsu_found = B_FALSE; usage->zsu_active = B_FALSE; usage->zsu_new = B_FALSE; /* * Allocate as not deleted. If a process is found in a pset for * a zone, the usage will not be deleted until at least the next * interval. */ usage->zsu_start = g_now; usage->zsu_hrstart = g_hrnow; usage->zsu_deleted = B_FALSE; usage->zsu_empty = B_TRUE; usage->zsu_scheds = 0; usage->zsu_cpu_shares = ZS_LIMIT_NONE; ctl->zsctl_npset_usages++; pset->zsp_nusage++; return (usage); } static zsd_pset_usage_t * zsd_lookup_insert_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone) { zsd_pset_usage_t *usage, *tmp; if ((usage = zsd_lookup_usage(pset, zone)) != NULL) return (usage); if ((usage = zsd_allocate_pset_usage(ctl, pset, zone)) == NULL) return (NULL); tmp = list_head(&pset->zsp_usage_list); while (tmp != NULL && strcmp(zone->zsz_name, tmp->zsu_zone->zsz_name) > 0) tmp = list_next(&pset->zsp_usage_list, tmp); list_insert_before(&pset->zsp_usage_list, tmp, usage); return (usage); } static void zsd_refresh_system(zsd_ctl_t *ctl) { zsd_system_t *system = ctl->zsctl_system; /* Re-count these values each interval */ system->zss_processes = 0; system->zss_lwps = 0; system->zss_shm = 0; system->zss_shmids = 0; system->zss_semids = 0; system->zss_msgids = 0; system->zss_lofi = 0; } /* Reads each cpu's kstats, and adds the usage to the cpu's pset */ static void zsd_update_cpu_stats(zsd_ctl_t *ctl, zsd_cpu_t *cpu) { zsd_system_t *sys; processorid_t cpuid; zsd_pset_t *pset_prev; zsd_pset_t *pset; kstat_t *kstat; kstat_named_t *knp; kid_t kid; uint64_t idle, intr, kern, user; sys = ctl->zsctl_system; pset = cpu->zsc_pset; knp = NULL; kid = -1; cpuid = cpu->zsc_id; /* Get the cpu time totals for this cpu */ kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "cpu", cpuid, "sys"); if (kstat == NULL) return; kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL); if (kid == -1) return; knp = kstat_data_lookup(kstat, "cpu_nsec_idle"); if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64) return; idle = knp->value.ui64; knp = kstat_data_lookup(kstat, "cpu_nsec_kernel"); if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64) return; kern = knp->value.ui64; knp = kstat_data_lookup(kstat, "cpu_nsec_user"); if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64) return; user = knp->value.ui64; /* * Tracking intr time per cpu just exists for future enhancements. * The value is presently always zero. */ intr = 0; cpu->zsc_nsec_idle = idle; cpu->zsc_nsec_intr = intr; cpu->zsc_nsec_kern = kern; cpu->zsc_nsec_user = user; if (cpu->zsc_onlined == B_TRUE) { /* * cpu is newly online. There is no reference value, * so just record its current stats for comparison * on next stat read. */ cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle; cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr; cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern; cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user; return; } /* * Calculate relative time since previous refresh. * Paranoia. Don't let time go backwards. */ idle = intr = kern = user = 0; if (cpu->zsc_nsec_idle > cpu->zsc_nsec_idle_prev) idle = cpu->zsc_nsec_idle - cpu->zsc_nsec_idle_prev; if (cpu->zsc_nsec_intr > cpu->zsc_nsec_intr_prev) intr = cpu->zsc_nsec_intr - cpu->zsc_nsec_intr_prev; if (cpu->zsc_nsec_kern > cpu->zsc_nsec_kern_prev) kern = cpu->zsc_nsec_kern - cpu->zsc_nsec_kern_prev; if (cpu->zsc_nsec_user > cpu->zsc_nsec_user_prev) user = cpu->zsc_nsec_user - cpu->zsc_nsec_user_prev; /* Update totals for cpu usage */ TIMESTRUC_ADD_NANOSEC(cpu->zsc_idle, idle); TIMESTRUC_ADD_NANOSEC(cpu->zsc_intr, intr); TIMESTRUC_ADD_NANOSEC(cpu->zsc_kern, kern); TIMESTRUC_ADD_NANOSEC(cpu->zsc_user, user); /* * Add cpu's stats to its pset if it is known to be in * the pset since previous read. */ if (cpu->zsc_psetid == cpu->zsc_psetid_prev || cpu->zsc_psetid_prev == ZS_PSET_ERROR || (pset_prev = zsd_lookup_pset_byid(ctl, cpu->zsc_psetid_prev)) == NULL) { TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, idle); TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, intr); TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, kern); TIMESTRUC_ADD_NANOSEC(pset->zsp_user, user); } else { /* * Last pset was different than current pset. * Best guess is to split usage between the two. */ TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_idle, idle / 2); TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_intr, intr / 2); TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_kern, kern / 2); TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_user, user / 2); TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, (idle / 2) + (idle % 2)); TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, (intr / 2) + (intr % 2)); TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, (kern / 2) + (kern % 2)); TIMESTRUC_ADD_NANOSEC(pset->zsp_user, (user / 2) + (user % 2)); } TIMESTRUC_ADD_NANOSEC(sys->zss_idle, idle); TIMESTRUC_ADD_NANOSEC(sys->zss_intr, intr); TIMESTRUC_ADD_NANOSEC(sys->zss_kern, kern); TIMESTRUC_ADD_NANOSEC(sys->zss_user, user); } /* Determine the details of a processor set by pset_id */ static int zsd_get_pool_pset(zsd_ctl_t *ctl, psetid_t psetid, char *psetname, size_t namelen, uint_t *cputype, uint64_t *online, uint64_t *size, uint64_t *min, uint64_t *max, int64_t *importance) { uint_t old, num; pool_conf_t *conf = ctl->zsctl_pool_conf; pool_value_t **vals = ctl->zsctl_pool_vals; pool_resource_t **res_list = NULL; pool_resource_t *pset; pool_component_t **cpus = NULL; processorid_t *cache; const char *string; uint64_t uint64; int64_t int64; int i, ret, type; if (ctl->zsctl_pool_status == POOL_DISABLED) { /* * Inspect legacy psets */ for (;;) { old = num = ctl->zsctl_cpu_ncache; ret = pset_info(psetid, &type, &num, ctl->zsctl_cpu_cache); if (ret < 0) { /* pset is gone. Tell caller to retry */ errno = EINTR; return (-1); } if (num <= old) { /* Success */ break; } if ((cache = (processorid_t *)realloc( ctl->zsctl_cpu_cache, num * sizeof (processorid_t))) != NULL) { ctl->zsctl_cpu_ncache = num; ctl->zsctl_cpu_cache = cache; } else { /* * Could not allocate to get new cpu list. */ zsd_warn(gettext( "Could not allocate for cpu list")); errno = ENOMEM; return (-1); } } /* * Old school pset. Just make min and max equal * to its size */ if (psetid == ZS_PSET_DEFAULT) { *cputype = ZS_CPUTYPE_DEFAULT_PSET; (void) strlcpy(psetname, "pset_default", namelen); } else { *cputype = ZS_CPUTYPE_PSRSET_PSET; (void) snprintf(psetname, namelen, "SUNWlegacy_pset_%d", psetid); } /* * Just treat legacy pset as a simple pool pset */ *online = num; *size = num; *min = num; *max = num; *importance = 1; return (0); } /* Look up the pool pset using the pset id */ res_list = NULL; pool_value_set_int64(vals[1], psetid); if (pool_value_set_name(vals[1], "pset.sys_id") != PO_SUCCESS) goto err; if (pool_value_set_name(vals[0], "type") != PO_SUCCESS) goto err; if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS) goto err; if ((res_list = pool_query_resources(conf, &num, vals)) == NULL) goto err; if (num != 1) goto err; pset = res_list[0]; free(res_list); res_list = NULL; if (pool_get_property(conf, pool_resource_to_elem(conf, pset), "pset.name", vals[0]) != POC_STRING || pool_value_get_string(vals[0], &string) != PO_SUCCESS) goto err; (void) strlcpy(psetname, string, namelen); if (strncmp(psetname, "SUNWtmp", strlen("SUNWtmp")) == 0) *cputype = ZS_CPUTYPE_DEDICATED; else if (psetid == ZS_PSET_DEFAULT) *cputype = ZS_CPUTYPE_DEFAULT_PSET; else *cputype = ZS_CPUTYPE_POOL_PSET; /* Get size, min, max, and importance */ if (pool_get_property(conf, pool_resource_to_elem(conf, pset), "pset.size", vals[0]) == POC_UINT && pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS) *size = uint64; else *size = 0; /* Get size, min, max, and importance */ if (pool_get_property(conf, pool_resource_to_elem(conf, pset), "pset.min", vals[0]) == POC_UINT && pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS) *min = uint64; else *min = 0; if (*min >= ZSD_PSET_UNLIMITED) *min = ZS_LIMIT_NONE; if (pool_get_property(conf, pool_resource_to_elem(conf, pset), "pset.max", vals[0]) == POC_UINT && pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS) *max = uint64; else *max = ZS_LIMIT_NONE; if (*max >= ZSD_PSET_UNLIMITED) *max = ZS_LIMIT_NONE; if (pool_get_property(conf, pool_resource_to_elem(conf, pset), "pset.importance", vals[0]) == POC_INT && pool_value_get_int64(vals[0], &int64) == PO_SUCCESS) *importance = int64; else *importance = (uint64_t)1; *online = 0; if (*size == 0) return (0); /* get cpus */ cpus = pool_query_resource_components(conf, pset, &num, NULL); if (cpus == NULL) goto err; /* Make sure there is space for cpu id list */ if (num > ctl->zsctl_cpu_ncache) { if ((cache = (processorid_t *)realloc( ctl->zsctl_cpu_cache, num * sizeof (processorid_t))) != NULL) { ctl->zsctl_cpu_ncache = num; ctl->zsctl_cpu_cache = cache; } else { /* * Could not allocate to get new cpu list. */ zsd_warn(gettext( "Could not allocate for cpu list")); goto err; } } /* count the online cpus */ for (i = 0; i < num; i++) { if (pool_get_property(conf, pool_component_to_elem( conf, cpus[i]), "cpu.status", vals[0]) != POC_STRING || pool_value_get_string(vals[0], &string) != PO_SUCCESS) goto err; if (strcmp(string, "on-line") != 0 && strcmp(string, "no-intr") != 0) continue; if (pool_get_property(conf, pool_component_to_elem( conf, cpus[i]), "cpu.sys_id", vals[0]) != POC_INT || pool_value_get_int64(vals[0], &int64) != PO_SUCCESS) goto err; (*online)++; ctl->zsctl_cpu_cache[i] = (psetid_t)int64; } free(cpus); return (0); err: if (res_list != NULL) free(res_list); if (cpus != NULL) free(cpus); /* * The pools operations should succeed since the conf is a consistent * snapshot. Tell caller there is no need to retry. */ errno = EINVAL; return (-1); } /* * Update the current list of processor sets. * This also updates the list of online cpus, and each cpu's pset membership. */ static void zsd_refresh_psets(zsd_ctl_t *ctl) { int i, j, ret, state; uint_t old, num; uint_t cputype; int64_t sys_id, importance; uint64_t online, size, min, max; zsd_system_t *system; zsd_pset_t *pset; zsd_cpu_t *cpu; psetid_t *cache; char psetname[ZS_PSETNAME_MAX]; processorid_t cpuid; pool_value_t *pv_save = NULL; pool_resource_t **res_list = NULL; pool_resource_t *res; pool_value_t **vals; pool_conf_t *conf; boolean_t roll_cpus = B_TRUE; /* Zero cpu counters to recount them */ system = ctl->zsctl_system; system->zss_ncpus = 0; system->zss_ncpus_online = 0; retry: ret = pool_get_status(&state); if (ret == 0 && state == POOL_ENABLED) { conf = ctl->zsctl_pool_conf; vals = ctl->zsctl_pool_vals; pv_save = vals[1]; vals[1] = NULL; if (ctl->zsctl_pool_status == POOL_DISABLED) { if (pool_conf_open(ctl->zsctl_pool_conf, pool_dynamic_location(), PO_RDONLY) == 0) { ctl->zsctl_pool_status = POOL_ENABLED; ctl->zsctl_pool_changed = POU_PSET; } } else { ctl->zsctl_pool_changed = 0; ret = pool_conf_update(ctl->zsctl_pool_conf, &(ctl->zsctl_pool_changed)); if (ret < 0) { /* Pools must have become disabled */ (void) pool_conf_close(ctl->zsctl_pool_conf); ctl->zsctl_pool_status = POOL_DISABLED; if (pool_error() == POE_SYSTEM && errno == ENOTACTIVE) goto retry; zsd_warn(gettext( "Unable to update pool configuration")); /* Not able to get pool info. Don't update. */ goto err; } } /* Get the list of psets using libpool */ if (pool_value_set_name(vals[0], "type") != PO_SUCCESS) goto err; if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS) goto err; if ((res_list = pool_query_resources(conf, &num, vals)) == NULL) goto err; if (num > ctl->zsctl_pset_ncache) { if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache, (num) * sizeof (psetid_t))) == NULL) { goto err; } ctl->zsctl_pset_ncache = num; ctl->zsctl_pset_cache = cache; } /* Save the pset id of each pset */ for (i = 0; i < num; i++) { res = res_list[i]; if (pool_get_property(conf, pool_resource_to_elem(conf, res), "pset.sys_id", vals[0]) != POC_INT || pool_value_get_int64(vals[0], &sys_id) != PO_SUCCESS) goto err; ctl->zsctl_pset_cache[i] = (int)sys_id; } vals[1] = pv_save; pv_save = NULL; } else { if (ctl->zsctl_pool_status == POOL_ENABLED) { (void) pool_conf_close(ctl->zsctl_pool_conf); ctl->zsctl_pool_status = POOL_DISABLED; } /* Get the pset list using legacy psets */ for (;;) { old = num = ctl->zsctl_pset_ncache; (void) pset_list(ctl->zsctl_pset_cache, &num); if ((num + 1) <= old) { break; } if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache, (num + 1) * sizeof (psetid_t))) != NULL) { ctl->zsctl_pset_ncache = num + 1; ctl->zsctl_pset_cache = cache; } else { /* * Could not allocate to get new pset list. * Give up */ return; } } /* Add the default pset to list */ ctl->zsctl_pset_cache[num] = ctl->zsctl_pset_cache[0]; ctl->zsctl_pset_cache[0] = ZS_PSET_DEFAULT; num++; } psets_changed: zsd_mark_cpus_start(ctl, roll_cpus); zsd_mark_psets_start(ctl); roll_cpus = B_FALSE; /* Refresh cpu membership of all psets */ for (i = 0; i < num; i++) { /* Get pool pset information */ sys_id = ctl->zsctl_pset_cache[i]; if (zsd_get_pool_pset(ctl, sys_id, psetname, sizeof (psetname), &cputype, &online, &size, &min, &max, &importance) != 0) { if (errno == EINTR) goto psets_changed; zsd_warn(gettext("Failed to get info for pset %d"), sys_id); continue; } system->zss_ncpus += size; system->zss_ncpus_online += online; pset = zsd_lookup_insert_pset(ctl, psetname, ctl->zsctl_pset_cache[i]); /* update pset info */ zsd_mark_pset_found(pset, cputype, online, size, min, max, importance); /* update each cpu in pset */ for (j = 0; j < pset->zsp_online; j++) { cpuid = ctl->zsctl_cpu_cache[j]; cpu = zsd_lookup_insert_cpu(ctl, cpuid); zsd_mark_cpu_found(cpu, pset, sys_id); } } err: if (res_list != NULL) free(res_list); if (pv_save != NULL) vals[1] = pv_save; } /* * Fetch the current pool and pset name for the given zone. */ static void zsd_get_zone_pool_pset(zsd_ctl_t *ctl, zsd_zone_t *zone, char *pool, int poollen, char *pset, int psetlen, uint_t *cputype) { poolid_t poolid; pool_t **pools = NULL; pool_resource_t **res_list = NULL; char poolname[ZS_POOLNAME_MAX]; char psetname[ZS_PSETNAME_MAX]; pool_conf_t *conf = ctl->zsctl_pool_conf; pool_value_t *pv_save = NULL; pool_value_t **vals = ctl->zsctl_pool_vals; const char *string; int ret; int64_t int64; uint_t num; ret = zone_getattr(zone->zsz_id, ZONE_ATTR_POOLID, &poolid, sizeof (poolid)); if (ret < 0) goto lookup_done; pv_save = vals[1]; vals[1] = NULL; pools = NULL; res_list = NULL; /* Default values if lookup fails */ (void) strlcpy(poolname, "pool_default", sizeof (poolname)); (void) strlcpy(psetname, "pset_default", sizeof (poolname)); *cputype = ZS_CPUTYPE_DEFAULT_PSET; /* no dedicated cpu if pools are disabled */ if (ctl->zsctl_pool_status == POOL_DISABLED) goto lookup_done; /* Get the pool name using the id */ pool_value_set_int64(vals[0], poolid); if (pool_value_set_name(vals[0], "pool.sys_id") != PO_SUCCESS) goto lookup_done; if ((pools = pool_query_pools(conf, &num, vals)) == NULL) goto lookup_done; if (num != 1) goto lookup_done; if (pool_get_property(conf, pool_to_elem(conf, pools[0]), "pool.name", vals[0]) != POC_STRING || pool_value_get_string(vals[0], &string) != PO_SUCCESS) goto lookup_done; (void) strlcpy(poolname, (char *)string, sizeof (poolname)); /* Get the name of the pset for the pool */ if (pool_value_set_name(vals[0], "type") != PO_SUCCESS) goto lookup_done; if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS) goto lookup_done; if ((res_list = pool_query_pool_resources(conf, pools[0], &num, vals)) == NULL) goto lookup_done; if (num != 1) goto lookup_done; if (pool_get_property(conf, pool_resource_to_elem(conf, res_list[0]), "pset.sys_id", vals[0]) != POC_INT || pool_value_get_int64(vals[0], &int64) != PO_SUCCESS) goto lookup_done; if (int64 == ZS_PSET_DEFAULT) *cputype = ZS_CPUTYPE_DEFAULT_PSET; if (pool_get_property(conf, pool_resource_to_elem(conf, res_list[0]), "pset.name", vals[0]) != POC_STRING || pool_value_get_string(vals[0], &string) != PO_SUCCESS) goto lookup_done; (void) strlcpy(psetname, (char *)string, sizeof (psetname)); if (strncmp(psetname, "SUNWtmp_", strlen("SUNWtmp_")) == 0) *cputype = ZS_CPUTYPE_DEDICATED; if (strncmp(psetname, "SUNW_legacy_", strlen("SUNW_legacy_")) == 0) *cputype = ZS_CPUTYPE_PSRSET_PSET; else *cputype = ZS_CPUTYPE_POOL_PSET; lookup_done: if (pv_save != NULL) vals[1] = pv_save; if (res_list) free(res_list); if (pools) free(pools); (void) strlcpy(pool, poolname, poollen); (void) strlcpy(pset, psetname, psetlen); } /* Convert scheduler names to ZS_* scheduler flags */ static uint_t zsd_schedname2int(char *clname, int pri) { uint_t sched = 0; if (strcmp(clname, "TS") == 0) { sched = ZS_SCHED_TS; } else if (strcmp(clname, "IA") == 0) { sched = ZS_SCHED_IA; } else if (strcmp(clname, "FX") == 0) { if (pri > 59) { sched = ZS_SCHED_FX_60; } else { sched = ZS_SCHED_FX; } } else if (strcmp(clname, "RT") == 0) { sched = ZS_SCHED_RT; } else if (strcmp(clname, "FSS") == 0) { sched = ZS_SCHED_FSS; } return (sched); } static uint64_t zsd_get_zone_rctl_limit(char *name) { rctlblk_t *rblk; rblk = (rctlblk_t *)alloca(rctlblk_size()); if (getrctl(name, NULL, rblk, RCTL_FIRST) != 0) { return (ZS_LIMIT_NONE); } return (rctlblk_get_value(rblk)); } static uint64_t zsd_get_zone_rctl_usage(char *name) { rctlblk_t *rblk; rblk = (rctlblk_t *)alloca(rctlblk_size()); if (getrctl(name, NULL, rblk, RCTL_USAGE) != 0) { return (0); } return (rctlblk_get_value(rblk)); } #define ZSD_NUM_RCTL_VALS 19 /* * Fetch the limit information for a zone. This uses zone_enter() as the * getrctl(2) system call only returns rctl information for the zone of * the caller. */ static int zsd_get_zone_caps(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t *cpu_shares, uint64_t *cpu_cap, uint64_t *ram_cap, uint64_t *locked_cap, uint64_t *vm_cap, uint64_t *processes_cap, uint64_t *processes, uint64_t *lwps_cap, uint64_t *lwps, uint64_t *shm_cap, uint64_t *shm, uint64_t *shmids_cap, uint64_t *shmids, uint64_t *semids_cap, uint64_t *semids, uint64_t *msgids_cap, uint64_t *msgids, uint64_t *lofi_cap, uint64_t *lofi, uint_t *sched) { int p[2], pid, tmpl_fd, ret; ctid_t ct; char class[PC_CLNMSZ]; uint64_t vals[ZSD_NUM_RCTL_VALS]; zsd_system_t *sys = ctl->zsctl_system; int i = 0; int res = 0; /* Treat all caps as no cap on error */ *cpu_shares = ZS_LIMIT_NONE; *cpu_cap = ZS_LIMIT_NONE; *ram_cap = ZS_LIMIT_NONE; *locked_cap = ZS_LIMIT_NONE; *vm_cap = ZS_LIMIT_NONE; *processes_cap = ZS_LIMIT_NONE; *lwps_cap = ZS_LIMIT_NONE; *shm_cap = ZS_LIMIT_NONE; *shmids_cap = ZS_LIMIT_NONE; *semids_cap = ZS_LIMIT_NONE; *msgids_cap = ZS_LIMIT_NONE; *lofi_cap = ZS_LIMIT_NONE; *processes = 0; *lwps = 0; *shm = 0; *shmids = 0; *semids = 0; *msgids = 0; *lofi = 0; /* Get the ram cap first since it is a zone attr */ ret = zone_getattr(zone->zsz_id, ZONE_ATTR_PHYS_MCAP, ram_cap, sizeof (*ram_cap)); if (ret < 0 || *ram_cap == 0) *ram_cap = ZS_LIMIT_NONE; /* Get the zone's default scheduling class */ ret = zone_getattr(zone->zsz_id, ZONE_ATTR_SCHED_CLASS, class, sizeof (class)); if (ret < 0) return (-1); *sched = zsd_schedname2int(class, 0); /* rctl caps must be fetched from within the zone */ if (pipe(p) != 0) return (-1); if ((tmpl_fd = init_template()) == -1) { (void) close(p[0]); (void) close(p[1]); return (-1); } pid = forkx(0); if (pid < 0) { (void) ct_tmpl_clear(tmpl_fd); (void) close(p[0]); (void) close(p[1]); return (-1); } if (pid == 0) { (void) ct_tmpl_clear(tmpl_fd); (void) close(tmpl_fd); (void) close(p[0]); if (zone->zsz_id != getzoneid()) { if (zone_enter(zone->zsz_id) < 0) { (void) close(p[1]); _exit(0); } } /* Get caps for zone, and write them to zonestatd parent. */ vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-shares"); vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-cap"); vals[i++] = zsd_get_zone_rctl_limit("zone.max-locked-memory"); vals[i++] = zsd_get_zone_rctl_limit("zone.max-swap"); vals[i++] = zsd_get_zone_rctl_limit("zone.max-processes"); vals[i++] = zsd_get_zone_rctl_usage("zone.max-processes"); vals[i++] = zsd_get_zone_rctl_limit("zone.max-lwps"); vals[i++] = zsd_get_zone_rctl_usage("zone.max-lwps"); vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-memory"); vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-memory"); vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-ids"); vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-ids"); vals[i++] = zsd_get_zone_rctl_limit("zone.max-sem-ids"); vals[i++] = zsd_get_zone_rctl_usage("zone.max-sem-ids"); vals[i++] = zsd_get_zone_rctl_limit("zone.max-msg-ids"); vals[i++] = zsd_get_zone_rctl_usage("zone.max-msg-ids"); vals[i++] = zsd_get_zone_rctl_limit("zone.max-lofi"); vals[i++] = zsd_get_zone_rctl_usage("zone.max-lofi"); if (write(p[1], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) != ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) { (void) close(p[1]); _exit(1); } (void) close(p[1]); _exit(0); } if (contract_latest(&ct) == -1) ct = -1; (void) ct_tmpl_clear(tmpl_fd); (void) close(tmpl_fd); (void) close(p[1]); while (waitpid(pid, NULL, 0) != pid) ; /* Read cap from child in zone */ if (read(p[0], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) != ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) { res = -1; goto cleanup; } i = 0; *cpu_shares = vals[i++]; *cpu_cap = vals[i++]; *locked_cap = vals[i++]; *vm_cap = vals[i++]; *processes_cap = vals[i++]; *processes = vals[i++]; *lwps_cap = vals[i++]; *lwps = vals[i++]; *shm_cap = vals[i++]; *shm = vals[i++]; *shmids_cap = vals[i++]; *shmids = vals[i++]; *semids_cap = vals[i++]; *semids = vals[i++]; *msgids_cap = vals[i++]; *msgids = vals[i++]; *lofi_cap = vals[i++]; *lofi = vals[i++]; /* Interpret maximum values as no cap */ if (*cpu_cap == UINT32_MAX || *cpu_cap == 0) *cpu_cap = ZS_LIMIT_NONE; if (*processes_cap == sys->zss_processes_max) *processes_cap = ZS_LIMIT_NONE; if (*lwps_cap == sys->zss_lwps_max) *lwps_cap = ZS_LIMIT_NONE; if (*shm_cap == sys->zss_shm_max) *shm_cap = ZS_LIMIT_NONE; if (*shmids_cap == sys->zss_shmids_max) *shmids_cap = ZS_LIMIT_NONE; if (*semids_cap == sys->zss_semids_max) *semids_cap = ZS_LIMIT_NONE; if (*msgids_cap == sys->zss_msgids_max) *msgids_cap = ZS_LIMIT_NONE; if (*lofi_cap == sys->zss_lofi_max) *lofi_cap = ZS_LIMIT_NONE; cleanup: (void) close(p[0]); (void) ct_tmpl_clear(tmpl_fd); (void) close(tmpl_fd); (void) contract_abandon_id(ct); return (res); } /* Update the current list of running zones */ static void zsd_refresh_zones(zsd_ctl_t *ctl) { zsd_zone_t *zone; uint_t old, num; ushort_t flags; int i, ret; zoneid_t *cache; uint64_t cpu_shares; uint64_t cpu_cap; uint64_t ram_cap; uint64_t locked_cap; uint64_t vm_cap; uint64_t processes_cap; uint64_t processes; uint64_t lwps_cap; uint64_t lwps; uint64_t shm_cap; uint64_t shm; uint64_t shmids_cap; uint64_t shmids; uint64_t semids_cap; uint64_t semids; uint64_t msgids_cap; uint64_t msgids; uint64_t lofi_cap; uint64_t lofi; char zonename[ZS_ZONENAME_MAX]; char poolname[ZS_POOLNAME_MAX]; char psetname[ZS_PSETNAME_MAX]; uint_t sched; uint_t cputype; uint_t iptype; /* Get the current list of running zones */ for (;;) { old = num = ctl->zsctl_zone_ncache; (void) zone_list(ctl->zsctl_zone_cache, &num); if (num <= old) break; if ((cache = (zoneid_t *)realloc(ctl->zsctl_zone_cache, (num) * sizeof (zoneid_t))) != NULL) { ctl->zsctl_zone_ncache = num; ctl->zsctl_zone_cache = cache; } else { /* Could not allocate to get new zone list. Give up */ return; } } zsd_mark_zones_start(ctl); for (i = 0; i < num; i++) { ret = getzonenamebyid(ctl->zsctl_zone_cache[i], zonename, sizeof (zonename)); if (ret < 0) continue; zone = zsd_lookup_insert_zone(ctl, zonename, ctl->zsctl_zone_cache[i]); ret = zone_getattr(ctl->zsctl_zone_cache[i], ZONE_ATTR_FLAGS, &flags, sizeof (flags)); if (ret < 0) continue; if (flags & ZF_NET_EXCL) iptype = ZS_IPTYPE_EXCLUSIVE; else iptype = ZS_IPTYPE_SHARED; zsd_get_zone_pool_pset(ctl, zone, poolname, sizeof (poolname), psetname, sizeof (psetname), &cputype); if (zsd_get_zone_caps(ctl, zone, &cpu_shares, &cpu_cap, &ram_cap, &locked_cap, &vm_cap, &processes_cap, &processes, &lwps_cap, &lwps, &shm_cap, &shm, &shmids_cap, &shmids, &semids_cap, &semids, &msgids_cap, &msgids, &lofi_cap, &lofi, &sched) != 0) continue; zsd_mark_zone_found(ctl, zone, cpu_shares, cpu_cap, ram_cap, locked_cap, vm_cap, processes_cap, processes, lwps_cap, lwps, shm_cap, shm, shmids_cap, shmids, semids_cap, semids, msgids_cap, msgids, lofi_cap, lofi, poolname, psetname, sched, cputype, iptype); } } /* Fetch the details of a process from its psinfo_t */ static void zsd_get_proc_info(zsd_ctl_t *ctl, psinfo_t *psinfo, psetid_t *psetid, psetid_t *prev_psetid, zoneid_t *zoneid, zoneid_t *prev_zoneid, timestruc_t *delta, uint_t *sched) { timestruc_t d; zsd_proc_t *proc; /* Get cached data for proc */ proc = &(ctl->zsctl_proc_array[psinfo->pr_pid]); *psetid = psinfo->pr_lwp.pr_bindpset; if (proc->zspr_psetid == ZS_PSET_ERROR) *prev_psetid = *psetid; else *prev_psetid = proc->zspr_psetid; *zoneid = psinfo->pr_zoneid; if (proc->zspr_zoneid == -1) *prev_zoneid = *zoneid; else *prev_zoneid = proc->zspr_zoneid; TIMESTRUC_DELTA(d, psinfo->pr_time, proc->zspr_usage); *delta = d; *sched = zsd_schedname2int(psinfo->pr_lwp.pr_clname, psinfo->pr_lwp.pr_pri); /* Update cached data for proc */ proc->zspr_psetid = psinfo->pr_lwp.pr_bindpset; proc->zspr_zoneid = psinfo->pr_zoneid; proc->zspr_sched = *sched; proc->zspr_usage.tv_sec = psinfo->pr_time.tv_sec; proc->zspr_usage.tv_nsec = psinfo->pr_time.tv_nsec; proc->zspr_ppid = psinfo->pr_ppid; } /* * Reset the known cpu usage of a process. This is done after a process * exits so that if the pid is recycled, data from its previous life is * not reused */ static void zsd_flush_proc_info(zsd_proc_t *proc) { proc->zspr_usage.tv_sec = 0; proc->zspr_usage.tv_nsec = 0; } /* * Open the current extended accounting file. On initialization, open the * file as the current file to be used. Otherwise, open the file as the * next file to use of the current file reaches EOF. */ static int zsd_open_exacct(zsd_ctl_t *ctl, boolean_t init) { int ret, oret, state, trys = 0, flags; int *fd, *open; ea_file_t *eaf; struct stat64 *stat; char path[MAXPATHLEN]; /* * The accounting file is first opened at the tail. Following * opens to new accounting files are opened at the head. */ if (init == B_TRUE) { flags = EO_NO_VALID_HDR | EO_TAIL; fd = &ctl->zsctl_proc_fd; eaf = &ctl->zsctl_proc_eaf; stat = &ctl->zsctl_proc_stat; open = &ctl->zsctl_proc_open; } else { flags = EO_NO_VALID_HDR | EO_HEAD; fd = &ctl->zsctl_proc_fd_next; eaf = &ctl->zsctl_proc_eaf_next; stat = &ctl->zsctl_proc_stat_next; open = &ctl->zsctl_proc_open_next; } *fd = -1; *open = 0; retry: /* open accounting files for cpu consumption */ ret = acctctl(AC_STATE_GET | AC_PROC, &state, sizeof (state)); if (ret != 0) { zsd_warn(gettext("Unable to get process accounting state")); goto err; } if (state != AC_ON) { if (trys > 0) { zsd_warn(gettext( "Unable to enable process accounting")); goto err; } (void) zsd_enable_cpu_stats(); trys++; goto retry; } ret = acctctl(AC_FILE_GET | AC_PROC, path, sizeof (path)); if (ret != 0) { zsd_warn(gettext("Unable to get process accounting file")); goto err; } if ((*fd = open64(path, O_RDONLY, 0)) >= 0 && (oret = ea_fdopen(eaf, *fd, NULL, flags, O_RDONLY)) == 0) ret = fstat64(*fd, stat); if (*fd < 0 || oret < 0 || ret < 0) { struct timespec ts; /* * It is possible the accounting file is momentarily unavailable * because it is being rolled. Try for up to half a second. * * If failure to open accounting file persists, give up. */ if (oret == 0) (void) ea_close(eaf); else if (*fd >= 0) (void) close(*fd); if (trys > 500) { zsd_warn(gettext( "Unable to open process accounting file")); goto err; } /* wait one millisecond */ ts.tv_sec = 0; ts.tv_nsec = NANOSEC / 1000; (void) nanosleep(&ts, NULL); goto retry; } *open = 1; return (0); err: if (*fd >= 0) (void) close(*fd); *open = 0; *fd = -1; return (-1); } /* * Walk /proc and charge each process to its zone and processor set. * Then read exacct data for exited processes, and charge them as well. */ static void zsd_refresh_procs(zsd_ctl_t *ctl, boolean_t init) { DIR *dir; struct dirent *dent; psinfo_t psinfo; int fd, ret; zsd_proc_t *proc, *pproc, *tmp, *next; list_t pplist, plist; zsd_zone_t *zone, *prev_zone; zsd_pset_t *pset, *prev_pset; psetid_t psetid, prev_psetid; zoneid_t zoneid, prev_zoneid; zsd_pset_usage_t *usage, *prev_usage; char path[MAXPATHLEN]; ea_object_t object; ea_object_t pobject; boolean_t hrtime_expired = B_FALSE; struct timeval interval_end; timestruc_t delta, d1, d2; uint_t sched = 0; /* * Get the current accounting file. The current accounting file * may be different than the file in use, as the accounting file * may have been rolled, or manually changed by an admin. */ ret = zsd_open_exacct(ctl, init); if (ret != 0) { zsd_warn(gettext("Unable to track process accounting")); return; } /* * Mark the current time as the interval end time. Don't track * processes that exit after this time. */ (void) gettimeofday(&interval_end, NULL); dir = opendir("/proc"); if (dir == NULL) { zsd_warn(gettext("Unable to open /proc")); return; } /* Walk all processes and compute each zone's usage on each pset. */ while ((dent = readdir(dir)) != NULL) { if (strcmp(dent->d_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) continue; (void) snprintf(path, sizeof (path), "/proc/%s/psinfo", dent->d_name); fd = open(path, O_RDONLY); if (fd < 0) continue; if (read(fd, &psinfo, sizeof (psinfo)) != sizeof (psinfo)) { (void) close(fd); continue; } (void) close(fd); zsd_get_proc_info(ctl, &psinfo, &psetid, &prev_psetid, &zoneid, &prev_zoneid, &delta, &sched); d1.tv_sec = delta.tv_sec / 2; d1.tv_nsec = delta.tv_nsec / 2; d2.tv_sec = (delta.tv_sec / 2) + (delta.tv_sec % 2); d2.tv_nsec = (delta.tv_nsec / 2) + (delta.tv_nsec % 2); /* Get the zone and pset this process is running in */ zone = zsd_lookup_zone_byid(ctl, zoneid); if (zone == NULL) continue; pset = zsd_lookup_pset_byid(ctl, psetid); if (pset == NULL) continue; usage = zsd_lookup_insert_usage(ctl, pset, zone); if (usage == NULL) continue; /* * Get the usage of the previous zone and pset if they were * different. */ if (zoneid != prev_zoneid) prev_zone = zsd_lookup_zone_byid(ctl, prev_zoneid); else prev_zone = NULL; if (psetid != prev_psetid) prev_pset = zsd_lookup_pset_byid(ctl, prev_psetid); else prev_pset = NULL; prev_usage = NULL; if (prev_zone != NULL || prev_pset != NULL) { if (prev_zone == NULL) prev_zone = zone; if (prev_pset == NULL) prev_pset = pset; prev_usage = zsd_lookup_insert_usage(ctl, prev_pset, prev_zone); } /* Update the usage with the processes info */ if (prev_usage == NULL) { zsd_mark_pset_usage_found(usage, sched); } else { zsd_mark_pset_usage_found(usage, sched); zsd_mark_pset_usage_found(prev_usage, sched); } /* * First time around is just to get a starting point. All * usages will be zero. */ if (init == B_TRUE) continue; if (prev_usage == NULL) { zsd_add_usage(ctl, usage, &delta); } else { zsd_add_usage(ctl, usage, &d1); zsd_add_usage(ctl, prev_usage, &d2); } } (void) closedir(dir); /* * No need to collect exited proc data on initialization. Just * caching the usage of the known processes to get a zero starting * point. */ if (init == B_TRUE) return; /* * Add accounting records to account for processes which have * exited. */ list_create(&plist, sizeof (zsd_proc_t), offsetof(zsd_proc_t, zspr_next)); list_create(&pplist, sizeof (zsd_proc_t), offsetof(zsd_proc_t, zspr_next)); for (;;) { pid_t pid; pid_t ppid; timestruc_t user, sys, proc_usage; timestruc_t finish; int numfound = 0; bzero(&object, sizeof (object)); proc = NULL; zone = NULL; pset = NULL; usage = NULL; ret = ea_get_object(&ctl->zsctl_proc_eaf, &object); if (ret == EO_ERROR) { if (ea_error() == EXR_EOF) { struct stat64 *stat; struct stat64 *stat_next; /* * See if the next accounting file is the * same as the current accounting file. */ stat = &(ctl->zsctl_proc_stat); stat_next = &(ctl->zsctl_proc_stat_next); if (stat->st_ino == stat_next->st_ino && stat->st_dev == stat_next->st_dev) { /* * End of current accounting file is * reached, so finished. Clear EOF * bit for next time around. */ ea_clear(&ctl->zsctl_proc_eaf); break; } else { /* * Accounting file has changed. Move * to current accounting file. */ (void) ea_close(&ctl->zsctl_proc_eaf); ctl->zsctl_proc_fd = ctl->zsctl_proc_fd_next; ctl->zsctl_proc_eaf = ctl->zsctl_proc_eaf_next; ctl->zsctl_proc_stat = ctl->zsctl_proc_stat_next; ctl->zsctl_proc_fd_next = -1; ctl->zsctl_proc_open_next = 0; continue; } } else { /* * Other accounting error. Give up on * accounting. */ goto ea_err; } } /* Skip if not a process group */ if ((object.eo_catalog & EXT_TYPE_MASK) != EXT_GROUP || (object.eo_catalog & EXD_DATA_MASK) != EXD_GROUP_PROC) { (void) ea_free_item(&object, EUP_ALLOC); continue; } /* The process group entry should be complete */ while (numfound < 9) { bzero(&pobject, sizeof (pobject)); ret = ea_get_object(&ctl->zsctl_proc_eaf, &pobject); if (ret < 0) { (void) ea_free_item(&object, EUP_ALLOC); zsd_warn( "unable to get process accounting data"); goto ea_err; } /* Next entries should be process data */ if ((pobject.eo_catalog & EXT_TYPE_MASK) == EXT_GROUP) { (void) ea_free_item(&object, EUP_ALLOC); (void) ea_free_item(&pobject, EUP_ALLOC); zsd_warn( "process data of wrong type"); goto ea_err; } switch (pobject.eo_catalog & EXD_DATA_MASK) { case EXD_PROC_PID: pid = pobject.eo_item.ei_uint32; proc = &(ctl->zsctl_proc_array[pid]); /* * This process should not be currently in * the list of processes to process. */ assert(!list_link_active(&proc->zspr_next)); numfound++; break; case EXD_PROC_ANCPID: ppid = pobject.eo_item.ei_uint32; pproc = &(ctl->zsctl_proc_array[ppid]); numfound++; break; case EXD_PROC_ZONENAME: zone = zsd_lookup_zone(ctl, pobject.eo_item.ei_string, -1); numfound++; break; case EXD_PROC_CPU_USER_SEC: user.tv_sec = pobject.eo_item.ei_uint64; numfound++; break; case EXD_PROC_CPU_USER_NSEC: user.tv_nsec = pobject.eo_item.ei_uint64; numfound++; break; case EXD_PROC_CPU_SYS_SEC: sys.tv_sec = pobject.eo_item.ei_uint64; numfound++; break; case EXD_PROC_CPU_SYS_NSEC: sys.tv_nsec = pobject.eo_item.ei_uint64; numfound++; break; case EXD_PROC_FINISH_SEC: finish.tv_sec = pobject.eo_item.ei_uint64; numfound++; break; case EXD_PROC_FINISH_NSEC: finish.tv_nsec = pobject.eo_item.ei_uint64; numfound++; break; } (void) ea_free_item(&pobject, EUP_ALLOC); } (void) ea_free_item(&object, EUP_ALLOC); if (numfound != 9) { zsd_warn(gettext( "Malformed process accounting entry found")); goto proc_done; } if (finish.tv_sec > interval_end.tv_sec || (finish.tv_sec == interval_end.tv_sec && finish.tv_nsec > (interval_end.tv_usec * 1000))) hrtime_expired = B_TRUE; /* * Try to identify the zone and pset to which this * exited process belongs. */ if (zone == NULL) goto proc_done; /* Save proc info */ proc->zspr_ppid = ppid; proc->zspr_zoneid = zone->zsz_id; prev_psetid = ZS_PSET_ERROR; sched = 0; /* * The following tries to deduce the processes pset. * * First choose pset and sched using cached value from the * most recent time the process has been seen. * * pset and sched can change across zone_enter, so make sure * most recent sighting of this process was in the same * zone before using most recent known value. * * If there is no known value, use value of processes * parent. If parent is unknown, walk parents until a known * parent is found. * * If no parent in the zone is found, use the zone's default * pset and scheduling class. */ if (proc->zspr_psetid != ZS_PSET_ERROR) { prev_psetid = proc->zspr_psetid; pset = zsd_lookup_pset_byid(ctl, prev_psetid); sched = proc->zspr_sched; } else if (pproc->zspr_zoneid == zone->zsz_id && pproc->zspr_psetid != ZS_PSET_ERROR) { prev_psetid = pproc->zspr_psetid; pset = zsd_lookup_pset_byid(ctl, prev_psetid); sched = pproc->zspr_sched; } if (pset == NULL) { /* * Process or processes parent has never been seen. * Save to deduce a known parent later. */ proc_usage = sys; TIMESTRUC_ADD_TIMESTRUC(proc_usage, user); TIMESTRUC_DELTA(delta, proc_usage, proc->zspr_usage); proc->zspr_usage = delta; list_insert_tail(&plist, proc); continue; } /* Add the zone's usage to the pset */ usage = zsd_lookup_insert_usage(ctl, pset, zone); if (usage == NULL) goto proc_done; zsd_mark_pset_usage_found(usage, sched); /* compute the usage to add for the exited proc */ proc_usage = sys; TIMESTRUC_ADD_TIMESTRUC(proc_usage, user); TIMESTRUC_DELTA(delta, proc_usage, proc->zspr_usage); zsd_add_usage(ctl, usage, &delta); proc_done: zsd_flush_proc_info(proc); if (hrtime_expired == B_TRUE) break; } /* * close next accounting file. */ if (ctl->zsctl_proc_open_next) { (void) ea_close( &ctl->zsctl_proc_eaf_next); ctl->zsctl_proc_open_next = 0; ctl->zsctl_proc_fd_next = -1; } /* For the remaining processes, use pset and sched of a known parent */ proc = list_head(&plist); while (proc != NULL) { next = proc; for (;;) { if (next->zspr_ppid == 0 || next->zspr_ppid == -1) { /* * Kernel process, or parent is unknown, skip * process, remove from process list. */ tmp = proc; proc = list_next(&plist, proc); list_link_init(&tmp->zspr_next); break; } pproc = &(ctl->zsctl_proc_array[next->zspr_ppid]); if (pproc->zspr_zoneid != proc->zspr_zoneid) { /* * Parent in different zone. Save process and * use zone's default pset and sched below */ tmp = proc; proc = list_next(&plist, proc); list_remove(&plist, tmp); list_insert_tail(&pplist, tmp); break; } /* Parent has unknown pset, Search parent's parent */ if (pproc->zspr_psetid == ZS_PSET_ERROR) { next = pproc; continue; } /* Found parent with known pset. Use its info */ proc->zspr_psetid = pproc->zspr_psetid; proc->zspr_sched = pproc->zspr_sched; next->zspr_psetid = pproc->zspr_psetid; next->zspr_sched = pproc->zspr_sched; zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid); if (zone == NULL) { tmp = proc; proc = list_next(&plist, proc); list_remove(&plist, tmp); list_link_init(&tmp->zspr_next); break; } pset = zsd_lookup_pset_byid(ctl, proc->zspr_psetid); if (pset == NULL) { tmp = proc; proc = list_next(&plist, proc); list_remove(&plist, tmp); list_link_init(&tmp->zspr_next); break; } /* Add the zone's usage to the pset */ usage = zsd_lookup_insert_usage(ctl, pset, zone); if (usage == NULL) { tmp = proc; proc = list_next(&plist, proc); list_remove(&plist, tmp); list_link_init(&tmp->zspr_next); break; } zsd_mark_pset_usage_found(usage, proc->zspr_sched); zsd_add_usage(ctl, usage, &proc->zspr_usage); zsd_flush_proc_info(proc); tmp = proc; proc = list_next(&plist, proc); list_remove(&plist, tmp); list_link_init(&tmp->zspr_next); break; } } /* * Process has never been seen. Using zone info to * determine pset and scheduling class. */ proc = list_head(&pplist); while (proc != NULL) { zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid); if (zone == NULL) goto next; if (zone->zsz_psetid != ZS_PSET_ERROR && zone->zsz_psetid != ZS_PSET_MULTI) { prev_psetid = zone->zsz_psetid; pset = zsd_lookup_pset_byid(ctl, prev_psetid); } else { pset = zsd_lookup_pset(ctl, zone->zsz_pset, -1); if (pset != NULL) prev_psetid = pset->zsp_id; } if (pset == NULL) goto next; sched = zone->zsz_scheds; /* * Ignore FX high scheduling class if it is not the * only scheduling class in the zone. */ if (sched != ZS_SCHED_FX_60) sched &= (~ZS_SCHED_FX_60); /* * If more than one scheduling class has been found * in the zone, use zone's default scheduling class for * this process. */ if ((sched & (sched - 1)) != 0) sched = zone->zsz_default_sched; /* Add the zone's usage to the pset */ usage = zsd_lookup_insert_usage(ctl, pset, zone); if (usage == NULL) goto next; zsd_mark_pset_usage_found(usage, sched); zsd_add_usage(ctl, usage, &proc->zspr_usage); next: tmp = proc; proc = list_next(&pplist, proc); zsd_flush_proc_info(tmp); list_link_init(&tmp->zspr_next); } return; ea_err: /* * Close the next accounting file if we have not transitioned to it * yet. */ if (ctl->zsctl_proc_open_next) { (void) ea_close(&ctl->zsctl_proc_eaf_next); ctl->zsctl_proc_open_next = 0; ctl->zsctl_proc_fd_next = -1; } } /* * getvmusage(2) uses size_t's in the passwd data structure, which differ * in size for 32bit and 64 bit kernels. Since this is a contracted interface, * and zonestatd does not necessarily match the kernel's bitness, marshal * results appropriately. */ static int zsd_getvmusage(zsd_ctl_t *ctl, uint_t flags, time_t age, zsd_vmusage64_t *buf, uint64_t *nres) { zsd_vmusage32_t *vmu32; zsd_vmusage64_t *vmu64; uint32_t nres32; int i; int ret; if (ctl->zsctl_kern_bits == 32) { nres32 = *nres; ret = syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, flags, age, (uintptr_t)buf, (uintptr_t)&nres32); *nres = nres32; if (ret == 0 && buf != NULL) { /* * An array of vmusage32_t's has been returned. * Convert it to an array of vmusage64_t's. */ vmu32 = (zsd_vmusage32_t *)buf; vmu64 = (zsd_vmusage64_t *)buf; for (i = nres32 - 1; i >= 0; i--) { vmu64[i].vmu_zoneid = vmu32[i].vmu_zoneid; vmu64[i].vmu_type = vmu32[i].vmu_type; vmu64[i].vmu_type = vmu32[i].vmu_type; vmu64[i].vmu_rss_all = vmu32[i].vmu_rss_all; vmu64[i].vmu_rss_private = vmu32[i].vmu_rss_private; vmu64[i].vmu_rss_shared = vmu32[i].vmu_rss_shared; vmu64[i].vmu_swap_all = vmu32[i].vmu_swap_all; vmu64[i].vmu_swap_private = vmu32[i].vmu_swap_private; vmu64[i].vmu_swap_shared = vmu32[i].vmu_swap_shared; } } return (ret); } else { /* * kernel is 64 bit, so use 64 bit structures as zonestat * expects. */ return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, flags, age, (uintptr_t)buf, (uintptr_t)nres)); } } /* * Update the current physical, virtual, and locked memory usage of the * running zones. */ static void zsd_refresh_memory(zsd_ctl_t *ctl, boolean_t init) { uint64_t phys_total; uint64_t phys_used; uint64_t phys_zones; uint64_t phys_zones_overcount; uint64_t phys_zones_extra; uint64_t phys_zones_credit; uint64_t vm_free; uint64_t vm_used; uint64_t disk_swap_total; uint64_t disk_swap_used; /* disk swap with contents */ uint64_t physmem; uint64_t pp_kernel; uint64_t arc_size = 0; struct anoninfo ani; int num_swap_devices; struct swaptable *swt; struct swapent *swent; size_t swt_size; char *path; zsd_vmusage64_t *vmusage; uint64_t num_vmusage; int i, ret; zsd_system_t *sys; zsd_zone_t *zone; int vmu_nzones; kstat_t *kstat; char kstat_name[KSTAT_STRLEN]; kstat_named_t *knp; kid_t kid; if (init) return; sys = ctl->zsctl_system; /* interrogate swap devices to find the amount of disk swap */ disk_swap_again: num_swap_devices = swapctl(SC_GETNSWP, NULL); if (num_swap_devices == 0) { sys->zss_swap_total = disk_swap_total = 0; sys->zss_swap_used = disk_swap_used = 0; /* No disk swap */ goto disk_swap_done; } /* see if swap table needs to be larger */ if (num_swap_devices > ctl->zsctl_swap_cache_num) { swt_size = sizeof (int) + (num_swap_devices * sizeof (struct swapent)) + (num_swap_devices * MAXPATHLEN); if (ctl->zsctl_swap_cache != NULL) free(ctl->zsctl_swap_cache); swt = (struct swaptable *)malloc(swt_size); if (swt == NULL) { /* * Could not allocate to get list of swap devices. * Just use data from the most recent read, which will * be zero if this is the first read. */ zsd_warn(gettext("Unable to allocate to determine " "virtual memory")); disk_swap_total = sys->zss_swap_total; disk_swap_used = sys->zss_swap_used; goto disk_swap_done; } swent = swt->swt_ent; path = (char *)swt + (sizeof (int) + num_swap_devices * sizeof (swapent_t)); for (i = 0; i < num_swap_devices; i++, swent++) { swent->ste_path = path; path += MAXPATHLEN; } swt->swt_n = num_swap_devices; ctl->zsctl_swap_cache = swt; ctl->zsctl_swap_cache_size = swt_size; ctl->zsctl_swap_cache_num = num_swap_devices; } num_swap_devices = swapctl(SC_LIST, ctl->zsctl_swap_cache); if (num_swap_devices < 0) { /* More swap devices have arrived */ if (errno == ENOMEM) goto disk_swap_again; zsd_warn(gettext("Unable to determine disk swap devices")); /* Unexpected error. Use existing data */ disk_swap_total = sys->zss_swap_total; disk_swap_used = sys->zss_swap_used; goto disk_swap_done; } /* add up the disk swap */ disk_swap_total = 0; disk_swap_used = 0; swent = ctl->zsctl_swap_cache->swt_ent; for (i = 0; i < num_swap_devices; i++, swent++) { disk_swap_total += swent->ste_pages; disk_swap_used += (swent->ste_pages - swent->ste_free); } disk_swap_total *= ctl->zsctl_pagesize; disk_swap_used *= ctl->zsctl_pagesize; sys->zss_swap_total = disk_swap_total; sys->zss_swap_used = disk_swap_used; disk_swap_done: /* get system pages kstat */ kid = -1; kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "unix", 0, "system_pages"); if (kstat == NULL) zsd_warn(gettext("Unable to lookup system pages kstat")); else kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL); if (kid == -1) { zsd_warn(gettext("Unable to read system pages kstat")); return; } else { knp = kstat_data_lookup(kstat, "physmem"); if (knp == NULL) { zsd_warn(gettext("Unable to read physmem")); } else { if (knp->data_type == KSTAT_DATA_UINT64) physmem = knp->value.ui64; else if (knp->data_type == KSTAT_DATA_UINT32) physmem = knp->value.ui32; else return; } knp = kstat_data_lookup(kstat, "pp_kernel"); if (knp == NULL) { zsd_warn(gettext("Unable to read pp_kernel")); } else { if (knp->data_type == KSTAT_DATA_UINT64) pp_kernel = knp->value.ui64; else if (knp->data_type == KSTAT_DATA_UINT32) pp_kernel = knp->value.ui32; else return; } } physmem *= ctl->zsctl_pagesize; pp_kernel *= ctl->zsctl_pagesize; /* get the zfs arc size if available */ arc_size = 0; kid = -1; kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "zfs", 0, "arcstats"); if (kstat != NULL) kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL); if (kid != -1) { knp = kstat_data_lookup(kstat, "size"); if (knp != NULL) if (knp->data_type == KSTAT_DATA_UINT64) arc_size = knp->value.ui64; } /* Try to get swap information */ if (swapctl(SC_AINFO, &ani) < 0) { zsd_warn(gettext("Unable to get swap info")); return; } vmusage_again: /* getvmusage to get physical memory usage */ vmusage = ctl->zsctl_vmusage_cache; num_vmusage = ctl->zsctl_vmusage_cache_num; ret = zsd_getvmusage(ctl, VMUSAGE_SYSTEM | VMUSAGE_ALL_ZONES, 0, vmusage, &num_vmusage); if (ret != 0) { /* Unexpected error. Use existing data */ if (errno != EOVERFLOW) { zsd_warn(gettext( "Unable to read physical memory usage")); phys_zones = sys->zss_ram_zones; goto vmusage_done; } } /* vmusage results cache too small */ if (num_vmusage > ctl->zsctl_vmusage_cache_num) { size_t size = sizeof (zsd_vmusage64_t) * num_vmusage; if (ctl->zsctl_vmusage_cache != NULL) free(ctl->zsctl_vmusage_cache); vmusage = (zsd_vmusage64_t *)malloc(size); if (vmusage == NULL) { zsd_warn(gettext("Unable to alloc to determine " "physical memory usage")); phys_zones = sys->zss_ram_zones; goto vmusage_done; } ctl->zsctl_vmusage_cache = vmusage; ctl->zsctl_vmusage_cache_num = num_vmusage; goto vmusage_again; } phys_zones_overcount = 0; vmu_nzones = 0; for (i = 0; i < num_vmusage; i++) { switch (vmusage[i].vmu_type) { case VMUSAGE_SYSTEM: /* total pages backing user process mappings */ phys_zones = sys->zss_ram_zones = vmusage[i].vmu_rss_all; break; case VMUSAGE_ZONE: vmu_nzones++; phys_zones_overcount += vmusage[i].vmu_rss_all; zone = zsd_lookup_zone_byid(ctl, vmusage[i].vmu_id); if (zone != NULL) zone->zsz_usage_ram = vmusage[i].vmu_rss_all; break; default: break; } } /* * Figure how much memory was double counted due to text sharing * between zones. Credit this back so that the sum of the zones * equals the total zone ram usage; */ phys_zones_extra = phys_zones_overcount - phys_zones; phys_zones_credit = phys_zones_extra / vmu_nzones; vmusage_done: /* walk the zones to get swap and locked kstats. Fetch ram cap. */ sys->zss_locked_zones = 0; sys->zss_vm_zones = 0; for (zone = list_head(&ctl->zsctl_zones); zone != NULL; zone = list_next(&ctl->zsctl_zones, zone)) { /* If zone halted during interval, show memory usage as none */ if (zone->zsz_active == B_FALSE || zone->zsz_deleted == B_TRUE) { zone->zsz_usage_ram = 0; zone->zsz_usage_vm = 0; zone->zsz_usage_locked = 0; continue; } if (phys_zones_credit > 0) { if (zone->zsz_usage_ram > phys_zones_credit) { zone->zsz_usage_ram -= phys_zones_credit; } } /* * Get zone's swap usage. Since zone could have halted, * treats as zero if cannot read */ zone->zsz_usage_vm = 0; (void) snprintf(kstat_name, sizeof (kstat_name), "swapresv_zone_%d", zone->zsz_id); kid = -1; kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps", zone->zsz_id, kstat_name); if (kstat != NULL) kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL); if (kid != -1) { knp = kstat_data_lookup(kstat, "usage"); if (knp != NULL && knp->data_type == KSTAT_DATA_UINT64) { zone->zsz_usage_vm = knp->value.ui64; sys->zss_vm_zones += knp->value.ui64; } } /* * Get zone's locked usage. Since zone could have halted, * treats as zero if cannot read */ zone->zsz_usage_locked = 0; (void) snprintf(kstat_name, sizeof (kstat_name), "lockedmem_zone_%d", zone->zsz_id); kid = -1; kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps", zone->zsz_id, kstat_name); if (kstat != NULL) kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL); if (kid != -1) { knp = kstat_data_lookup(kstat, "usage"); if (knp != NULL && knp->data_type == KSTAT_DATA_UINT64) { zone->zsz_usage_locked = knp->value.ui64; /* * Since locked memory accounting for zones * can double count ddi locked memory, cap each * zone's locked usage at its ram usage. */ if (zone->zsz_usage_locked > zone->zsz_usage_ram) zone->zsz_usage_locked = zone->zsz_usage_ram; sys->zss_locked_zones += zone->zsz_usage_locked; } } } phys_total = sysconf(_SC_PHYS_PAGES) * ctl->zsctl_pagesize; phys_used = (sysconf(_SC_PHYS_PAGES) - sysconf(_SC_AVPHYS_PAGES)) * ctl->zsctl_pagesize; /* Compute remaining statistics */ sys->zss_ram_total = phys_total; sys->zss_ram_zones = phys_zones; sys->zss_ram_kern = phys_used - phys_zones - arc_size; /* * The total for kernel locked memory should include * segkp locked pages, but oh well. The arc size is subtracted, * as that physical memory is reclaimable. */ sys->zss_locked_kern = pp_kernel - arc_size; /* Add memory used by kernel startup and obp to kernel locked */ if ((phys_total - physmem) > 0) sys->zss_locked_kern += phys_total - physmem; /* * Add in the portion of (RAM+DISK) that is not available as swap, * and consider it swap used by the kernel. */ sys->zss_vm_total = phys_total + disk_swap_total; vm_free = (ani.ani_max - ani.ani_resv) * ctl->zsctl_pagesize; vm_used = sys->zss_vm_total - vm_free; sys->zss_vm_kern = vm_used - sys->zss_vm_zones - arc_size; } /* * Charge each cpu's usage to its processor sets. Also add the cpu's total * time to each zone using the processor set. This tracks the maximum * amount of cpu time that a zone could have used. */ static void zsd_refresh_cpu_stats(zsd_ctl_t *ctl, boolean_t init) { zsd_system_t *sys; zsd_zone_t *zone; zsd_pset_usage_t *usage; zsd_cpu_t *cpu; zsd_cpu_t *cpu_next; zsd_pset_t *pset; timestruc_t ts; uint64_t hrtime; timestruc_t delta; /* Update the per-cpu kstat data */ cpu_next = list_head(&ctl->zsctl_cpus); while (cpu_next != NULL) { cpu = cpu_next; cpu_next = list_next(&ctl->zsctl_cpus, cpu); zsd_update_cpu_stats(ctl, cpu); } /* Update the elapsed real time */ hrtime = gethrtime(); if (init) { /* first time around, store hrtime for future comparision */ ctl->zsctl_hrtime = hrtime; ctl->zsctl_hrtime_prev = hrtime; } else { /* Compute increase in hrtime since the most recent read */ ctl->zsctl_hrtime_prev = ctl->zsctl_hrtime; ctl->zsctl_hrtime = hrtime; if ((hrtime = hrtime - ctl->zsctl_hrtime_prev) > 0) TIMESTRUC_ADD_NANOSEC(ctl->zsctl_hrtime_total, hrtime); } /* On initialization, all psets have zero time */ if (init) return; for (pset = list_head(&ctl->zsctl_psets); pset != NULL; pset = list_next(&ctl->zsctl_psets, pset)) { if (pset->zsp_active == B_FALSE) { zsd_warn(gettext("Internal error,inactive pset found")); continue; } /* sum total used time for pset */ ts.tv_sec = 0; ts.tv_nsec = 0; TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_intr); TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_kern); TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_user); /* kernel time in pset is total time minus zone time */ TIMESTRUC_DELTA(pset->zsp_usage_kern, ts, pset->zsp_usage_zones); if (pset->zsp_usage_kern.tv_sec < 0 || pset->zsp_usage_kern.tv_nsec < 0) { pset->zsp_usage_kern.tv_sec = 0; pset->zsp_usage_kern.tv_nsec = 0; } /* Total pset elapsed time is used time plus idle time */ TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_idle); TIMESTRUC_DELTA(delta, ts, pset->zsp_total_time); for (usage = list_head(&pset->zsp_usage_list); usage != NULL; usage = list_next(&pset->zsp_usage_list, usage)) { zone = usage->zsu_zone; if (usage->zsu_cpu_shares != ZS_LIMIT_NONE && usage->zsu_cpu_shares != ZS_SHARES_UNLIMITED && usage->zsu_cpu_shares != 0) { /* * Figure out how many nanoseconds of share time * to give to the zone */ hrtime = delta.tv_sec; hrtime *= NANOSEC; hrtime += delta.tv_nsec; hrtime *= usage->zsu_cpu_shares; hrtime /= pset->zsp_cpu_shares; TIMESTRUC_ADD_NANOSEC(zone->zsz_share_time, hrtime); } /* Add pset time to each zone using pset */ TIMESTRUC_ADD_TIMESTRUC(zone->zsz_pset_time, delta); zone->zsz_cpus_online += pset->zsp_online; } pset->zsp_total_time = ts; } for (zone = list_head(&ctl->zsctl_zones); zone != NULL; zone = list_next(&ctl->zsctl_zones, zone)) { /* update cpu cap tracking if the zone has a cpu cap */ if (zone->zsz_cpu_cap != ZS_LIMIT_NONE) { uint64_t elapsed; elapsed = ctl->zsctl_hrtime - ctl->zsctl_hrtime_prev; elapsed *= zone->zsz_cpu_cap; elapsed = elapsed / 100; TIMESTRUC_ADD_NANOSEC(zone->zsz_cap_time, elapsed); } } sys = ctl->zsctl_system; ts.tv_sec = 0; ts.tv_nsec = 0; TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_intr); TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_kern); TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_user); /* kernel time in pset is total time minus zone time */ TIMESTRUC_DELTA(sys->zss_cpu_usage_kern, ts, sys->zss_cpu_usage_zones); if (sys->zss_cpu_usage_kern.tv_sec < 0 || sys->zss_cpu_usage_kern.tv_nsec < 0) { sys->zss_cpu_usage_kern.tv_sec = 0; sys->zss_cpu_usage_kern.tv_nsec = 0; } /* Total pset elapsed time is used time plus idle time */ TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_idle); sys->zss_cpu_total_time = ts; } /* * Saves current usage data to a cache that is read by libzonestat when * calling zs_usage_read(). * * All pointers in the cached data structure are set to NULL. When * libzonestat reads the cached data, it will set the pointers relative to * its address space. */ static void zsd_usage_cache_update(zsd_ctl_t *ctl) { zs_usage_cache_t *cache; zs_usage_cache_t *old; zs_usage_t *usage; zs_system_t *sys; zsd_system_t *dsys; zs_zone_t *zone = NULL; zsd_zone_t *dzone; zs_pset_t *pset = NULL; zsd_pset_t *dpset; zs_pset_zone_t *pusage; zsd_pset_usage_t *dpusage; char *next; uint_t size, i, j; size = sizeof (zs_usage_cache_t) + sizeof (zs_usage_t) + sizeof (zs_system_t) + sizeof (zs_zone_t) * ctl->zsctl_nzones + sizeof (zs_pset_t) * ctl->zsctl_npsets + sizeof (zs_pset_zone_t) * ctl->zsctl_npset_usages; cache = (zs_usage_cache_t *)malloc(size); if (cache == NULL) { zsd_warn(gettext("Unable to allocate usage cache\n")); return; } next = (char *)cache; cache->zsuc_size = size - sizeof (zs_usage_cache_t); next += sizeof (zs_usage_cache_t); /* LINTED */ usage = cache->zsuc_usage = (zs_usage_t *)next; next += sizeof (zs_usage_t); usage->zsu_start = g_start; usage->zsu_hrstart = g_hrstart; usage->zsu_time = g_now; usage->zsu_hrtime = g_hrnow; usage->zsu_nzones = ctl->zsctl_nzones; usage->zsu_npsets = ctl->zsctl_npsets; usage->zsu_system = NULL; /* LINTED */ sys = (zs_system_t *)next; next += sizeof (zs_system_t); dsys = ctl->zsctl_system; sys->zss_ram_total = dsys->zss_ram_total; sys->zss_ram_kern = dsys->zss_ram_kern; sys->zss_ram_zones = dsys->zss_ram_zones; sys->zss_locked_kern = dsys->zss_locked_kern; sys->zss_locked_zones = dsys->zss_locked_zones; sys->zss_vm_total = dsys->zss_vm_total; sys->zss_vm_kern = dsys->zss_vm_kern; sys->zss_vm_zones = dsys->zss_vm_zones; sys->zss_swap_total = dsys->zss_swap_total; sys->zss_swap_used = dsys->zss_swap_used; sys->zss_ncpus = dsys->zss_ncpus; sys->zss_ncpus_online = dsys->zss_ncpus_online; sys->zss_processes_max = dsys->zss_maxpid; sys->zss_lwps_max = dsys->zss_lwps_max; sys->zss_shm_max = dsys->zss_shm_max; sys->zss_shmids_max = dsys->zss_shmids_max; sys->zss_semids_max = dsys->zss_semids_max; sys->zss_msgids_max = dsys->zss_msgids_max; sys->zss_lofi_max = dsys->zss_lofi_max; sys->zss_processes = dsys->zss_processes; sys->zss_lwps = dsys->zss_lwps; sys->zss_shm = dsys->zss_shm; sys->zss_shmids = dsys->zss_shmids; sys->zss_semids = dsys->zss_semids; sys->zss_msgids = dsys->zss_msgids; sys->zss_lofi = dsys->zss_lofi; sys->zss_cpu_total_time = dsys->zss_cpu_total_time; sys->zss_cpu_usage_zones = dsys->zss_cpu_usage_zones; sys->zss_cpu_usage_kern = dsys->zss_cpu_usage_kern; for (i = 0, dzone = list_head(&ctl->zsctl_zones); i < ctl->zsctl_nzones; i++, dzone = list_next(&ctl->zsctl_zones, dzone)) { /* LINTED */ zone = (zs_zone_t *)next; next += sizeof (zs_zone_t); list_link_init(&zone->zsz_next); zone->zsz_system = NULL; (void) strlcpy(zone->zsz_name, dzone->zsz_name, sizeof (zone->zsz_name)); (void) strlcpy(zone->zsz_pool, dzone->zsz_pool, sizeof (zone->zsz_pool)); (void) strlcpy(zone->zsz_pset, dzone->zsz_pset, sizeof (zone->zsz_pset)); zone->zsz_id = dzone->zsz_id; zone->zsz_cputype = dzone->zsz_cputype; zone->zsz_iptype = dzone->zsz_iptype; zone->zsz_start = dzone->zsz_start; zone->zsz_hrstart = dzone->zsz_hrstart; zone->zsz_scheds = dzone->zsz_scheds; zone->zsz_cpu_shares = dzone->zsz_cpu_shares; zone->zsz_cpu_cap = dzone->zsz_cpu_cap; zone->zsz_ram_cap = dzone->zsz_ram_cap; zone->zsz_vm_cap = dzone->zsz_vm_cap; zone->zsz_locked_cap = dzone->zsz_locked_cap; zone->zsz_cpu_usage = dzone->zsz_cpu_usage; zone->zsz_cpus_online = dzone->zsz_cpus_online; zone->zsz_pset_time = dzone->zsz_pset_time; zone->zsz_cap_time = dzone->zsz_cap_time; zone->zsz_share_time = dzone->zsz_share_time; zone->zsz_usage_ram = dzone->zsz_usage_ram; zone->zsz_usage_locked = dzone->zsz_usage_locked; zone->zsz_usage_vm = dzone->zsz_usage_vm; zone->zsz_processes_cap = dzone->zsz_processes_cap; zone->zsz_lwps_cap = dzone->zsz_lwps_cap; zone->zsz_shm_cap = dzone->zsz_shm_cap; zone->zsz_shmids_cap = dzone->zsz_shmids_cap; zone->zsz_semids_cap = dzone->zsz_semids_cap; zone->zsz_msgids_cap = dzone->zsz_msgids_cap; zone->zsz_lofi_cap = dzone->zsz_lofi_cap; zone->zsz_processes = dzone->zsz_processes; zone->zsz_lwps = dzone->zsz_lwps; zone->zsz_shm = dzone->zsz_shm; zone->zsz_shmids = dzone->zsz_shmids; zone->zsz_semids = dzone->zsz_semids; zone->zsz_msgids = dzone->zsz_msgids; zone->zsz_lofi = dzone->zsz_lofi; } for (i = 0, dpset = list_head(&ctl->zsctl_psets); i < ctl->zsctl_npsets; i++, dpset = list_next(&ctl->zsctl_psets, dpset)) { /* LINTED */ pset = (zs_pset_t *)next; next += sizeof (zs_pset_t); list_link_init(&pset->zsp_next); (void) strlcpy(pset->zsp_name, dpset->zsp_name, sizeof (pset->zsp_name)); pset->zsp_id = dpset->zsp_id; pset->zsp_cputype = dpset->zsp_cputype; pset->zsp_start = dpset->zsp_start; pset->zsp_hrstart = dpset->zsp_hrstart; pset->zsp_online = dpset->zsp_online; pset->zsp_size = dpset->zsp_size; pset->zsp_min = dpset->zsp_min; pset->zsp_max = dpset->zsp_max; pset->zsp_importance = dpset->zsp_importance; pset->zsp_scheds = dpset->zsp_scheds; pset->zsp_cpu_shares = dpset->zsp_cpu_shares; pset->zsp_total_time = dpset->zsp_total_time; pset->zsp_usage_kern = dpset->zsp_usage_kern; pset->zsp_usage_zones = dpset->zsp_usage_zones; pset->zsp_nusage = dpset->zsp_nusage; /* Add pset usages for pset */ for (j = 0, dpusage = list_head(&dpset->zsp_usage_list); j < dpset->zsp_nusage; j++, dpusage = list_next(&dpset->zsp_usage_list, dpusage)) { /* LINTED */ pusage = (zs_pset_zone_t *)next; next += sizeof (zs_pset_zone_t); /* pointers are computed by client */ pusage->zspz_pset = NULL; pusage->zspz_zone = NULL; list_link_init(&pusage->zspz_next); pusage->zspz_zoneid = dpusage->zsu_zone->zsz_id; pusage->zspz_start = dpusage->zsu_start; pusage->zspz_hrstart = dpusage->zsu_hrstart; pusage->zspz_hrstart = dpusage->zsu_hrstart; pusage->zspz_cpu_shares = dpusage->zsu_cpu_shares; pusage->zspz_scheds = dpusage->zsu_scheds; pusage->zspz_cpu_usage = dpusage->zsu_cpu_usage; } } /* Update the current cache pointer */ (void) mutex_lock(&g_usage_cache_lock); old = g_usage_cache; cache->zsuc_ref = 1; cache->zsuc_gen = g_gen_next; usage->zsu_gen = g_gen_next; usage->zsu_size = size; g_usage_cache = cache; if (old != NULL) { old->zsuc_ref--; if (old->zsuc_ref == 0) free(old); } g_gen_next++; /* Wake up any clients that are waiting for this calculation */ if (g_usage_cache_kickers > 0) { (void) cond_broadcast(&g_usage_cache_wait); } (void) mutex_unlock(&g_usage_cache_lock); } static zs_usage_cache_t * zsd_usage_cache_hold_locked() { zs_usage_cache_t *ret; ret = g_usage_cache; ret->zsuc_ref++; return (ret); } void zsd_usage_cache_rele(zs_usage_cache_t *cache) { (void) mutex_lock(&g_usage_cache_lock); cache->zsuc_ref--; if (cache->zsuc_ref == 0) free(cache); (void) mutex_unlock(&g_usage_cache_lock); } /* Close the handles held by zsd_open() */ void zsd_close(zsd_ctl_t *ctl) { zsd_zone_t *zone; zsd_pset_t *pset; zsd_pset_usage_t *usage; zsd_cpu_t *cpu; int id; if (ctl->zsctl_kstat_ctl) { (void) kstat_close(ctl->zsctl_kstat_ctl); ctl->zsctl_kstat_ctl = NULL; } if (ctl->zsctl_proc_open) { (void) ea_close(&ctl->zsctl_proc_eaf); ctl->zsctl_proc_open = 0; ctl->zsctl_proc_fd = -1; } if (ctl->zsctl_pool_conf) { if (ctl->zsctl_pool_status == POOL_ENABLED) (void) pool_conf_close(ctl->zsctl_pool_conf); ctl->zsctl_pool_status = POOL_DISABLED; } while ((zone = list_head(&ctl->zsctl_zones)) != NULL) { list_remove(&ctl->zsctl_zones, zone); free(zone); ctl->zsctl_nzones--; } while ((pset = list_head(&ctl->zsctl_psets)) != NULL) { while ((usage = list_head(&pset->zsp_usage_list)) != NULL) { list_remove(&pset->zsp_usage_list, usage); ctl->zsctl_npset_usages--; free(usage); } list_remove(&ctl->zsctl_psets, pset); free(pset); ctl->zsctl_npsets--; } /* Release all cpus being tracked */ while (cpu = list_head(&ctl->zsctl_cpus)) { list_remove(&ctl->zsctl_cpus, cpu); id = cpu->zsc_id; bzero(cpu, sizeof (zsd_cpu_t)); cpu->zsc_id = id; cpu->zsc_allocated = B_FALSE; cpu->zsc_psetid = ZS_PSET_ERROR; cpu->zsc_psetid_prev = ZS_PSET_ERROR; } assert(ctl->zsctl_npset_usages == 0); assert(ctl->zsctl_npsets == 0); assert(ctl->zsctl_nzones == 0); (void) zsd_disable_cpu_stats(); } /* * Update the utilization data for all zones and processor sets. */ static int zsd_read(zsd_ctl_t *ctl, boolean_t init, boolean_t do_memory) { (void) kstat_chain_update(ctl->zsctl_kstat_ctl); (void) gettimeofday(&(ctl->zsctl_timeofday), NULL); zsd_refresh_system(ctl); /* * Memory calculation is expensive. Only update it on sample * intervals. */ if (do_memory == B_TRUE) zsd_refresh_memory(ctl, init); zsd_refresh_zones(ctl); zsd_refresh_psets(ctl); zsd_refresh_procs(ctl, init); zsd_refresh_cpu_stats(ctl, init); /* * Delete objects that no longer exist. * Pset usages must be deleted first as they point to zone and * pset objects. */ zsd_mark_pset_usages_end(ctl); zsd_mark_psets_end(ctl); zsd_mark_cpus_end(ctl); zsd_mark_zones_end(ctl); /* * Save results for clients. */ zsd_usage_cache_update(ctl); /* * Roll process accounting file. */ (void) zsd_roll_exacct(); return (0); } /* * Get the system rctl, which is the upper most limit */ static uint64_t zsd_get_system_rctl(char *name) { rctlblk_t *rblk, *rblk_last; rblk = (rctlblk_t *)alloca(rctlblk_size()); rblk_last = (rctlblk_t *)alloca(rctlblk_size()); if (getrctl(name, NULL, rblk_last, RCTL_FIRST) != 0) return (ZS_LIMIT_NONE); while (getrctl(name, rblk_last, rblk, RCTL_NEXT) == 0) (void) bcopy(rblk, rblk_last, rctlblk_size()); return (rctlblk_get_value(rblk_last)); } /* * Open any necessary subsystems for collecting utilization data, * allocate and initialize data structures, and get initial utilization. * * Errors: * ENOMEM out of memory * EINVAL other error */ static zsd_ctl_t * zsd_open(zsd_ctl_t *ctl) { zsd_system_t *system; char path[MAXPATHLEN]; struct statvfs svfs; int ret; int i; size_t size; int err; if (ctl == NULL && (ctl = (zsd_ctl_t *)calloc(1, sizeof (zsd_ctl_t))) == NULL) { zsd_warn(gettext("Out of Memory")); errno = ENOMEM; goto err; } ctl->zsctl_proc_fd = -1; /* open kstats */ if (ctl->zsctl_kstat_ctl == NULL && (ctl->zsctl_kstat_ctl = kstat_open()) == NULL) { err = errno; zsd_warn(gettext("Unable to open kstats")); errno = err; if (errno != ENOMEM) errno = EAGAIN; goto err; } /* * These are set when the accounting file is opened by * zsd_update_procs() */ ctl->zsctl_proc_fd = -1; ctl->zsctl_proc_fd_next = -1; ctl->zsctl_proc_open = 0; ctl->zsctl_proc_open_next = 0; check_exacct: (void) zsd_enable_cpu_stats(); /* Create structures to track usage */ if (ctl->zsctl_system == NULL && (ctl->zsctl_system = (zsd_system_t *) calloc(1, sizeof (zsd_system_t))) == NULL) { ret = -1; zsd_warn(gettext("Out of Memory")); errno = ENOMEM; goto err; } system = ctl->zsctl_system; /* get the kernel bitness to know structure layout for getvmusage */ ret = sysinfo(SI_ARCHITECTURE_64, path, sizeof (path)); if (ret < 0) ctl->zsctl_kern_bits = 32; else ctl->zsctl_kern_bits = 64; ctl->zsctl_pagesize = sysconf(_SC_PAGESIZE); size = sysconf(_SC_CPUID_MAX); ctl->zsctl_maxcpuid = size; if (ctl->zsctl_cpu_array == NULL && (ctl->zsctl_cpu_array = (zsd_cpu_t *)calloc(size + 1, sizeof (zsd_cpu_t))) == NULL) { zsd_warn(gettext("Out of Memory")); errno = ENOMEM; goto err; } for (i = 0; i <= ctl->zsctl_maxcpuid; i++) { ctl->zsctl_cpu_array[i].zsc_id = i; ctl->zsctl_cpu_array[i].zsc_allocated = B_FALSE; ctl->zsctl_cpu_array[i].zsc_psetid = ZS_PSET_ERROR; ctl->zsctl_cpu_array[i].zsc_psetid_prev = ZS_PSET_ERROR; } if (statvfs("/proc", &svfs) != 0 || strcmp("/proc", svfs.f_fstr) != 0) { zsd_warn(gettext("/proc not a procfs filesystem")); errno = EINVAL; goto err; } size = sysconf(_SC_MAXPID) + 1; ctl->zsctl_maxproc = size; if (ctl->zsctl_proc_array == NULL && (ctl->zsctl_proc_array = (zsd_proc_t *)calloc(size, sizeof (zsd_proc_t))) == NULL) { zsd_warn(gettext("Out of Memory")); errno = ENOMEM; goto err; } for (i = 0; i <= ctl->zsctl_maxproc; i++) { list_link_init(&(ctl->zsctl_proc_array[i].zspr_next)); ctl->zsctl_proc_array[i].zspr_psetid = ZS_PSET_ERROR; ctl->zsctl_proc_array[i].zspr_zoneid = -1; ctl->zsctl_proc_array[i].zspr_usage.tv_sec = 0; ctl->zsctl_proc_array[i].zspr_usage.tv_nsec = 0; ctl->zsctl_proc_array[i].zspr_ppid = -1; } list_create(&ctl->zsctl_zones, sizeof (zsd_zone_t), offsetof(zsd_zone_t, zsz_next)); list_create(&ctl->zsctl_psets, sizeof (zsd_pset_t), offsetof(zsd_pset_t, zsp_next)); list_create(&ctl->zsctl_cpus, sizeof (zsd_cpu_t), offsetof(zsd_cpu_t, zsc_next)); if (ctl->zsctl_pool_conf == NULL && (ctl->zsctl_pool_conf = pool_conf_alloc()) == NULL) { zsd_warn(gettext("Out of Memory")); errno = ENOMEM; goto err; } ctl->zsctl_pool_status = POOL_DISABLED; ctl->zsctl_pool_changed = 0; if (ctl->zsctl_pool_vals[0] == NULL && (ctl->zsctl_pool_vals[0] = pool_value_alloc()) == NULL) { zsd_warn(gettext("Out of Memory")); errno = ENOMEM; goto err; } if (ctl->zsctl_pool_vals[1] == NULL && (ctl->zsctl_pool_vals[1] = pool_value_alloc()) == NULL) { zsd_warn(gettext("Out of Memory")); errno = ENOMEM; goto err; } ctl->zsctl_pool_vals[2] = NULL; /* * get system limits */ system->zss_maxpid = size = sysconf(_SC_MAXPID); system->zss_processes_max = zsd_get_system_rctl("zone.max-processes"); system->zss_lwps_max = zsd_get_system_rctl("zone.max-lwps"); system->zss_shm_max = zsd_get_system_rctl("zone.max-shm-memory"); system->zss_shmids_max = zsd_get_system_rctl("zone.max-shm-ids"); system->zss_semids_max = zsd_get_system_rctl("zone.max-sem-ids"); system->zss_msgids_max = zsd_get_system_rctl("zone.max-msg-ids"); system->zss_lofi_max = zsd_get_system_rctl("zone.max-lofi"); g_gen_next = 1; if (zsd_read(ctl, B_TRUE, B_FALSE) != 0) zsd_warn(gettext("Reading zone statistics failed")); return (ctl); err: if (ctl) zsd_close(ctl); return (NULL); } /* Copy utilization data to buffer, filtering data if non-global zone. */ static void zsd_usage_filter(zoneid_t zid, zs_usage_cache_t *cache, zs_usage_t *usage, boolean_t is_gz) { zs_usage_t *cusage; zs_system_t *sys, *csys; zs_zone_t *zone, *czone; zs_pset_t *pset, *cpset; zs_pset_zone_t *pz, *cpz, *foundpz; size_t size = 0, csize = 0; char *start, *cstart; int i, j; timestruc_t delta; /* Privileged users in the global zone get everything */ if (is_gz) { cusage = cache->zsuc_usage; (void) bcopy(cusage, usage, cusage->zsu_size); return; } /* Zones just get their own usage */ cusage = cache->zsuc_usage; start = (char *)usage; cstart = (char *)cusage; size += sizeof (zs_usage_t); csize += sizeof (zs_usage_t); usage->zsu_start = cusage->zsu_start; usage->zsu_hrstart = cusage->zsu_hrstart; usage->zsu_time = cusage->zsu_time; usage->zsu_hrtime = cusage->zsu_hrtime; usage->zsu_gen = cusage->zsu_gen; usage->zsu_nzones = 1; usage->zsu_npsets = 0; /* LINTED */ sys = (zs_system_t *)(start + size); /* LINTED */ csys = (zs_system_t *)(cstart + csize); size += sizeof (zs_system_t); csize += sizeof (zs_system_t); /* Save system limits but not usage */ *sys = *csys; sys->zss_ncpus = 0; sys->zss_ncpus_online = 0; /* LINTED */ zone = (zs_zone_t *)(start + size); /* LINTED */ czone = (zs_zone_t *)(cstart + csize); /* Find the matching zone */ for (i = 0; i < cusage->zsu_nzones; i++) { if (czone->zsz_id == zid) { *zone = *czone; size += sizeof (zs_zone_t); } csize += sizeof (zs_zone_t); /* LINTED */ czone = (zs_zone_t *)(cstart + csize); } sys->zss_ram_kern += (sys->zss_ram_zones - zone->zsz_usage_ram); sys->zss_ram_zones = zone->zsz_usage_ram; sys->zss_vm_kern += (sys->zss_vm_zones - zone->zsz_usage_vm); sys->zss_vm_zones = zone->zsz_usage_vm; sys->zss_locked_kern += (sys->zss_locked_zones - zone->zsz_usage_locked); sys->zss_locked_zones = zone->zsz_usage_locked; TIMESTRUC_DELTA(delta, sys->zss_cpu_usage_zones, zone->zsz_cpu_usage); TIMESTRUC_ADD_TIMESTRUC(sys->zss_cpu_usage_kern, delta); sys->zss_cpu_usage_zones = zone->zsz_cpu_usage; /* LINTED */ pset = (zs_pset_t *)(start + size); /* LINTED */ cpset = (zs_pset_t *)(cstart + csize); for (i = 0; i < cusage->zsu_npsets; i++) { csize += sizeof (zs_pset_t); /* LINTED */ cpz = (zs_pset_zone_t *)(csize + cstart); foundpz = NULL; for (j = 0; j < cpset->zsp_nusage; j++) { if (cpz->zspz_zoneid == zid) foundpz = cpz; csize += sizeof (zs_pset_zone_t); /* LINTED */ cpz = (zs_pset_zone_t *)(csize + cstart); } if (foundpz != NULL) { size += sizeof (zs_pset_t); /* LINTED */ pz = (zs_pset_zone_t *)(start + size); size += sizeof (zs_pset_zone_t); *pset = *cpset; *pz = *foundpz; TIMESTRUC_DELTA(delta, pset->zsp_usage_zones, pz->zspz_cpu_usage); TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_kern, delta); pset->zsp_usage_zones = pz->zspz_cpu_usage; pset->zsp_nusage = 1; usage->zsu_npsets++; sys->zss_ncpus += pset->zsp_size; sys->zss_ncpus_online += pset->zsp_online; } /* LINTED */ cpset = (zs_pset_t *)(cstart + csize); } usage->zsu_size = size; } /* * Respond to new connections from libzonestat.so. Also respond to zoneadmd, * which reports new zones. */ /* ARGSUSED */ static void zsd_server(void *cookie, char *argp, size_t arg_size, door_desc_t *dp, uint_t n_desc) { int *args, cmd; door_desc_t door; ucred_t *ucred; const priv_set_t *eset; if (argp == DOOR_UNREF_DATA) { (void) door_return(NULL, 0, NULL, 0); thr_exit(NULL); } if (arg_size != sizeof (cmd) * 2) { (void) door_return(NULL, 0, NULL, 0); thr_exit(NULL); } /* LINTED */ args = (int *)argp; cmd = args[0]; /* If connection, return door to stat server */ if (cmd == ZSD_CMD_CONNECT) { /* Verify client compilation version */ if (args[1] != ZS_VERSION) { args[1] = ZSD_STATUS_VERSION_MISMATCH; (void) door_return(argp, sizeof (cmd) * 2, NULL, 0); thr_exit(NULL); } ucred = alloca(ucred_size()); /* Verify client permission */ if (door_ucred(&ucred) != 0) { args[1] = ZSD_STATUS_INTERNAL_ERROR; (void) door_return(argp, sizeof (cmd) * 2, NULL, 0); thr_exit(NULL); } eset = ucred_getprivset(ucred, PRIV_EFFECTIVE); if (eset == NULL) { args[1] = ZSD_STATUS_INTERNAL_ERROR; (void) door_return(argp, sizeof (cmd) * 2, NULL, 0); thr_exit(NULL); } if (!priv_ismember(eset, PRIV_PROC_INFO)) { args[1] = ZSD_STATUS_PERMISSION; (void) door_return(argp, sizeof (cmd) * 2, NULL, 0); thr_exit(NULL); } /* Return stat server door */ args[1] = ZSD_STATUS_OK; door.d_attributes = DOOR_DESCRIPTOR; door.d_data.d_desc.d_descriptor = g_stat_door; (void) door_return(argp, sizeof (cmd) * 2, &door, 1); thr_exit(NULL); } /* Respond to zoneadmd informing zonestatd of a new zone */ if (cmd == ZSD_CMD_NEW_ZONE) { zsd_fattach_zone(args[1], g_server_door, B_FALSE); (void) door_return(NULL, 0, NULL, 0); thr_exit(NULL); } args[1] = ZSD_STATUS_INTERNAL_ERROR; (void) door_return(argp, sizeof (cmd) * 2, NULL, 0); thr_exit(NULL); } /* * Respond to libzonestat.so clients with the current utlilzation data. */ /* ARGSUSED */ static void zsd_stat_server(void *cookie, char *argp, size_t arg_size, door_desc_t *dp, uint_t n_desc) { uint64_t *args, cmd; zs_usage_cache_t *cache; int ret; char *rvalp; size_t rvals; zs_usage_t *usage; ucred_t *ucred; zoneid_t zoneid; const priv_set_t *eset; boolean_t is_gz = B_FALSE; /* Tell stat thread there are no more clients */ if (argp == DOOR_UNREF_DATA) { (void) mutex_lock(&g_usage_cache_lock); g_hasclient = B_FALSE; (void) cond_signal(&g_usage_cache_kick); (void) mutex_unlock(&g_usage_cache_lock); (void) door_return(NULL, 0, NULL, 0); thr_exit(NULL); } if (arg_size != sizeof (cmd) * 2) { (void) door_return(NULL, 0, NULL, 0); thr_exit(NULL); } /* LINTED */ args = (uint64_t *)argp; cmd = args[0]; if (cmd != ZSD_CMD_READ) { (void) door_return(NULL, 0, NULL, 0); thr_exit(NULL); } ucred = alloca(ucred_size()); if (door_ucred(&ucred) != 0) { (void) door_return(NULL, 0, NULL, 0); thr_exit(NULL); } zoneid = ucred_getzoneid(ucred); if (zoneid == GLOBAL_ZONEID) is_gz = B_TRUE; eset = ucred_getprivset(ucred, PRIV_EFFECTIVE); if (eset == NULL) { (void) door_return(NULL, 0, NULL, 0); thr_exit(NULL); } if (!priv_ismember(eset, PRIV_PROC_INFO)) { (void) door_return(NULL, 0, NULL, 0); thr_exit(NULL); } (void) mutex_lock(&g_usage_cache_lock); g_hasclient = B_TRUE; /* * Force a new cpu calculation for client. This will force a * new memory calculation if the memory data is older than the * sample period. */ g_usage_cache_kickers++; (void) cond_signal(&g_usage_cache_kick); ret = cond_wait(&g_usage_cache_wait, &g_usage_cache_lock); g_usage_cache_kickers--; if (ret != 0 && errno == EINTR) { (void) mutex_unlock(&g_usage_cache_lock); zsd_warn(gettext( "Interrupted before writing usage size to client\n")); (void) door_return(NULL, 0, NULL, 0); thr_exit(NULL); } cache = zsd_usage_cache_hold_locked(); if (cache == NULL) { zsd_warn(gettext("Usage cache empty.\n")); (void) door_return(NULL, 0, NULL, 0); thr_exit(NULL); } (void) mutex_unlock(&g_usage_cache_lock); /* Copy current usage data to stack to send to client */ usage = (zs_usage_t *)alloca(cache->zsuc_size); /* Filter out results if caller is non-global zone */ zsd_usage_filter(zoneid, cache, usage, is_gz); rvalp = (void *)usage; rvals = usage->zsu_size; zsd_usage_cache_rele(cache); (void) door_return(rvalp, rvals, NULL, 0); thr_exit(NULL); } static volatile boolean_t g_quit; /* ARGSUSED */ static void zonestat_quithandler(int sig) { g_quit = B_TRUE; } /* * The stat thread generates new utilization data when clients request * it. It also manages opening and closing the subsystems used to gather * data depending on if clients exist. */ /* ARGSUSED */ void * stat_thread(void *arg) { time_t start; time_t now; time_t next_memory; boolean_t do_memory; boolean_t do_read; boolean_t do_close; start = time(NULL); if (start < 0) { if (g_quit == B_TRUE) goto quit; zsd_warn(gettext("Unable to fetch current time")); g_quit = B_TRUE; goto quit; } next_memory = start; while (g_quit == B_FALSE) { for (;;) { /* * These are used to decide if the most recent memory * calculation was within a sample interval, * and weather or not the usage collection needs to * be opened or closed. */ do_memory = B_FALSE; do_read = B_FALSE; do_close = B_FALSE; /* * If all clients have gone, close usage collecting */ (void) mutex_lock(&g_usage_cache_lock); if (!g_hasclient && g_open == B_TRUE) { do_close = B_TRUE; (void) mutex_unlock(&g_usage_cache_lock); break; } if (g_quit == B_TRUE) { (void) mutex_unlock( &g_usage_cache_lock); break; } /* * Wait for a usage data request */ if (g_usage_cache_kickers == 0) { (void) cond_wait(&g_usage_cache_kick, &g_usage_cache_lock); } now = time(NULL); if (now < 0) { if (g_quit == B_TRUE) { (void) mutex_unlock( &g_usage_cache_lock); goto quit; } g_quit = B_TRUE; (void) mutex_unlock(&g_usage_cache_lock); zsd_warn(gettext( "Unable to fetch current time")); goto quit; } if (g_hasclient) { do_read = B_TRUE; if (now >= next_memory) { do_memory = B_TRUE; next_memory = now + g_interval; } } else { do_close = B_TRUE; } (void) mutex_unlock(&g_usage_cache_lock); if (do_read || do_close) break; } g_now = now; g_hrnow = gethrtime(); if (g_hasclient && g_open == B_FALSE) { g_start = g_now; g_hrstart = g_hrnow; g_ctl = zsd_open(g_ctl); if (g_ctl == NULL) zsd_warn(gettext( "Unable to open zone statistics")); else g_open = B_TRUE; } if (do_read && g_ctl) { if (zsd_read(g_ctl, B_FALSE, do_memory) != 0) { zsd_warn(gettext( "Unable to read zone statistics")); g_quit = B_TRUE; return (NULL); } } (void) mutex_lock(&g_usage_cache_lock); if (!g_hasclient && g_open == B_TRUE && g_ctl) { (void) mutex_unlock(&g_usage_cache_lock); zsd_close(g_ctl); g_open = B_FALSE; } else { (void) mutex_unlock(&g_usage_cache_lock); } } quit: if (g_open) zsd_close(g_ctl); (void) thr_kill(g_main, SIGINT); thr_exit(NULL); return (NULL); } void zsd_set_fx() { pcinfo_t pcinfo; pcparms_t pcparms; (void) strlcpy(pcinfo.pc_clname, "FX", sizeof (pcinfo.pc_clname)); if (priocntl(0, 0, PC_GETCID, (caddr_t)&pcinfo) == -1) { zsd_warn(gettext("cannot get FX class parameters")); return; } pcparms.pc_cid = pcinfo.pc_cid; ((fxparms_t *)pcparms.pc_clparms)->fx_upri = 60; ((fxparms_t *)pcparms.pc_clparms)->fx_uprilim = 60; ((fxparms_t *)pcparms.pc_clparms)->fx_tqsecs = 0; ((fxparms_t *)pcparms.pc_clparms)->fx_tqnsecs = FX_NOCHANGE; if (priocntl(P_PID, getpid(), PC_SETPARMS, (caddr_t)&pcparms) == -1) zsd_warn(gettext("cannot enter the FX class")); } static int pipe_fd; static void daemonize_ready(char status) { /* * wake the parent with a clue */ (void) write(pipe_fd, &status, 1); (void) close(pipe_fd); } static int daemonize_start(void) { char data; int status; int filedes[2]; pid_t pid; (void) close(0); (void) dup2(2, 1); if (pipe(filedes) < 0) return (-1); (void) fflush(NULL); if ((pid = fork1()) < 0) return (-1); if (pid != 0) { /* * parent */ struct sigaction act; act.sa_sigaction = SIG_DFL; (void) sigemptyset(&act.sa_mask); act.sa_flags = 0; (void) sigaction(SIGPIPE, &act, NULL); /* ignore SIGPIPE */ (void) close(filedes[1]); if (read(filedes[0], &data, 1) == 1) { /* forward ready code via exit status */ exit(data); } status = -1; (void) wait4(pid, &status, 0, NULL); /* daemon process exited before becoming ready */ if (WIFEXITED(status)) { /* assume daemon process printed useful message */ exit(WEXITSTATUS(status)); } else { zsd_warn(gettext("daemon process killed or died")); exit(1); } } /* * child */ pipe_fd = filedes[1]; (void) close(filedes[0]); /* * generic Unix setup */ (void) setsid(); (void) umask(0000); return (0); } static void fattach_all_zones(boolean_t detach_only) { zoneid_t *zids; uint_t nzids, nzids_last; int i; again: (void) zone_list(NULL, &nzids); nzids_last = nzids; zids = (zoneid_t *)malloc(sizeof (zoneid_t) * nzids_last); if (zids == NULL) zsd_error(gettext("Out of memory")); (void) zone_list(zids, &nzids); if (nzids > nzids_last) { free(zids); goto again; } for (i = 0; i < nzids; i++) zsd_fattach_zone(zids[i], g_server_door, detach_only); free(zids); } int main(int argc, char *argv[]) { int arg; thread_t tid; scf_simple_prop_t *prop; uint64_t *intervalp; boolean_t opt_cleanup = B_FALSE; g_main = thr_self(); g_quit = B_FALSE; (void) signal(SIGINT, zonestat_quithandler); (void) signal(SIGTERM, zonestat_quithandler); (void) signal(SIGHUP, zonestat_quithandler); /* (void) sigignore(SIGCHLD); */ (void) sigignore(SIGPIPE); if (getzoneid() != GLOBAL_ZONEID) zsd_error(gettext("Must be run from global zone only")); while ((arg = getopt(argc, argv, "c")) != EOF) { switch (arg) { case 'c': opt_cleanup = B_TRUE; break; default: zsd_error(gettext("Invalid option")); } } if (opt_cleanup) { if (zsd_disable_cpu_stats() != 0) exit(1); else exit(0); } /* Get the configured sample interval */ prop = scf_simple_prop_get(NULL, "svc:/system/zones-monitoring:default", "config", "sample_interval"); if (prop == NULL) zsd_error(gettext("Unable to fetch SMF property " "\"config/sample_interval\"")); if (scf_simple_prop_type(prop) != SCF_TYPE_COUNT) zsd_error(gettext("Malformed SMF property " "\"config/sample_interval\". Must be of type \"count\"")); intervalp = scf_simple_prop_next_count(prop); g_interval = *intervalp; if (g_interval == 0) zsd_error(gettext("Malformed SMF property " "\"config/sample_interval\". Must be greater than zero")); scf_simple_prop_free(prop); if (daemonize_start() < 0) zsd_error(gettext("Unable to start daemon\n")); /* Run at high priority */ zsd_set_fx(); (void) mutex_init(&g_usage_cache_lock, USYNC_THREAD, NULL); (void) cond_init(&g_usage_cache_kick, USYNC_THREAD, NULL); (void) cond_init(&g_usage_cache_wait, USYNC_THREAD, NULL); g_server_door = door_create(zsd_server, NULL, DOOR_REFUSE_DESC | DOOR_NO_CANCEL); if (g_server_door < 0) zsd_error(gettext("Unable to create server door\n")); g_stat_door = door_create(zsd_stat_server, NULL, DOOR_UNREF_MULTI | DOOR_REFUSE_DESC | DOOR_NO_CANCEL); if (g_stat_door < 0) zsd_error(gettext("Unable to create statistics door\n")); fattach_all_zones(B_FALSE); if (thr_create(NULL, 0, stat_thread, NULL, 0, &tid) != 0) zsd_error(gettext("Unable to create statistics thread\n")); daemonize_ready(0); /* Wait for signal to quit */ while (g_quit == B_FALSE) (void) pause(); /* detach doors */ fattach_all_zones(B_TRUE); (void) door_revoke(g_server_door); (void) door_revoke(g_stat_door); /* kick stat thread and wait for it to close the statistics */ (void) mutex_lock(&g_usage_cache_lock); g_quit = B_TRUE; (void) cond_signal(&g_usage_cache_kick); (void) mutex_unlock(&g_usage_cache_lock); end: (void) thr_join(tid, NULL, NULL); return (0); }