/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 */
#include <alloca.h>
#include <assert.h>
#include <dirent.h>
#include <dlfcn.h>
#include <door.h>
#include <errno.h>
#include <exacct.h>
#include <ctype.h>
#include <fcntl.h>
#include <kstat.h>
#include <libcontract.h>
#include <libintl.h>
#include <libscf.h>
#include <zonestat.h>
#include <zonestat_impl.h>
#include <limits.h>
#include <pool.h>
#include <procfs.h>
#include <rctl.h>
#include <thread.h>
#include <signal.h>
#include <stdarg.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <synch.h>
#include <sys/acctctl.h>
#include <sys/contract/process.h>
#include <sys/ctfs.h>
#include <sys/fork.h>
#include <sys/param.h>
#include <sys/priocntl.h>
#include <sys/fxpriocntl.h>
#include <sys/processor.h>
#include <sys/pset.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/swap.h>
#include <sys/systeminfo.h>
#include <thread.h>
#include <sys/list.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/vm_usage.h>
#include <sys/wait.h>
#include <sys/zone.h>
#include <time.h>
#include <ucred.h>
#include <unistd.h>
#include <vm/anon.h>
#include <zone.h>
#include <zonestat.h>

#define	MAX_PSET_NAME	1024	/* Taken from PV_NAME_MAX_LEN */
#define	ZSD_PSET_UNLIMITED	UINT16_MAX
#define	ZONESTAT_EXACCT_FILE	"/var/adm/exacct/zonestat-process"

/*
 * zonestatd implements gathering cpu and memory utilization data for
 * running zones.  It has these components:
 *
 * zsd_server:
 *	Door server to respond to client connections.  Each client
 *	will connect using libzonestat.so, which will open and
 *	call /var/tmp/.zonestat_door.  Each connecting client is given
 *	a file descriptor to the stat server.
 *
 *	The zsd_server also responds to zoneadmd, which reports when a
 *	new zone is booted.  This is used to fattach the zsd_server door
 *	into the new zone.
 *
 * zsd_stat_server:
 *	Receives client requests for the current utilization data.  Each
 *	client request will cause zonestatd to update the current utilization
 *	data by kicking the stat_thread.
 *
 *	If the client is in a non-global zone, the utilization data will
 *	be filtered to only show the given zone.  The usage by all other zones
 *	will be added to the system utilization.
 *
 * stat_thread:
 *	The stat thread implements querying the system to determine the
 *	current utilization data for each running zone.  This includes
 *	inspecting the system's processor set configuration, as well as details
 *	of each zone, such as their configured limits, and which processor
 *	sets they are running in.
 *
 *	The stat_thread will only update memory utilization data as often as
 *	the configured config/sample_interval on the zones-monitoring service.
 */

/*
 * The private vmusage structure unfortunately uses size_t types, and assumes
 * the caller's bitness matches the kernel's bitness.  Since the getvmusage()
 * system call is contracted, and zonestatd is 32 bit, the following structures
 * are used to interact with a 32bit or 64 bit kernel.
 */
typedef struct zsd_vmusage32 {
	id_t vmu_zoneid;
	uint_t vmu_type;
	id_t vmu_id;

	uint32_t vmu_rss_all;
	uint32_t vmu_rss_private;
	uint32_t vmu_rss_shared;
	uint32_t vmu_swap_all;
	uint32_t vmu_swap_private;
	uint32_t vmu_swap_shared;
} zsd_vmusage32_t;

typedef struct zsd_vmusage64 {
	id_t vmu_zoneid;
	uint_t vmu_type;
	id_t vmu_id;
	/*
	 * An amd64 kernel will align the following uint64_t members, but a
	 * 32bit i386 process will not without help.
	 */
	int vmu_align_next_members_on_8_bytes;
	uint64_t vmu_rss_all;
	uint64_t vmu_rss_private;
	uint64_t vmu_rss_shared;
	uint64_t vmu_swap_all;
	uint64_t vmu_swap_private;
	uint64_t vmu_swap_shared;
} zsd_vmusage64_t;

struct zsd_zone;

/* Used to store a zone's usage of a pset */
typedef struct zsd_pset_usage {
	struct zsd_zone	*zsu_zone;
	struct zsd_pset	*zsu_pset;

	list_node_t	zsu_next;

	zoneid_t	zsu_zoneid;
	boolean_t	zsu_found;	/* zone bound at end of interval */
	boolean_t	zsu_active;	/* zone was bound during interval */
	boolean_t	zsu_new;	/* zone newly bound in this interval */
	boolean_t	zsu_deleted;	/* zone was unbound in this interval */
	boolean_t	zsu_empty;	/* no procs in pset in this interval */
	time_t		zsu_start;	/* time when zone was found in pset */
	hrtime_t	zsu_hrstart;	/* time when zone  was found in pset */
	uint64_t	zsu_cpu_shares;
	uint_t		zsu_scheds;	/* schedulers found in this pass */
	timestruc_t	zsu_cpu_usage;	/* cpu time used */
} zsd_pset_usage_t;

/* Used to store a pset's utilization */
typedef struct zsd_pset {
	psetid_t	zsp_id;
	list_node_t	zsp_next;
	char		zsp_name[ZS_PSETNAME_MAX];

	uint_t		zsp_cputype;	/* default, dedicated or shared */
	boolean_t	zsp_found;	/* pset found at end of interval */
	boolean_t	zsp_new;	/* pset new in this interval */
	boolean_t	zsp_deleted;	/* pset deleted in this interval */
	boolean_t	zsp_active;	/* pset existed during interval */
	boolean_t	zsp_empty;	/* no processes in pset */
	time_t		zsp_start;
	hrtime_t	zsp_hrstart;

	uint64_t	zsp_online;	/* online cpus in interval */
	uint64_t	zsp_size;	/* size in this interval */
	uint64_t	zsp_min;	/* configured min in this interval */
	uint64_t	zsp_max;	/* configured max in this interval */
	int64_t		zsp_importance;	/* configured max in this interval */

	uint_t		zsp_scheds;	/* scheds of processes found in pset */
	uint64_t	zsp_cpu_shares;	/* total shares in this interval */

	timestruc_t	zsp_total_time;
	timestruc_t	zsp_usage_kern;
	timestruc_t	zsp_usage_zones;

	/* Individual zone usages of pset */
	list_t		zsp_usage_list;
	int		zsp_nusage;

	/* Summed kstat values from individual cpus in pset */
	timestruc_t	zsp_idle;
	timestruc_t	zsp_intr;
	timestruc_t	zsp_kern;
	timestruc_t	zsp_user;

} zsd_pset_t;

/* Used to track an individual cpu's utilization as reported by kstats */
typedef struct zsd_cpu {
	processorid_t	zsc_id;
	list_node_t	zsc_next;
	psetid_t	zsc_psetid;
	psetid_t	zsc_psetid_prev;
	zsd_pset_t	*zsc_pset;

	boolean_t	zsc_found;	/* cpu online in this interval */
	boolean_t	zsc_onlined;	/* cpu onlined during this interval */
	boolean_t	zsc_offlined;	/* cpu offlined during this interval */
	boolean_t	zsc_active;	/* cpu online during this interval */
	boolean_t	zsc_allocated;	/* True if cpu has ever been found */

	/* kstats this interval */
	uint64_t	zsc_nsec_idle;
	uint64_t	zsc_nsec_intr;
	uint64_t	zsc_nsec_kern;
	uint64_t	zsc_nsec_user;

	/* kstats in most recent interval */
	uint64_t	zsc_nsec_idle_prev;
	uint64_t	zsc_nsec_intr_prev;
	uint64_t	zsc_nsec_kern_prev;
	uint64_t	zsc_nsec_user_prev;

	/* Total kstat increases since zonestatd started reading kstats */
	timestruc_t	zsc_idle;
	timestruc_t	zsc_intr;
	timestruc_t	zsc_kern;
	timestruc_t	zsc_user;

} zsd_cpu_t;

/* Used to describe an individual zone and its utilization */
typedef struct zsd_zone {
	zoneid_t	zsz_id;
	list_node_t	zsz_next;
	char		zsz_name[ZS_ZONENAME_MAX];
	uint_t		zsz_cputype;
	uint_t		zsz_iptype;
	time_t		zsz_start;
	hrtime_t	zsz_hrstart;

	char		zsz_pool[ZS_POOLNAME_MAX];
	char		zsz_pset[ZS_PSETNAME_MAX];
	int		zsz_default_sched;
	/* These are deduced by inspecting processes */
	psetid_t	zsz_psetid;
	uint_t		zsz_scheds;

	boolean_t	zsz_new;	/* zone booted during this interval */
	boolean_t	zsz_deleted;	/* halted during this interval */
	boolean_t	zsz_active;	/* running in this interval */
	boolean_t	zsz_empty;	/* no processes in this interval */
	boolean_t	zsz_gone;	/* not installed in this interval */
	boolean_t	zsz_found;	/* Running at end of this interval */

	uint64_t	zsz_cpu_shares;
	uint64_t	zsz_cpu_cap;
	uint64_t	zsz_ram_cap;
	uint64_t	zsz_locked_cap;
	uint64_t	zsz_vm_cap;

	uint64_t	zsz_cpus_online;
	timestruc_t	zsz_cpu_usage;	/* cpu time of cpu cap */
	timestruc_t	zsz_cap_time;	/* cpu time of cpu cap */
	timestruc_t	zsz_share_time; /* cpu time of share of cpu */
	timestruc_t	zsz_pset_time;  /* time of all psets zone is bound to */

	uint64_t	zsz_usage_ram;
	uint64_t	zsz_usage_locked;
	uint64_t	zsz_usage_vm;

	uint64_t	zsz_processes_cap;
	uint64_t	zsz_lwps_cap;
	uint64_t	zsz_shm_cap;
	uint64_t	zsz_shmids_cap;
	uint64_t	zsz_semids_cap;
	uint64_t	zsz_msgids_cap;
	uint64_t	zsz_lofi_cap;

	uint64_t	zsz_processes;
	uint64_t	zsz_lwps;
	uint64_t	zsz_shm;
	uint64_t	zsz_shmids;
	uint64_t	zsz_semids;
	uint64_t	zsz_msgids;
	uint64_t	zsz_lofi;

} zsd_zone_t;

/*
 * Used to track the cpu usage of an individual processes.
 *
 * zonestatd sweeps /proc each interval and charges the cpu usage of processes.
 * to their zone.  As processes exit, their extended accounting records are
 * read and the difference of their total and known usage is charged to their
 * zone.
 *
 * If a process is never seen in /proc, the total usage on its extended
 * accounting record will be charged to its zone.
 */
typedef struct zsd_proc {
	list_node_t	zspr_next;
	pid_t		zspr_ppid;
	psetid_t	zspr_psetid;
	zoneid_t	zspr_zoneid;
	int		zspr_sched;
	timestruc_t	zspr_usage;
} zsd_proc_t;

/* Used to track the overall resource usage of the system */
typedef struct zsd_system {

	uint64_t zss_ram_total;
	uint64_t zss_ram_kern;
	uint64_t zss_ram_zones;

	uint64_t zss_locked_kern;
	uint64_t zss_locked_zones;

	uint64_t zss_vm_total;
	uint64_t zss_vm_kern;
	uint64_t zss_vm_zones;

	uint64_t zss_swap_total;
	uint64_t zss_swap_used;

	timestruc_t zss_idle;
	timestruc_t zss_intr;
	timestruc_t zss_kern;
	timestruc_t zss_user;

	timestruc_t zss_cpu_total_time;
	timestruc_t zss_cpu_usage_kern;
	timestruc_t zss_cpu_usage_zones;

	uint64_t zss_maxpid;
	uint64_t zss_processes_max;
	uint64_t zss_lwps_max;
	uint64_t zss_shm_max;
	uint64_t zss_shmids_max;
	uint64_t zss_semids_max;
	uint64_t zss_msgids_max;
	uint64_t zss_lofi_max;

	uint64_t zss_processes;
	uint64_t zss_lwps;
	uint64_t zss_shm;
	uint64_t zss_shmids;
	uint64_t zss_semids;
	uint64_t zss_msgids;
	uint64_t zss_lofi;

	uint64_t zss_ncpus;
	uint64_t zss_ncpus_online;

} zsd_system_t;

/*
 * A dumping ground for various information and structures used to compute
 * utilization.
 *
 * This structure is used to track the system while clients are connected.
 * When The first client connects, a zsd_ctl is allocated and configured by
 * zsd_open().  When all clients disconnect, the zsd_ctl is closed.
 */
typedef struct zsd_ctl {
	kstat_ctl_t	*zsctl_kstat_ctl;

	/* To track extended accounting */
	int		zsctl_proc_fd;		/* Log currently being used */
	ea_file_t	zsctl_proc_eaf;
	struct stat64	zsctl_proc_stat;
	int		zsctl_proc_open;
	int		zsctl_proc_fd_next;	/* Log file to use next */
	ea_file_t	zsctl_proc_eaf_next;
	struct stat64	zsctl_proc_stat_next;
	int		zsctl_proc_open_next;

	/* pool configuration handle */
	pool_conf_t	*zsctl_pool_conf;
	int		zsctl_pool_status;
	int		zsctl_pool_changed;

	/* The above usage tacking structures */
	zsd_system_t	*zsctl_system;
	list_t		zsctl_zones;
	list_t		zsctl_psets;
	list_t		zsctl_cpus;
	zsd_cpu_t	*zsctl_cpu_array;
	zsd_proc_t	*zsctl_proc_array;

	/* Various system info */
	uint64_t	zsctl_maxcpuid;
	uint64_t	zsctl_maxproc;
	uint64_t	zsctl_kern_bits;
	uint64_t	zsctl_pagesize;

	/* Used to track time available under a cpu cap. */
	uint64_t	zsctl_hrtime;
	uint64_t	zsctl_hrtime_prev;
	timestruc_t	zsctl_hrtime_total;

	struct timeval	zsctl_timeofday;

	/* Caches for arrays allocated for use by various system calls */
	psetid_t	*zsctl_pset_cache;
	uint_t		zsctl_pset_ncache;
	processorid_t	*zsctl_cpu_cache;
	uint_t		zsctl_cpu_ncache;
	zoneid_t	*zsctl_zone_cache;
	uint_t		zsctl_zone_ncache;
	struct swaptable *zsctl_swap_cache;
	uint64_t	zsctl_swap_cache_size;
	uint64_t	zsctl_swap_cache_num;
	zsd_vmusage64_t	*zsctl_vmusage_cache;
	uint64_t	zsctl_vmusage_cache_num;

	/* Info about procfs for scanning /proc */
	pool_value_t	*zsctl_pool_vals[3];

	/* Counts on tracked entities */
	uint_t		zsctl_nzones;
	uint_t		zsctl_npsets;
	uint_t		zsctl_npset_usages;
} zsd_ctl_t;

zsd_ctl_t		*g_ctl;
boolean_t		g_open;		/* True if g_ctl is open */
int			g_hasclient;	/* True if any clients are connected */

/*
 * The usage cache is updated by the stat_thread, and copied to clients by
 * the zsd_stat_server.  Mutex and cond are to synchronize between the
 * stat_thread and the stat_server.
 */
zs_usage_cache_t	*g_usage_cache;
mutex_t			g_usage_cache_lock;
cond_t			g_usage_cache_kick;
uint_t			g_usage_cache_kickers;
cond_t			g_usage_cache_wait;
char			*g_usage_cache_buf;
uint_t			g_usage_cache_bufsz;
uint64_t		g_gen_next;

/* fds of door servers */
int			g_server_door;
int			g_stat_door;

/*
 * Starting and current time.  Used to throttle memory calculation, and to
 * mark new zones and psets with their boot and creation time.
 */
time_t			g_now;
time_t			g_start;
hrtime_t		g_hrnow;
hrtime_t		g_hrstart;
uint64_t		g_interval;

/*
 * main() thread.
 */
thread_t		g_main;

/* PRINTFLIKE1 */
static void
zsd_warn(const char *fmt, ...)
{
	va_list alist;

	va_start(alist, fmt);

	(void) fprintf(stderr, gettext("zonestat: Warning: "));
	(void) vfprintf(stderr, fmt, alist);
	(void) fprintf(stderr, "\n");
	va_end(alist);
}

/* PRINTFLIKE1 */
static void
zsd_error(const char *fmt, ...)
{
	va_list alist;

	va_start(alist, fmt);

	(void) fprintf(stderr, gettext("zonestat: Error: "));
	(void) vfprintf(stderr, fmt, alist);
	(void) fprintf(stderr, "\n");
	va_end(alist);
	exit(1);
}

/* Turns on extended accounting if not configured externally */
int
zsd_enable_cpu_stats()
{
	char *path = ZONESTAT_EXACCT_FILE;
	char oldfile[MAXPATHLEN];
	int ret, state = AC_ON;
	ac_res_t res[6];

	/*
	 * Start a new accounting file  if accounting not configured
	 * externally.
	 */

	res[0].ar_id = AC_PROC_PID;
	res[0].ar_state = AC_ON;
	res[1].ar_id = AC_PROC_ANCPID;
	res[1].ar_state = AC_ON;
	res[2].ar_id = AC_PROC_CPU;
	res[2].ar_state = AC_ON;
	res[3].ar_id = AC_PROC_TIME;
	res[3].ar_state = AC_ON;
	res[4].ar_id = AC_PROC_ZONENAME;
	res[4].ar_state = AC_ON;
	res[5].ar_id = AC_NONE;
	res[5].ar_state = AC_ON;
	if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
		zsd_warn(gettext("Unable to set accounting resources"));
		return (-1);
	}
	/* Only set accounting file if none is configured */
	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
	if (ret < 0) {

		(void) unlink(path);
		if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1)
		    == -1) {
			zsd_warn(gettext("Unable to set accounting file"));
			return (-1);
		}
	}
	if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
		zsd_warn(gettext("Unable to enable accounting"));
		return (-1);
	}
	return (0);
}

/* Turns off extended accounting if not configured externally */
int
zsd_disable_cpu_stats()
{
	char *path = ZONESTAT_EXACCT_FILE;
	int ret, state = AC_OFF;
	ac_res_t res[6];
	char oldfile[MAXPATHLEN];

	/* If accounting file is externally configured, leave it alone */
	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
	if (ret == 0 && strcmp(oldfile, path) != 0)
		return (0);

	res[0].ar_id = AC_PROC_PID;
	res[0].ar_state = AC_OFF;
	res[1].ar_id = AC_PROC_ANCPID;
	res[1].ar_state = AC_OFF;
	res[2].ar_id = AC_PROC_CPU;
	res[2].ar_state = AC_OFF;
	res[3].ar_id = AC_PROC_TIME;
	res[3].ar_state = AC_OFF;
	res[4].ar_id = AC_PROC_ZONENAME;
	res[4].ar_state = AC_OFF;
	res[5].ar_id = AC_NONE;
	res[5].ar_state = AC_OFF;
	if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
		zsd_warn(gettext("Unable to clear accounting resources"));
		return (-1);
	}
	if (acctctl(AC_PROC | AC_FILE_SET, NULL, 0) == -1) {
		zsd_warn(gettext("Unable to clear accounting file"));
		return (-1);
	}
	if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
		zsd_warn(gettext("Unable to diable accounting"));
		return (-1);
	}

	(void) unlink(path);
	return (0);
}

/*
 * If not configured externally, deletes the current extended accounting file
 * and starts a new one.
 *
 * Since the stat_thread holds an open handle to the accounting file, it will
 * read all remaining entries from the old file before switching to
 * read the new one.
 */
int
zsd_roll_exacct(void)
{
	int ret;
	char *path = ZONESTAT_EXACCT_FILE;
	char oldfile[MAXPATHLEN];

	/* If accounting file is externally configured, leave it alone */
	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
	if (ret == 0 && strcmp(oldfile, path) != 0)
		return (0);

	if (unlink(path) != 0)
		/* Roll it next time */
		return (0);

	if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) {
		zsd_warn(gettext("Unable to set accounting file"));
		return (-1);
	}
	return (0);
}

/* Contract stuff for zone_enter() */
int
init_template(void)
{
	int fd;
	int err = 0;

	fd = open64(CTFS_ROOT "/process/template", O_RDWR);
	if (fd == -1)
		return (-1);

	/*
	 * For now, zoneadmd doesn't do anything with the contract.
	 * Deliver no events, don't inherit, and allow it to be orphaned.
	 */
	err |= ct_tmpl_set_critical(fd, 0);
	err |= ct_tmpl_set_informative(fd, 0);
	err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
	err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
	if (err || ct_tmpl_activate(fd)) {
		(void) close(fd);
		return (-1);
	}

	return (fd);
}

/*
 * Contract stuff for zone_enter()
 */
int
contract_latest(ctid_t *id)
{
	int cfd, r;
	ct_stathdl_t st;
	ctid_t result;

	if ((cfd = open64(CTFS_ROOT "/process/latest", O_RDONLY)) == -1)
		return (errno);

	if ((r = ct_status_read(cfd, CTD_COMMON, &st)) != 0) {
		(void) close(cfd);
		return (r);
	}

	result = ct_status_get_id(st);
	ct_status_free(st);
	(void) close(cfd);

	*id = result;
	return (0);
}

static int
close_on_exec(int fd)
{
	int flags = fcntl(fd, F_GETFD, 0);
	if ((flags != -1) && (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1))
		return (0);
	return (-1);
}

int
contract_open(ctid_t ctid, const char *type, const char *file, int oflag)
{
	char path[PATH_MAX];
	int n, fd;

	if (type == NULL)
		type = "all";

	n = snprintf(path, PATH_MAX, CTFS_ROOT "/%s/%ld/%s", type, ctid, file);
	if (n >= sizeof (path)) {
		errno = ENAMETOOLONG;
		return (-1);
	}

	fd = open64(path, oflag);
	if (fd != -1) {
		if (close_on_exec(fd) == -1) {
			int err = errno;
			(void) close(fd);
			errno = err;
			return (-1);
		}
	}
	return (fd);
}

int
contract_abandon_id(ctid_t ctid)
{
	int fd, err;

	fd = contract_open(ctid, "all", "ctl", O_WRONLY);
	if (fd == -1)
		return (errno);

	err = ct_ctl_abandon(fd);
	(void) close(fd);

	return (err);
}
/*
 * Attach the zsd_server to a zone.  Called for each zone when zonestatd
 * starts, and for each newly booted zone when zoneadmd contacts the zsd_server
 *
 * Zone_enter is used to avoid reaching into zone to fattach door.
 */
static void
zsd_fattach_zone(zoneid_t zid, int door, boolean_t detach_only)
{
	char *path = ZS_DOOR_PATH;
	int fd, pid, stat, tmpl_fd;
	ctid_t ct;

	if ((tmpl_fd = init_template()) == -1) {
		zsd_warn("Unable to init template");
		return;
	}

	pid = forkx(0);
	if (pid < 0) {
		(void) ct_tmpl_clear(tmpl_fd);
		zsd_warn(gettext(
		    "Unable to fork to add zonestat to zoneid %d\n"), zid);
		return;
	}

	if (pid == 0) {
		(void) ct_tmpl_clear(tmpl_fd);
		(void) close(tmpl_fd);
		if (zid != 0 && zone_enter(zid) != 0) {
			if (errno == EINVAL) {
				_exit(0);
			}
			_exit(1);
		}
		(void) fdetach(path);
		(void) unlink(path);
		if (detach_only)
			_exit(0);
		fd = open(path, O_CREAT|O_RDWR, 0644);
		if (fd < 0)
			_exit(2);
		if (fattach(door, path) != 0)
			_exit(3);
		_exit(0);
	}
	if (contract_latest(&ct) == -1)
		ct = -1;
	(void) ct_tmpl_clear(tmpl_fd);
	(void) close(tmpl_fd);
	(void) contract_abandon_id(ct);
	while (waitpid(pid, &stat, 0) != pid)
		;
	if (WIFEXITED(stat) && WEXITSTATUS(stat) == 0)
		return;

	zsd_warn(gettext("Unable to attach door to zoneid: %d"), zid);

	if (WEXITSTATUS(stat) == 1)
		zsd_warn(gettext("Cannot entering zone"));
	else if (WEXITSTATUS(stat) == 2)
		zsd_warn(gettext("Unable to create door file: %s"), path);
	else if (WEXITSTATUS(stat) == 3)
		zsd_warn(gettext("Unable to fattach file: %s"), path);

	zsd_warn(gettext("Internal error entering zone: %d"), zid);
}

/*
 * Zone lookup and allocation functions to manage list of currently running
 * zones.
 */
static zsd_zone_t *
zsd_lookup_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
{
	zsd_zone_t *zone;

	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
	    zone = list_next(&ctl->zsctl_zones, zone)) {
		if (strcmp(zone->zsz_name, zonename) == 0) {
			if (zoneid != -1)
				zone->zsz_id = zoneid;
			return (zone);
		}
	}
	return (NULL);
}

static zsd_zone_t *
zsd_lookup_zone_byid(zsd_ctl_t *ctl, zoneid_t zoneid)
{
	zsd_zone_t *zone;

	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
	    zone = list_next(&ctl->zsctl_zones, zone)) {
		if (zone->zsz_id == zoneid)
			return (zone);
	}
	return (NULL);
}

static zsd_zone_t *
zsd_allocate_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
{
	zsd_zone_t *zone;

	if ((zone = (zsd_zone_t *)calloc(1, sizeof (zsd_zone_t))) == NULL)
		return (NULL);

	(void) strlcpy(zone->zsz_name, zonename, sizeof (zone->zsz_name));
	zone->zsz_id = zoneid;
	zone->zsz_found = B_FALSE;

	/*
	 * Allocate as deleted so if not found in first pass, zone is deleted
	 * from list.  This can happen if zone is returned by zone_list, but
	 * exits before first attempt to fetch zone details.
	 */
	zone->zsz_start = g_now;
	zone->zsz_hrstart = g_hrnow;
	zone->zsz_deleted = B_TRUE;

	zone->zsz_cpu_shares = ZS_LIMIT_NONE;
	zone->zsz_cpu_cap = ZS_LIMIT_NONE;
	zone->zsz_ram_cap = ZS_LIMIT_NONE;
	zone->zsz_locked_cap = ZS_LIMIT_NONE;
	zone->zsz_vm_cap = ZS_LIMIT_NONE;

	zone->zsz_processes_cap = ZS_LIMIT_NONE;
	zone->zsz_lwps_cap = ZS_LIMIT_NONE;
	zone->zsz_shm_cap = ZS_LIMIT_NONE;
	zone->zsz_shmids_cap = ZS_LIMIT_NONE;
	zone->zsz_semids_cap = ZS_LIMIT_NONE;
	zone->zsz_msgids_cap = ZS_LIMIT_NONE;
	zone->zsz_lofi_cap = ZS_LIMIT_NONE;

	ctl->zsctl_nzones++;

	return (zone);
}

static zsd_zone_t *
zsd_lookup_insert_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
{
	zsd_zone_t *zone, *tmp;

	if ((zone = zsd_lookup_zone(ctl, zonename, zoneid)) != NULL)
		return (zone);

	if ((zone = zsd_allocate_zone(ctl, zonename, zoneid)) == NULL)
		return (NULL);

	/* Insert sorted by zonename */
	tmp = list_head(&ctl->zsctl_zones);
	while (tmp != NULL && strcmp(zonename, tmp->zsz_name) > 0)
		tmp = list_next(&ctl->zsctl_zones, tmp);

	list_insert_before(&ctl->zsctl_zones, tmp, zone);
	return (zone);
}

/*
 * Mark all zones as not existing.  As zones are found, they will
 * be marked as existing.  If a zone is not found, then it must have
 * halted.
 */
static void
zsd_mark_zones_start(zsd_ctl_t *ctl)
{

	zsd_zone_t *zone;

	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
	    zone = list_next(&ctl->zsctl_zones, zone)) {
		zone->zsz_found = B_FALSE;
	}
}

/*
 * Mark each zone as not using pset.  If processes are found using the
 * pset, the zone will remain bound to the pset.  If none of a zones
 * processes are bound to the pset, the zone's usage of the pset will
 * be deleted.
 *
 */
static void
zsd_mark_pset_usage_start(zsd_pset_t *pset)
{
	zsd_pset_usage_t *usage;

	for (usage = list_head(&pset->zsp_usage_list);
	    usage != NULL;
	    usage = list_next(&pset->zsp_usage_list, usage)) {
		usage->zsu_found = B_FALSE;
		usage->zsu_empty = B_TRUE;
	}
}

/*
 * Mark each pset as not existing.  If a pset is found, it will be marked
 * as existing.  If a pset is not found, it wil be deleted.
 */
static void
zsd_mark_psets_start(zsd_ctl_t *ctl)
{
	zsd_pset_t *pset;

	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
	    pset = list_next(&ctl->zsctl_psets, pset)) {
		pset->zsp_found = B_FALSE;
		zsd_mark_pset_usage_start(pset);
	}
}

/*
 * A pset was found.  Update its information
 */
static void
zsd_mark_pset_found(zsd_pset_t *pset, uint_t type, uint64_t online,
    uint64_t size, uint64_t min, uint64_t max, int64_t importance)
{
	pset->zsp_empty = B_TRUE;
	pset->zsp_deleted = B_FALSE;

	assert(pset->zsp_found == B_FALSE);

	/* update pset flags */
	if (pset->zsp_active == B_FALSE)
		/* pset not seen on previous interval.  It is new. */
		pset->zsp_new = B_TRUE;
	else
		pset->zsp_new = B_FALSE;

	pset->zsp_found = B_TRUE;
	pset->zsp_cputype = type;
	pset->zsp_online = online;
	pset->zsp_size = size;
	pset->zsp_min = min;
	pset->zsp_max = max;
	pset->zsp_importance = importance;
	pset->zsp_cpu_shares = 0;
	pset->zsp_scheds = 0;
	pset->zsp_active = B_TRUE;
}

/*
 * A zone's process was found using a pset. Charge the process to the pset and
 * the per-zone data for the pset.
 */
static void
zsd_mark_pset_usage_found(zsd_pset_usage_t *usage, uint_t sched)
{
	zsd_zone_t *zone = usage->zsu_zone;
	zsd_pset_t *pset = usage->zsu_pset;

	/* Nothing to do if already found */
	if (usage->zsu_found == B_TRUE)
		goto add_stats;

	usage->zsu_found = B_TRUE;
	usage->zsu_empty = B_FALSE;

	usage->zsu_deleted = B_FALSE;
	/* update usage flags */
	if (usage->zsu_active == B_FALSE)
		usage->zsu_new = B_TRUE;
	else
		usage->zsu_new = B_FALSE;

	usage->zsu_scheds = 0;
	usage->zsu_cpu_shares = ZS_LIMIT_NONE;
	usage->zsu_active = B_TRUE;
	pset->zsp_empty = B_FALSE;
	zone->zsz_empty = B_FALSE;

add_stats:
	/* Detect zone's pset id, and if it is bound to multiple psets */
	if (zone->zsz_psetid == ZS_PSET_ERROR)
		zone->zsz_psetid = pset->zsp_id;
	else if (zone->zsz_psetid != pset->zsp_id)
		zone->zsz_psetid = ZS_PSET_MULTI;

	usage->zsu_scheds |= sched;
	pset->zsp_scheds |= sched;
	zone->zsz_scheds |= sched;

	/* Record if FSS is co-habitating with conflicting scheduler */
	if ((pset->zsp_scheds & ZS_SCHED_FSS) &&
	    usage->zsu_scheds & (
	    ZS_SCHED_TS | ZS_SCHED_IA | ZS_SCHED_FX)) {
		usage->zsu_scheds |= ZS_SCHED_CONFLICT;

		pset->zsp_scheds |= ZS_SCHED_CONFLICT;
	}

}

/* Add cpu time for a process to a pset, zone, and system totals */
static void
zsd_add_usage(zsd_ctl_t *ctl, zsd_pset_usage_t *usage, timestruc_t *delta)
{
	zsd_system_t *system = ctl->zsctl_system;
	zsd_zone_t *zone = usage->zsu_zone;
	zsd_pset_t *pset = usage->zsu_pset;

	TIMESTRUC_ADD_TIMESTRUC(usage->zsu_cpu_usage, *delta);
	TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_zones, *delta);
	TIMESTRUC_ADD_TIMESTRUC(zone->zsz_cpu_usage, *delta);
	TIMESTRUC_ADD_TIMESTRUC(system->zss_cpu_usage_zones, *delta);
}

/* Determine which processor sets have been deleted */
static void
zsd_mark_psets_end(zsd_ctl_t *ctl)
{
	zsd_pset_t *pset, *tmp;

	/*
	 * Mark pset as not exists, and deleted if it existed
	 * previous interval.
	 */
	pset = list_head(&ctl->zsctl_psets);
	while (pset != NULL) {
		if (pset->zsp_found == B_FALSE) {
			pset->zsp_empty = B_TRUE;
			if (pset->zsp_deleted == B_TRUE) {
				tmp = pset;
				pset = list_next(&ctl->zsctl_psets, pset);
				list_remove(&ctl->zsctl_psets, tmp);
				free(tmp);
				ctl->zsctl_npsets--;
				continue;
			} else {
				/* Pset vanished during this interval */
				pset->zsp_new = B_FALSE;
				pset->zsp_deleted = B_TRUE;
				pset->zsp_active = B_TRUE;
			}
		}
		pset = list_next(&ctl->zsctl_psets, pset);
	}
}

/* Determine which zones are no longer bound to processor sets */
static void
zsd_mark_pset_usages_end(zsd_ctl_t *ctl)
{
	zsd_pset_t *pset;
	zsd_zone_t *zone;
	zsd_pset_usage_t *usage, *tmp;

	/*
	 * Mark pset as not exists, and deleted if it existed previous
	 * interval.
	 */
	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
	    pset = list_next(&ctl->zsctl_psets, pset)) {
		usage = list_head(&pset->zsp_usage_list);
		while (usage != NULL) {
			/*
			 * Mark pset as not exists, and deleted if it existed
			 * previous interval.
			 */
			if (usage->zsu_found == B_FALSE ||
			    usage->zsu_zone->zsz_deleted == B_TRUE ||
			    usage->zsu_pset->zsp_deleted == B_TRUE) {
				tmp = usage;
				usage = list_next(&pset->zsp_usage_list,
				    usage);
				list_remove(&pset->zsp_usage_list, tmp);
				free(tmp);
				pset->zsp_nusage--;
				ctl->zsctl_npset_usages--;
				continue;
			} else {
				usage->zsu_new = B_FALSE;
				usage->zsu_deleted = B_TRUE;
				usage->zsu_active = B_TRUE;
			}
			/* Add cpu shares for usages that are in FSS */
			zone = usage->zsu_zone;
			if (usage->zsu_scheds & ZS_SCHED_FSS &&
			    zone->zsz_cpu_shares != ZS_SHARES_UNLIMITED &&
			    zone->zsz_cpu_shares != 0) {
				zone = usage->zsu_zone;
				usage->zsu_cpu_shares = zone->zsz_cpu_shares;
				pset->zsp_cpu_shares += zone->zsz_cpu_shares;
			}
			usage = list_next(&pset->zsp_usage_list,
			    usage);
		}
	}
}

/* A zone has been found.  Update its information */
static void
zsd_mark_zone_found(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t cpu_shares,
    uint64_t cpu_cap, uint64_t ram_cap, uint64_t locked_cap,
    uint64_t vm_cap, uint64_t processes_cap, uint64_t processes,
    uint64_t lwps_cap, uint64_t lwps, uint64_t shm_cap, uint64_t shm,
    uint64_t shmids_cap, uint64_t shmids, uint64_t semids_cap,
    uint64_t semids, uint64_t msgids_cap, uint64_t msgids, uint64_t lofi_cap,
    uint64_t lofi, char *poolname, char *psetname, uint_t sched, uint_t cputype,
    uint_t iptype)
{
	zsd_system_t *sys = ctl->zsctl_system;

	assert(zone->zsz_found == B_FALSE);

	/*
	 * Mark zone as exists, and new if it did not exist in previous
	 * interval.
	 */
	zone->zsz_found = B_TRUE;
	zone->zsz_empty = B_TRUE;
	zone->zsz_deleted = B_FALSE;

	/*
	 * Zone is new.  Assume zone's properties are the same over entire
	 * interval.
	 */
	if (zone->zsz_active == B_FALSE)
		zone->zsz_new = B_TRUE;
	else
		zone->zsz_new = B_FALSE;

	(void) strlcpy(zone->zsz_pool, poolname, sizeof (zone->zsz_pool));
	(void) strlcpy(zone->zsz_pset, psetname, sizeof (zone->zsz_pset));
	zone->zsz_default_sched = sched;

	/* Schedulers updated later as processes are found */
	zone->zsz_scheds = 0;

	/* Cpus updated later as psets bound are identified */
	zone->zsz_cpus_online = 0;

	zone->zsz_cputype = cputype;
	zone->zsz_iptype = iptype;
	zone->zsz_psetid = ZS_PSET_ERROR;
	zone->zsz_cpu_cap = cpu_cap;
	zone->zsz_cpu_shares = cpu_shares;
	zone->zsz_ram_cap = ram_cap;
	zone->zsz_locked_cap = locked_cap;
	zone->zsz_vm_cap = vm_cap;
	zone->zsz_processes_cap = processes_cap;
	zone->zsz_processes = processes;
	zone->zsz_lwps_cap = lwps_cap;
	zone->zsz_lwps = lwps;
	zone->zsz_shm_cap = shm_cap;
	zone->zsz_shm = shm;
	zone->zsz_shmids_cap = shmids_cap;
	zone->zsz_shmids = shmids;
	zone->zsz_semids_cap = semids_cap;
	zone->zsz_semids = semids;
	zone->zsz_msgids_cap = msgids_cap;
	zone->zsz_msgids = msgids;
	zone->zsz_lofi_cap = lofi_cap;
	zone->zsz_lofi = lofi;

	sys->zss_processes += processes;
	sys->zss_lwps += lwps;
	sys->zss_shm += shm;
	sys->zss_shmids += shmids;
	sys->zss_semids += semids;
	sys->zss_msgids += msgids;
	sys->zss_lofi += lofi;
	zone->zsz_active = B_TRUE;
}


/* Determine which zones have halted */
static void
zsd_mark_zones_end(zsd_ctl_t *ctl)
{
	zsd_zone_t *zone, *tmp;

	/*
	 * Mark zone as not existing, or delete if it did not exist in
	 * previous interval.
	 */
	zone = list_head(&ctl->zsctl_zones);
	while (zone != NULL) {
		if (zone->zsz_found == B_FALSE) {
			zone->zsz_empty = B_TRUE;
			if (zone->zsz_deleted == B_TRUE) {
				/*
				 * Zone deleted in prior interval,
				 * so it no longer exists.
				 */
				tmp = zone;
				zone = list_next(&ctl->zsctl_zones, zone);
				list_remove(&ctl->zsctl_zones, tmp);
				free(tmp);
				ctl->zsctl_nzones--;
				continue;
			} else {
				zone->zsz_new = B_FALSE;
				zone->zsz_deleted = B_TRUE;
				zone->zsz_active = B_TRUE;
			}
		}
		zone = list_next(&ctl->zsctl_zones, zone);
	}
}

/*
 * Mark cpus as not existing.  If a cpu is found, it will be updated.  If
 * a cpu is not found, then it must have gone offline, so it will be
 * deleted.
 *
 * The kstat tracking data is rolled so that the usage since the previous
 * interval can be determined.
 */
static void
zsd_mark_cpus_start(zsd_ctl_t *ctl, boolean_t roll)
{
	zsd_cpu_t *cpu;

	/*
	 * Mark all cpus as not existing.  As cpus are found, they will
	 * be marked as existing.
	 */
	for (cpu = list_head(&ctl->zsctl_cpus); cpu != NULL;
	    cpu = list_next(&ctl->zsctl_cpus, cpu)) {
		cpu->zsc_found = B_FALSE;
		if (cpu->zsc_active == B_TRUE && roll) {
			cpu->zsc_psetid_prev = cpu->zsc_psetid;
			cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
			cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
			cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
			cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
		}
	}
}

/*
 * An array the size of the maximum number of cpus is kept.  Within this array
 * a list of the online cpus is maintained.
 */
zsd_cpu_t *
zsd_lookup_insert_cpu(zsd_ctl_t *ctl, processorid_t cpuid)
{
	zsd_cpu_t *cpu;

	assert(cpuid < ctl->zsctl_maxcpuid);
	cpu = &(ctl->zsctl_cpu_array[cpuid]);
	assert(cpuid == cpu->zsc_id);

	if (cpu->zsc_allocated == B_FALSE) {
		cpu->zsc_allocated = B_TRUE;
		list_insert_tail(&ctl->zsctl_cpus, cpu);
	}
	return (cpu);
}

/* A cpu has been found.  Update its information */
static void
zsd_mark_cpu_found(zsd_cpu_t *cpu, zsd_pset_t *pset, psetid_t psetid)
{
	/*
	 * legacy processor sets, the cpu may move while zonestatd is
	 * inspecting, causing it to be found twice.  In this case, just
	 * leave cpu in the first processor set in which it was found.
	 */
	if (cpu->zsc_found == B_TRUE)
		return;

	/* Mark cpu as online */
	cpu->zsc_found = B_TRUE;
	cpu->zsc_offlined = B_FALSE;
	cpu->zsc_pset = pset;
	/*
	 * cpu is newly online.
	 */
	if (cpu->zsc_active == B_FALSE) {
		/*
		 * Cpu is newly online.
		 */
		cpu->zsc_onlined = B_TRUE;
		cpu->zsc_psetid = psetid;
		cpu->zsc_psetid_prev = psetid;
	} else {
		/*
		 * cpu online during previous interval.  Save properties at
		 * start of interval
		 */
		cpu->zsc_onlined = B_FALSE;
		cpu->zsc_psetid = psetid;

	}
	cpu->zsc_active = B_TRUE;
}

/* Remove all offlined cpus from the list of tracked cpus */
static void
zsd_mark_cpus_end(zsd_ctl_t *ctl)
{
	zsd_cpu_t *cpu, *tmp;
	int id;

	/* Mark cpu as online or offline */
	cpu = list_head(&ctl->zsctl_cpus);
	while (cpu != NULL) {
		if (cpu->zsc_found == B_FALSE) {
			if (cpu->zsc_offlined == B_TRUE) {
				/*
				 * cpu offlined in prior interval. It is gone.
				 */
				tmp = cpu;
				cpu = list_next(&ctl->zsctl_cpus, cpu);
				list_remove(&ctl->zsctl_cpus, tmp);
				/* Clear structure for future use */
				id = tmp->zsc_id;
				bzero(tmp, sizeof (zsd_cpu_t));
				tmp->zsc_id = id;
				tmp->zsc_allocated = B_FALSE;
				tmp->zsc_psetid = ZS_PSET_ERROR;
				tmp->zsc_psetid_prev = ZS_PSET_ERROR;

			} else {
				/*
				 * cpu online at start of interval.  Treat
				 * as still online, since it was online for
				 * some portion of the interval.
				 */
				cpu->zsc_offlined = B_TRUE;
				cpu->zsc_onlined = B_FALSE;
				cpu->zsc_active = B_TRUE;
				cpu->zsc_psetid = cpu->zsc_psetid_prev;
				cpu->zsc_pset = NULL;
			}
		}
		cpu = list_next(&ctl->zsctl_cpus, cpu);
	}
}

/* Some utility functions for managing the list of processor sets */
static zsd_pset_t *
zsd_lookup_pset_byid(zsd_ctl_t *ctl, psetid_t psetid)
{
	zsd_pset_t *pset;

	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
	    pset = list_next(&ctl->zsctl_psets, pset)) {
		if (pset->zsp_id == psetid)
			return (pset);
	}
	return (NULL);
}

static zsd_pset_t *
zsd_lookup_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
{
	zsd_pset_t *pset;

	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
	    pset = list_next(&ctl->zsctl_psets, pset)) {
		if (strcmp(pset->zsp_name, psetname) == 0) {
			if (psetid != -1)
				pset->zsp_id = psetid;
			return (pset);
		}
	}
	return (NULL);
}

static zsd_pset_t *
zsd_allocate_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
{
	zsd_pset_t *pset;

	if ((pset = (zsd_pset_t *)calloc(1, sizeof (zsd_pset_t))) == NULL)
		return (NULL);

	(void) strlcpy(pset->zsp_name, psetname, sizeof (pset->zsp_name));
	pset->zsp_id = psetid;
	pset->zsp_found = B_FALSE;
	/*
	 * Allocate as deleted so if not found in first pass, pset is deleted
	 * from list.  This can happen if pset is returned by pset_list, but
	 * is destroyed before first attempt to fetch pset details.
	 */
	list_create(&pset->zsp_usage_list, sizeof (zsd_pset_usage_t),
	    offsetof(zsd_pset_usage_t, zsu_next));

	pset->zsp_hrstart = g_hrnow;
	pset->zsp_deleted = B_TRUE;
	pset->zsp_empty = B_TRUE;
	ctl->zsctl_npsets++;

	return (pset);
}

static zsd_pset_t *
zsd_lookup_insert_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
{
	zsd_pset_t *pset, *tmp;

	if ((pset = zsd_lookup_pset(ctl, psetname, psetid)) != NULL)
		return (pset);

	if ((pset = zsd_allocate_pset(ctl, psetname, psetid)) == NULL)
		return (NULL);

	/* Insert sorted by psetname */
	tmp = list_head(&ctl->zsctl_psets);
	while (tmp != NULL && strcmp(psetname, tmp->zsp_name) > 0)
		tmp = list_next(&ctl->zsctl_psets, tmp);

	list_insert_before(&ctl->zsctl_psets, tmp, pset);
	return (pset);
}

/* Some utility functions for managing the list of zones using each pset */
static zsd_pset_usage_t *
zsd_lookup_usage(zsd_pset_t *pset, zsd_zone_t *zone)
{
	zsd_pset_usage_t *usage;

	for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
	    usage = list_next(&pset->zsp_usage_list, usage))
		if (usage->zsu_zone == zone)
			return (usage);

	return (NULL);
}

static zsd_pset_usage_t *
zsd_allocate_pset_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
{
	zsd_pset_usage_t *usage;

	if ((usage = (zsd_pset_usage_t *)calloc(1, sizeof (zsd_pset_usage_t)))
	    == NULL)
		return (NULL);

	list_link_init(&usage->zsu_next);
	usage->zsu_zone = zone;
	usage->zsu_zoneid = zone->zsz_id;
	usage->zsu_pset = pset;
	usage->zsu_found = B_FALSE;
	usage->zsu_active = B_FALSE;
	usage->zsu_new = B_FALSE;
	/*
	 * Allocate as not deleted.  If a process is found in a pset for
	 * a zone, the usage will not be deleted until at least the next
	 * interval.
	 */
	usage->zsu_start = g_now;
	usage->zsu_hrstart = g_hrnow;
	usage->zsu_deleted = B_FALSE;
	usage->zsu_empty = B_TRUE;
	usage->zsu_scheds = 0;
	usage->zsu_cpu_shares = ZS_LIMIT_NONE;

	ctl->zsctl_npset_usages++;
	pset->zsp_nusage++;

	return (usage);
}

static zsd_pset_usage_t *
zsd_lookup_insert_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
{
	zsd_pset_usage_t *usage, *tmp;

	if ((usage = zsd_lookup_usage(pset, zone))
	    != NULL)
		return (usage);

	if ((usage = zsd_allocate_pset_usage(ctl, pset, zone)) == NULL)
		return (NULL);

	tmp = list_head(&pset->zsp_usage_list);
	while (tmp != NULL && strcmp(zone->zsz_name, tmp->zsu_zone->zsz_name)
	    > 0)
		tmp = list_next(&pset->zsp_usage_list, tmp);

	list_insert_before(&pset->zsp_usage_list, tmp, usage);
	return (usage);
}

static void
zsd_refresh_system(zsd_ctl_t *ctl)
{
	zsd_system_t *system = ctl->zsctl_system;

	/* Re-count these values each interval */
	system->zss_processes = 0;
	system->zss_lwps = 0;
	system->zss_shm = 0;
	system->zss_shmids = 0;
	system->zss_semids = 0;
	system->zss_msgids = 0;
	system->zss_lofi = 0;
}


/* Reads each cpu's kstats, and adds the usage to the cpu's pset */
static void
zsd_update_cpu_stats(zsd_ctl_t *ctl, zsd_cpu_t *cpu)
{
	zsd_system_t *sys;
	processorid_t cpuid;
	zsd_pset_t *pset_prev;
	zsd_pset_t *pset;
	kstat_t *kstat;
	kstat_named_t *knp;
	kid_t kid;
	uint64_t idle, intr, kern, user;

	sys = ctl->zsctl_system;
	pset = cpu->zsc_pset;
	knp = NULL;
	kid = -1;
	cpuid = cpu->zsc_id;

	/* Get the cpu time totals for this cpu */
	kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "cpu", cpuid, "sys");
	if (kstat == NULL)
		return;

	kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
	if (kid == -1)
		return;

	knp = kstat_data_lookup(kstat, "cpu_nsec_idle");
	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
		return;

	idle = knp->value.ui64;

	knp = kstat_data_lookup(kstat, "cpu_nsec_kernel");
	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
		return;

	kern = knp->value.ui64;

	knp = kstat_data_lookup(kstat, "cpu_nsec_user");
	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
		return;

	user = knp->value.ui64;

	/*
	 * Tracking intr time per cpu just exists for future enhancements.
	 * The value is presently always zero.
	 */
	intr = 0;
	cpu->zsc_nsec_idle = idle;
	cpu->zsc_nsec_intr = intr;
	cpu->zsc_nsec_kern = kern;
	cpu->zsc_nsec_user = user;

	if (cpu->zsc_onlined == B_TRUE) {
		/*
		 * cpu is newly online.  There is no reference value,
		 * so just record its current stats for comparison
		 * on next stat read.
		 */
		cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
		cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
		cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
		cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
		return;
	}

	/*
	 * Calculate relative time since previous refresh.
	 * Paranoia.  Don't let time  go backwards.
	 */
	idle = intr = kern = user = 0;
	if (cpu->zsc_nsec_idle > cpu->zsc_nsec_idle_prev)
		idle = cpu->zsc_nsec_idle - cpu->zsc_nsec_idle_prev;

	if (cpu->zsc_nsec_intr > cpu->zsc_nsec_intr_prev)
		intr = cpu->zsc_nsec_intr - cpu->zsc_nsec_intr_prev;

	if (cpu->zsc_nsec_kern > cpu->zsc_nsec_kern_prev)
		kern = cpu->zsc_nsec_kern - cpu->zsc_nsec_kern_prev;

	if (cpu->zsc_nsec_user > cpu->zsc_nsec_user_prev)
		user = cpu->zsc_nsec_user - cpu->zsc_nsec_user_prev;

	/* Update totals for cpu usage */
	TIMESTRUC_ADD_NANOSEC(cpu->zsc_idle, idle);
	TIMESTRUC_ADD_NANOSEC(cpu->zsc_intr, intr);
	TIMESTRUC_ADD_NANOSEC(cpu->zsc_kern, kern);
	TIMESTRUC_ADD_NANOSEC(cpu->zsc_user, user);

	/*
	 * Add cpu's stats to its pset if it is known to be in
	 * the pset since previous read.
	 */
	if (cpu->zsc_psetid == cpu->zsc_psetid_prev ||
	    cpu->zsc_psetid_prev == ZS_PSET_ERROR ||
	    (pset_prev = zsd_lookup_pset_byid(ctl,
	    cpu->zsc_psetid_prev)) == NULL) {
		TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, idle);
		TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, intr);
		TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, kern);
		TIMESTRUC_ADD_NANOSEC(pset->zsp_user, user);
	} else {
		/*
		 * Last pset was different than current pset.
		 * Best guess is to split usage between the two.
		 */
		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_idle, idle / 2);
		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_intr, intr / 2);
		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_kern, kern / 2);
		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_user, user / 2);

		TIMESTRUC_ADD_NANOSEC(pset->zsp_idle,
		    (idle / 2) + (idle % 2));
		TIMESTRUC_ADD_NANOSEC(pset->zsp_intr,
		    (intr / 2) + (intr % 2));
		TIMESTRUC_ADD_NANOSEC(pset->zsp_kern,
		    (kern / 2) + (kern % 2));
		TIMESTRUC_ADD_NANOSEC(pset->zsp_user,
		    (user / 2) + (user % 2));
	}
	TIMESTRUC_ADD_NANOSEC(sys->zss_idle, idle);
	TIMESTRUC_ADD_NANOSEC(sys->zss_intr, intr);
	TIMESTRUC_ADD_NANOSEC(sys->zss_kern, kern);
	TIMESTRUC_ADD_NANOSEC(sys->zss_user, user);
}

/* Determine the details of a processor set by pset_id */
static int
zsd_get_pool_pset(zsd_ctl_t *ctl, psetid_t psetid, char *psetname,
    size_t namelen, uint_t *cputype, uint64_t *online, uint64_t *size,
    uint64_t *min, uint64_t *max, int64_t *importance)
{
	uint_t old, num;

	pool_conf_t *conf = ctl->zsctl_pool_conf;
	pool_value_t **vals = ctl->zsctl_pool_vals;
	pool_resource_t **res_list = NULL;
	pool_resource_t *pset;
	pool_component_t **cpus = NULL;
	processorid_t *cache;
	const char *string;
	uint64_t uint64;
	int64_t int64;
	int i, ret, type;

	if (ctl->zsctl_pool_status == POOL_DISABLED) {

		/*
		 * Inspect legacy psets
		 */
		for (;;) {
			old = num = ctl->zsctl_cpu_ncache;
			ret = pset_info(psetid, &type, &num,
			    ctl->zsctl_cpu_cache);
			if (ret < 0) {
				/* pset is gone.  Tell caller to retry */
				errno = EINTR;
				return (-1);
			}
			if (num <= old) {
			/* Success */
				break;
			}
			if ((cache = (processorid_t *)realloc(
			    ctl->zsctl_cpu_cache, num *
			    sizeof (processorid_t))) != NULL) {
				ctl->zsctl_cpu_ncache = num;
				ctl->zsctl_cpu_cache = cache;
			} else {
				/*
				 * Could not allocate to get new cpu list.
				 */
				zsd_warn(gettext(
				    "Could not allocate for cpu list"));
				errno = ENOMEM;
				return (-1);
			}
		}
		/*
		 * Old school pset.  Just make min and max equal
		 * to its size
		 */
		if (psetid == ZS_PSET_DEFAULT) {
			*cputype = ZS_CPUTYPE_DEFAULT_PSET;
			(void) strlcpy(psetname, "pset_default", namelen);
		} else {
			*cputype = ZS_CPUTYPE_PSRSET_PSET;
			(void) snprintf(psetname, namelen,
			    "SUNWlegacy_pset_%d", psetid);
		}

		/*
		 * Just treat legacy pset as a simple pool pset
		 */
		*online = num;
		*size = num;
		*min = num;
		*max = num;
		*importance = 1;

		return (0);
	}

	/* Look up the pool pset using the pset id */
	res_list = NULL;
	pool_value_set_int64(vals[1], psetid);
	if (pool_value_set_name(vals[1], "pset.sys_id")
	    != PO_SUCCESS)
		goto err;

	if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
		goto err;
	if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
		goto err;
	if ((res_list = pool_query_resources(conf, &num, vals)) == NULL)
		goto err;
	if (num != 1)
		goto err;
	pset = res_list[0];
	free(res_list);
	res_list = NULL;
	if (pool_get_property(conf, pool_resource_to_elem(conf, pset),
	    "pset.name", vals[0]) != POC_STRING ||
	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
		goto err;

	(void) strlcpy(psetname, string, namelen);
	if (strncmp(psetname, "SUNWtmp", strlen("SUNWtmp")) == 0)
		*cputype = ZS_CPUTYPE_DEDICATED;
	else if (psetid == ZS_PSET_DEFAULT)
		*cputype = ZS_CPUTYPE_DEFAULT_PSET;
	else
		*cputype = ZS_CPUTYPE_POOL_PSET;

	/* Get size, min, max, and importance */
	if (pool_get_property(conf, pool_resource_to_elem(conf,
	    pset), "pset.size", vals[0]) == POC_UINT &&
	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
		*size = uint64;
	else
		*size = 0;

		/* Get size, min, max, and importance */
	if (pool_get_property(conf, pool_resource_to_elem(conf,
	    pset), "pset.min", vals[0]) == POC_UINT &&
	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
		*min = uint64;
	else
		*min = 0;
	if (*min >= ZSD_PSET_UNLIMITED)
		*min = ZS_LIMIT_NONE;

	if (pool_get_property(conf, pool_resource_to_elem(conf,
	    pset), "pset.max", vals[0]) == POC_UINT &&
	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
		*max = uint64;
	else
		*max = ZS_LIMIT_NONE;

	if (*max >= ZSD_PSET_UNLIMITED)
		*max = ZS_LIMIT_NONE;

	if (pool_get_property(conf, pool_resource_to_elem(conf,
	    pset), "pset.importance", vals[0]) == POC_INT &&
	    pool_value_get_int64(vals[0], &int64) == PO_SUCCESS)
		*importance = int64;
	else
		*importance = (uint64_t)1;

	*online = 0;
	if (*size == 0)
		return (0);

	/* get cpus */
	cpus = pool_query_resource_components(conf, pset, &num, NULL);
	if (cpus == NULL)
		goto err;

	/* Make sure there is space for cpu id list */
	if (num > ctl->zsctl_cpu_ncache) {
		if ((cache = (processorid_t *)realloc(
		    ctl->zsctl_cpu_cache, num *
		    sizeof (processorid_t))) != NULL) {
			ctl->zsctl_cpu_ncache = num;
			ctl->zsctl_cpu_cache = cache;
		} else {
			/*
			 * Could not allocate to get new cpu list.
			 */
			zsd_warn(gettext(
			    "Could not allocate for cpu list"));
			goto err;
		}
	}

	/* count the online cpus */
	for (i = 0; i < num; i++) {
		if (pool_get_property(conf, pool_component_to_elem(
		    conf, cpus[i]), "cpu.status", vals[0]) != POC_STRING ||
		    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
			goto err;

		if (strcmp(string, "on-line") != 0 &&
		    strcmp(string, "no-intr") != 0)
			continue;

		if (pool_get_property(conf, pool_component_to_elem(
		    conf, cpus[i]), "cpu.sys_id", vals[0]) != POC_INT ||
		    pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
			goto err;

		(*online)++;
		ctl->zsctl_cpu_cache[i] = (psetid_t)int64;
	}
	free(cpus);
	return (0);
err:
	if (res_list != NULL)
		free(res_list);
	if (cpus != NULL)
		free(cpus);

	/*
	 * The pools operations should succeed since the conf is a consistent
	 * snapshot.  Tell caller there is no need to retry.
	 */
	errno = EINVAL;
	return (-1);
}

/*
 * Update the current list of processor sets.
 * This also updates the list of online cpus, and each cpu's pset membership.
 */
static void
zsd_refresh_psets(zsd_ctl_t *ctl)
{
	int i, j, ret, state;
	uint_t old, num;
	uint_t cputype;
	int64_t sys_id, importance;
	uint64_t online, size, min, max;
	zsd_system_t *system;
	zsd_pset_t *pset;
	zsd_cpu_t *cpu;
	psetid_t *cache;
	char psetname[ZS_PSETNAME_MAX];
	processorid_t cpuid;
	pool_value_t *pv_save = NULL;
	pool_resource_t **res_list = NULL;
	pool_resource_t *res;
	pool_value_t **vals;
	pool_conf_t *conf;
	boolean_t roll_cpus = B_TRUE;

	/* Zero cpu counters to recount them */
	system = ctl->zsctl_system;
	system->zss_ncpus = 0;
	system->zss_ncpus_online = 0;
retry:
	ret = pool_get_status(&state);
	if (ret == 0 && state == POOL_ENABLED) {

		conf = ctl->zsctl_pool_conf;
		vals = ctl->zsctl_pool_vals;
		pv_save = vals[1];
		vals[1] = NULL;

		if (ctl->zsctl_pool_status == POOL_DISABLED) {
			if (pool_conf_open(ctl->zsctl_pool_conf,
			    pool_dynamic_location(), PO_RDONLY) == 0) {
				ctl->zsctl_pool_status = POOL_ENABLED;
				ctl->zsctl_pool_changed = POU_PSET;
			}
		} else {
			ctl->zsctl_pool_changed = 0;
			ret = pool_conf_update(ctl->zsctl_pool_conf,
			    &(ctl->zsctl_pool_changed));
			if (ret < 0) {
				/* Pools must have become disabled */
				(void) pool_conf_close(ctl->zsctl_pool_conf);
				ctl->zsctl_pool_status = POOL_DISABLED;
				if (pool_error() == POE_SYSTEM && errno ==
				    ENOTACTIVE)
					goto retry;

				zsd_warn(gettext(
				    "Unable to update pool configuration"));
				/* Not able to get pool info.  Don't update. */
				goto err;
			}
		}
		/* Get the list of psets using libpool */
		if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
			goto err;

		if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
			goto err;
		if ((res_list = pool_query_resources(conf, &num, vals))
		    == NULL)
			goto err;

		if (num > ctl->zsctl_pset_ncache)  {
			if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
			    (num) * sizeof (psetid_t))) == NULL) {
				goto err;
			}
			ctl->zsctl_pset_ncache = num;
			ctl->zsctl_pset_cache = cache;
		}
		/* Save the pset id of each pset */
		for (i = 0; i < num; i++) {
			res = res_list[i];
			if (pool_get_property(conf, pool_resource_to_elem(conf,
			    res), "pset.sys_id", vals[0]) != POC_INT ||
			    pool_value_get_int64(vals[0], &sys_id)
			    != PO_SUCCESS)
				goto err;
			ctl->zsctl_pset_cache[i] = (int)sys_id;
		}
		vals[1] = pv_save;
		pv_save = NULL;
	} else {
		if (ctl->zsctl_pool_status == POOL_ENABLED) {
			(void) pool_conf_close(ctl->zsctl_pool_conf);
			ctl->zsctl_pool_status = POOL_DISABLED;
		}
		/* Get the pset list using legacy psets */
		for (;;) {
			old = num = ctl->zsctl_pset_ncache;
			(void) pset_list(ctl->zsctl_pset_cache, &num);
			if ((num + 1) <= old) {
				break;
			}
			if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
			    (num + 1) * sizeof (psetid_t))) != NULL) {
				ctl->zsctl_pset_ncache = num + 1;
				ctl->zsctl_pset_cache = cache;
			} else {
				/*
				 * Could not allocate to get new pset list.
				 * Give up
				 */
				return;
			}
		}
		/* Add the default pset to list */
		ctl->zsctl_pset_cache[num] = ctl->zsctl_pset_cache[0];
		ctl->zsctl_pset_cache[0] = ZS_PSET_DEFAULT;
		num++;
	}
psets_changed:
	zsd_mark_cpus_start(ctl, roll_cpus);
	zsd_mark_psets_start(ctl);
	roll_cpus = B_FALSE;

	/* Refresh cpu membership of all psets */
	for (i = 0; i < num; i++) {

		/* Get pool pset information */
		sys_id = ctl->zsctl_pset_cache[i];
		if (zsd_get_pool_pset(ctl, sys_id, psetname, sizeof (psetname),
		    &cputype, &online, &size, &min, &max, &importance)
		    != 0) {
			if (errno == EINTR)
				goto psets_changed;
			zsd_warn(gettext("Failed to get info for pset %d"),
			    sys_id);
			continue;
		}

		system->zss_ncpus += size;
		system->zss_ncpus_online += online;

		pset = zsd_lookup_insert_pset(ctl, psetname,
		    ctl->zsctl_pset_cache[i]);

		/* update pset info */
		zsd_mark_pset_found(pset, cputype, online, size, min,
		    max, importance);

		/* update each cpu in pset */
		for (j = 0; j < pset->zsp_online; j++) {
			cpuid = ctl->zsctl_cpu_cache[j];
			cpu = zsd_lookup_insert_cpu(ctl, cpuid);
			zsd_mark_cpu_found(cpu, pset, sys_id);
		}
	}
err:
	if (res_list != NULL)
		free(res_list);
	if (pv_save != NULL)
		vals[1] = pv_save;
}



/*
 * Fetch the current pool and pset name for the given zone.
 */
static void
zsd_get_zone_pool_pset(zsd_ctl_t *ctl, zsd_zone_t *zone,
    char *pool, int poollen, char *pset, int psetlen, uint_t *cputype)
{
	poolid_t poolid;
	pool_t **pools = NULL;
	pool_resource_t **res_list = NULL;
	char poolname[ZS_POOLNAME_MAX];
	char psetname[ZS_PSETNAME_MAX];
	pool_conf_t *conf = ctl->zsctl_pool_conf;
	pool_value_t *pv_save = NULL;
	pool_value_t **vals = ctl->zsctl_pool_vals;
	const char *string;
	int ret;
	int64_t int64;
	uint_t num;

	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_POOLID,
	    &poolid, sizeof (poolid));
	if (ret < 0)
		goto lookup_done;

	pv_save = vals[1];
	vals[1] = NULL;
	pools = NULL;
	res_list = NULL;

	/* Default values if lookup fails */
	(void) strlcpy(poolname, "pool_default", sizeof (poolname));
	(void) strlcpy(psetname, "pset_default", sizeof (poolname));
	*cputype = ZS_CPUTYPE_DEFAULT_PSET;

	/* no dedicated cpu if pools are disabled */
	if (ctl->zsctl_pool_status == POOL_DISABLED)
		goto lookup_done;

	/* Get the pool name using the id */
	pool_value_set_int64(vals[0], poolid);
	if (pool_value_set_name(vals[0], "pool.sys_id") != PO_SUCCESS)
		goto lookup_done;

	if ((pools = pool_query_pools(conf, &num, vals)) == NULL)
		goto lookup_done;

	if (num != 1)
		goto lookup_done;

	if (pool_get_property(conf, pool_to_elem(conf, pools[0]),
	    "pool.name", vals[0]) != POC_STRING ||
	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
		goto lookup_done;
	(void) strlcpy(poolname, (char *)string, sizeof (poolname));

	/* Get the name of the pset for the pool */
	if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
		goto lookup_done;

	if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
		goto lookup_done;

	if ((res_list = pool_query_pool_resources(conf, pools[0], &num, vals))
	    == NULL)
		goto lookup_done;

	if (num != 1)
		goto lookup_done;

	if (pool_get_property(conf, pool_resource_to_elem(conf,
	    res_list[0]), "pset.sys_id", vals[0]) != POC_INT ||
	    pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
		goto lookup_done;

	if (int64 == ZS_PSET_DEFAULT)
		*cputype = ZS_CPUTYPE_DEFAULT_PSET;

	if (pool_get_property(conf, pool_resource_to_elem(conf,
	    res_list[0]), "pset.name", vals[0]) != POC_STRING ||
	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
		goto lookup_done;

	(void) strlcpy(psetname, (char *)string, sizeof (psetname));

	if (strncmp(psetname, "SUNWtmp_", strlen("SUNWtmp_")) == 0)
		*cputype = ZS_CPUTYPE_DEDICATED;
	if (strncmp(psetname, "SUNW_legacy_", strlen("SUNW_legacy_")) == 0)
		*cputype = ZS_CPUTYPE_PSRSET_PSET;
	else
		*cputype = ZS_CPUTYPE_POOL_PSET;

lookup_done:

	if (pv_save != NULL)
		vals[1] = pv_save;

	if (res_list)
		free(res_list);
	if (pools)
		free(pools);

	(void) strlcpy(pool, poolname, poollen);
	(void) strlcpy(pset, psetname, psetlen);
}

/* Convert scheduler names to ZS_* scheduler flags */
static uint_t
zsd_schedname2int(char *clname, int pri)
{
	uint_t sched = 0;

	if (strcmp(clname, "TS") == 0) {
		sched = ZS_SCHED_TS;
	} else if (strcmp(clname, "IA") == 0) {
		sched = ZS_SCHED_IA;
	} else if (strcmp(clname, "FX") == 0) {
		if (pri > 59) {
			sched = ZS_SCHED_FX_60;
		} else {
			sched = ZS_SCHED_FX;
		}
	} else if (strcmp(clname, "RT") == 0) {
		sched = ZS_SCHED_RT;

	} else if (strcmp(clname, "FSS") == 0) {
		sched = ZS_SCHED_FSS;
	}
	return (sched);
}

static uint64_t
zsd_get_zone_rctl_limit(char *name)
{
	rctlblk_t *rblk;

	rblk = (rctlblk_t *)alloca(rctlblk_size());
	if (getrctl(name, NULL, rblk, RCTL_FIRST)
	    != 0) {
		return (ZS_LIMIT_NONE);
	}
	return (rctlblk_get_value(rblk));
}

static uint64_t
zsd_get_zone_rctl_usage(char *name)
{
	rctlblk_t *rblk;

	rblk = (rctlblk_t *)alloca(rctlblk_size());
	if (getrctl(name, NULL, rblk, RCTL_USAGE)
	    != 0) {
		return (0);
	}
	return (rctlblk_get_value(rblk));
}

#define	ZSD_NUM_RCTL_VALS 19

/*
 * Fetch the limit information for a zone.  This uses zone_enter() as the
 * getrctl(2) system call only returns rctl information for the zone of
 * the caller.
 */
static int
zsd_get_zone_caps(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t *cpu_shares,
    uint64_t *cpu_cap, uint64_t *ram_cap, uint64_t *locked_cap,
    uint64_t *vm_cap, uint64_t *processes_cap, uint64_t *processes,
    uint64_t *lwps_cap, uint64_t *lwps, uint64_t *shm_cap, uint64_t *shm,
    uint64_t *shmids_cap, uint64_t *shmids, uint64_t *semids_cap,
    uint64_t *semids, uint64_t *msgids_cap, uint64_t *msgids,
    uint64_t *lofi_cap, uint64_t *lofi, uint_t *sched)
{
	int p[2], pid, tmpl_fd, ret;
	ctid_t ct;
	char class[PC_CLNMSZ];
	uint64_t vals[ZSD_NUM_RCTL_VALS];
	zsd_system_t *sys = ctl->zsctl_system;
	int i = 0;
	int res = 0;

	/* Treat all caps as no cap on error */
	*cpu_shares = ZS_LIMIT_NONE;
	*cpu_cap = ZS_LIMIT_NONE;
	*ram_cap = ZS_LIMIT_NONE;
	*locked_cap = ZS_LIMIT_NONE;
	*vm_cap = ZS_LIMIT_NONE;

	*processes_cap = ZS_LIMIT_NONE;
	*lwps_cap = ZS_LIMIT_NONE;
	*shm_cap = ZS_LIMIT_NONE;
	*shmids_cap = ZS_LIMIT_NONE;
	*semids_cap = ZS_LIMIT_NONE;
	*msgids_cap = ZS_LIMIT_NONE;
	*lofi_cap = ZS_LIMIT_NONE;

	*processes = 0;
	*lwps = 0;
	*shm = 0;
	*shmids = 0;
	*semids = 0;
	*msgids = 0;
	*lofi = 0;

	/* Get the ram cap first since it is a zone attr */
	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_PHYS_MCAP,
	    ram_cap, sizeof (*ram_cap));
	if (ret < 0 || *ram_cap == 0)
		*ram_cap = ZS_LIMIT_NONE;

	/* Get the zone's default scheduling class */
	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_SCHED_CLASS,
	    class, sizeof (class));
	if (ret < 0)
		return (-1);

	*sched = zsd_schedname2int(class, 0);

	/* rctl caps must be fetched from within the zone */
	if (pipe(p) != 0)
		return (-1);

	if ((tmpl_fd = init_template()) == -1) {
		(void) close(p[0]);
		(void) close(p[1]);
		return (-1);
	}
	pid = forkx(0);
	if (pid < 0) {
		(void) ct_tmpl_clear(tmpl_fd);
		(void) close(p[0]);
		(void) close(p[1]);
		return (-1);
	}
	if (pid == 0) {

		(void) ct_tmpl_clear(tmpl_fd);
		(void) close(tmpl_fd);
		(void) close(p[0]);
		if (zone->zsz_id != getzoneid()) {
			if (zone_enter(zone->zsz_id) < 0) {
				(void) close(p[1]);
				_exit(0);
			}
		}

		/* Get caps for zone, and write them to zonestatd parent. */
		vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-shares");
		vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-cap");
		vals[i++] = zsd_get_zone_rctl_limit("zone.max-locked-memory");
		vals[i++] = zsd_get_zone_rctl_limit("zone.max-swap");
		vals[i++] = zsd_get_zone_rctl_limit("zone.max-processes");
		vals[i++] = zsd_get_zone_rctl_usage("zone.max-processes");
		vals[i++] = zsd_get_zone_rctl_limit("zone.max-lwps");
		vals[i++] = zsd_get_zone_rctl_usage("zone.max-lwps");
		vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-memory");
		vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-memory");
		vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-ids");
		vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-ids");
		vals[i++] = zsd_get_zone_rctl_limit("zone.max-sem-ids");
		vals[i++] = zsd_get_zone_rctl_usage("zone.max-sem-ids");
		vals[i++] = zsd_get_zone_rctl_limit("zone.max-msg-ids");
		vals[i++] = zsd_get_zone_rctl_usage("zone.max-msg-ids");
		vals[i++] = zsd_get_zone_rctl_limit("zone.max-lofi");
		vals[i++] = zsd_get_zone_rctl_usage("zone.max-lofi");

		if (write(p[1], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
		    ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
			(void) close(p[1]);
			_exit(1);
		}

		(void) close(p[1]);
		_exit(0);
	}
	if (contract_latest(&ct) == -1)
		ct = -1;

	(void) ct_tmpl_clear(tmpl_fd);
	(void) close(tmpl_fd);
	(void) close(p[1]);
	while (waitpid(pid, NULL, 0) != pid)
		;

	/* Read cap from child in zone */
	if (read(p[0], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
	    ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
		res = -1;
		goto cleanup;
	}
	i = 0;
	*cpu_shares = vals[i++];
	*cpu_cap = vals[i++];
	*locked_cap = vals[i++];
	*vm_cap = vals[i++];
	*processes_cap = vals[i++];
	*processes = vals[i++];
	*lwps_cap = vals[i++];
	*lwps = vals[i++];
	*shm_cap = vals[i++];
	*shm = vals[i++];
	*shmids_cap = vals[i++];
	*shmids = vals[i++];
	*semids_cap = vals[i++];
	*semids = vals[i++];
	*msgids_cap = vals[i++];
	*msgids = vals[i++];
	*lofi_cap = vals[i++];
	*lofi = vals[i++];

	/* Interpret maximum values as no cap */
	if (*cpu_cap == UINT32_MAX || *cpu_cap == 0)
		*cpu_cap = ZS_LIMIT_NONE;
	if (*processes_cap == sys->zss_processes_max)
		*processes_cap = ZS_LIMIT_NONE;
	if (*lwps_cap == sys->zss_lwps_max)
		*lwps_cap = ZS_LIMIT_NONE;
	if (*shm_cap == sys->zss_shm_max)
		*shm_cap = ZS_LIMIT_NONE;
	if (*shmids_cap == sys->zss_shmids_max)
		*shmids_cap = ZS_LIMIT_NONE;
	if (*semids_cap == sys->zss_semids_max)
		*semids_cap = ZS_LIMIT_NONE;
	if (*msgids_cap == sys->zss_msgids_max)
		*msgids_cap = ZS_LIMIT_NONE;
	if (*lofi_cap == sys->zss_lofi_max)
		*lofi_cap = ZS_LIMIT_NONE;


cleanup:
	(void) close(p[0]);
	(void) ct_tmpl_clear(tmpl_fd);
	(void) close(tmpl_fd);
	(void) contract_abandon_id(ct);

	return (res);
}

/* Update the current list of running zones */
static void
zsd_refresh_zones(zsd_ctl_t *ctl)
{
	zsd_zone_t *zone;
	uint_t old, num;
	ushort_t flags;
	int i, ret;
	zoneid_t *cache;
	uint64_t cpu_shares;
	uint64_t cpu_cap;
	uint64_t ram_cap;
	uint64_t locked_cap;
	uint64_t vm_cap;
	uint64_t processes_cap;
	uint64_t processes;
	uint64_t lwps_cap;
	uint64_t lwps;
	uint64_t shm_cap;
	uint64_t shm;
	uint64_t shmids_cap;
	uint64_t shmids;
	uint64_t semids_cap;
	uint64_t semids;
	uint64_t msgids_cap;
	uint64_t msgids;
	uint64_t lofi_cap;
	uint64_t lofi;

	char zonename[ZS_ZONENAME_MAX];
	char poolname[ZS_POOLNAME_MAX];
	char psetname[ZS_PSETNAME_MAX];
	uint_t sched;
	uint_t cputype;
	uint_t iptype;

	/* Get the current list of running zones */
	for (;;) {
		old = num = ctl->zsctl_zone_ncache;
		(void) zone_list(ctl->zsctl_zone_cache, &num);
		if (num <= old)
			break;
		if ((cache = (zoneid_t *)realloc(ctl->zsctl_zone_cache,
		    (num) * sizeof (zoneid_t))) != NULL) {
			ctl->zsctl_zone_ncache = num;
			ctl->zsctl_zone_cache = cache;
		} else {
			/* Could not allocate to get new zone list.  Give up */
			return;
		}
	}

	zsd_mark_zones_start(ctl);

	for (i = 0; i < num; i++) {

		ret = getzonenamebyid(ctl->zsctl_zone_cache[i],
		    zonename, sizeof (zonename));
		if (ret < 0)
			continue;

		zone = zsd_lookup_insert_zone(ctl, zonename,
		    ctl->zsctl_zone_cache[i]);

		ret = zone_getattr(ctl->zsctl_zone_cache[i], ZONE_ATTR_FLAGS,
		    &flags, sizeof (flags));
		if (ret < 0)
			continue;

		if (flags & ZF_NET_EXCL)
			iptype = ZS_IPTYPE_EXCLUSIVE;
		else
			iptype = ZS_IPTYPE_SHARED;

		zsd_get_zone_pool_pset(ctl, zone, poolname, sizeof (poolname),
		    psetname, sizeof (psetname), &cputype);

		if (zsd_get_zone_caps(ctl, zone, &cpu_shares, &cpu_cap,
		    &ram_cap, &locked_cap, &vm_cap, &processes_cap, &processes,
		    &lwps_cap, &lwps, &shm_cap, &shm, &shmids_cap, &shmids,
		    &semids_cap, &semids, &msgids_cap, &msgids, &lofi_cap,
		    &lofi, &sched) != 0)
			continue;

		zsd_mark_zone_found(ctl, zone, cpu_shares, cpu_cap, ram_cap,
		    locked_cap, vm_cap, processes_cap, processes, lwps_cap,
		    lwps, shm_cap, shm, shmids_cap, shmids, semids_cap,
		    semids, msgids_cap, msgids, lofi_cap, lofi, poolname,
		    psetname, sched, cputype, iptype);
	}
}

/* Fetch the details of a process from its psinfo_t */
static void
zsd_get_proc_info(zsd_ctl_t *ctl, psinfo_t *psinfo, psetid_t *psetid,
    psetid_t *prev_psetid, zoneid_t *zoneid, zoneid_t *prev_zoneid,
    timestruc_t *delta, uint_t *sched)
{
	timestruc_t d;
	zsd_proc_t *proc;

	/* Get cached data for proc */
	proc = &(ctl->zsctl_proc_array[psinfo->pr_pid]);
	*psetid = psinfo->pr_lwp.pr_bindpset;

	if (proc->zspr_psetid == ZS_PSET_ERROR)
		*prev_psetid = *psetid;
	else
		*prev_psetid = proc->zspr_psetid;

	*zoneid = psinfo->pr_zoneid;
	if (proc->zspr_zoneid == -1)
		*prev_zoneid = *zoneid;
	else
		*prev_zoneid = proc->zspr_zoneid;

	TIMESTRUC_DELTA(d, psinfo->pr_time, proc->zspr_usage);
	*delta = d;

	*sched = zsd_schedname2int(psinfo->pr_lwp.pr_clname,
	    psinfo->pr_lwp.pr_pri);

	/* Update cached data for proc */
	proc->zspr_psetid = psinfo->pr_lwp.pr_bindpset;
	proc->zspr_zoneid = psinfo->pr_zoneid;
	proc->zspr_sched = *sched;
	proc->zspr_usage.tv_sec = psinfo->pr_time.tv_sec;
	proc->zspr_usage.tv_nsec = psinfo->pr_time.tv_nsec;
	proc->zspr_ppid = psinfo->pr_ppid;
}

/*
 * Reset the known cpu usage of a process. This is done after a process
 * exits so that if the pid is recycled, data from its previous life is
 * not reused
 */
static void
zsd_flush_proc_info(zsd_proc_t *proc)
{
	proc->zspr_usage.tv_sec = 0;
	proc->zspr_usage.tv_nsec = 0;
}

/*
 * Open the current extended accounting file.  On initialization, open the
 * file as the current file to be used.  Otherwise, open the file as the
 * next file to use of the current file reaches EOF.
 */
static int
zsd_open_exacct(zsd_ctl_t *ctl, boolean_t init)
{
	int ret, oret, state, trys = 0, flags;
	int *fd, *open;
	ea_file_t *eaf;
	struct stat64 *stat;
	char path[MAXPATHLEN];

	/*
	 * The accounting file is first opened at the tail.  Following
	 * opens to new accounting files are opened at the head.
	 */
	if (init == B_TRUE) {
		flags = EO_NO_VALID_HDR | EO_TAIL;
		fd = &ctl->zsctl_proc_fd;
		eaf = &ctl->zsctl_proc_eaf;
		stat = &ctl->zsctl_proc_stat;
		open = &ctl->zsctl_proc_open;
	} else {
		flags = EO_NO_VALID_HDR | EO_HEAD;
		fd = &ctl->zsctl_proc_fd_next;
		eaf = &ctl->zsctl_proc_eaf_next;
		stat = &ctl->zsctl_proc_stat_next;
		open = &ctl->zsctl_proc_open_next;
	}

	*fd = -1;
	*open = 0;
retry:
	/* open accounting files for cpu consumption */
	ret = acctctl(AC_STATE_GET | AC_PROC, &state, sizeof (state));
	if (ret != 0) {
		zsd_warn(gettext("Unable to get process accounting state"));
		goto err;
	}
	if (state != AC_ON) {
		if (trys > 0) {
			zsd_warn(gettext(
			    "Unable to enable process accounting"));
			goto err;
		}
		(void) zsd_enable_cpu_stats();
		trys++;
		goto retry;
	}

	ret = acctctl(AC_FILE_GET | AC_PROC, path, sizeof (path));
	if (ret != 0) {
		zsd_warn(gettext("Unable to get process accounting file"));
		goto err;
	}

	if ((*fd = open64(path, O_RDONLY, 0)) >= 0 &&
	    (oret = ea_fdopen(eaf, *fd, NULL, flags, O_RDONLY)) == 0)
		ret = fstat64(*fd, stat);

	if (*fd < 0 || oret < 0 || ret < 0) {
		struct timespec ts;

		/*
		 * It is possible the accounting file is momentarily unavailable
		 * because it is being rolled.  Try for up to half a second.
		 *
		 * If failure to open accounting file persists, give up.
		 */
		if (oret == 0)
			(void) ea_close(eaf);
		else if (*fd >= 0)
			(void) close(*fd);
		if (trys > 500) {
			zsd_warn(gettext(
			    "Unable to open process accounting file"));
			goto err;
		}
		/* wait one millisecond */
		ts.tv_sec = 0;
		ts.tv_nsec = NANOSEC / 1000;
		(void) nanosleep(&ts, NULL);
		goto retry;
	}
	*open = 1;
	return (0);
err:
	if (*fd >= 0)
		(void) close(*fd);
	*open = 0;
	*fd = -1;
	return (-1);
}

/*
 * Walk /proc and charge each process to its zone and processor set.
 * Then read exacct data for exited processes, and charge them as well.
 */
static void
zsd_refresh_procs(zsd_ctl_t *ctl, boolean_t init)
{
	DIR *dir;
	struct dirent *dent;
	psinfo_t psinfo;
	int fd, ret;
	zsd_proc_t *proc, *pproc, *tmp, *next;
	list_t pplist, plist;
	zsd_zone_t *zone, *prev_zone;
	zsd_pset_t *pset, *prev_pset;
	psetid_t psetid, prev_psetid;
	zoneid_t zoneid, prev_zoneid;
	zsd_pset_usage_t *usage, *prev_usage;
	char path[MAXPATHLEN];

	ea_object_t object;
	ea_object_t pobject;
	boolean_t hrtime_expired = B_FALSE;
	struct timeval interval_end;

	timestruc_t delta, d1, d2;
	uint_t sched = 0;

	/*
	 * Get the current accounting file.  The current accounting file
	 * may be different than the file in use, as the accounting file
	 * may have been rolled, or manually changed by an admin.
	 */
	ret = zsd_open_exacct(ctl, init);
	if (ret != 0) {
		zsd_warn(gettext("Unable to track process accounting"));
		return;
	}

	/*
	 * Mark the current time as the interval end time.  Don't track
	 * processes that exit after this time.
	 */
	(void) gettimeofday(&interval_end, NULL);

	dir = opendir("/proc");
	if (dir == NULL) {
		zsd_warn(gettext("Unable to open /proc"));
		return;
	}

	/* Walk all processes and compute each zone's usage on each pset. */
	while ((dent = readdir(dir)) != NULL) {

		if (strcmp(dent->d_name, ".") == 0 ||
		    strcmp(dent->d_name, "..") == 0)
			continue;

		(void) snprintf(path, sizeof (path), "/proc/%s/psinfo",
		    dent->d_name);

		fd = open(path, O_RDONLY);
		if (fd < 0)
			continue;

		if (read(fd, &psinfo, sizeof (psinfo)) != sizeof (psinfo)) {
			(void) close(fd);
			continue;
		}
		(void) close(fd);

		zsd_get_proc_info(ctl, &psinfo, &psetid, &prev_psetid,
		    &zoneid, &prev_zoneid, &delta, &sched);

		d1.tv_sec = delta.tv_sec / 2;
		d1.tv_nsec = delta.tv_nsec / 2;
		d2.tv_sec = (delta.tv_sec / 2) + (delta.tv_sec % 2);
		d2.tv_nsec = (delta.tv_nsec / 2) + (delta.tv_nsec % 2);

		/* Get the zone and pset this process is running in */
		zone = zsd_lookup_zone_byid(ctl, zoneid);
		if (zone == NULL)
			continue;
		pset = zsd_lookup_pset_byid(ctl, psetid);
		if (pset == NULL)
			continue;
		usage = zsd_lookup_insert_usage(ctl, pset, zone);
		if (usage == NULL)
			continue;

		/*
		 * Get the usage of the previous zone and pset if they were
		 * different.
		 */
		if (zoneid != prev_zoneid)
			prev_zone = zsd_lookup_zone_byid(ctl, prev_zoneid);
		else
			prev_zone = NULL;

		if (psetid != prev_psetid)
			prev_pset = zsd_lookup_pset_byid(ctl, prev_psetid);
		else
			prev_pset = NULL;

		prev_usage = NULL;
		if (prev_zone != NULL || prev_pset != NULL) {
			if (prev_zone == NULL)
				prev_zone = zone;
			if (prev_pset == NULL)
				prev_pset = pset;

			prev_usage = zsd_lookup_insert_usage(ctl, prev_pset,
			    prev_zone);
		}

		/* Update the usage with the processes info */
		if (prev_usage == NULL) {
			zsd_mark_pset_usage_found(usage, sched);
		} else {
			zsd_mark_pset_usage_found(usage, sched);
			zsd_mark_pset_usage_found(prev_usage, sched);
		}

		/*
		 * First time around is just to get a starting point.  All
		 * usages will be zero.
		 */
		if (init == B_TRUE)
			continue;

		if (prev_usage == NULL) {
			zsd_add_usage(ctl, usage, &delta);
		} else {
			zsd_add_usage(ctl, usage, &d1);
			zsd_add_usage(ctl, prev_usage, &d2);
		}
	}
	(void) closedir(dir);

	/*
	 * No need to collect exited proc data on initialization.  Just
	 * caching the usage of the known processes to get a zero starting
	 * point.
	 */
	if (init == B_TRUE)
		return;

	/*
	 * Add accounting records to account for processes which have
	 * exited.
	 */
	list_create(&plist, sizeof (zsd_proc_t),
	    offsetof(zsd_proc_t, zspr_next));
	list_create(&pplist, sizeof (zsd_proc_t),
	    offsetof(zsd_proc_t, zspr_next));

	for (;;) {
		pid_t pid;
		pid_t ppid;
		timestruc_t user, sys, proc_usage;
		timestruc_t finish;
		int numfound = 0;

		bzero(&object, sizeof (object));
		proc = NULL;
		zone = NULL;
		pset = NULL;
		usage = NULL;
		ret = ea_get_object(&ctl->zsctl_proc_eaf, &object);
		if (ret == EO_ERROR) {
			if (ea_error() == EXR_EOF) {

				struct stat64 *stat;
				struct stat64 *stat_next;

				/*
				 * See if the next accounting file is the
				 * same as the current accounting file.
				 */
				stat = &(ctl->zsctl_proc_stat);
				stat_next = &(ctl->zsctl_proc_stat_next);
				if (stat->st_ino == stat_next->st_ino &&
				    stat->st_dev == stat_next->st_dev) {
					/*
					 * End of current accounting file is
					 * reached, so finished.  Clear EOF
					 * bit for next time around.
					 */
					ea_clear(&ctl->zsctl_proc_eaf);
					break;
				} else {
					/*
					 * Accounting file has changed.  Move
					 * to current accounting file.
					 */
					(void) ea_close(&ctl->zsctl_proc_eaf);

					ctl->zsctl_proc_fd =
					    ctl->zsctl_proc_fd_next;
					ctl->zsctl_proc_eaf =
					    ctl->zsctl_proc_eaf_next;
					ctl->zsctl_proc_stat =
					    ctl->zsctl_proc_stat_next;

					ctl->zsctl_proc_fd_next = -1;
					ctl->zsctl_proc_open_next = 0;
					continue;
				}
			} else {
				/*
				 * Other accounting error.  Give up on
				 * accounting.
				 */
				goto ea_err;
			}
		}
		/* Skip if not a process group */
		if ((object.eo_catalog & EXT_TYPE_MASK) != EXT_GROUP ||
		    (object.eo_catalog & EXD_DATA_MASK) != EXD_GROUP_PROC) {
			(void) ea_free_item(&object, EUP_ALLOC);
			continue;
		}

		/* The process group entry should be complete */
		while (numfound < 9) {
			bzero(&pobject, sizeof (pobject));
			ret = ea_get_object(&ctl->zsctl_proc_eaf,
			    &pobject);
			if (ret < 0) {
				(void) ea_free_item(&object, EUP_ALLOC);
				zsd_warn(
				    "unable to get process accounting data");
				goto ea_err;
			}
			/* Next entries should be process data */
			if ((pobject.eo_catalog & EXT_TYPE_MASK) ==
			    EXT_GROUP) {
				(void) ea_free_item(&object, EUP_ALLOC);
				(void) ea_free_item(&pobject, EUP_ALLOC);
				zsd_warn(
				    "process data of wrong type");
				goto ea_err;
			}
			switch (pobject.eo_catalog & EXD_DATA_MASK) {
			case EXD_PROC_PID:
				pid = pobject.eo_item.ei_uint32;
				proc = &(ctl->zsctl_proc_array[pid]);
				/*
				 * This process should not be currently in
				 * the list of processes to process.
				 */
				assert(!list_link_active(&proc->zspr_next));
				numfound++;
				break;
			case EXD_PROC_ANCPID:
				ppid = pobject.eo_item.ei_uint32;
				pproc = &(ctl->zsctl_proc_array[ppid]);
				numfound++;
				break;
			case EXD_PROC_ZONENAME:
				zone = zsd_lookup_zone(ctl,
				    pobject.eo_item.ei_string, -1);
				numfound++;
				break;
			case EXD_PROC_CPU_USER_SEC:
				user.tv_sec =
				    pobject.eo_item.ei_uint64;
				numfound++;
				break;
			case EXD_PROC_CPU_USER_NSEC:
				user.tv_nsec =
				    pobject.eo_item.ei_uint64;
				numfound++;
				break;
			case EXD_PROC_CPU_SYS_SEC:
				sys.tv_sec =
				    pobject.eo_item.ei_uint64;
				numfound++;
				break;
			case EXD_PROC_CPU_SYS_NSEC:
				sys.tv_nsec =
				    pobject.eo_item.ei_uint64;
				numfound++;
				break;
			case EXD_PROC_FINISH_SEC:
				finish.tv_sec =
				    pobject.eo_item.ei_uint64;
				numfound++;
				break;
			case EXD_PROC_FINISH_NSEC:
				finish.tv_nsec =
				    pobject.eo_item.ei_uint64;
				numfound++;
				break;
			}
			(void) ea_free_item(&pobject, EUP_ALLOC);
		}
		(void) ea_free_item(&object, EUP_ALLOC);
		if (numfound != 9) {
			zsd_warn(gettext(
			    "Malformed process accounting entry found"));
			goto proc_done;
		}

		if (finish.tv_sec > interval_end.tv_sec ||
		    (finish.tv_sec == interval_end.tv_sec &&
		    finish.tv_nsec > (interval_end.tv_usec * 1000)))
			hrtime_expired = B_TRUE;

		/*
		 * Try to identify the zone and pset to which this
		 * exited process belongs.
		 */
		if (zone == NULL)
			goto proc_done;

		/* Save proc info */
		proc->zspr_ppid = ppid;
		proc->zspr_zoneid = zone->zsz_id;

		prev_psetid = ZS_PSET_ERROR;
		sched = 0;

		/*
		 * The following tries to deduce the processes pset.
		 *
		 * First choose pset and sched using cached value from the
		 * most recent time the process has been seen.
		 *
		 * pset and sched can change across zone_enter, so make sure
		 * most recent sighting of this process was in the same
		 * zone before using most recent known value.
		 *
		 * If there is no known value, use value of processes
		 * parent.  If parent is unknown, walk parents until a known
		 * parent is found.
		 *
		 * If no parent in the zone is found, use the zone's default
		 * pset and scheduling class.
		 */
		if (proc->zspr_psetid != ZS_PSET_ERROR) {
			prev_psetid = proc->zspr_psetid;
			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
			sched = proc->zspr_sched;
		} else if (pproc->zspr_zoneid == zone->zsz_id &&
		    pproc->zspr_psetid != ZS_PSET_ERROR) {
			prev_psetid = pproc->zspr_psetid;
			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
			sched = pproc->zspr_sched;
		}

		if (pset == NULL) {
			/*
			 * Process or processes parent has never been seen.
			 * Save to deduce a known parent later.
			 */
			proc_usage = sys;
			TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
			TIMESTRUC_DELTA(delta, proc_usage,
			    proc->zspr_usage);
			proc->zspr_usage = delta;
			list_insert_tail(&plist, proc);
			continue;
		}

		/* Add the zone's usage to the pset */
		usage = zsd_lookup_insert_usage(ctl, pset, zone);
		if (usage == NULL)
			goto proc_done;

		zsd_mark_pset_usage_found(usage, sched);

		/* compute the usage to add for the exited proc */
		proc_usage = sys;
		TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
		TIMESTRUC_DELTA(delta, proc_usage,
		    proc->zspr_usage);

		zsd_add_usage(ctl, usage, &delta);
proc_done:
		zsd_flush_proc_info(proc);

		if (hrtime_expired == B_TRUE)
			break;
	}
	/*
	 * close next accounting file.
	 */
	if (ctl->zsctl_proc_open_next) {
		(void) ea_close(
		    &ctl->zsctl_proc_eaf_next);
		ctl->zsctl_proc_open_next = 0;
		ctl->zsctl_proc_fd_next = -1;
	}

	/* For the remaining processes, use pset and sched of a known parent */
	proc = list_head(&plist);
	while (proc != NULL) {
		next = proc;
		for (;;) {
			if (next->zspr_ppid == 0 || next->zspr_ppid == -1) {
				/*
				 * Kernel process, or parent is unknown, skip
				 * process, remove from process list.
				 */
				tmp = proc;
				proc = list_next(&plist, proc);
				list_link_init(&tmp->zspr_next);
				break;
			}
			pproc = &(ctl->zsctl_proc_array[next->zspr_ppid]);
			if (pproc->zspr_zoneid != proc->zspr_zoneid) {
				/*
				 * Parent in different zone.  Save process and
				 * use zone's default pset and sched below
				 */
				tmp = proc;
				proc = list_next(&plist, proc);
				list_remove(&plist, tmp);
				list_insert_tail(&pplist, tmp);
				break;
			}
			/* Parent has unknown pset, Search parent's parent  */
			if (pproc->zspr_psetid == ZS_PSET_ERROR) {
				next = pproc;
				continue;
			}
			/* Found parent with known pset.  Use its info */
			proc->zspr_psetid = pproc->zspr_psetid;
			proc->zspr_sched = pproc->zspr_sched;
			next->zspr_psetid = pproc->zspr_psetid;
			next->zspr_sched = pproc->zspr_sched;
			zone = zsd_lookup_zone_byid(ctl,
			    proc->zspr_zoneid);
			if (zone == NULL) {
				tmp = proc;
				proc = list_next(&plist, proc);
				list_remove(&plist, tmp);
				list_link_init(&tmp->zspr_next);
				break;
			}
			pset = zsd_lookup_pset_byid(ctl,
			    proc->zspr_psetid);
			if (pset == NULL) {
				tmp = proc;
				proc = list_next(&plist, proc);
				list_remove(&plist, tmp);
				list_link_init(&tmp->zspr_next);
				break;
			}
			/* Add the zone's usage to the pset */
			usage = zsd_lookup_insert_usage(ctl, pset, zone);
			if (usage == NULL) {
				tmp = proc;
				proc = list_next(&plist, proc);
				list_remove(&plist, tmp);
				list_link_init(&tmp->zspr_next);
				break;
			}
			zsd_mark_pset_usage_found(usage, proc->zspr_sched);
			zsd_add_usage(ctl, usage, &proc->zspr_usage);
			zsd_flush_proc_info(proc);
			tmp = proc;
			proc = list_next(&plist, proc);
			list_remove(&plist, tmp);
			list_link_init(&tmp->zspr_next);
			break;
		}
	}
	/*
	 * Process has never been seen.  Using zone info to
	 * determine pset and scheduling class.
	 */
	proc = list_head(&pplist);
	while (proc != NULL) {

		zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid);
		if (zone == NULL)
			goto next;
		if (zone->zsz_psetid != ZS_PSET_ERROR &&
		    zone->zsz_psetid != ZS_PSET_MULTI) {
			prev_psetid = zone->zsz_psetid;
			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
		} else {
			pset = zsd_lookup_pset(ctl, zone->zsz_pset, -1);
			if (pset != NULL)
				prev_psetid = pset->zsp_id;
		}
		if (pset == NULL)
			goto next;

		sched = zone->zsz_scheds;
		/*
		 * Ignore FX high scheduling class if it is not the
		 * only scheduling class in the zone.
		 */
		if (sched != ZS_SCHED_FX_60)
			sched &= (~ZS_SCHED_FX_60);
		/*
		 * If more than one scheduling class has been found
		 * in the zone, use zone's default scheduling class for
		 * this process.
		 */
		if ((sched & (sched - 1)) != 0)
			sched = zone->zsz_default_sched;

		/* Add the zone's usage to the pset */
		usage = zsd_lookup_insert_usage(ctl, pset, zone);
		if (usage == NULL)
			goto next;

		zsd_mark_pset_usage_found(usage, sched);
		zsd_add_usage(ctl, usage, &proc->zspr_usage);
next:
		tmp = proc;
		proc = list_next(&pplist, proc);
		zsd_flush_proc_info(tmp);
		list_link_init(&tmp->zspr_next);
	}
	return;
ea_err:
	/*
	 * Close the next accounting file if we have not transitioned to it
	 * yet.
	 */
	if (ctl->zsctl_proc_open_next) {
		(void) ea_close(&ctl->zsctl_proc_eaf_next);
		ctl->zsctl_proc_open_next = 0;
		ctl->zsctl_proc_fd_next = -1;
	}
}

/*
 * getvmusage(2) uses size_t's in the passwd data structure, which differ
 * in size for 32bit and 64 bit kernels.  Since this is a contracted interface,
 * and zonestatd does not necessarily match the kernel's bitness, marshal
 * results appropriately.
 */
static int
zsd_getvmusage(zsd_ctl_t *ctl, uint_t flags, time_t age, zsd_vmusage64_t *buf,
    uint64_t *nres)
{
	zsd_vmusage32_t *vmu32;
	zsd_vmusage64_t *vmu64;
	uint32_t nres32;
	int i;
	int ret;

	if (ctl->zsctl_kern_bits == 32)  {
		nres32 = *nres;
		ret = syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
		    flags, age, (uintptr_t)buf, (uintptr_t)&nres32);
		*nres = nres32;
		if (ret == 0 && buf != NULL) {
			/*
			 * An array of vmusage32_t's has been returned.
			 * Convert it to an array of vmusage64_t's.
			 */
			vmu32 = (zsd_vmusage32_t *)buf;
			vmu64 = (zsd_vmusage64_t *)buf;
			for (i = nres32 - 1; i >= 0; i--) {

				vmu64[i].vmu_zoneid = vmu32[i].vmu_zoneid;
				vmu64[i].vmu_type = vmu32[i].vmu_type;
				vmu64[i].vmu_type = vmu32[i].vmu_type;
				vmu64[i].vmu_rss_all = vmu32[i].vmu_rss_all;
				vmu64[i].vmu_rss_private =
				    vmu32[i].vmu_rss_private;
				vmu64[i].vmu_rss_shared =
				    vmu32[i].vmu_rss_shared;
				vmu64[i].vmu_swap_all = vmu32[i].vmu_swap_all;
				vmu64[i].vmu_swap_private =
				    vmu32[i].vmu_swap_private;
				vmu64[i].vmu_swap_shared =
				    vmu32[i].vmu_swap_shared;
			}
		}
		return (ret);
	} else {
		/*
		 * kernel is 64 bit, so use 64 bit structures as zonestat
		 * expects.
		 */
		return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
		    flags, age, (uintptr_t)buf, (uintptr_t)nres));

	}
}

/*
 * Update the current physical, virtual, and locked memory usage of the
 * running zones.
 */
static void
zsd_refresh_memory(zsd_ctl_t *ctl, boolean_t init)
{

	uint64_t phys_total;
	uint64_t phys_used;
	uint64_t phys_zones;
	uint64_t phys_zones_overcount;
	uint64_t phys_zones_extra;
	uint64_t phys_zones_credit;

	uint64_t vm_free;
	uint64_t vm_used;

	uint64_t disk_swap_total;
	uint64_t disk_swap_used;	/* disk swap with contents */

	uint64_t physmem;
	uint64_t pp_kernel;
	uint64_t arc_size = 0;
	struct anoninfo ani;

	int num_swap_devices;
	struct swaptable *swt;
	struct swapent *swent;
	size_t swt_size;
	char *path;

	zsd_vmusage64_t *vmusage;
	uint64_t num_vmusage;

	int i, ret;

	zsd_system_t *sys;
	zsd_zone_t *zone;
	int vmu_nzones;

	kstat_t *kstat;
	char kstat_name[KSTAT_STRLEN];
	kstat_named_t *knp;
	kid_t kid;

	if (init)
		return;

	sys = ctl->zsctl_system;

	/* interrogate swap devices to find the amount of disk swap */
disk_swap_again:
	num_swap_devices = swapctl(SC_GETNSWP, NULL);

	if (num_swap_devices == 0) {
		sys->zss_swap_total = disk_swap_total = 0;
		sys->zss_swap_used = disk_swap_used = 0;
		/* No disk swap */
		goto disk_swap_done;
	}
	/* see if swap table needs to be larger */
	if (num_swap_devices > ctl->zsctl_swap_cache_num) {
		swt_size = sizeof (int) +
		    (num_swap_devices * sizeof (struct swapent)) +
		    (num_swap_devices * MAXPATHLEN);
		if (ctl->zsctl_swap_cache != NULL)
			free(ctl->zsctl_swap_cache);

		swt = (struct swaptable *)malloc(swt_size);
		if (swt == NULL) {
			/*
			 * Could not allocate to get list of swap devices.
			 * Just use data from the most recent read, which will
			 * be zero if this is the first read.
			 */
			zsd_warn(gettext("Unable to allocate to determine "
			    "virtual memory"));
			disk_swap_total = sys->zss_swap_total;
			disk_swap_used = sys->zss_swap_used;
			goto disk_swap_done;
		}
		swent = swt->swt_ent;
		path = (char *)swt + (sizeof (int) +
		    num_swap_devices * sizeof (swapent_t));
		for (i = 0; i < num_swap_devices; i++, swent++) {
			swent->ste_path = path;
			path += MAXPATHLEN;
		}
		swt->swt_n = num_swap_devices;
		ctl->zsctl_swap_cache = swt;
		ctl->zsctl_swap_cache_size = swt_size;
		ctl->zsctl_swap_cache_num = num_swap_devices;
	}
	num_swap_devices = swapctl(SC_LIST, ctl->zsctl_swap_cache);
	if (num_swap_devices < 0) {
		/* More swap devices have arrived */
		if (errno == ENOMEM)
			goto disk_swap_again;

		zsd_warn(gettext("Unable to determine disk swap devices"));
		/* Unexpected error.  Use existing data */
		disk_swap_total = sys->zss_swap_total;
		disk_swap_used = sys->zss_swap_used;
		goto disk_swap_done;
	}

	/* add up the disk swap */
	disk_swap_total = 0;
	disk_swap_used = 0;
	swent = ctl->zsctl_swap_cache->swt_ent;
	for (i = 0; i < num_swap_devices; i++, swent++) {
		disk_swap_total += swent->ste_pages;
		disk_swap_used += (swent->ste_pages - swent->ste_free);
	}
	disk_swap_total *= ctl->zsctl_pagesize;
	disk_swap_used *= ctl->zsctl_pagesize;

	sys->zss_swap_total = disk_swap_total;
	sys->zss_swap_used = disk_swap_used;

disk_swap_done:

	/* get system pages kstat */
	kid = -1;
	kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "unix", 0, "system_pages");
	if (kstat == NULL)
		zsd_warn(gettext("Unable to lookup system pages kstat"));
	else
		kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);

	if (kid == -1) {
		zsd_warn(gettext("Unable to read system pages kstat"));
		return;
	} else {
		knp = kstat_data_lookup(kstat, "physmem");
		if (knp == NULL) {
			zsd_warn(gettext("Unable to read physmem"));
		} else {
			if (knp->data_type == KSTAT_DATA_UINT64)
				physmem = knp->value.ui64;
			else if (knp->data_type == KSTAT_DATA_UINT32)
				physmem = knp->value.ui32;
			else
				return;
		}
		knp = kstat_data_lookup(kstat, "pp_kernel");
		if (knp == NULL) {
			zsd_warn(gettext("Unable to read pp_kernel"));
		} else {
			if (knp->data_type == KSTAT_DATA_UINT64)
				pp_kernel = knp->value.ui64;
			else if (knp->data_type == KSTAT_DATA_UINT32)
				pp_kernel = knp->value.ui32;
			else
				return;
		}
	}
	physmem *= ctl->zsctl_pagesize;
	pp_kernel *= ctl->zsctl_pagesize;

	/* get the zfs arc size if available */
	arc_size = 0;
	kid = -1;
	kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "zfs", 0, "arcstats");
	if (kstat != NULL)
		kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
	if (kid != -1) {
		knp = kstat_data_lookup(kstat, "size");
		if (knp != NULL)
			if (knp->data_type == KSTAT_DATA_UINT64)
				arc_size = knp->value.ui64;
	}

	/* Try to get swap information */
	if (swapctl(SC_AINFO, &ani) < 0) {
		zsd_warn(gettext("Unable to get swap info"));
		return;
	}

vmusage_again:
	/* getvmusage to get physical memory usage */
	vmusage = ctl->zsctl_vmusage_cache;
	num_vmusage = ctl->zsctl_vmusage_cache_num;

	ret = zsd_getvmusage(ctl, VMUSAGE_SYSTEM | VMUSAGE_ALL_ZONES, 0,
	    vmusage, &num_vmusage);

	if (ret != 0) {
		/* Unexpected error.  Use existing data */
		if (errno != EOVERFLOW) {
			zsd_warn(gettext(
			    "Unable to read physical memory usage"));
			phys_zones = sys->zss_ram_zones;
			goto vmusage_done;
		}
	}
	/* vmusage results cache too small */
	if (num_vmusage > ctl->zsctl_vmusage_cache_num) {

		size_t size = sizeof (zsd_vmusage64_t) * num_vmusage;

		if (ctl->zsctl_vmusage_cache != NULL)
			free(ctl->zsctl_vmusage_cache);
		vmusage = (zsd_vmusage64_t *)malloc(size);
		if (vmusage == NULL) {
			zsd_warn(gettext("Unable to alloc to determine "
			    "physical memory usage"));
			phys_zones = sys->zss_ram_zones;
			goto vmusage_done;
		}
		ctl->zsctl_vmusage_cache = vmusage;
		ctl->zsctl_vmusage_cache_num = num_vmusage;
		goto vmusage_again;
	}

	phys_zones_overcount = 0;
	vmu_nzones = 0;
	for (i = 0; i < num_vmusage; i++) {
		switch (vmusage[i].vmu_type) {
		case VMUSAGE_SYSTEM:
			/* total pages backing user process mappings */
			phys_zones = sys->zss_ram_zones =
			    vmusage[i].vmu_rss_all;
			break;
		case VMUSAGE_ZONE:
			vmu_nzones++;
			phys_zones_overcount += vmusage[i].vmu_rss_all;
			zone = zsd_lookup_zone_byid(ctl, vmusage[i].vmu_id);
			if (zone != NULL)
				zone->zsz_usage_ram = vmusage[i].vmu_rss_all;
			break;
		default:
			break;
		}
	}
	/*
	 * Figure how much memory was double counted due to text sharing
	 * between zones.  Credit this back so that the sum of the zones
	 * equals the total zone ram usage;
	 */
	phys_zones_extra = phys_zones_overcount - phys_zones;
	phys_zones_credit = phys_zones_extra / vmu_nzones;

vmusage_done:

	/* walk the zones to get swap and locked kstats.  Fetch ram cap. */
	sys->zss_locked_zones = 0;
	sys->zss_vm_zones = 0;
	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
	    zone = list_next(&ctl->zsctl_zones, zone)) {

		/* If zone halted during interval, show memory usage as none */
		if (zone->zsz_active == B_FALSE ||
		    zone->zsz_deleted == B_TRUE) {
			zone->zsz_usage_ram = 0;
			zone->zsz_usage_vm = 0;
			zone->zsz_usage_locked = 0;
			continue;
		}

		if (phys_zones_credit > 0) {
			if (zone->zsz_usage_ram > phys_zones_credit) {
				zone->zsz_usage_ram -= phys_zones_credit;
			}
		}
		/*
		 * Get zone's swap usage.  Since zone could have halted,
		 * treats as zero if cannot read
		 */
		zone->zsz_usage_vm = 0;
		(void) snprintf(kstat_name, sizeof (kstat_name),
		    "swapresv_zone_%d", zone->zsz_id);
		kid = -1;
		kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
		    zone->zsz_id, kstat_name);
		if (kstat != NULL)
			kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
		if (kid != -1) {
			knp = kstat_data_lookup(kstat, "usage");
			if (knp != NULL &&
			    knp->data_type == KSTAT_DATA_UINT64) {
				zone->zsz_usage_vm = knp->value.ui64;
				sys->zss_vm_zones += knp->value.ui64;
			}
		}
		/*
		 * Get zone's locked usage.  Since zone could have halted,
		 * treats as zero if cannot read
		 */
		zone->zsz_usage_locked = 0;
		(void) snprintf(kstat_name, sizeof (kstat_name),
		    "lockedmem_zone_%d", zone->zsz_id);
		kid = -1;
		kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
		    zone->zsz_id, kstat_name);
		if (kstat != NULL)
			kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
		if (kid != -1) {
			knp = kstat_data_lookup(kstat, "usage");
			if (knp != NULL &&
			    knp->data_type == KSTAT_DATA_UINT64) {
				zone->zsz_usage_locked = knp->value.ui64;
				/*
				 * Since locked memory accounting for zones
				 * can double count ddi locked memory, cap each
				 * zone's locked usage at its ram usage.
				 */
				if (zone->zsz_usage_locked >
				    zone->zsz_usage_ram)
					zone->zsz_usage_locked =
					    zone->zsz_usage_ram;
				sys->zss_locked_zones +=
				    zone->zsz_usage_locked;
			}
		}
	}

	phys_total =
	    sysconf(_SC_PHYS_PAGES) * ctl->zsctl_pagesize;

	phys_used = (sysconf(_SC_PHYS_PAGES) - sysconf(_SC_AVPHYS_PAGES))
	    * ctl->zsctl_pagesize;

	/* Compute remaining statistics */
	sys->zss_ram_total = phys_total;
	sys->zss_ram_zones = phys_zones;
	sys->zss_ram_kern = phys_used - phys_zones - arc_size;

	/*
	 * The total for kernel locked memory should include
	 * segkp locked pages, but oh well.  The arc size is subtracted,
	 * as that physical memory is reclaimable.
	 */
	sys->zss_locked_kern = pp_kernel - arc_size;
	/* Add memory used by kernel startup and obp to kernel locked */
	if ((phys_total - physmem) > 0)
		sys->zss_locked_kern += phys_total - physmem;

	/*
	 * Add in the portion of (RAM+DISK) that is not available as swap,
	 * and consider it swap used by the kernel.
	 */
	sys->zss_vm_total = phys_total + disk_swap_total;
	vm_free = (ani.ani_max - ani.ani_resv) * ctl->zsctl_pagesize;
	vm_used = sys->zss_vm_total - vm_free;
	sys->zss_vm_kern = vm_used - sys->zss_vm_zones - arc_size;
}

/*
 * Charge each cpu's usage to its processor sets.  Also add the cpu's total
 * time to each zone using the processor set.  This tracks the maximum
 * amount of cpu time that a zone could have used.
 */
static void
zsd_refresh_cpu_stats(zsd_ctl_t *ctl, boolean_t init)
{
	zsd_system_t *sys;
	zsd_zone_t *zone;
	zsd_pset_usage_t *usage;
	zsd_cpu_t *cpu;
	zsd_cpu_t *cpu_next;
	zsd_pset_t *pset;
	timestruc_t ts;
	uint64_t hrtime;
	timestruc_t delta;

	/* Update the per-cpu kstat data */
	cpu_next = list_head(&ctl->zsctl_cpus);
	while (cpu_next != NULL) {
		cpu = cpu_next;
		cpu_next = list_next(&ctl->zsctl_cpus, cpu);
		zsd_update_cpu_stats(ctl, cpu);
	}
	/* Update the elapsed real time */
	hrtime = gethrtime();
	if (init) {
		/* first time around, store hrtime for future comparision */
		ctl->zsctl_hrtime = hrtime;
		ctl->zsctl_hrtime_prev = hrtime;

	} else {
		/* Compute increase in hrtime since the most recent read */
		ctl->zsctl_hrtime_prev = ctl->zsctl_hrtime;
		ctl->zsctl_hrtime = hrtime;
		if ((hrtime = hrtime - ctl->zsctl_hrtime_prev) > 0)
			TIMESTRUC_ADD_NANOSEC(ctl->zsctl_hrtime_total, hrtime);
	}

	/* On initialization, all psets have zero time  */
	if (init)
		return;

	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
	    pset = list_next(&ctl->zsctl_psets, pset)) {

		if (pset->zsp_active == B_FALSE) {
			zsd_warn(gettext("Internal error,inactive pset found"));
			continue;
		}

		/* sum total used time for pset */
		ts.tv_sec = 0;
		ts.tv_nsec = 0;
		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_intr);
		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_kern);
		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_user);
		/* kernel time in pset is total time minus zone time */
		TIMESTRUC_DELTA(pset->zsp_usage_kern, ts,
		    pset->zsp_usage_zones);
		if (pset->zsp_usage_kern.tv_sec < 0 ||
		    pset->zsp_usage_kern.tv_nsec < 0) {
			pset->zsp_usage_kern.tv_sec = 0;
			pset->zsp_usage_kern.tv_nsec = 0;
		}
		/* Total pset elapsed time is used time plus idle time */
		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_idle);

		TIMESTRUC_DELTA(delta, ts, pset->zsp_total_time);

		for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
		    usage = list_next(&pset->zsp_usage_list, usage)) {

			zone = usage->zsu_zone;
			if (usage->zsu_cpu_shares != ZS_LIMIT_NONE &&
			    usage->zsu_cpu_shares != ZS_SHARES_UNLIMITED &&
			    usage->zsu_cpu_shares != 0) {
				/*
				 * Figure out how many nanoseconds of share time
				 * to give to the zone
				 */
				hrtime = delta.tv_sec;
				hrtime *= NANOSEC;
				hrtime += delta.tv_nsec;
				hrtime *= usage->zsu_cpu_shares;
				hrtime /= pset->zsp_cpu_shares;
				TIMESTRUC_ADD_NANOSEC(zone->zsz_share_time,
				    hrtime);
			}
			/* Add pset time to each zone using pset */
			TIMESTRUC_ADD_TIMESTRUC(zone->zsz_pset_time, delta);

			zone->zsz_cpus_online += pset->zsp_online;
		}
		pset->zsp_total_time = ts;
	}

	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
	    zone = list_next(&ctl->zsctl_zones, zone)) {

		/* update cpu cap tracking if the zone has a cpu cap */
		if (zone->zsz_cpu_cap != ZS_LIMIT_NONE) {
			uint64_t elapsed;

			elapsed = ctl->zsctl_hrtime - ctl->zsctl_hrtime_prev;
			elapsed *= zone->zsz_cpu_cap;
			elapsed = elapsed / 100;
			TIMESTRUC_ADD_NANOSEC(zone->zsz_cap_time, elapsed);
		}
	}
	sys = ctl->zsctl_system;
	ts.tv_sec = 0;
	ts.tv_nsec = 0;
	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_intr);
	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_kern);
	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_user);

	/* kernel time in pset is total time minus zone time */
	TIMESTRUC_DELTA(sys->zss_cpu_usage_kern, ts,
	    sys->zss_cpu_usage_zones);
	if (sys->zss_cpu_usage_kern.tv_sec < 0 ||
	    sys->zss_cpu_usage_kern.tv_nsec < 0) {
		sys->zss_cpu_usage_kern.tv_sec = 0;
		sys->zss_cpu_usage_kern.tv_nsec = 0;
	}
	/* Total pset elapsed time is used time plus idle time */
	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_idle);
	sys->zss_cpu_total_time = ts;
}

/*
 * Saves current usage data to a cache that is read by libzonestat when
 * calling zs_usage_read().
 *
 * All pointers in the cached data structure are set to NULL.  When
 * libzonestat reads the cached data, it will set the pointers relative to
 * its address space.
 */
static void
zsd_usage_cache_update(zsd_ctl_t *ctl)
{
	zs_usage_cache_t *cache;
	zs_usage_cache_t *old;
	zs_usage_t *usage;

	zs_system_t *sys;
	zsd_system_t *dsys;
	zs_zone_t *zone = NULL;
	zsd_zone_t *dzone;
	zs_pset_t *pset = NULL;
	zsd_pset_t *dpset;
	zs_pset_zone_t *pusage;
	zsd_pset_usage_t *dpusage;

	char *next;
	uint_t size, i, j;

	size =
	    sizeof (zs_usage_cache_t) +
	    sizeof (zs_usage_t) +
	    sizeof (zs_system_t) +
	    sizeof (zs_zone_t) * ctl->zsctl_nzones +
	    sizeof (zs_pset_t) *  ctl->zsctl_npsets +
	    sizeof (zs_pset_zone_t) * ctl->zsctl_npset_usages;

	cache = (zs_usage_cache_t *)malloc(size);
	if (cache == NULL) {
		zsd_warn(gettext("Unable to allocate usage cache\n"));
		return;
	}

	next = (char *)cache;
	cache->zsuc_size = size - sizeof (zs_usage_cache_t);
	next += sizeof (zs_usage_cache_t);

	/* LINTED */
	usage = cache->zsuc_usage = (zs_usage_t *)next;
	next += sizeof (zs_usage_t);
	usage->zsu_start = g_start;
	usage->zsu_hrstart = g_hrstart;
	usage->zsu_time = g_now;
	usage->zsu_hrtime = g_hrnow;
	usage->zsu_nzones = ctl->zsctl_nzones;
	usage->zsu_npsets = ctl->zsctl_npsets;
	usage->zsu_system = NULL;

	/* LINTED */
	sys = (zs_system_t *)next;
	next += sizeof (zs_system_t);
	dsys = ctl->zsctl_system;
	sys->zss_ram_total = dsys->zss_ram_total;
	sys->zss_ram_kern = dsys->zss_ram_kern;
	sys->zss_ram_zones = dsys->zss_ram_zones;
	sys->zss_locked_kern = dsys->zss_locked_kern;
	sys->zss_locked_zones = dsys->zss_locked_zones;
	sys->zss_vm_total = dsys->zss_vm_total;
	sys->zss_vm_kern = dsys->zss_vm_kern;
	sys->zss_vm_zones = dsys->zss_vm_zones;
	sys->zss_swap_total = dsys->zss_swap_total;
	sys->zss_swap_used = dsys->zss_swap_used;
	sys->zss_ncpus = dsys->zss_ncpus;
	sys->zss_ncpus_online = dsys->zss_ncpus_online;

	sys->zss_processes_max = dsys->zss_maxpid;
	sys->zss_lwps_max = dsys->zss_lwps_max;
	sys->zss_shm_max = dsys->zss_shm_max;
	sys->zss_shmids_max = dsys->zss_shmids_max;
	sys->zss_semids_max = dsys->zss_semids_max;
	sys->zss_msgids_max = dsys->zss_msgids_max;
	sys->zss_lofi_max = dsys->zss_lofi_max;

	sys->zss_processes = dsys->zss_processes;
	sys->zss_lwps = dsys->zss_lwps;
	sys->zss_shm = dsys->zss_shm;
	sys->zss_shmids = dsys->zss_shmids;
	sys->zss_semids = dsys->zss_semids;
	sys->zss_msgids = dsys->zss_msgids;
	sys->zss_lofi = dsys->zss_lofi;

	sys->zss_cpu_total_time = dsys->zss_cpu_total_time;
	sys->zss_cpu_usage_zones = dsys->zss_cpu_usage_zones;
	sys->zss_cpu_usage_kern = dsys->zss_cpu_usage_kern;

	for (i = 0, dzone = list_head(&ctl->zsctl_zones);
	    i < ctl->zsctl_nzones;
	    i++, dzone = list_next(&ctl->zsctl_zones, dzone)) {
		/* LINTED */
		zone = (zs_zone_t *)next;
		next += sizeof (zs_zone_t);
		list_link_init(&zone->zsz_next);
		zone->zsz_system = NULL;

		(void) strlcpy(zone->zsz_name, dzone->zsz_name,
		    sizeof (zone->zsz_name));
		(void) strlcpy(zone->zsz_pool, dzone->zsz_pool,
		    sizeof (zone->zsz_pool));
		(void) strlcpy(zone->zsz_pset, dzone->zsz_pset,
		    sizeof (zone->zsz_pset));
		zone->zsz_id = dzone->zsz_id;
		zone->zsz_cputype = dzone->zsz_cputype;
		zone->zsz_iptype = dzone->zsz_iptype;
		zone->zsz_start = dzone->zsz_start;
		zone->zsz_hrstart = dzone->zsz_hrstart;
		zone->zsz_scheds = dzone->zsz_scheds;
		zone->zsz_cpu_shares = dzone->zsz_cpu_shares;
		zone->zsz_cpu_cap = dzone->zsz_cpu_cap;
		zone->zsz_ram_cap = dzone->zsz_ram_cap;
		zone->zsz_vm_cap = dzone->zsz_vm_cap;
		zone->zsz_locked_cap = dzone->zsz_locked_cap;
		zone->zsz_cpu_usage = dzone->zsz_cpu_usage;
		zone->zsz_cpus_online = dzone->zsz_cpus_online;
		zone->zsz_pset_time = dzone->zsz_pset_time;
		zone->zsz_cap_time = dzone->zsz_cap_time;
		zone->zsz_share_time = dzone->zsz_share_time;
		zone->zsz_usage_ram = dzone->zsz_usage_ram;
		zone->zsz_usage_locked = dzone->zsz_usage_locked;
		zone->zsz_usage_vm = dzone->zsz_usage_vm;

		zone->zsz_processes_cap = dzone->zsz_processes_cap;
		zone->zsz_lwps_cap = dzone->zsz_lwps_cap;
		zone->zsz_shm_cap = dzone->zsz_shm_cap;
		zone->zsz_shmids_cap = dzone->zsz_shmids_cap;
		zone->zsz_semids_cap = dzone->zsz_semids_cap;
		zone->zsz_msgids_cap = dzone->zsz_msgids_cap;
		zone->zsz_lofi_cap = dzone->zsz_lofi_cap;

		zone->zsz_processes = dzone->zsz_processes;
		zone->zsz_lwps = dzone->zsz_lwps;
		zone->zsz_shm = dzone->zsz_shm;
		zone->zsz_shmids = dzone->zsz_shmids;
		zone->zsz_semids = dzone->zsz_semids;
		zone->zsz_msgids = dzone->zsz_msgids;
		zone->zsz_lofi = dzone->zsz_lofi;
	}

	for (i = 0, dpset = list_head(&ctl->zsctl_psets);
	    i < ctl->zsctl_npsets;
	    i++, dpset = list_next(&ctl->zsctl_psets, dpset)) {
		/* LINTED */
		pset = (zs_pset_t *)next;
		next += sizeof (zs_pset_t);
		list_link_init(&pset->zsp_next);
		(void) strlcpy(pset->zsp_name, dpset->zsp_name,
		    sizeof (pset->zsp_name));
		pset->zsp_id = dpset->zsp_id;
		pset->zsp_cputype = dpset->zsp_cputype;
		pset->zsp_start = dpset->zsp_start;
		pset->zsp_hrstart = dpset->zsp_hrstart;
		pset->zsp_online = dpset->zsp_online;
		pset->zsp_size = dpset->zsp_size;
		pset->zsp_min = dpset->zsp_min;
		pset->zsp_max = dpset->zsp_max;
		pset->zsp_importance = dpset->zsp_importance;
		pset->zsp_scheds = dpset->zsp_scheds;
		pset->zsp_cpu_shares = dpset->zsp_cpu_shares;
		pset->zsp_total_time = dpset->zsp_total_time;
		pset->zsp_usage_kern = dpset->zsp_usage_kern;
		pset->zsp_usage_zones = dpset->zsp_usage_zones;
		pset->zsp_nusage = dpset->zsp_nusage;
		/* Add pset usages for pset */
		for (j = 0, dpusage = list_head(&dpset->zsp_usage_list);
		    j < dpset->zsp_nusage;
		    j++, dpusage = list_next(&dpset->zsp_usage_list, dpusage)) {
			/* LINTED */
			pusage = (zs_pset_zone_t *)next;
			next += sizeof (zs_pset_zone_t);
			/* pointers are computed by client */
			pusage->zspz_pset = NULL;
			pusage->zspz_zone = NULL;
			list_link_init(&pusage->zspz_next);
			pusage->zspz_zoneid = dpusage->zsu_zone->zsz_id;
			pusage->zspz_start = dpusage->zsu_start;
			pusage->zspz_hrstart = dpusage->zsu_hrstart;
			pusage->zspz_hrstart = dpusage->zsu_hrstart;
			pusage->zspz_cpu_shares = dpusage->zsu_cpu_shares;
			pusage->zspz_scheds = dpusage->zsu_scheds;
			pusage->zspz_cpu_usage = dpusage->zsu_cpu_usage;
		}
	}

	/* Update the current cache pointer */
	(void) mutex_lock(&g_usage_cache_lock);
	old = g_usage_cache;
	cache->zsuc_ref = 1;
	cache->zsuc_gen = g_gen_next;
	usage->zsu_gen = g_gen_next;
	usage->zsu_size = size;
	g_usage_cache = cache;
	if (old != NULL) {
		old->zsuc_ref--;
		if (old->zsuc_ref == 0)
			free(old);
	}
	g_gen_next++;
	/* Wake up any clients that are waiting for this calculation */
	if (g_usage_cache_kickers > 0) {
		(void) cond_broadcast(&g_usage_cache_wait);
	}
	(void) mutex_unlock(&g_usage_cache_lock);
}

static zs_usage_cache_t *
zsd_usage_cache_hold_locked()
{
	zs_usage_cache_t *ret;

	ret = g_usage_cache;
	ret->zsuc_ref++;
	return (ret);
}

void
zsd_usage_cache_rele(zs_usage_cache_t *cache)
{
	(void) mutex_lock(&g_usage_cache_lock);
	cache->zsuc_ref--;
	if (cache->zsuc_ref == 0)
		free(cache);
	(void) mutex_unlock(&g_usage_cache_lock);
}

/* Close the handles held by zsd_open() */
void
zsd_close(zsd_ctl_t *ctl)
{
	zsd_zone_t *zone;
	zsd_pset_t *pset;
	zsd_pset_usage_t *usage;
	zsd_cpu_t *cpu;
	int id;

	if (ctl->zsctl_kstat_ctl) {
		(void) kstat_close(ctl->zsctl_kstat_ctl);
		ctl->zsctl_kstat_ctl = NULL;
	}
	if (ctl->zsctl_proc_open) {
		(void) ea_close(&ctl->zsctl_proc_eaf);
		ctl->zsctl_proc_open = 0;
		ctl->zsctl_proc_fd = -1;
	}
	if (ctl->zsctl_pool_conf) {
		if (ctl->zsctl_pool_status == POOL_ENABLED)
			(void) pool_conf_close(ctl->zsctl_pool_conf);
		ctl->zsctl_pool_status = POOL_DISABLED;
	}

	while ((zone = list_head(&ctl->zsctl_zones)) != NULL) {
		list_remove(&ctl->zsctl_zones, zone);
		free(zone);
		ctl->zsctl_nzones--;
	}

	while ((pset = list_head(&ctl->zsctl_psets)) != NULL) {
		while ((usage = list_head(&pset->zsp_usage_list))
		    != NULL) {
			list_remove(&pset->zsp_usage_list, usage);
			ctl->zsctl_npset_usages--;
			free(usage);
		}
		list_remove(&ctl->zsctl_psets, pset);
		free(pset);
		ctl->zsctl_npsets--;
	}

	/* Release all cpus being tracked */
	while (cpu = list_head(&ctl->zsctl_cpus)) {
		list_remove(&ctl->zsctl_cpus, cpu);
		id = cpu->zsc_id;
		bzero(cpu, sizeof (zsd_cpu_t));
		cpu->zsc_id = id;
		cpu->zsc_allocated = B_FALSE;
		cpu->zsc_psetid = ZS_PSET_ERROR;
		cpu->zsc_psetid_prev = ZS_PSET_ERROR;
	}

	assert(ctl->zsctl_npset_usages == 0);
	assert(ctl->zsctl_npsets == 0);
	assert(ctl->zsctl_nzones == 0);
	(void) zsd_disable_cpu_stats();
}


/*
 * Update the utilization data for all zones and processor sets.
 */
static int
zsd_read(zsd_ctl_t *ctl, boolean_t init, boolean_t do_memory)
{
	(void) kstat_chain_update(ctl->zsctl_kstat_ctl);
	(void) gettimeofday(&(ctl->zsctl_timeofday), NULL);

	zsd_refresh_system(ctl);

	/*
	 * Memory calculation is expensive.  Only update it on sample
	 * intervals.
	 */
	if (do_memory == B_TRUE)
		zsd_refresh_memory(ctl, init);
	zsd_refresh_zones(ctl);
	zsd_refresh_psets(ctl);
	zsd_refresh_procs(ctl, init);
	zsd_refresh_cpu_stats(ctl, init);

	/*
	 * Delete objects that no longer exist.
	 * Pset usages must be deleted first as they point to zone and
	 * pset objects.
	 */
	zsd_mark_pset_usages_end(ctl);
	zsd_mark_psets_end(ctl);
	zsd_mark_cpus_end(ctl);
	zsd_mark_zones_end(ctl);

	/*
	 * Save results for clients.
	 */
	zsd_usage_cache_update(ctl);

	/*
	 * Roll process accounting file.
	 */
	(void) zsd_roll_exacct();
	return (0);
}

/*
 * Get the system rctl, which is the upper most limit
 */
static uint64_t
zsd_get_system_rctl(char *name)
{
	rctlblk_t *rblk, *rblk_last;

	rblk = (rctlblk_t *)alloca(rctlblk_size());
	rblk_last = (rctlblk_t *)alloca(rctlblk_size());

	if (getrctl(name, NULL, rblk_last, RCTL_FIRST) != 0)
		return (ZS_LIMIT_NONE);

	while (getrctl(name, rblk_last, rblk, RCTL_NEXT) == 0)
		(void) bcopy(rblk, rblk_last, rctlblk_size());

	return (rctlblk_get_value(rblk_last));
}

/*
 * Open any necessary subsystems for collecting utilization data,
 * allocate and initialize data structures, and get initial utilization.
 *
 * Errors:
 *	ENOMEM	out of memory
 *	EINVAL  other error
 */
static zsd_ctl_t *
zsd_open(zsd_ctl_t *ctl)
{
	zsd_system_t *system;

	char path[MAXPATHLEN];
	struct statvfs svfs;
	int ret;
	int i;
	size_t size;
	int err;

	if (ctl == NULL && (ctl = (zsd_ctl_t *)calloc(1,
	    sizeof (zsd_ctl_t))) == NULL) {
			zsd_warn(gettext("Out of Memory"));
			errno = ENOMEM;
			goto err;
	}
	ctl->zsctl_proc_fd = -1;

	/* open kstats */
	if (ctl->zsctl_kstat_ctl == NULL &&
	    (ctl->zsctl_kstat_ctl = kstat_open()) == NULL) {
		err = errno;
		zsd_warn(gettext("Unable to open kstats"));
		errno = err;
		if (errno != ENOMEM)
			errno = EAGAIN;
		goto err;
	}

	/*
	 * These are set when the accounting file is opened by
	 * zsd_update_procs()
	 */
	ctl->zsctl_proc_fd = -1;
	ctl->zsctl_proc_fd_next = -1;
	ctl->zsctl_proc_open = 0;
	ctl->zsctl_proc_open_next = 0;

check_exacct:
	(void) zsd_enable_cpu_stats();

	/* Create structures to track usage */
	if (ctl->zsctl_system == NULL && (ctl->zsctl_system = (zsd_system_t *)
	    calloc(1, sizeof (zsd_system_t))) == NULL) {
		ret = -1;
		zsd_warn(gettext("Out of Memory"));
		errno = ENOMEM;
		goto err;
	}
	system = ctl->zsctl_system;
	/* get the kernel bitness to know structure layout for getvmusage */
	ret = sysinfo(SI_ARCHITECTURE_64, path, sizeof (path));
	if (ret < 0)
		ctl->zsctl_kern_bits = 32;
	else
		ctl->zsctl_kern_bits = 64;
	ctl->zsctl_pagesize = sysconf(_SC_PAGESIZE);

	size = sysconf(_SC_CPUID_MAX);
	ctl->zsctl_maxcpuid = size;
	if (ctl->zsctl_cpu_array == NULL && (ctl->zsctl_cpu_array =
	    (zsd_cpu_t *)calloc(size + 1, sizeof (zsd_cpu_t))) == NULL) {
		zsd_warn(gettext("Out of Memory"));
		errno = ENOMEM;
		goto err;
	}
	for (i = 0; i <= ctl->zsctl_maxcpuid; i++) {
		ctl->zsctl_cpu_array[i].zsc_id = i;
		ctl->zsctl_cpu_array[i].zsc_allocated = B_FALSE;
		ctl->zsctl_cpu_array[i].zsc_psetid = ZS_PSET_ERROR;
		ctl->zsctl_cpu_array[i].zsc_psetid_prev = ZS_PSET_ERROR;
	}
	if (statvfs("/proc", &svfs) != 0 ||
	    strcmp("/proc", svfs.f_fstr) != 0) {
		zsd_warn(gettext("/proc not a procfs filesystem"));
		errno = EINVAL;
		goto err;
	}

	size = sysconf(_SC_MAXPID) + 1;
	ctl->zsctl_maxproc = size;
	if (ctl->zsctl_proc_array == NULL &&
	    (ctl->zsctl_proc_array = (zsd_proc_t *)calloc(size,
	    sizeof (zsd_proc_t))) == NULL) {
		zsd_warn(gettext("Out of Memory"));
		errno = ENOMEM;
		goto err;
	}
	for (i = 0; i <= ctl->zsctl_maxproc; i++) {
		list_link_init(&(ctl->zsctl_proc_array[i].zspr_next));
		ctl->zsctl_proc_array[i].zspr_psetid = ZS_PSET_ERROR;
		ctl->zsctl_proc_array[i].zspr_zoneid = -1;
		ctl->zsctl_proc_array[i].zspr_usage.tv_sec = 0;
		ctl->zsctl_proc_array[i].zspr_usage.tv_nsec = 0;
		ctl->zsctl_proc_array[i].zspr_ppid = -1;
	}

	list_create(&ctl->zsctl_zones, sizeof (zsd_zone_t),
	    offsetof(zsd_zone_t, zsz_next));

	list_create(&ctl->zsctl_psets, sizeof (zsd_pset_t),
	    offsetof(zsd_pset_t, zsp_next));

	list_create(&ctl->zsctl_cpus, sizeof (zsd_cpu_t),
	    offsetof(zsd_cpu_t, zsc_next));

	if (ctl->zsctl_pool_conf == NULL &&
	    (ctl->zsctl_pool_conf = pool_conf_alloc()) == NULL) {
		zsd_warn(gettext("Out of Memory"));
		errno = ENOMEM;
		goto err;
	}
	ctl->zsctl_pool_status = POOL_DISABLED;
	ctl->zsctl_pool_changed = 0;

	if (ctl->zsctl_pool_vals[0] == NULL &&
	    (ctl->zsctl_pool_vals[0] = pool_value_alloc()) == NULL) {
		zsd_warn(gettext("Out of Memory"));
		errno = ENOMEM;
		goto err;
	}
	if (ctl->zsctl_pool_vals[1] == NULL &&
	    (ctl->zsctl_pool_vals[1] = pool_value_alloc()) == NULL) {
		zsd_warn(gettext("Out of Memory"));
		errno = ENOMEM;
		goto err;
	}
	ctl->zsctl_pool_vals[2] = NULL;

	/*
	 * get system limits
	 */
	system->zss_maxpid = size = sysconf(_SC_MAXPID);
	system->zss_processes_max = zsd_get_system_rctl("zone.max-processes");
	system->zss_lwps_max = zsd_get_system_rctl("zone.max-lwps");
	system->zss_shm_max = zsd_get_system_rctl("zone.max-shm-memory");
	system->zss_shmids_max = zsd_get_system_rctl("zone.max-shm-ids");
	system->zss_semids_max = zsd_get_system_rctl("zone.max-sem-ids");
	system->zss_msgids_max = zsd_get_system_rctl("zone.max-msg-ids");
	system->zss_lofi_max = zsd_get_system_rctl("zone.max-lofi");

	g_gen_next = 1;

	if (zsd_read(ctl, B_TRUE, B_FALSE) != 0)
		zsd_warn(gettext("Reading zone statistics failed"));

	return (ctl);
err:
	if (ctl)
		zsd_close(ctl);

	return (NULL);
}

/* Copy utilization data to buffer, filtering data if non-global zone. */
static void
zsd_usage_filter(zoneid_t zid, zs_usage_cache_t *cache, zs_usage_t *usage,
    boolean_t is_gz)
{
	zs_usage_t *cusage;
	zs_system_t *sys, *csys;
	zs_zone_t *zone, *czone;
	zs_pset_t *pset, *cpset;
	zs_pset_zone_t *pz, *cpz, *foundpz;
	size_t size = 0, csize = 0;
	char *start, *cstart;
	int i, j;
	timestruc_t delta;

	/* Privileged users in the global zone get everything */
	if (is_gz) {
		cusage = cache->zsuc_usage;
		(void) bcopy(cusage, usage, cusage->zsu_size);
		return;
	}

	/* Zones just get their own usage */
	cusage = cache->zsuc_usage;

	start = (char *)usage;
	cstart = (char *)cusage;
	size += sizeof (zs_usage_t);
	csize += sizeof (zs_usage_t);

	usage->zsu_start = cusage->zsu_start;
	usage->zsu_hrstart = cusage->zsu_hrstart;
	usage->zsu_time = cusage->zsu_time;
	usage->zsu_hrtime = cusage->zsu_hrtime;
	usage->zsu_gen = cusage->zsu_gen;
	usage->zsu_nzones = 1;
	usage->zsu_npsets = 0;

	/* LINTED */
	sys = (zs_system_t *)(start + size);
	/* LINTED */
	csys = (zs_system_t *)(cstart + csize);
	size += sizeof (zs_system_t);
	csize += sizeof (zs_system_t);

	/* Save system limits but not usage */
	*sys = *csys;
	sys->zss_ncpus = 0;
	sys->zss_ncpus_online = 0;

	/* LINTED */
	zone = (zs_zone_t *)(start + size);
	/* LINTED */
	czone = (zs_zone_t *)(cstart + csize);
	/* Find the matching zone */
	for (i = 0; i < cusage->zsu_nzones; i++) {
		if (czone->zsz_id == zid) {
			*zone = *czone;
			size += sizeof (zs_zone_t);
		}
		csize += sizeof (zs_zone_t);
		/* LINTED */
		czone = (zs_zone_t *)(cstart + csize);
	}
	sys->zss_ram_kern += (sys->zss_ram_zones - zone->zsz_usage_ram);
	sys->zss_ram_zones = zone->zsz_usage_ram;

	sys->zss_vm_kern += (sys->zss_vm_zones - zone->zsz_usage_vm);
	sys->zss_vm_zones = zone->zsz_usage_vm;

	sys->zss_locked_kern += (sys->zss_locked_zones -
	    zone->zsz_usage_locked);
	sys->zss_locked_zones = zone->zsz_usage_locked;

	TIMESTRUC_DELTA(delta, sys->zss_cpu_usage_zones, zone->zsz_cpu_usage);
	TIMESTRUC_ADD_TIMESTRUC(sys->zss_cpu_usage_kern, delta);
	sys->zss_cpu_usage_zones = zone->zsz_cpu_usage;

	/* LINTED */
	pset = (zs_pset_t *)(start + size);
	/* LINTED */
	cpset = (zs_pset_t *)(cstart + csize);
	for (i = 0; i < cusage->zsu_npsets; i++) {
		csize += sizeof (zs_pset_t);
		/* LINTED */
		cpz = (zs_pset_zone_t *)(csize + cstart);
		foundpz = NULL;
		for (j = 0; j < cpset->zsp_nusage; j++) {
			if (cpz->zspz_zoneid == zid)
				foundpz = cpz;

			csize += sizeof (zs_pset_zone_t);
			/* LINTED */
			cpz = (zs_pset_zone_t *)(csize + cstart);
		}
		if (foundpz != NULL) {
			size += sizeof (zs_pset_t);
			/* LINTED */
			pz = (zs_pset_zone_t *)(start + size);
			size += sizeof (zs_pset_zone_t);

			*pset = *cpset;
			*pz = *foundpz;

			TIMESTRUC_DELTA(delta, pset->zsp_usage_zones,
			    pz->zspz_cpu_usage);
			TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_kern, delta);
			pset->zsp_usage_zones = pz->zspz_cpu_usage;
			pset->zsp_nusage = 1;
			usage->zsu_npsets++;
			sys->zss_ncpus += pset->zsp_size;
			sys->zss_ncpus_online += pset->zsp_online;
		}
		/* LINTED */
		cpset = (zs_pset_t *)(cstart + csize);
	}
	usage->zsu_size = size;
}

/*
 * Respond to new connections from libzonestat.so.  Also respond to zoneadmd,
 * which reports new zones.
 */
/* ARGSUSED */
static void
zsd_server(void *cookie, char *argp, size_t arg_size,
    door_desc_t *dp, uint_t n_desc)
{
	int *args, cmd;
	door_desc_t door;
	ucred_t *ucred;
	const priv_set_t *eset;

	if (argp == DOOR_UNREF_DATA) {
		(void) door_return(NULL, 0, NULL, 0);
		thr_exit(NULL);
	}

	if (arg_size != sizeof (cmd) * 2) {
		(void) door_return(NULL, 0, NULL, 0);
		thr_exit(NULL);
	}

	/* LINTED */
	args = (int *)argp;
	cmd = args[0];

	/* If connection, return door to stat server */
	if (cmd == ZSD_CMD_CONNECT) {

		/* Verify client compilation version */
		if (args[1] != ZS_VERSION) {
			args[1] = ZSD_STATUS_VERSION_MISMATCH;
			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
			thr_exit(NULL);
		}
		ucred = alloca(ucred_size());
		/* Verify client permission */
		if (door_ucred(&ucred) != 0) {
			args[1] = ZSD_STATUS_INTERNAL_ERROR;
			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
			thr_exit(NULL);
		}

		eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
		if (eset == NULL) {
			args[1] = ZSD_STATUS_INTERNAL_ERROR;
			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
			thr_exit(NULL);
		}
		if (!priv_ismember(eset, PRIV_PROC_INFO)) {
			args[1] = ZSD_STATUS_PERMISSION;
			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
			thr_exit(NULL);
		}

		/* Return stat server door */
		args[1] = ZSD_STATUS_OK;
		door.d_attributes = DOOR_DESCRIPTOR;
		door.d_data.d_desc.d_descriptor = g_stat_door;
		(void) door_return(argp, sizeof (cmd) * 2, &door, 1);
		thr_exit(NULL);
	}

	/* Respond to zoneadmd informing zonestatd of a new zone */
	if (cmd == ZSD_CMD_NEW_ZONE) {
		zsd_fattach_zone(args[1], g_server_door, B_FALSE);
		(void) door_return(NULL, 0, NULL, 0);
		thr_exit(NULL);
	}

	args[1] = ZSD_STATUS_INTERNAL_ERROR;
	(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
	thr_exit(NULL);
}

/*
 * Respond to libzonestat.so clients with the current utlilzation data.
 */
/* ARGSUSED */
static void
zsd_stat_server(void *cookie, char *argp, size_t arg_size,
    door_desc_t *dp, uint_t n_desc)
{
	uint64_t *args, cmd;
	zs_usage_cache_t *cache;
	int ret;
	char *rvalp;
	size_t rvals;
	zs_usage_t *usage;
	ucred_t *ucred;
	zoneid_t zoneid;
	const priv_set_t *eset;
	boolean_t is_gz = B_FALSE;

	/* Tell stat thread there are no more clients */
	if (argp == DOOR_UNREF_DATA) {
		(void) mutex_lock(&g_usage_cache_lock);
		g_hasclient = B_FALSE;
		(void) cond_signal(&g_usage_cache_kick);
		(void) mutex_unlock(&g_usage_cache_lock);
		(void) door_return(NULL, 0, NULL, 0);
		thr_exit(NULL);
	}
	if (arg_size != sizeof (cmd) * 2) {
		(void) door_return(NULL, 0, NULL, 0);
		thr_exit(NULL);
	}
	/* LINTED */
	args = (uint64_t *)argp;
	cmd = args[0];
	if (cmd != ZSD_CMD_READ) {
		(void) door_return(NULL, 0, NULL, 0);
		thr_exit(NULL);
	}
	ucred = alloca(ucred_size());
	if (door_ucred(&ucred) != 0) {
		(void) door_return(NULL, 0, NULL, 0);
		thr_exit(NULL);
	}
	zoneid = ucred_getzoneid(ucred);

	if (zoneid == GLOBAL_ZONEID)
		is_gz = B_TRUE;

	eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
	if (eset == NULL) {
		(void) door_return(NULL, 0, NULL, 0);
		thr_exit(NULL);
	}
	if (!priv_ismember(eset, PRIV_PROC_INFO)) {
		(void) door_return(NULL, 0, NULL, 0);
		thr_exit(NULL);
	}
	(void) mutex_lock(&g_usage_cache_lock);
	g_hasclient = B_TRUE;

	/*
	 * Force a new cpu calculation for client.  This will force a
	 * new memory calculation if the memory data is older than the
	 * sample period.
	 */
	g_usage_cache_kickers++;
	(void) cond_signal(&g_usage_cache_kick);
	ret = cond_wait(&g_usage_cache_wait, &g_usage_cache_lock);
	g_usage_cache_kickers--;
	if (ret != 0 && errno == EINTR) {
		(void) mutex_unlock(&g_usage_cache_lock);
		zsd_warn(gettext(
		    "Interrupted before writing usage size to client\n"));
		(void) door_return(NULL, 0, NULL, 0);
		thr_exit(NULL);
	}
	cache = zsd_usage_cache_hold_locked();
	if (cache == NULL) {
		zsd_warn(gettext("Usage cache empty.\n"));
		(void) door_return(NULL, 0, NULL, 0);
		thr_exit(NULL);
	}
	(void) mutex_unlock(&g_usage_cache_lock);

	/* Copy current usage data to stack to send to client */
	usage = (zs_usage_t *)alloca(cache->zsuc_size);

	/* Filter out results if caller is non-global zone */
	zsd_usage_filter(zoneid, cache, usage, is_gz);

	rvalp = (void *)usage;
	rvals = usage->zsu_size;
	zsd_usage_cache_rele(cache);

	(void) door_return(rvalp, rvals, NULL, 0);
	thr_exit(NULL);
}

static volatile boolean_t g_quit;

/* ARGSUSED */
static void
zonestat_quithandler(int sig)
{
	g_quit = B_TRUE;
}

/*
 * The stat thread generates new utilization data when clients request
 * it.  It also manages opening and closing the subsystems used to gather
 * data depending on if clients exist.
 */
/* ARGSUSED */
void *
stat_thread(void *arg)
{
	time_t start;
	time_t now;
	time_t next_memory;
	boolean_t do_memory;
	boolean_t do_read;
	boolean_t do_close;

	start = time(NULL);
	if (start < 0) {
		if (g_quit == B_TRUE)
			goto quit;
		zsd_warn(gettext("Unable to fetch current time"));
		g_quit = B_TRUE;
		goto quit;
	}

	next_memory = start;
	while (g_quit == B_FALSE) {
		for (;;) {
			/*
			 * These are used to decide if the most recent memory
			 * calculation was within a sample interval,
			 * and weather or not the usage collection needs to
			 * be opened or closed.
			 */
			do_memory = B_FALSE;
			do_read = B_FALSE;
			do_close = B_FALSE;

			/*
			 * If all clients have gone, close usage collecting
			 */
			(void) mutex_lock(&g_usage_cache_lock);
			if (!g_hasclient && g_open == B_TRUE) {
				do_close = B_TRUE;
				(void) mutex_unlock(&g_usage_cache_lock);
				break;
			}
			if (g_quit == B_TRUE) {
				(void) mutex_unlock(
				    &g_usage_cache_lock);
				break;
			}
			/*
			 * Wait for a usage data request
			 */
			if (g_usage_cache_kickers == 0) {
				(void) cond_wait(&g_usage_cache_kick,
				    &g_usage_cache_lock);
			}
			now = time(NULL);
			if (now < 0) {
				if (g_quit == B_TRUE) {
					(void) mutex_unlock(
					    &g_usage_cache_lock);
					goto quit;
				}
				g_quit = B_TRUE;
				(void) mutex_unlock(&g_usage_cache_lock);
				zsd_warn(gettext(
				    "Unable to fetch current time"));
				goto quit;
			}
			if (g_hasclient) {
				do_read = B_TRUE;
				if (now >= next_memory) {
					do_memory = B_TRUE;
					next_memory = now + g_interval;
				}
			} else {
				do_close = B_TRUE;
			}
			(void) mutex_unlock(&g_usage_cache_lock);
			if (do_read || do_close)
				break;
		}
		g_now = now;
		g_hrnow = gethrtime();
		if (g_hasclient && g_open == B_FALSE) {
			g_start = g_now;
			g_hrstart = g_hrnow;
			g_ctl = zsd_open(g_ctl);
			if (g_ctl == NULL)
				zsd_warn(gettext(
				    "Unable to open zone statistics"));
			else
				g_open = B_TRUE;
		}
		if (do_read && g_ctl) {
			if (zsd_read(g_ctl, B_FALSE, do_memory) != 0) {
				zsd_warn(gettext(
				    "Unable to read zone statistics"));
				g_quit = B_TRUE;
				return (NULL);
			}
		}
		(void) mutex_lock(&g_usage_cache_lock);
		if (!g_hasclient && g_open == B_TRUE && g_ctl) {
			(void) mutex_unlock(&g_usage_cache_lock);
			zsd_close(g_ctl);
			g_open = B_FALSE;
		} else {
			(void) mutex_unlock(&g_usage_cache_lock);
		}
	}
quit:
	if (g_open)
		zsd_close(g_ctl);

	(void) thr_kill(g_main, SIGINT);
	thr_exit(NULL);
	return (NULL);
}

void
zsd_set_fx()
{
	pcinfo_t pcinfo;
	pcparms_t pcparms;

	(void) strlcpy(pcinfo.pc_clname, "FX", sizeof (pcinfo.pc_clname));
	if (priocntl(0, 0, PC_GETCID, (caddr_t)&pcinfo) == -1) {
		zsd_warn(gettext("cannot get FX class parameters"));
		return;
	}
	pcparms.pc_cid = pcinfo.pc_cid;
	((fxparms_t *)pcparms.pc_clparms)->fx_upri = 60;
	((fxparms_t *)pcparms.pc_clparms)->fx_uprilim = 60;
	((fxparms_t *)pcparms.pc_clparms)->fx_tqsecs = 0;
	((fxparms_t *)pcparms.pc_clparms)->fx_tqnsecs = FX_NOCHANGE;
	if (priocntl(P_PID, getpid(), PC_SETPARMS, (caddr_t)&pcparms) == -1)
		zsd_warn(gettext("cannot enter the FX class"));
}

static int pipe_fd;

static void
daemonize_ready(char status)
{
	/*
	 * wake the parent with a clue
	 */
	(void) write(pipe_fd, &status, 1);
	(void) close(pipe_fd);
}

static int
daemonize_start(void)
{
	char data;
	int status;

	int filedes[2];
	pid_t pid;

	(void) close(0);
	(void) dup2(2, 1);

	if (pipe(filedes) < 0)
		return (-1);

	(void) fflush(NULL);

	if ((pid = fork1()) < 0)
		return (-1);

	if (pid != 0) {
		/*
		 * parent
		 */
		struct sigaction act;

		act.sa_handler = SIG_DFL;
		(void) sigemptyset(&act.sa_mask);
		act.sa_flags = 0;

		(void) sigaction(SIGPIPE, &act, NULL);  /* ignore SIGPIPE */

		(void) close(filedes[1]);
		if (read(filedes[0], &data, 1) == 1) {
			/* forward ready code via exit status */
			exit(data);
		}
		status = -1;
		(void) wait4(pid, &status, 0, NULL);
		/* daemon process exited before becoming ready */
		if (WIFEXITED(status)) {
			/* assume daemon process printed useful message */
			exit(WEXITSTATUS(status));
		} else {
			zsd_warn(gettext("daemon process killed or died"));
			exit(1);
		}
	}

	/*
	 * child
	 */
	pipe_fd = filedes[1];
	(void) close(filedes[0]);

	/*
	 * generic Unix setup
	 */
	(void) setsid();
	(void) umask(0000);

	return (0);
}

static void
fattach_all_zones(boolean_t detach_only)
{
	zoneid_t *zids;
	uint_t nzids, nzids_last;
	int i;

again:
	(void) zone_list(NULL, &nzids);
	nzids_last = nzids;
	zids = (zoneid_t *)malloc(sizeof (zoneid_t) * nzids_last);
	if (zids == NULL)
		zsd_error(gettext("Out of memory"));

	(void) zone_list(zids, &nzids);
	if (nzids > nzids_last) {
		free(zids);
		goto again;
	}
	for (i = 0; i < nzids; i++)
		zsd_fattach_zone(zids[i], g_server_door, detach_only);

	free(zids);
}

int
main(int argc, char *argv[])
{

	int arg;
	thread_t tid;
	scf_simple_prop_t *prop;
	uint64_t *intervalp;
	boolean_t opt_cleanup = B_FALSE;

	g_main = thr_self();
	g_quit = B_FALSE;
	(void) signal(SIGINT, zonestat_quithandler);
	(void) signal(SIGTERM, zonestat_quithandler);
	(void) signal(SIGHUP, zonestat_quithandler);
/*	(void) sigignore(SIGCHLD); */
	(void) sigignore(SIGPIPE);

	if (getzoneid() != GLOBAL_ZONEID)
		zsd_error(gettext("Must be run from global zone only"));

	while ((arg = getopt(argc, argv, "c"))
	    != EOF) {
		switch (arg) {
		case 'c':
			opt_cleanup = B_TRUE;
			break;
		default:
			zsd_error(gettext("Invalid option"));
		}
	}

	if (opt_cleanup) {
		if (zsd_disable_cpu_stats() != 0)
			exit(1);
		else
			exit(0);
	}

	/* Get the configured sample interval */
	prop = scf_simple_prop_get(NULL, "svc:/system/zones-monitoring:default",
	    "config", "sample_interval");
	if (prop == NULL)
		zsd_error(gettext("Unable to fetch SMF property "
		    "\"config/sample_interval\""));

	if (scf_simple_prop_type(prop) != SCF_TYPE_COUNT)
		zsd_error(gettext("Malformed SMF property "
		    "\"config/sample_interval\".  Must be of type \"count\""));

	intervalp = scf_simple_prop_next_count(prop);
	g_interval = *intervalp;
	if (g_interval == 0)
		zsd_error(gettext("Malformed SMF property "
		    "\"config/sample_interval\".  Must be greater than zero"));

	scf_simple_prop_free(prop);

	if (daemonize_start() < 0)
		zsd_error(gettext("Unable to start daemon\n"));

	/* Run at high priority */
	zsd_set_fx();

	(void) mutex_init(&g_usage_cache_lock, USYNC_THREAD, NULL);
	(void) cond_init(&g_usage_cache_kick, USYNC_THREAD, NULL);
	(void) cond_init(&g_usage_cache_wait, USYNC_THREAD, NULL);

	g_server_door = door_create(zsd_server, NULL,
	    DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
	if (g_server_door < 0)
		zsd_error(gettext("Unable to create server door\n"));


	g_stat_door = door_create(zsd_stat_server, NULL, DOOR_UNREF_MULTI |
	    DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
	if (g_stat_door < 0)
		zsd_error(gettext("Unable to create statistics door\n"));

	fattach_all_zones(B_FALSE);

	if (thr_create(NULL, 0, stat_thread, NULL, 0, &tid) != 0)
		zsd_error(gettext("Unable to create statistics thread\n"));

	daemonize_ready(0);

	/* Wait for signal to quit */
	while (g_quit == B_FALSE)
		(void) pause();

	/* detach doors */
	fattach_all_zones(B_TRUE);

	(void) door_revoke(g_server_door);
	(void) door_revoke(g_stat_door);

	/* kick stat thread and wait for it to close the statistics */
	(void) mutex_lock(&g_usage_cache_lock);
	g_quit = B_TRUE;
	(void) cond_signal(&g_usage_cache_kick);
	(void) mutex_unlock(&g_usage_cache_lock);
end:
	(void) thr_join(tid, NULL, NULL);
	return (0);
}