xref: /illumos-gate/usr/src/cmd/zonestat/zonestatd/zonestatd.c (revision 2d9a5a52c758e1dbaee1569f0d91634a0f5cbe39)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 #include <alloca.h>
26 #include <assert.h>
27 #include <dirent.h>
28 #include <dlfcn.h>
29 #include <door.h>
30 #include <errno.h>
31 #include <exacct.h>
32 #include <ctype.h>
33 #include <fcntl.h>
34 #include <kstat.h>
35 #include <libcontract.h>
36 #include <libintl.h>
37 #include <libscf.h>
38 #include <zonestat.h>
39 #include <zonestat_impl.h>
40 #include <limits.h>
41 #include <pool.h>
42 #include <procfs.h>
43 #include <rctl.h>
44 #include <thread.h>
45 #include <signal.h>
46 #include <stdarg.h>
47 #include <stddef.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <strings.h>
51 #include <synch.h>
52 #include <sys/acctctl.h>
53 #include <sys/contract/process.h>
54 #include <sys/ctfs.h>
55 #include <sys/fork.h>
56 #include <sys/param.h>
57 #include <sys/priocntl.h>
58 #include <sys/fxpriocntl.h>
59 #include <sys/processor.h>
60 #include <sys/pset.h>
61 #include <sys/socket.h>
62 #include <sys/stat.h>
63 #include <sys/statvfs.h>
64 #include <sys/swap.h>
65 #include <sys/systeminfo.h>
66 #include <thread.h>
67 #include <sys/list.h>
68 #include <sys/time.h>
69 #include <sys/types.h>
70 #include <sys/vm_usage.h>
71 #include <sys/wait.h>
72 #include <sys/zone.h>
73 #include <time.h>
74 #include <ucred.h>
75 #include <unistd.h>
76 #include <vm/anon.h>
77 #include <zone.h>
78 #include <zonestat.h>
79 
80 #define	MAX_PSET_NAME	1024	/* Taken from PV_NAME_MAX_LEN */
81 #define	ZSD_PSET_UNLIMITED	UINT16_MAX
82 #define	ZONESTAT_EXACCT_FILE	"/var/adm/exacct/zonestat-process"
83 
84 /*
85  * zonestatd implements gathering cpu and memory utilization data for
86  * running zones.  It has these components:
87  *
88  * zsd_server:
89  *	Door server to respond to client connections.  Each client
90  *	will connect using libzonestat.so, which will open and
91  *	call /var/tmp/.zonestat_door.  Each connecting client is given
92  *	a file descriptor to the stat server.
93  *
94  *	The zsd_server also responds to zoneadmd, which reports when a
95  *	new zone is booted.  This is used to fattach the zsd_server door
96  *	into the new zone.
97  *
98  * zsd_stat_server:
99  *	Receives client requests for the current utilization data.  Each
100  *	client request will cause zonestatd to update the current utilization
101  *	data by kicking the stat_thread.
102  *
103  *	If the client is in a non-global zone, the utilization data will
104  *	be filtered to only show the given zone.  The usage by all other zones
105  *	will be added to the system utilization.
106  *
107  * stat_thread:
108  *	The stat thread implements querying the system to determine the
109  *	current utilization data for each running zone.  This includes
110  *	inspecting the system's processor set configuration, as well as details
111  *	of each zone, such as their configured limits, and which processor
112  *	sets they are running in.
113  *
114  *	The stat_thread will only update memory utilization data as often as
115  *	the configured config/sample_interval on the zones-monitoring service.
116  */
117 
118 /*
119  * The private vmusage structure unfortunately uses size_t types, and assumes
120  * the caller's bitness matches the kernel's bitness.  Since the getvmusage()
121  * system call is contracted, and zonestatd is 32 bit, the following structures
122  * are used to interact with a 32bit or 64 bit kernel.
123  */
124 typedef struct zsd_vmusage32 {
125 	id_t vmu_zoneid;
126 	uint_t vmu_type;
127 	id_t vmu_id;
128 
129 	uint32_t vmu_rss_all;
130 	uint32_t vmu_rss_private;
131 	uint32_t vmu_rss_shared;
132 	uint32_t vmu_swap_all;
133 	uint32_t vmu_swap_private;
134 	uint32_t vmu_swap_shared;
135 } zsd_vmusage32_t;
136 
137 typedef struct zsd_vmusage64 {
138 	id_t vmu_zoneid;
139 	uint_t vmu_type;
140 	id_t vmu_id;
141 	/*
142 	 * An amd64 kernel will align the following uint64_t members, but a
143 	 * 32bit i386 process will not without help.
144 	 */
145 	int vmu_align_next_members_on_8_bytes;
146 	uint64_t vmu_rss_all;
147 	uint64_t vmu_rss_private;
148 	uint64_t vmu_rss_shared;
149 	uint64_t vmu_swap_all;
150 	uint64_t vmu_swap_private;
151 	uint64_t vmu_swap_shared;
152 } zsd_vmusage64_t;
153 
154 struct zsd_zone;
155 
156 /* Used to store a zone's usage of a pset */
157 typedef struct zsd_pset_usage {
158 	struct zsd_zone	*zsu_zone;
159 	struct zsd_pset	*zsu_pset;
160 
161 	list_node_t	zsu_next;
162 
163 	zoneid_t	zsu_zoneid;
164 	boolean_t	zsu_found;	/* zone bound at end of interval */
165 	boolean_t	zsu_active;	/* zone was bound during interval */
166 	boolean_t	zsu_new;	/* zone newly bound in this interval */
167 	boolean_t	zsu_deleted;	/* zone was unbound in this interval */
168 	boolean_t	zsu_empty;	/* no procs in pset in this interval */
169 	time_t		zsu_start;	/* time when zone was found in pset */
170 	hrtime_t	zsu_hrstart;	/* time when zone  was found in pset */
171 	uint64_t	zsu_cpu_shares;
172 	uint_t		zsu_scheds;	/* schedulers found in this pass */
173 	timestruc_t	zsu_cpu_usage;	/* cpu time used */
174 } zsd_pset_usage_t;
175 
176 /* Used to store a pset's utilization */
177 typedef struct zsd_pset {
178 	psetid_t	zsp_id;
179 	list_node_t	zsp_next;
180 	char		zsp_name[ZS_PSETNAME_MAX];
181 
182 	uint_t		zsp_cputype;	/* default, dedicated or shared */
183 	boolean_t	zsp_found;	/* pset found at end of interval */
184 	boolean_t	zsp_new;	/* pset new in this interval */
185 	boolean_t	zsp_deleted;	/* pset deleted in this interval */
186 	boolean_t	zsp_active;	/* pset existed during interval */
187 	boolean_t	zsp_empty;	/* no processes in pset */
188 	time_t		zsp_start;
189 	hrtime_t	zsp_hrstart;
190 
191 	uint64_t	zsp_online;	/* online cpus in interval */
192 	uint64_t	zsp_size;	/* size in this interval */
193 	uint64_t	zsp_min;	/* configured min in this interval */
194 	uint64_t	zsp_max;	/* configured max in this interval */
195 	int64_t		zsp_importance;	/* configured max in this interval */
196 
197 	uint_t		zsp_scheds;	/* scheds of processes found in pset */
198 	uint64_t	zsp_cpu_shares;	/* total shares in this interval */
199 
200 	timestruc_t	zsp_total_time;
201 	timestruc_t	zsp_usage_kern;
202 	timestruc_t	zsp_usage_zones;
203 
204 	/* Individual zone usages of pset */
205 	list_t		zsp_usage_list;
206 	int		zsp_nusage;
207 
208 	/* Summed kstat values from individual cpus in pset */
209 	timestruc_t	zsp_idle;
210 	timestruc_t	zsp_intr;
211 	timestruc_t	zsp_kern;
212 	timestruc_t	zsp_user;
213 
214 } zsd_pset_t;
215 
216 /* Used to track an individual cpu's utilization as reported by kstats */
217 typedef struct zsd_cpu {
218 	processorid_t	zsc_id;
219 	list_node_t	zsc_next;
220 	psetid_t	zsc_psetid;
221 	psetid_t	zsc_psetid_prev;
222 	zsd_pset_t	*zsc_pset;
223 
224 	boolean_t	zsc_found;	/* cpu online in this interval */
225 	boolean_t	zsc_onlined;	/* cpu onlined during this interval */
226 	boolean_t	zsc_offlined;	/* cpu offlined during this interval */
227 	boolean_t	zsc_active;	/* cpu online during this interval */
228 	boolean_t	zsc_allocated;	/* True if cpu has ever been found */
229 
230 	/* kstats this interval */
231 	uint64_t	zsc_nsec_idle;
232 	uint64_t	zsc_nsec_intr;
233 	uint64_t	zsc_nsec_kern;
234 	uint64_t	zsc_nsec_user;
235 
236 	/* kstats in most recent interval */
237 	uint64_t	zsc_nsec_idle_prev;
238 	uint64_t	zsc_nsec_intr_prev;
239 	uint64_t	zsc_nsec_kern_prev;
240 	uint64_t	zsc_nsec_user_prev;
241 
242 	/* Total kstat increases since zonestatd started reading kstats */
243 	timestruc_t	zsc_idle;
244 	timestruc_t	zsc_intr;
245 	timestruc_t	zsc_kern;
246 	timestruc_t	zsc_user;
247 
248 } zsd_cpu_t;
249 
250 /* Used to describe an individual zone and its utilization */
251 typedef struct zsd_zone {
252 	zoneid_t	zsz_id;
253 	list_node_t	zsz_next;
254 	char		zsz_name[ZS_ZONENAME_MAX];
255 	uint_t		zsz_cputype;
256 	uint_t		zsz_iptype;
257 	time_t		zsz_start;
258 	hrtime_t	zsz_hrstart;
259 
260 	char		zsz_pool[ZS_POOLNAME_MAX];
261 	char		zsz_pset[ZS_PSETNAME_MAX];
262 	int		zsz_default_sched;
263 	/* These are deduced by inspecting processes */
264 	psetid_t	zsz_psetid;
265 	uint_t		zsz_scheds;
266 
267 	boolean_t	zsz_new;	/* zone booted during this interval */
268 	boolean_t	zsz_deleted;	/* halted during this interval */
269 	boolean_t	zsz_active;	/* running in this interval */
270 	boolean_t	zsz_empty;	/* no processes in this interval */
271 	boolean_t	zsz_gone;	/* not installed in this interval */
272 	boolean_t	zsz_found;	/* Running at end of this interval */
273 
274 	uint64_t	zsz_cpu_shares;
275 	uint64_t	zsz_cpu_cap;
276 	uint64_t	zsz_ram_cap;
277 	uint64_t	zsz_locked_cap;
278 	uint64_t	zsz_vm_cap;
279 
280 	uint64_t	zsz_cpus_online;
281 	timestruc_t	zsz_cpu_usage;	/* cpu time of cpu cap */
282 	timestruc_t	zsz_cap_time;	/* cpu time of cpu cap */
283 	timestruc_t	zsz_share_time; /* cpu time of share of cpu */
284 	timestruc_t	zsz_pset_time;  /* time of all psets zone is bound to */
285 
286 	uint64_t	zsz_usage_ram;
287 	uint64_t	zsz_usage_locked;
288 	uint64_t	zsz_usage_vm;
289 
290 	uint64_t	zsz_processes_cap;
291 	uint64_t	zsz_lwps_cap;
292 	uint64_t	zsz_shm_cap;
293 	uint64_t	zsz_shmids_cap;
294 	uint64_t	zsz_semids_cap;
295 	uint64_t	zsz_msgids_cap;
296 	uint64_t	zsz_lofi_cap;
297 
298 	uint64_t	zsz_processes;
299 	uint64_t	zsz_lwps;
300 	uint64_t	zsz_shm;
301 	uint64_t	zsz_shmids;
302 	uint64_t	zsz_semids;
303 	uint64_t	zsz_msgids;
304 	uint64_t	zsz_lofi;
305 
306 } zsd_zone_t;
307 
308 /*
309  * Used to track the cpu usage of an individual processes.
310  *
311  * zonestatd sweeps /proc each interval and charges the cpu usage of processes.
312  * to their zone.  As processes exit, their extended accounting records are
313  * read and the difference of their total and known usage is charged to their
314  * zone.
315  *
316  * If a process is never seen in /proc, the total usage on its extended
317  * accounting record will be charged to its zone.
318  */
319 typedef struct zsd_proc {
320 	list_node_t	zspr_next;
321 	pid_t		zspr_ppid;
322 	psetid_t	zspr_psetid;
323 	zoneid_t	zspr_zoneid;
324 	int		zspr_sched;
325 	timestruc_t	zspr_usage;
326 } zsd_proc_t;
327 
328 /* Used to track the overall resource usage of the system */
329 typedef struct zsd_system {
330 
331 	uint64_t zss_ram_total;
332 	uint64_t zss_ram_kern;
333 	uint64_t zss_ram_zones;
334 
335 	uint64_t zss_locked_kern;
336 	uint64_t zss_locked_zones;
337 
338 	uint64_t zss_vm_total;
339 	uint64_t zss_vm_kern;
340 	uint64_t zss_vm_zones;
341 
342 	uint64_t zss_swap_total;
343 	uint64_t zss_swap_used;
344 
345 	timestruc_t zss_idle;
346 	timestruc_t zss_intr;
347 	timestruc_t zss_kern;
348 	timestruc_t zss_user;
349 
350 	timestruc_t zss_cpu_total_time;
351 	timestruc_t zss_cpu_usage_kern;
352 	timestruc_t zss_cpu_usage_zones;
353 
354 	uint64_t zss_maxpid;
355 	uint64_t zss_processes_max;
356 	uint64_t zss_lwps_max;
357 	uint64_t zss_shm_max;
358 	uint64_t zss_shmids_max;
359 	uint64_t zss_semids_max;
360 	uint64_t zss_msgids_max;
361 	uint64_t zss_lofi_max;
362 
363 	uint64_t zss_processes;
364 	uint64_t zss_lwps;
365 	uint64_t zss_shm;
366 	uint64_t zss_shmids;
367 	uint64_t zss_semids;
368 	uint64_t zss_msgids;
369 	uint64_t zss_lofi;
370 
371 	uint64_t zss_ncpus;
372 	uint64_t zss_ncpus_online;
373 
374 } zsd_system_t;
375 
376 /*
377  * A dumping ground for various information and structures used to compute
378  * utilization.
379  *
380  * This structure is used to track the system while clients are connected.
381  * When The first client connects, a zsd_ctl is allocated and configured by
382  * zsd_open().  When all clients disconnect, the zsd_ctl is closed.
383  */
384 typedef struct zsd_ctl {
385 	kstat_ctl_t	*zsctl_kstat_ctl;
386 
387 	/* To track extended accounting */
388 	int		zsctl_proc_fd;		/* Log currently being used */
389 	ea_file_t	zsctl_proc_eaf;
390 	struct stat64	zsctl_proc_stat;
391 	int		zsctl_proc_open;
392 	int		zsctl_proc_fd_next;	/* Log file to use next */
393 	ea_file_t	zsctl_proc_eaf_next;
394 	struct stat64	zsctl_proc_stat_next;
395 	int		zsctl_proc_open_next;
396 
397 	/* pool configuration handle */
398 	pool_conf_t	*zsctl_pool_conf;
399 	int		zsctl_pool_status;
400 	int		zsctl_pool_changed;
401 
402 	/* The above usage tacking structures */
403 	zsd_system_t	*zsctl_system;
404 	list_t		zsctl_zones;
405 	list_t		zsctl_psets;
406 	list_t		zsctl_cpus;
407 	zsd_cpu_t	*zsctl_cpu_array;
408 	zsd_proc_t	*zsctl_proc_array;
409 
410 	/* Various system info */
411 	uint64_t	zsctl_maxcpuid;
412 	uint64_t	zsctl_maxproc;
413 	uint64_t	zsctl_kern_bits;
414 	uint64_t	zsctl_pagesize;
415 
416 	/* Used to track time available under a cpu cap. */
417 	uint64_t	zsctl_hrtime;
418 	uint64_t	zsctl_hrtime_prev;
419 	timestruc_t	zsctl_hrtime_total;
420 
421 	struct timeval	zsctl_timeofday;
422 
423 	/* Caches for arrays allocated for use by various system calls */
424 	psetid_t	*zsctl_pset_cache;
425 	uint_t		zsctl_pset_ncache;
426 	processorid_t	*zsctl_cpu_cache;
427 	uint_t		zsctl_cpu_ncache;
428 	zoneid_t	*zsctl_zone_cache;
429 	uint_t		zsctl_zone_ncache;
430 	struct swaptable *zsctl_swap_cache;
431 	uint64_t	zsctl_swap_cache_size;
432 	uint64_t	zsctl_swap_cache_num;
433 	zsd_vmusage64_t	*zsctl_vmusage_cache;
434 	uint64_t	zsctl_vmusage_cache_num;
435 
436 	/* Info about procfs for scanning /proc */
437 	pool_value_t	*zsctl_pool_vals[3];
438 
439 	/* Counts on tracked entities */
440 	uint_t		zsctl_nzones;
441 	uint_t		zsctl_npsets;
442 	uint_t		zsctl_npset_usages;
443 } zsd_ctl_t;
444 
445 zsd_ctl_t		*g_ctl;
446 boolean_t		g_open;		/* True if g_ctl is open */
447 int			g_hasclient;	/* True if any clients are connected */
448 
449 /*
450  * The usage cache is updated by the stat_thread, and copied to clients by
451  * the zsd_stat_server.  Mutex and cond are to synchronize between the
452  * stat_thread and the stat_server.
453  */
454 zs_usage_cache_t	*g_usage_cache;
455 mutex_t			g_usage_cache_lock;
456 cond_t			g_usage_cache_kick;
457 uint_t			g_usage_cache_kickers;
458 cond_t			g_usage_cache_wait;
459 char			*g_usage_cache_buf;
460 uint_t			g_usage_cache_bufsz;
461 uint64_t		g_gen_next;
462 
463 /* fds of door servers */
464 int			g_server_door;
465 int			g_stat_door;
466 
467 /*
468  * Starting and current time.  Used to throttle memory calculation, and to
469  * mark new zones and psets with their boot and creation time.
470  */
471 time_t			g_now;
472 time_t			g_start;
473 hrtime_t		g_hrnow;
474 hrtime_t		g_hrstart;
475 uint64_t		g_interval;
476 
477 /*
478  * main() thread.
479  */
480 thread_t		g_main;
481 
482 /* PRINTFLIKE1 */
483 static void
484 zsd_warn(const char *fmt, ...)
485 {
486 	va_list alist;
487 
488 	va_start(alist, fmt);
489 
490 	(void) fprintf(stderr, gettext("zonestat: Warning: "));
491 	(void) vfprintf(stderr, fmt, alist);
492 	(void) fprintf(stderr, "\n");
493 	va_end(alist);
494 }
495 
496 /* PRINTFLIKE1 */
497 static void
498 zsd_error(const char *fmt, ...)
499 {
500 	va_list alist;
501 
502 	va_start(alist, fmt);
503 
504 	(void) fprintf(stderr, gettext("zonestat: Error: "));
505 	(void) vfprintf(stderr, fmt, alist);
506 	(void) fprintf(stderr, "\n");
507 	va_end(alist);
508 	exit(1);
509 }
510 
511 /* Turns on extended accounting if not configured externally */
512 int
513 zsd_enable_cpu_stats()
514 {
515 	char *path = ZONESTAT_EXACCT_FILE;
516 	char oldfile[MAXPATHLEN];
517 	int ret, state = AC_ON;
518 	ac_res_t res[6];
519 
520 	/*
521 	 * Start a new accounting file  if accounting not configured
522 	 * externally.
523 	 */
524 
525 	res[0].ar_id = AC_PROC_PID;
526 	res[0].ar_state = AC_ON;
527 	res[1].ar_id = AC_PROC_ANCPID;
528 	res[1].ar_state = AC_ON;
529 	res[2].ar_id = AC_PROC_CPU;
530 	res[2].ar_state = AC_ON;
531 	res[3].ar_id = AC_PROC_TIME;
532 	res[3].ar_state = AC_ON;
533 	res[4].ar_id = AC_PROC_ZONENAME;
534 	res[4].ar_state = AC_ON;
535 	res[5].ar_id = AC_NONE;
536 	res[5].ar_state = AC_ON;
537 	if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
538 		zsd_warn(gettext("Unable to set accounting resources"));
539 		return (-1);
540 	}
541 	/* Only set accounting file if none is configured */
542 	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
543 	if (ret < 0) {
544 
545 		(void) unlink(path);
546 		if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1)
547 		    == -1) {
548 			zsd_warn(gettext("Unable to set accounting file"));
549 			return (-1);
550 		}
551 	}
552 	if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
553 		zsd_warn(gettext("Unable to enable accounting"));
554 		return (-1);
555 	}
556 	return (0);
557 }
558 
559 /* Turns off extended accounting if not configured externally */
560 int
561 zsd_disable_cpu_stats()
562 {
563 	char *path = ZONESTAT_EXACCT_FILE;
564 	int ret, state = AC_OFF;
565 	ac_res_t res[6];
566 	char oldfile[MAXPATHLEN];
567 
568 	/* If accounting file is externally configured, leave it alone */
569 	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
570 	if (ret == 0 && strcmp(oldfile, path) != 0)
571 		return (0);
572 
573 	res[0].ar_id = AC_PROC_PID;
574 	res[0].ar_state = AC_OFF;
575 	res[1].ar_id = AC_PROC_ANCPID;
576 	res[1].ar_state = AC_OFF;
577 	res[2].ar_id = AC_PROC_CPU;
578 	res[2].ar_state = AC_OFF;
579 	res[3].ar_id = AC_PROC_TIME;
580 	res[3].ar_state = AC_OFF;
581 	res[4].ar_id = AC_PROC_ZONENAME;
582 	res[4].ar_state = AC_OFF;
583 	res[5].ar_id = AC_NONE;
584 	res[5].ar_state = AC_OFF;
585 	if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
586 		zsd_warn(gettext("Unable to clear accounting resources"));
587 		return (-1);
588 	}
589 	if (acctctl(AC_PROC | AC_FILE_SET, NULL, 0) == -1) {
590 		zsd_warn(gettext("Unable to clear accounting file"));
591 		return (-1);
592 	}
593 	if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
594 		zsd_warn(gettext("Unable to diable accounting"));
595 		return (-1);
596 	}
597 
598 	(void) unlink(path);
599 	return (0);
600 }
601 
602 /*
603  * If not configured externally, deletes the current extended accounting file
604  * and starts a new one.
605  *
606  * Since the stat_thread holds an open handle to the accounting file, it will
607  * read all remaining entries from the old file before switching to
608  * read the new one.
609  */
610 int
611 zsd_roll_exacct(void)
612 {
613 	int ret;
614 	char *path = ZONESTAT_EXACCT_FILE;
615 	char oldfile[MAXPATHLEN];
616 
617 	/* If accounting file is externally configured, leave it alone */
618 	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
619 	if (ret == 0 && strcmp(oldfile, path) != 0)
620 		return (0);
621 
622 	if (unlink(path) != 0)
623 		/* Roll it next time */
624 		return (0);
625 
626 	if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) {
627 		zsd_warn(gettext("Unable to set accounting file"));
628 		return (-1);
629 	}
630 	return (0);
631 }
632 
633 /* Contract stuff for zone_enter() */
634 int
635 init_template(void)
636 {
637 	int fd;
638 	int err = 0;
639 
640 	fd = open64(CTFS_ROOT "/process/template", O_RDWR);
641 	if (fd == -1)
642 		return (-1);
643 
644 	/*
645 	 * For now, zoneadmd doesn't do anything with the contract.
646 	 * Deliver no events, don't inherit, and allow it to be orphaned.
647 	 */
648 	err |= ct_tmpl_set_critical(fd, 0);
649 	err |= ct_tmpl_set_informative(fd, 0);
650 	err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
651 	err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
652 	if (err || ct_tmpl_activate(fd)) {
653 		(void) close(fd);
654 		return (-1);
655 	}
656 
657 	return (fd);
658 }
659 
660 /*
661  * Contract stuff for zone_enter()
662  */
663 int
664 contract_latest(ctid_t *id)
665 {
666 	int cfd, r;
667 	ct_stathdl_t st;
668 	ctid_t result;
669 
670 	if ((cfd = open64(CTFS_ROOT "/process/latest", O_RDONLY)) == -1)
671 		return (errno);
672 
673 	if ((r = ct_status_read(cfd, CTD_COMMON, &st)) != 0) {
674 		(void) close(cfd);
675 		return (r);
676 	}
677 
678 	result = ct_status_get_id(st);
679 	ct_status_free(st);
680 	(void) close(cfd);
681 
682 	*id = result;
683 	return (0);
684 }
685 
686 static int
687 close_on_exec(int fd)
688 {
689 	int flags = fcntl(fd, F_GETFD, 0);
690 	if ((flags != -1) && (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1))
691 		return (0);
692 	return (-1);
693 }
694 
695 int
696 contract_open(ctid_t ctid, const char *type, const char *file, int oflag)
697 {
698 	char path[PATH_MAX];
699 	int n, fd;
700 
701 	if (type == NULL)
702 		type = "all";
703 
704 	n = snprintf(path, PATH_MAX, CTFS_ROOT "/%s/%ld/%s", type, ctid, file);
705 	if (n >= sizeof (path)) {
706 		errno = ENAMETOOLONG;
707 		return (-1);
708 	}
709 
710 	fd = open64(path, oflag);
711 	if (fd != -1) {
712 		if (close_on_exec(fd) == -1) {
713 			int err = errno;
714 			(void) close(fd);
715 			errno = err;
716 			return (-1);
717 		}
718 	}
719 	return (fd);
720 }
721 
722 int
723 contract_abandon_id(ctid_t ctid)
724 {
725 	int fd, err;
726 
727 	fd = contract_open(ctid, "all", "ctl", O_WRONLY);
728 	if (fd == -1)
729 		return (errno);
730 
731 	err = ct_ctl_abandon(fd);
732 	(void) close(fd);
733 
734 	return (err);
735 }
736 /*
737  * Attach the zsd_server to a zone.  Called for each zone when zonestatd
738  * starts, and for each newly booted zone when zoneadmd contacts the zsd_server
739  *
740  * Zone_enter is used to avoid reaching into zone to fattach door.
741  */
742 static void
743 zsd_fattach_zone(zoneid_t zid, int door, boolean_t detach_only)
744 {
745 	char *path = ZS_DOOR_PATH;
746 	int fd, pid, stat, tmpl_fd;
747 	ctid_t ct;
748 
749 	if ((tmpl_fd = init_template()) == -1) {
750 		zsd_warn("Unable to init template");
751 		return;
752 	}
753 
754 	pid = forkx(0);
755 	if (pid < 0) {
756 		(void) ct_tmpl_clear(tmpl_fd);
757 		zsd_warn(gettext(
758 		    "Unable to fork to add zonestat to zoneid %d\n"), zid);
759 		return;
760 	}
761 
762 	if (pid == 0) {
763 		(void) ct_tmpl_clear(tmpl_fd);
764 		(void) close(tmpl_fd);
765 		if (zid != 0 && zone_enter(zid) != 0) {
766 			if (errno == EINVAL) {
767 				_exit(0);
768 			}
769 			_exit(1);
770 		}
771 		(void) fdetach(path);
772 		(void) unlink(path);
773 		if (detach_only)
774 			_exit(0);
775 		fd = open(path, O_CREAT|O_RDWR, 0644);
776 		if (fd < 0)
777 			_exit(2);
778 		if (fattach(door, path) != 0)
779 			_exit(3);
780 		_exit(0);
781 	}
782 	if (contract_latest(&ct) == -1)
783 		ct = -1;
784 	(void) ct_tmpl_clear(tmpl_fd);
785 	(void) close(tmpl_fd);
786 	(void) contract_abandon_id(ct);
787 	while (waitpid(pid, &stat, 0) != pid)
788 		;
789 	if (WIFEXITED(stat) && WEXITSTATUS(stat) == 0)
790 		return;
791 
792 	zsd_warn(gettext("Unable to attach door to zoneid: %d"), zid);
793 
794 	if (WEXITSTATUS(stat) == 1)
795 		zsd_warn(gettext("Cannot entering zone"));
796 	else if (WEXITSTATUS(stat) == 2)
797 		zsd_warn(gettext("Unable to create door file: %s"), path);
798 	else if (WEXITSTATUS(stat) == 3)
799 		zsd_warn(gettext("Unable to fattach file: %s"), path);
800 
801 	zsd_warn(gettext("Internal error entering zone: %d"), zid);
802 }
803 
804 /*
805  * Zone lookup and allocation functions to manage list of currently running
806  * zones.
807  */
808 static zsd_zone_t *
809 zsd_lookup_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
810 {
811 	zsd_zone_t *zone;
812 
813 	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
814 	    zone = list_next(&ctl->zsctl_zones, zone)) {
815 		if (strcmp(zone->zsz_name, zonename) == 0) {
816 			if (zoneid != -1)
817 				zone->zsz_id = zoneid;
818 			return (zone);
819 		}
820 	}
821 	return (NULL);
822 }
823 
824 static zsd_zone_t *
825 zsd_lookup_zone_byid(zsd_ctl_t *ctl, zoneid_t zoneid)
826 {
827 	zsd_zone_t *zone;
828 
829 	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
830 	    zone = list_next(&ctl->zsctl_zones, zone)) {
831 		if (zone->zsz_id == zoneid)
832 			return (zone);
833 	}
834 	return (NULL);
835 }
836 
837 static zsd_zone_t *
838 zsd_allocate_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
839 {
840 	zsd_zone_t *zone;
841 
842 	if ((zone = (zsd_zone_t *)calloc(1, sizeof (zsd_zone_t))) == NULL)
843 		return (NULL);
844 
845 	(void) strlcpy(zone->zsz_name, zonename, sizeof (zone->zsz_name));
846 	zone->zsz_id = zoneid;
847 	zone->zsz_found = B_FALSE;
848 
849 	/*
850 	 * Allocate as deleted so if not found in first pass, zone is deleted
851 	 * from list.  This can happen if zone is returned by zone_list, but
852 	 * exits before first attempt to fetch zone details.
853 	 */
854 	zone->zsz_start = g_now;
855 	zone->zsz_hrstart = g_hrnow;
856 	zone->zsz_deleted = B_TRUE;
857 
858 	zone->zsz_cpu_shares = ZS_LIMIT_NONE;
859 	zone->zsz_cpu_cap = ZS_LIMIT_NONE;
860 	zone->zsz_ram_cap = ZS_LIMIT_NONE;
861 	zone->zsz_locked_cap = ZS_LIMIT_NONE;
862 	zone->zsz_vm_cap = ZS_LIMIT_NONE;
863 
864 	zone->zsz_processes_cap = ZS_LIMIT_NONE;
865 	zone->zsz_lwps_cap = ZS_LIMIT_NONE;
866 	zone->zsz_shm_cap = ZS_LIMIT_NONE;
867 	zone->zsz_shmids_cap = ZS_LIMIT_NONE;
868 	zone->zsz_semids_cap = ZS_LIMIT_NONE;
869 	zone->zsz_msgids_cap = ZS_LIMIT_NONE;
870 	zone->zsz_lofi_cap = ZS_LIMIT_NONE;
871 
872 	ctl->zsctl_nzones++;
873 
874 	return (zone);
875 }
876 
877 static zsd_zone_t *
878 zsd_lookup_insert_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
879 {
880 	zsd_zone_t *zone, *tmp;
881 
882 	if ((zone = zsd_lookup_zone(ctl, zonename, zoneid)) != NULL)
883 		return (zone);
884 
885 	if ((zone = zsd_allocate_zone(ctl, zonename, zoneid)) == NULL)
886 		return (NULL);
887 
888 	/* Insert sorted by zonename */
889 	tmp = list_head(&ctl->zsctl_zones);
890 	while (tmp != NULL && strcmp(zonename, tmp->zsz_name) > 0)
891 		tmp = list_next(&ctl->zsctl_zones, tmp);
892 
893 	list_insert_before(&ctl->zsctl_zones, tmp, zone);
894 	return (zone);
895 }
896 
897 /*
898  * Mark all zones as not existing.  As zones are found, they will
899  * be marked as existing.  If a zone is not found, then it must have
900  * halted.
901  */
902 static void
903 zsd_mark_zones_start(zsd_ctl_t *ctl)
904 {
905 
906 	zsd_zone_t *zone;
907 
908 	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
909 	    zone = list_next(&ctl->zsctl_zones, zone)) {
910 		zone->zsz_found = B_FALSE;
911 	}
912 }
913 
914 /*
915  * Mark each zone as not using pset.  If processes are found using the
916  * pset, the zone will remain bound to the pset.  If none of a zones
917  * processes are bound to the pset, the zone's usage of the pset will
918  * be deleted.
919  *
920  */
921 static void
922 zsd_mark_pset_usage_start(zsd_pset_t *pset)
923 {
924 	zsd_pset_usage_t *usage;
925 
926 	for (usage = list_head(&pset->zsp_usage_list);
927 	    usage != NULL;
928 	    usage = list_next(&pset->zsp_usage_list, usage)) {
929 		usage->zsu_found = B_FALSE;
930 		usage->zsu_empty = B_TRUE;
931 	}
932 }
933 
934 /*
935  * Mark each pset as not existing.  If a pset is found, it will be marked
936  * as existing.  If a pset is not found, it wil be deleted.
937  */
938 static void
939 zsd_mark_psets_start(zsd_ctl_t *ctl)
940 {
941 	zsd_pset_t *pset;
942 
943 	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
944 	    pset = list_next(&ctl->zsctl_psets, pset)) {
945 		pset->zsp_found = B_FALSE;
946 		zsd_mark_pset_usage_start(pset);
947 	}
948 }
949 
950 /*
951  * A pset was found.  Update its information
952  */
953 static void
954 zsd_mark_pset_found(zsd_pset_t *pset, uint_t type, uint64_t online,
955     uint64_t size, uint64_t min, uint64_t max, int64_t importance)
956 {
957 	pset->zsp_empty = B_TRUE;
958 	pset->zsp_deleted = B_FALSE;
959 
960 	assert(pset->zsp_found == B_FALSE);
961 
962 	/* update pset flags */
963 	if (pset->zsp_active == B_FALSE)
964 		/* pset not seen on previous interval.  It is new. */
965 		pset->zsp_new = B_TRUE;
966 	else
967 		pset->zsp_new = B_FALSE;
968 
969 	pset->zsp_found = B_TRUE;
970 	pset->zsp_cputype = type;
971 	pset->zsp_online = online;
972 	pset->zsp_size = size;
973 	pset->zsp_min = min;
974 	pset->zsp_max = max;
975 	pset->zsp_importance = importance;
976 	pset->zsp_cpu_shares = 0;
977 	pset->zsp_scheds = 0;
978 	pset->zsp_active = B_TRUE;
979 }
980 
981 /*
982  * A zone's process was found using a pset. Charge the process to the pset and
983  * the per-zone data for the pset.
984  */
985 static void
986 zsd_mark_pset_usage_found(zsd_pset_usage_t *usage, uint_t sched)
987 {
988 	zsd_zone_t *zone = usage->zsu_zone;
989 	zsd_pset_t *pset = usage->zsu_pset;
990 
991 	/* Nothing to do if already found */
992 	if (usage->zsu_found == B_TRUE)
993 		goto add_stats;
994 
995 	usage->zsu_found = B_TRUE;
996 	usage->zsu_empty = B_FALSE;
997 
998 	usage->zsu_deleted = B_FALSE;
999 	/* update usage flags */
1000 	if (usage->zsu_active == B_FALSE)
1001 		usage->zsu_new = B_TRUE;
1002 	else
1003 		usage->zsu_new = B_FALSE;
1004 
1005 	usage->zsu_scheds = 0;
1006 	usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1007 	usage->zsu_active = B_TRUE;
1008 	pset->zsp_empty = B_FALSE;
1009 	zone->zsz_empty = B_FALSE;
1010 
1011 add_stats:
1012 	/* Detect zone's pset id, and if it is bound to multiple psets */
1013 	if (zone->zsz_psetid == ZS_PSET_ERROR)
1014 		zone->zsz_psetid = pset->zsp_id;
1015 	else if (zone->zsz_psetid != pset->zsp_id)
1016 		zone->zsz_psetid = ZS_PSET_MULTI;
1017 
1018 	usage->zsu_scheds |= sched;
1019 	pset->zsp_scheds |= sched;
1020 	zone->zsz_scheds |= sched;
1021 
1022 	/* Record if FSS is co-habitating with conflicting scheduler */
1023 	if ((pset->zsp_scheds & ZS_SCHED_FSS) &&
1024 	    usage->zsu_scheds & (
1025 	    ZS_SCHED_TS | ZS_SCHED_IA | ZS_SCHED_FX)) {
1026 		usage->zsu_scheds |= ZS_SCHED_CONFLICT;
1027 
1028 		pset->zsp_scheds |= ZS_SCHED_CONFLICT;
1029 	}
1030 
1031 }
1032 
1033 /* Add cpu time for a process to a pset, zone, and system totals */
1034 static void
1035 zsd_add_usage(zsd_ctl_t *ctl, zsd_pset_usage_t *usage, timestruc_t *delta)
1036 {
1037 	zsd_system_t *system = ctl->zsctl_system;
1038 	zsd_zone_t *zone = usage->zsu_zone;
1039 	zsd_pset_t *pset = usage->zsu_pset;
1040 
1041 	TIMESTRUC_ADD_TIMESTRUC(usage->zsu_cpu_usage, *delta);
1042 	TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_zones, *delta);
1043 	TIMESTRUC_ADD_TIMESTRUC(zone->zsz_cpu_usage, *delta);
1044 	TIMESTRUC_ADD_TIMESTRUC(system->zss_cpu_usage_zones, *delta);
1045 }
1046 
1047 /* Determine which processor sets have been deleted */
1048 static void
1049 zsd_mark_psets_end(zsd_ctl_t *ctl)
1050 {
1051 	zsd_pset_t *pset, *tmp;
1052 
1053 	/*
1054 	 * Mark pset as not exists, and deleted if it existed
1055 	 * previous interval.
1056 	 */
1057 	pset = list_head(&ctl->zsctl_psets);
1058 	while (pset != NULL) {
1059 		if (pset->zsp_found == B_FALSE) {
1060 			pset->zsp_empty = B_TRUE;
1061 			if (pset->zsp_deleted == B_TRUE) {
1062 				tmp = pset;
1063 				pset = list_next(&ctl->zsctl_psets, pset);
1064 				list_remove(&ctl->zsctl_psets, tmp);
1065 				free(tmp);
1066 				ctl->zsctl_npsets--;
1067 				continue;
1068 			} else {
1069 				/* Pset vanished during this interval */
1070 				pset->zsp_new = B_FALSE;
1071 				pset->zsp_deleted = B_TRUE;
1072 				pset->zsp_active = B_TRUE;
1073 			}
1074 		}
1075 		pset = list_next(&ctl->zsctl_psets, pset);
1076 	}
1077 }
1078 
1079 /* Determine which zones are no longer bound to processor sets */
1080 static void
1081 zsd_mark_pset_usages_end(zsd_ctl_t *ctl)
1082 {
1083 	zsd_pset_t *pset;
1084 	zsd_zone_t *zone;
1085 	zsd_pset_usage_t *usage, *tmp;
1086 
1087 	/*
1088 	 * Mark pset as not exists, and deleted if it existed previous
1089 	 * interval.
1090 	 */
1091 	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1092 	    pset = list_next(&ctl->zsctl_psets, pset)) {
1093 		usage = list_head(&pset->zsp_usage_list);
1094 		while (usage != NULL) {
1095 			/*
1096 			 * Mark pset as not exists, and deleted if it existed
1097 			 * previous interval.
1098 			 */
1099 			if (usage->zsu_found == B_FALSE ||
1100 			    usage->zsu_zone->zsz_deleted == B_TRUE ||
1101 			    usage->zsu_pset->zsp_deleted == B_TRUE) {
1102 				tmp = usage;
1103 				usage = list_next(&pset->zsp_usage_list,
1104 				    usage);
1105 				list_remove(&pset->zsp_usage_list, tmp);
1106 				free(tmp);
1107 				pset->zsp_nusage--;
1108 				ctl->zsctl_npset_usages--;
1109 				continue;
1110 			} else {
1111 				usage->zsu_new = B_FALSE;
1112 				usage->zsu_deleted = B_TRUE;
1113 				usage->zsu_active = B_TRUE;
1114 			}
1115 			/* Add cpu shares for usages that are in FSS */
1116 			zone = usage->zsu_zone;
1117 			if (usage->zsu_scheds & ZS_SCHED_FSS &&
1118 			    zone->zsz_cpu_shares != ZS_SHARES_UNLIMITED &&
1119 			    zone->zsz_cpu_shares != 0) {
1120 				zone = usage->zsu_zone;
1121 				usage->zsu_cpu_shares = zone->zsz_cpu_shares;
1122 				pset->zsp_cpu_shares += zone->zsz_cpu_shares;
1123 			}
1124 			usage = list_next(&pset->zsp_usage_list,
1125 			    usage);
1126 		}
1127 	}
1128 }
1129 
1130 /* A zone has been found.  Update its information */
1131 static void
1132 zsd_mark_zone_found(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t cpu_shares,
1133     uint64_t cpu_cap, uint64_t ram_cap, uint64_t locked_cap,
1134     uint64_t vm_cap, uint64_t processes_cap, uint64_t processes,
1135     uint64_t lwps_cap, uint64_t lwps, uint64_t shm_cap, uint64_t shm,
1136     uint64_t shmids_cap, uint64_t shmids, uint64_t semids_cap,
1137     uint64_t semids, uint64_t msgids_cap, uint64_t msgids, uint64_t lofi_cap,
1138     uint64_t lofi, char *poolname, char *psetname, uint_t sched, uint_t cputype,
1139     uint_t iptype)
1140 {
1141 	zsd_system_t *sys = ctl->zsctl_system;
1142 
1143 	assert(zone->zsz_found == B_FALSE);
1144 
1145 	/*
1146 	 * Mark zone as exists, and new if it did not exist in previous
1147 	 * interval.
1148 	 */
1149 	zone->zsz_found = B_TRUE;
1150 	zone->zsz_empty = B_TRUE;
1151 	zone->zsz_deleted = B_FALSE;
1152 
1153 	/*
1154 	 * Zone is new.  Assume zone's properties are the same over entire
1155 	 * interval.
1156 	 */
1157 	if (zone->zsz_active == B_FALSE)
1158 		zone->zsz_new = B_TRUE;
1159 	else
1160 		zone->zsz_new = B_FALSE;
1161 
1162 	(void) strlcpy(zone->zsz_pool, poolname, sizeof (zone->zsz_pool));
1163 	(void) strlcpy(zone->zsz_pset, psetname, sizeof (zone->zsz_pset));
1164 	zone->zsz_default_sched = sched;
1165 
1166 	/* Schedulers updated later as processes are found */
1167 	zone->zsz_scheds = 0;
1168 
1169 	/* Cpus updated later as psets bound are identified */
1170 	zone->zsz_cpus_online = 0;
1171 
1172 	zone->zsz_cputype = cputype;
1173 	zone->zsz_iptype = iptype;
1174 	zone->zsz_psetid = ZS_PSET_ERROR;
1175 	zone->zsz_cpu_cap = cpu_cap;
1176 	zone->zsz_cpu_shares = cpu_shares;
1177 	zone->zsz_ram_cap = ram_cap;
1178 	zone->zsz_locked_cap = locked_cap;
1179 	zone->zsz_vm_cap = vm_cap;
1180 	zone->zsz_processes_cap = processes_cap;
1181 	zone->zsz_processes = processes;
1182 	zone->zsz_lwps_cap = lwps_cap;
1183 	zone->zsz_lwps = lwps;
1184 	zone->zsz_shm_cap = shm_cap;
1185 	zone->zsz_shm = shm;
1186 	zone->zsz_shmids_cap = shmids_cap;
1187 	zone->zsz_shmids = shmids;
1188 	zone->zsz_semids_cap = semids_cap;
1189 	zone->zsz_semids = semids;
1190 	zone->zsz_msgids_cap = msgids_cap;
1191 	zone->zsz_msgids = msgids;
1192 	zone->zsz_lofi_cap = lofi_cap;
1193 	zone->zsz_lofi = lofi;
1194 
1195 	sys->zss_processes += processes;
1196 	sys->zss_lwps += lwps;
1197 	sys->zss_shm += shm;
1198 	sys->zss_shmids += shmids;
1199 	sys->zss_semids += semids;
1200 	sys->zss_msgids += msgids;
1201 	sys->zss_lofi += lofi;
1202 	zone->zsz_active = B_TRUE;
1203 }
1204 
1205 
1206 /* Determine which zones have halted */
1207 static void
1208 zsd_mark_zones_end(zsd_ctl_t *ctl)
1209 {
1210 	zsd_zone_t *zone, *tmp;
1211 
1212 	/*
1213 	 * Mark zone as not existing, or delete if it did not exist in
1214 	 * previous interval.
1215 	 */
1216 	zone = list_head(&ctl->zsctl_zones);
1217 	while (zone != NULL) {
1218 		if (zone->zsz_found == B_FALSE) {
1219 			zone->zsz_empty = B_TRUE;
1220 			if (zone->zsz_deleted == B_TRUE) {
1221 				/*
1222 				 * Zone deleted in prior interval,
1223 				 * so it no longer exists.
1224 				 */
1225 				tmp = zone;
1226 				zone = list_next(&ctl->zsctl_zones, zone);
1227 				list_remove(&ctl->zsctl_zones, tmp);
1228 				free(tmp);
1229 				ctl->zsctl_nzones--;
1230 				continue;
1231 			} else {
1232 				zone->zsz_new = B_FALSE;
1233 				zone->zsz_deleted = B_TRUE;
1234 				zone->zsz_active = B_TRUE;
1235 			}
1236 		}
1237 		zone = list_next(&ctl->zsctl_zones, zone);
1238 	}
1239 }
1240 
1241 /*
1242  * Mark cpus as not existing.  If a cpu is found, it will be updated.  If
1243  * a cpu is not found, then it must have gone offline, so it will be
1244  * deleted.
1245  *
1246  * The kstat tracking data is rolled so that the usage since the previous
1247  * interval can be determined.
1248  */
1249 static void
1250 zsd_mark_cpus_start(zsd_ctl_t *ctl, boolean_t roll)
1251 {
1252 	zsd_cpu_t *cpu;
1253 
1254 	/*
1255 	 * Mark all cpus as not existing.  As cpus are found, they will
1256 	 * be marked as existing.
1257 	 */
1258 	for (cpu = list_head(&ctl->zsctl_cpus); cpu != NULL;
1259 	    cpu = list_next(&ctl->zsctl_cpus, cpu)) {
1260 		cpu->zsc_found = B_FALSE;
1261 		if (cpu->zsc_active == B_TRUE && roll) {
1262 			cpu->zsc_psetid_prev = cpu->zsc_psetid;
1263 			cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1264 			cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1265 			cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1266 			cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1267 		}
1268 	}
1269 }
1270 
1271 /*
1272  * An array the size of the maximum number of cpus is kept.  Within this array
1273  * a list of the online cpus is maintained.
1274  */
1275 zsd_cpu_t *
1276 zsd_lookup_insert_cpu(zsd_ctl_t *ctl, processorid_t cpuid)
1277 {
1278 	zsd_cpu_t *cpu;
1279 
1280 	assert(cpuid < ctl->zsctl_maxcpuid);
1281 	cpu = &(ctl->zsctl_cpu_array[cpuid]);
1282 	assert(cpuid == cpu->zsc_id);
1283 
1284 	if (cpu->zsc_allocated == B_FALSE) {
1285 		cpu->zsc_allocated = B_TRUE;
1286 		list_insert_tail(&ctl->zsctl_cpus, cpu);
1287 	}
1288 	return (cpu);
1289 }
1290 
1291 /* A cpu has been found.  Update its information */
1292 static void
1293 zsd_mark_cpu_found(zsd_cpu_t *cpu, zsd_pset_t *pset, psetid_t psetid)
1294 {
1295 	/*
1296 	 * legacy processor sets, the cpu may move while zonestatd is
1297 	 * inspecting, causing it to be found twice.  In this case, just
1298 	 * leave cpu in the first processor set in which it was found.
1299 	 */
1300 	if (cpu->zsc_found == B_TRUE)
1301 		return;
1302 
1303 	/* Mark cpu as online */
1304 	cpu->zsc_found = B_TRUE;
1305 	cpu->zsc_offlined = B_FALSE;
1306 	cpu->zsc_pset = pset;
1307 	/*
1308 	 * cpu is newly online.
1309 	 */
1310 	if (cpu->zsc_active == B_FALSE) {
1311 		/*
1312 		 * Cpu is newly online.
1313 		 */
1314 		cpu->zsc_onlined = B_TRUE;
1315 		cpu->zsc_psetid = psetid;
1316 		cpu->zsc_psetid_prev = psetid;
1317 	} else {
1318 		/*
1319 		 * cpu online during previous interval.  Save properties at
1320 		 * start of interval
1321 		 */
1322 		cpu->zsc_onlined = B_FALSE;
1323 		cpu->zsc_psetid = psetid;
1324 
1325 	}
1326 	cpu->zsc_active = B_TRUE;
1327 }
1328 
1329 /* Remove all offlined cpus from the list of tracked cpus */
1330 static void
1331 zsd_mark_cpus_end(zsd_ctl_t *ctl)
1332 {
1333 	zsd_cpu_t *cpu, *tmp;
1334 	int id;
1335 
1336 	/* Mark cpu as online or offline */
1337 	cpu = list_head(&ctl->zsctl_cpus);
1338 	while (cpu != NULL) {
1339 		if (cpu->zsc_found == B_FALSE) {
1340 			if (cpu->zsc_offlined == B_TRUE) {
1341 				/*
1342 				 * cpu offlined in prior interval. It is gone.
1343 				 */
1344 				tmp = cpu;
1345 				cpu = list_next(&ctl->zsctl_cpus, cpu);
1346 				list_remove(&ctl->zsctl_cpus, tmp);
1347 				/* Clear structure for future use */
1348 				id = tmp->zsc_id;
1349 				bzero(tmp, sizeof (zsd_cpu_t));
1350 				tmp->zsc_id = id;
1351 				tmp->zsc_allocated = B_FALSE;
1352 				tmp->zsc_psetid = ZS_PSET_ERROR;
1353 				tmp->zsc_psetid_prev = ZS_PSET_ERROR;
1354 
1355 			} else {
1356 				/*
1357 				 * cpu online at start of interval.  Treat
1358 				 * as still online, since it was online for
1359 				 * some portion of the interval.
1360 				 */
1361 				cpu->zsc_offlined = B_TRUE;
1362 				cpu->zsc_onlined = B_FALSE;
1363 				cpu->zsc_active = B_TRUE;
1364 				cpu->zsc_psetid = cpu->zsc_psetid_prev;
1365 				cpu->zsc_pset = NULL;
1366 			}
1367 		}
1368 		cpu = list_next(&ctl->zsctl_cpus, cpu);
1369 	}
1370 }
1371 
1372 /* Some utility functions for managing the list of processor sets */
1373 static zsd_pset_t *
1374 zsd_lookup_pset_byid(zsd_ctl_t *ctl, psetid_t psetid)
1375 {
1376 	zsd_pset_t *pset;
1377 
1378 	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1379 	    pset = list_next(&ctl->zsctl_psets, pset)) {
1380 		if (pset->zsp_id == psetid)
1381 			return (pset);
1382 	}
1383 	return (NULL);
1384 }
1385 
1386 static zsd_pset_t *
1387 zsd_lookup_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1388 {
1389 	zsd_pset_t *pset;
1390 
1391 	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1392 	    pset = list_next(&ctl->zsctl_psets, pset)) {
1393 		if (strcmp(pset->zsp_name, psetname) == 0) {
1394 			if (psetid != -1)
1395 				pset->zsp_id = psetid;
1396 			return (pset);
1397 		}
1398 	}
1399 	return (NULL);
1400 }
1401 
1402 static zsd_pset_t *
1403 zsd_allocate_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1404 {
1405 	zsd_pset_t *pset;
1406 
1407 	if ((pset = (zsd_pset_t *)calloc(1, sizeof (zsd_pset_t))) == NULL)
1408 		return (NULL);
1409 
1410 	(void) strlcpy(pset->zsp_name, psetname, sizeof (pset->zsp_name));
1411 	pset->zsp_id = psetid;
1412 	pset->zsp_found = B_FALSE;
1413 	/*
1414 	 * Allocate as deleted so if not found in first pass, pset is deleted
1415 	 * from list.  This can happen if pset is returned by pset_list, but
1416 	 * is destroyed before first attempt to fetch pset details.
1417 	 */
1418 	list_create(&pset->zsp_usage_list, sizeof (zsd_pset_usage_t),
1419 	    offsetof(zsd_pset_usage_t, zsu_next));
1420 
1421 	pset->zsp_hrstart = g_hrnow;
1422 	pset->zsp_deleted = B_TRUE;
1423 	pset->zsp_empty = B_TRUE;
1424 	ctl->zsctl_npsets++;
1425 
1426 	return (pset);
1427 }
1428 
1429 static zsd_pset_t *
1430 zsd_lookup_insert_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1431 {
1432 	zsd_pset_t *pset, *tmp;
1433 
1434 	if ((pset = zsd_lookup_pset(ctl, psetname, psetid)) != NULL)
1435 		return (pset);
1436 
1437 	if ((pset = zsd_allocate_pset(ctl, psetname, psetid)) == NULL)
1438 		return (NULL);
1439 
1440 	/* Insert sorted by psetname */
1441 	tmp = list_head(&ctl->zsctl_psets);
1442 	while (tmp != NULL && strcmp(psetname, tmp->zsp_name) > 0)
1443 		tmp = list_next(&ctl->zsctl_psets, tmp);
1444 
1445 	list_insert_before(&ctl->zsctl_psets, tmp, pset);
1446 	return (pset);
1447 }
1448 
1449 /* Some utility functions for managing the list of zones using each pset */
1450 static zsd_pset_usage_t *
1451 zsd_lookup_usage(zsd_pset_t *pset, zsd_zone_t *zone)
1452 {
1453 	zsd_pset_usage_t *usage;
1454 
1455 	for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
1456 	    usage = list_next(&pset->zsp_usage_list, usage))
1457 		if (usage->zsu_zone == zone)
1458 			return (usage);
1459 
1460 	return (NULL);
1461 }
1462 
1463 static zsd_pset_usage_t *
1464 zsd_allocate_pset_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1465 {
1466 	zsd_pset_usage_t *usage;
1467 
1468 	if ((usage = (zsd_pset_usage_t *)calloc(1, sizeof (zsd_pset_usage_t)))
1469 	    == NULL)
1470 		return (NULL);
1471 
1472 	list_link_init(&usage->zsu_next);
1473 	usage->zsu_zone = zone;
1474 	usage->zsu_zoneid = zone->zsz_id;
1475 	usage->zsu_pset = pset;
1476 	usage->zsu_found = B_FALSE;
1477 	usage->zsu_active = B_FALSE;
1478 	usage->zsu_new = B_FALSE;
1479 	/*
1480 	 * Allocate as not deleted.  If a process is found in a pset for
1481 	 * a zone, the usage will not be deleted until at least the next
1482 	 * interval.
1483 	 */
1484 	usage->zsu_start = g_now;
1485 	usage->zsu_hrstart = g_hrnow;
1486 	usage->zsu_deleted = B_FALSE;
1487 	usage->zsu_empty = B_TRUE;
1488 	usage->zsu_scheds = 0;
1489 	usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1490 
1491 	ctl->zsctl_npset_usages++;
1492 	pset->zsp_nusage++;
1493 
1494 	return (usage);
1495 }
1496 
1497 static zsd_pset_usage_t *
1498 zsd_lookup_insert_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1499 {
1500 	zsd_pset_usage_t *usage, *tmp;
1501 
1502 	if ((usage = zsd_lookup_usage(pset, zone))
1503 	    != NULL)
1504 		return (usage);
1505 
1506 	if ((usage = zsd_allocate_pset_usage(ctl, pset, zone)) == NULL)
1507 		return (NULL);
1508 
1509 	tmp = list_head(&pset->zsp_usage_list);
1510 	while (tmp != NULL && strcmp(zone->zsz_name, tmp->zsu_zone->zsz_name)
1511 	    > 0)
1512 		tmp = list_next(&pset->zsp_usage_list, tmp);
1513 
1514 	list_insert_before(&pset->zsp_usage_list, tmp, usage);
1515 	return (usage);
1516 }
1517 
1518 static void
1519 zsd_refresh_system(zsd_ctl_t *ctl)
1520 {
1521 	zsd_system_t *system = ctl->zsctl_system;
1522 
1523 	/* Re-count these values each interval */
1524 	system->zss_processes = 0;
1525 	system->zss_lwps = 0;
1526 	system->zss_shm = 0;
1527 	system->zss_shmids = 0;
1528 	system->zss_semids = 0;
1529 	system->zss_msgids = 0;
1530 	system->zss_lofi = 0;
1531 }
1532 
1533 
1534 /* Reads each cpu's kstats, and adds the usage to the cpu's pset */
1535 static void
1536 zsd_update_cpu_stats(zsd_ctl_t *ctl, zsd_cpu_t *cpu)
1537 {
1538 	zsd_system_t *sys;
1539 	processorid_t cpuid;
1540 	zsd_pset_t *pset_prev;
1541 	zsd_pset_t *pset;
1542 	kstat_t *kstat;
1543 	kstat_named_t *knp;
1544 	kid_t kid;
1545 	uint64_t idle, intr, kern, user;
1546 
1547 	sys = ctl->zsctl_system;
1548 	pset = cpu->zsc_pset;
1549 	knp = NULL;
1550 	kid = -1;
1551 	cpuid = cpu->zsc_id;
1552 
1553 	/* Get the cpu time totals for this cpu */
1554 	kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "cpu", cpuid, "sys");
1555 	if (kstat == NULL)
1556 		return;
1557 
1558 	kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
1559 	if (kid == -1)
1560 		return;
1561 
1562 	knp = kstat_data_lookup(kstat, "cpu_nsec_idle");
1563 	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1564 		return;
1565 
1566 	idle = knp->value.ui64;
1567 
1568 	knp = kstat_data_lookup(kstat, "cpu_nsec_kernel");
1569 	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1570 		return;
1571 
1572 	kern = knp->value.ui64;
1573 
1574 	knp = kstat_data_lookup(kstat, "cpu_nsec_user");
1575 	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1576 		return;
1577 
1578 	user = knp->value.ui64;
1579 
1580 	/*
1581 	 * Tracking intr time per cpu just exists for future enhancements.
1582 	 * The value is presently always zero.
1583 	 */
1584 	intr = 0;
1585 	cpu->zsc_nsec_idle = idle;
1586 	cpu->zsc_nsec_intr = intr;
1587 	cpu->zsc_nsec_kern = kern;
1588 	cpu->zsc_nsec_user = user;
1589 
1590 	if (cpu->zsc_onlined == B_TRUE) {
1591 		/*
1592 		 * cpu is newly online.  There is no reference value,
1593 		 * so just record its current stats for comparison
1594 		 * on next stat read.
1595 		 */
1596 		cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1597 		cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1598 		cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1599 		cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1600 		return;
1601 	}
1602 
1603 	/*
1604 	 * Calculate relative time since previous refresh.
1605 	 * Paranoia.  Don't let time  go backwards.
1606 	 */
1607 	idle = intr = kern = user = 0;
1608 	if (cpu->zsc_nsec_idle > cpu->zsc_nsec_idle_prev)
1609 		idle = cpu->zsc_nsec_idle - cpu->zsc_nsec_idle_prev;
1610 
1611 	if (cpu->zsc_nsec_intr > cpu->zsc_nsec_intr_prev)
1612 		intr = cpu->zsc_nsec_intr - cpu->zsc_nsec_intr_prev;
1613 
1614 	if (cpu->zsc_nsec_kern > cpu->zsc_nsec_kern_prev)
1615 		kern = cpu->zsc_nsec_kern - cpu->zsc_nsec_kern_prev;
1616 
1617 	if (cpu->zsc_nsec_user > cpu->zsc_nsec_user_prev)
1618 		user = cpu->zsc_nsec_user - cpu->zsc_nsec_user_prev;
1619 
1620 	/* Update totals for cpu usage */
1621 	TIMESTRUC_ADD_NANOSEC(cpu->zsc_idle, idle);
1622 	TIMESTRUC_ADD_NANOSEC(cpu->zsc_intr, intr);
1623 	TIMESTRUC_ADD_NANOSEC(cpu->zsc_kern, kern);
1624 	TIMESTRUC_ADD_NANOSEC(cpu->zsc_user, user);
1625 
1626 	/*
1627 	 * Add cpu's stats to its pset if it is known to be in
1628 	 * the pset since previous read.
1629 	 */
1630 	if (cpu->zsc_psetid == cpu->zsc_psetid_prev ||
1631 	    cpu->zsc_psetid_prev == ZS_PSET_ERROR ||
1632 	    (pset_prev = zsd_lookup_pset_byid(ctl,
1633 	    cpu->zsc_psetid_prev)) == NULL) {
1634 		TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, idle);
1635 		TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, intr);
1636 		TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, kern);
1637 		TIMESTRUC_ADD_NANOSEC(pset->zsp_user, user);
1638 	} else {
1639 		/*
1640 		 * Last pset was different than current pset.
1641 		 * Best guess is to split usage between the two.
1642 		 */
1643 		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_idle, idle / 2);
1644 		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_intr, intr / 2);
1645 		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_kern, kern / 2);
1646 		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_user, user / 2);
1647 
1648 		TIMESTRUC_ADD_NANOSEC(pset->zsp_idle,
1649 		    (idle / 2) + (idle % 2));
1650 		TIMESTRUC_ADD_NANOSEC(pset->zsp_intr,
1651 		    (intr / 2) + (intr % 2));
1652 		TIMESTRUC_ADD_NANOSEC(pset->zsp_kern,
1653 		    (kern / 2) + (kern % 2));
1654 		TIMESTRUC_ADD_NANOSEC(pset->zsp_user,
1655 		    (user / 2) + (user % 2));
1656 	}
1657 	TIMESTRUC_ADD_NANOSEC(sys->zss_idle, idle);
1658 	TIMESTRUC_ADD_NANOSEC(sys->zss_intr, intr);
1659 	TIMESTRUC_ADD_NANOSEC(sys->zss_kern, kern);
1660 	TIMESTRUC_ADD_NANOSEC(sys->zss_user, user);
1661 }
1662 
1663 /* Determine the details of a processor set by pset_id */
1664 static int
1665 zsd_get_pool_pset(zsd_ctl_t *ctl, psetid_t psetid, char *psetname,
1666     size_t namelen, uint_t *cputype, uint64_t *online, uint64_t *size,
1667     uint64_t *min, uint64_t *max, int64_t *importance)
1668 {
1669 	uint_t old, num;
1670 
1671 	pool_conf_t *conf = ctl->zsctl_pool_conf;
1672 	pool_value_t **vals = ctl->zsctl_pool_vals;
1673 	pool_resource_t **res_list = NULL;
1674 	pool_resource_t *pset;
1675 	pool_component_t **cpus = NULL;
1676 	processorid_t *cache;
1677 	const char *string;
1678 	uint64_t uint64;
1679 	int64_t int64;
1680 	int i, ret, type;
1681 
1682 	if (ctl->zsctl_pool_status == POOL_DISABLED) {
1683 
1684 		/*
1685 		 * Inspect legacy psets
1686 		 */
1687 		for (;;) {
1688 			old = num = ctl->zsctl_cpu_ncache;
1689 			ret = pset_info(psetid, &type, &num,
1690 			    ctl->zsctl_cpu_cache);
1691 			if (ret < 0) {
1692 				/* pset is gone.  Tell caller to retry */
1693 				errno = EINTR;
1694 				return (-1);
1695 			}
1696 			if (num <= old) {
1697 			/* Success */
1698 				break;
1699 			}
1700 			if ((cache = (processorid_t *)realloc(
1701 			    ctl->zsctl_cpu_cache, num *
1702 			    sizeof (processorid_t))) != NULL) {
1703 				ctl->zsctl_cpu_ncache = num;
1704 				ctl->zsctl_cpu_cache = cache;
1705 			} else {
1706 				/*
1707 				 * Could not allocate to get new cpu list.
1708 				 */
1709 				zsd_warn(gettext(
1710 				    "Could not allocate for cpu list"));
1711 				errno = ENOMEM;
1712 				return (-1);
1713 			}
1714 		}
1715 		/*
1716 		 * Old school pset.  Just make min and max equal
1717 		 * to its size
1718 		 */
1719 		if (psetid == ZS_PSET_DEFAULT) {
1720 			*cputype = ZS_CPUTYPE_DEFAULT_PSET;
1721 			(void) strlcpy(psetname, "pset_default", namelen);
1722 		} else {
1723 			*cputype = ZS_CPUTYPE_PSRSET_PSET;
1724 			(void) snprintf(psetname, namelen,
1725 			    "SUNWlegacy_pset_%d", psetid);
1726 		}
1727 
1728 		/*
1729 		 * Just treat legacy pset as a simple pool pset
1730 		 */
1731 		*online = num;
1732 		*size = num;
1733 		*min = num;
1734 		*max = num;
1735 		*importance = 1;
1736 
1737 		return (0);
1738 	}
1739 
1740 	/* Look up the pool pset using the pset id */
1741 	res_list = NULL;
1742 	pool_value_set_int64(vals[1], psetid);
1743 	if (pool_value_set_name(vals[1], "pset.sys_id")
1744 	    != PO_SUCCESS)
1745 		goto err;
1746 
1747 	if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1748 		goto err;
1749 	if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1750 		goto err;
1751 	if ((res_list = pool_query_resources(conf, &num, vals)) == NULL)
1752 		goto err;
1753 	if (num != 1)
1754 		goto err;
1755 	pset = res_list[0];
1756 	free(res_list);
1757 	res_list = NULL;
1758 	if (pool_get_property(conf, pool_resource_to_elem(conf, pset),
1759 	    "pset.name", vals[0]) != POC_STRING ||
1760 	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1761 		goto err;
1762 
1763 	(void) strlcpy(psetname, string, namelen);
1764 	if (strncmp(psetname, "SUNWtmp", strlen("SUNWtmp")) == 0)
1765 		*cputype = ZS_CPUTYPE_DEDICATED;
1766 	else if (psetid == ZS_PSET_DEFAULT)
1767 		*cputype = ZS_CPUTYPE_DEFAULT_PSET;
1768 	else
1769 		*cputype = ZS_CPUTYPE_POOL_PSET;
1770 
1771 	/* Get size, min, max, and importance */
1772 	if (pool_get_property(conf, pool_resource_to_elem(conf,
1773 	    pset), "pset.size", vals[0]) == POC_UINT &&
1774 	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1775 		*size = uint64;
1776 	else
1777 		*size = 0;
1778 
1779 		/* Get size, min, max, and importance */
1780 	if (pool_get_property(conf, pool_resource_to_elem(conf,
1781 	    pset), "pset.min", vals[0]) == POC_UINT &&
1782 	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1783 		*min = uint64;
1784 	else
1785 		*min = 0;
1786 	if (*min >= ZSD_PSET_UNLIMITED)
1787 		*min = ZS_LIMIT_NONE;
1788 
1789 	if (pool_get_property(conf, pool_resource_to_elem(conf,
1790 	    pset), "pset.max", vals[0]) == POC_UINT &&
1791 	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1792 		*max = uint64;
1793 	else
1794 		*max = ZS_LIMIT_NONE;
1795 
1796 	if (*max >= ZSD_PSET_UNLIMITED)
1797 		*max = ZS_LIMIT_NONE;
1798 
1799 	if (pool_get_property(conf, pool_resource_to_elem(conf,
1800 	    pset), "pset.importance", vals[0]) == POC_INT &&
1801 	    pool_value_get_int64(vals[0], &int64) == PO_SUCCESS)
1802 		*importance = int64;
1803 	else
1804 		*importance = (uint64_t)1;
1805 
1806 	*online = 0;
1807 	if (*size == 0)
1808 		return (0);
1809 
1810 	/* get cpus */
1811 	cpus = pool_query_resource_components(conf, pset, &num, NULL);
1812 	if (cpus == NULL)
1813 		goto err;
1814 
1815 	/* Make sure there is space for cpu id list */
1816 	if (num > ctl->zsctl_cpu_ncache) {
1817 		if ((cache = (processorid_t *)realloc(
1818 		    ctl->zsctl_cpu_cache, num *
1819 		    sizeof (processorid_t))) != NULL) {
1820 			ctl->zsctl_cpu_ncache = num;
1821 			ctl->zsctl_cpu_cache = cache;
1822 		} else {
1823 			/*
1824 			 * Could not allocate to get new cpu list.
1825 			 */
1826 			zsd_warn(gettext(
1827 			    "Could not allocate for cpu list"));
1828 			goto err;
1829 		}
1830 	}
1831 
1832 	/* count the online cpus */
1833 	for (i = 0; i < num; i++) {
1834 		if (pool_get_property(conf, pool_component_to_elem(
1835 		    conf, cpus[i]), "cpu.status", vals[0]) != POC_STRING ||
1836 		    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1837 			goto err;
1838 
1839 		if (strcmp(string, "on-line") != 0 &&
1840 		    strcmp(string, "no-intr") != 0)
1841 			continue;
1842 
1843 		if (pool_get_property(conf, pool_component_to_elem(
1844 		    conf, cpus[i]), "cpu.sys_id", vals[0]) != POC_INT ||
1845 		    pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
1846 			goto err;
1847 
1848 		(*online)++;
1849 		ctl->zsctl_cpu_cache[i] = (psetid_t)int64;
1850 	}
1851 	free(cpus);
1852 	return (0);
1853 err:
1854 	if (res_list != NULL)
1855 		free(res_list);
1856 	if (cpus != NULL)
1857 		free(cpus);
1858 
1859 	/*
1860 	 * The pools operations should succeed since the conf is a consistent
1861 	 * snapshot.  Tell caller there is no need to retry.
1862 	 */
1863 	errno = EINVAL;
1864 	return (-1);
1865 }
1866 
1867 /*
1868  * Update the current list of processor sets.
1869  * This also updates the list of online cpus, and each cpu's pset membership.
1870  */
1871 static void
1872 zsd_refresh_psets(zsd_ctl_t *ctl)
1873 {
1874 	int i, j, ret, state;
1875 	uint_t old, num;
1876 	uint_t cputype;
1877 	int64_t sys_id, importance;
1878 	uint64_t online, size, min, max;
1879 	zsd_system_t *system;
1880 	zsd_pset_t *pset;
1881 	zsd_cpu_t *cpu;
1882 	psetid_t *cache;
1883 	char psetname[ZS_PSETNAME_MAX];
1884 	processorid_t cpuid;
1885 	pool_value_t *pv_save = NULL;
1886 	pool_resource_t **res_list = NULL;
1887 	pool_resource_t *res;
1888 	pool_value_t **vals;
1889 	pool_conf_t *conf;
1890 	boolean_t roll_cpus = B_TRUE;
1891 
1892 	/* Zero cpu counters to recount them */
1893 	system = ctl->zsctl_system;
1894 	system->zss_ncpus = 0;
1895 	system->zss_ncpus_online = 0;
1896 retry:
1897 	ret = pool_get_status(&state);
1898 	if (ret == 0 && state == POOL_ENABLED) {
1899 
1900 		conf = ctl->zsctl_pool_conf;
1901 		vals = ctl->zsctl_pool_vals;
1902 		pv_save = vals[1];
1903 		vals[1] = NULL;
1904 
1905 		if (ctl->zsctl_pool_status == POOL_DISABLED) {
1906 			if (pool_conf_open(ctl->zsctl_pool_conf,
1907 			    pool_dynamic_location(), PO_RDONLY) == 0) {
1908 				ctl->zsctl_pool_status = POOL_ENABLED;
1909 				ctl->zsctl_pool_changed = POU_PSET;
1910 			}
1911 		} else {
1912 			ctl->zsctl_pool_changed = 0;
1913 			ret = pool_conf_update(ctl->zsctl_pool_conf,
1914 			    &(ctl->zsctl_pool_changed));
1915 			if (ret < 0) {
1916 				/* Pools must have become disabled */
1917 				(void) pool_conf_close(ctl->zsctl_pool_conf);
1918 				ctl->zsctl_pool_status = POOL_DISABLED;
1919 				if (pool_error() == POE_SYSTEM && errno ==
1920 				    ENOTACTIVE)
1921 					goto retry;
1922 
1923 				zsd_warn(gettext(
1924 				    "Unable to update pool configuration"));
1925 				/* Not able to get pool info.  Don't update. */
1926 				goto err;
1927 			}
1928 		}
1929 		/* Get the list of psets using libpool */
1930 		if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1931 			goto err;
1932 
1933 		if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1934 			goto err;
1935 		if ((res_list = pool_query_resources(conf, &num, vals))
1936 		    == NULL)
1937 			goto err;
1938 
1939 		if (num > ctl->zsctl_pset_ncache)  {
1940 			if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1941 			    (num) * sizeof (psetid_t))) == NULL) {
1942 				goto err;
1943 			}
1944 			ctl->zsctl_pset_ncache = num;
1945 			ctl->zsctl_pset_cache = cache;
1946 		}
1947 		/* Save the pset id of each pset */
1948 		for (i = 0; i < num; i++) {
1949 			res = res_list[i];
1950 			if (pool_get_property(conf, pool_resource_to_elem(conf,
1951 			    res), "pset.sys_id", vals[0]) != POC_INT ||
1952 			    pool_value_get_int64(vals[0], &sys_id)
1953 			    != PO_SUCCESS)
1954 				goto err;
1955 			ctl->zsctl_pset_cache[i] = (int)sys_id;
1956 		}
1957 		vals[1] = pv_save;
1958 		pv_save = NULL;
1959 	} else {
1960 		if (ctl->zsctl_pool_status == POOL_ENABLED) {
1961 			(void) pool_conf_close(ctl->zsctl_pool_conf);
1962 			ctl->zsctl_pool_status = POOL_DISABLED;
1963 		}
1964 		/* Get the pset list using legacy psets */
1965 		for (;;) {
1966 			old = num = ctl->zsctl_pset_ncache;
1967 			(void) pset_list(ctl->zsctl_pset_cache, &num);
1968 			if ((num + 1) <= old) {
1969 				break;
1970 			}
1971 			if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1972 			    (num + 1) * sizeof (psetid_t))) != NULL) {
1973 				ctl->zsctl_pset_ncache = num + 1;
1974 				ctl->zsctl_pset_cache = cache;
1975 			} else {
1976 				/*
1977 				 * Could not allocate to get new pset list.
1978 				 * Give up
1979 				 */
1980 				return;
1981 			}
1982 		}
1983 		/* Add the default pset to list */
1984 		ctl->zsctl_pset_cache[num] = ctl->zsctl_pset_cache[0];
1985 		ctl->zsctl_pset_cache[0] = ZS_PSET_DEFAULT;
1986 		num++;
1987 	}
1988 psets_changed:
1989 	zsd_mark_cpus_start(ctl, roll_cpus);
1990 	zsd_mark_psets_start(ctl);
1991 	roll_cpus = B_FALSE;
1992 
1993 	/* Refresh cpu membership of all psets */
1994 	for (i = 0; i < num; i++) {
1995 
1996 		/* Get pool pset information */
1997 		sys_id = ctl->zsctl_pset_cache[i];
1998 		if (zsd_get_pool_pset(ctl, sys_id, psetname, sizeof (psetname),
1999 		    &cputype, &online, &size, &min, &max, &importance)
2000 		    != 0) {
2001 			if (errno == EINTR)
2002 				goto psets_changed;
2003 			zsd_warn(gettext("Failed to get info for pset %d"),
2004 			    sys_id);
2005 			continue;
2006 		}
2007 
2008 		system->zss_ncpus += size;
2009 		system->zss_ncpus_online += online;
2010 
2011 		pset = zsd_lookup_insert_pset(ctl, psetname,
2012 		    ctl->zsctl_pset_cache[i]);
2013 
2014 		/* update pset info */
2015 		zsd_mark_pset_found(pset, cputype, online, size, min,
2016 		    max, importance);
2017 
2018 		/* update each cpu in pset */
2019 		for (j = 0; j < pset->zsp_online; j++) {
2020 			cpuid = ctl->zsctl_cpu_cache[j];
2021 			cpu = zsd_lookup_insert_cpu(ctl, cpuid);
2022 			zsd_mark_cpu_found(cpu, pset, sys_id);
2023 		}
2024 	}
2025 err:
2026 	if (res_list != NULL)
2027 		free(res_list);
2028 	if (pv_save != NULL)
2029 		vals[1] = pv_save;
2030 }
2031 
2032 
2033 
2034 /*
2035  * Fetch the current pool and pset name for the given zone.
2036  */
2037 static void
2038 zsd_get_zone_pool_pset(zsd_ctl_t *ctl, zsd_zone_t *zone,
2039     char *pool, int poollen, char *pset, int psetlen, uint_t *cputype)
2040 {
2041 	poolid_t poolid;
2042 	pool_t **pools = NULL;
2043 	pool_resource_t **res_list = NULL;
2044 	char poolname[ZS_POOLNAME_MAX];
2045 	char psetname[ZS_PSETNAME_MAX];
2046 	pool_conf_t *conf = ctl->zsctl_pool_conf;
2047 	pool_value_t *pv_save = NULL;
2048 	pool_value_t **vals = ctl->zsctl_pool_vals;
2049 	const char *string;
2050 	int ret;
2051 	int64_t int64;
2052 	uint_t num;
2053 
2054 	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_POOLID,
2055 	    &poolid, sizeof (poolid));
2056 	if (ret < 0)
2057 		goto lookup_done;
2058 
2059 	pv_save = vals[1];
2060 	vals[1] = NULL;
2061 	pools = NULL;
2062 	res_list = NULL;
2063 
2064 	/* Default values if lookup fails */
2065 	(void) strlcpy(poolname, "pool_default", sizeof (poolname));
2066 	(void) strlcpy(psetname, "pset_default", sizeof (poolname));
2067 	*cputype = ZS_CPUTYPE_DEFAULT_PSET;
2068 
2069 	/* no dedicated cpu if pools are disabled */
2070 	if (ctl->zsctl_pool_status == POOL_DISABLED)
2071 		goto lookup_done;
2072 
2073 	/* Get the pool name using the id */
2074 	pool_value_set_int64(vals[0], poolid);
2075 	if (pool_value_set_name(vals[0], "pool.sys_id") != PO_SUCCESS)
2076 		goto lookup_done;
2077 
2078 	if ((pools = pool_query_pools(conf, &num, vals)) == NULL)
2079 		goto lookup_done;
2080 
2081 	if (num != 1)
2082 		goto lookup_done;
2083 
2084 	if (pool_get_property(conf, pool_to_elem(conf, pools[0]),
2085 	    "pool.name", vals[0]) != POC_STRING ||
2086 	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2087 		goto lookup_done;
2088 	(void) strlcpy(poolname, (char *)string, sizeof (poolname));
2089 
2090 	/* Get the name of the pset for the pool */
2091 	if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
2092 		goto lookup_done;
2093 
2094 	if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
2095 		goto lookup_done;
2096 
2097 	if ((res_list = pool_query_pool_resources(conf, pools[0], &num, vals))
2098 	    == NULL)
2099 		goto lookup_done;
2100 
2101 	if (num != 1)
2102 		goto lookup_done;
2103 
2104 	if (pool_get_property(conf, pool_resource_to_elem(conf,
2105 	    res_list[0]), "pset.sys_id", vals[0]) != POC_INT ||
2106 	    pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
2107 		goto lookup_done;
2108 
2109 	if (int64 == ZS_PSET_DEFAULT)
2110 		*cputype = ZS_CPUTYPE_DEFAULT_PSET;
2111 
2112 	if (pool_get_property(conf, pool_resource_to_elem(conf,
2113 	    res_list[0]), "pset.name", vals[0]) != POC_STRING ||
2114 	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2115 		goto lookup_done;
2116 
2117 	(void) strlcpy(psetname, (char *)string, sizeof (psetname));
2118 
2119 	if (strncmp(psetname, "SUNWtmp_", strlen("SUNWtmp_")) == 0)
2120 		*cputype = ZS_CPUTYPE_DEDICATED;
2121 	if (strncmp(psetname, "SUNW_legacy_", strlen("SUNW_legacy_")) == 0)
2122 		*cputype = ZS_CPUTYPE_PSRSET_PSET;
2123 	else
2124 		*cputype = ZS_CPUTYPE_POOL_PSET;
2125 
2126 lookup_done:
2127 
2128 	if (pv_save != NULL)
2129 		vals[1] = pv_save;
2130 
2131 	if (res_list)
2132 		free(res_list);
2133 	if (pools)
2134 		free(pools);
2135 
2136 	(void) strlcpy(pool, poolname, poollen);
2137 	(void) strlcpy(pset, psetname, psetlen);
2138 }
2139 
2140 /* Convert scheduler names to ZS_* scheduler flags */
2141 static uint_t
2142 zsd_schedname2int(char *clname, int pri)
2143 {
2144 	uint_t sched = 0;
2145 
2146 	if (strcmp(clname, "TS") == 0) {
2147 		sched = ZS_SCHED_TS;
2148 	} else if (strcmp(clname, "IA") == 0) {
2149 		sched = ZS_SCHED_IA;
2150 	} else if (strcmp(clname, "FX") == 0) {
2151 		if (pri > 59) {
2152 			sched = ZS_SCHED_FX_60;
2153 		} else {
2154 			sched = ZS_SCHED_FX;
2155 		}
2156 	} else if (strcmp(clname, "RT") == 0) {
2157 		sched = ZS_SCHED_RT;
2158 
2159 	} else if (strcmp(clname, "FSS") == 0) {
2160 		sched = ZS_SCHED_FSS;
2161 	}
2162 	return (sched);
2163 }
2164 
2165 static uint64_t
2166 zsd_get_zone_rctl_limit(char *name)
2167 {
2168 	rctlblk_t *rblk;
2169 
2170 	rblk = (rctlblk_t *)alloca(rctlblk_size());
2171 	if (getrctl(name, NULL, rblk, RCTL_FIRST)
2172 	    != 0) {
2173 		return (ZS_LIMIT_NONE);
2174 	}
2175 	return (rctlblk_get_value(rblk));
2176 }
2177 
2178 static uint64_t
2179 zsd_get_zone_rctl_usage(char *name)
2180 {
2181 	rctlblk_t *rblk;
2182 
2183 	rblk = (rctlblk_t *)alloca(rctlblk_size());
2184 	if (getrctl(name, NULL, rblk, RCTL_USAGE)
2185 	    != 0) {
2186 		return (0);
2187 	}
2188 	return (rctlblk_get_value(rblk));
2189 }
2190 
2191 #define	ZSD_NUM_RCTL_VALS 19
2192 
2193 /*
2194  * Fetch the limit information for a zone.  This uses zone_enter() as the
2195  * getrctl(2) system call only returns rctl information for the zone of
2196  * the caller.
2197  */
2198 static int
2199 zsd_get_zone_caps(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t *cpu_shares,
2200     uint64_t *cpu_cap, uint64_t *ram_cap, uint64_t *locked_cap,
2201     uint64_t *vm_cap, uint64_t *processes_cap, uint64_t *processes,
2202     uint64_t *lwps_cap, uint64_t *lwps, uint64_t *shm_cap, uint64_t *shm,
2203     uint64_t *shmids_cap, uint64_t *shmids, uint64_t *semids_cap,
2204     uint64_t *semids, uint64_t *msgids_cap, uint64_t *msgids,
2205     uint64_t *lofi_cap, uint64_t *lofi, uint_t *sched)
2206 {
2207 	int p[2], pid, tmpl_fd, ret;
2208 	ctid_t ct;
2209 	char class[PC_CLNMSZ];
2210 	uint64_t vals[ZSD_NUM_RCTL_VALS];
2211 	zsd_system_t *sys = ctl->zsctl_system;
2212 	int i = 0;
2213 	int res = 0;
2214 
2215 	/* Treat all caps as no cap on error */
2216 	*cpu_shares = ZS_LIMIT_NONE;
2217 	*cpu_cap = ZS_LIMIT_NONE;
2218 	*ram_cap = ZS_LIMIT_NONE;
2219 	*locked_cap = ZS_LIMIT_NONE;
2220 	*vm_cap = ZS_LIMIT_NONE;
2221 
2222 	*processes_cap = ZS_LIMIT_NONE;
2223 	*lwps_cap = ZS_LIMIT_NONE;
2224 	*shm_cap = ZS_LIMIT_NONE;
2225 	*shmids_cap = ZS_LIMIT_NONE;
2226 	*semids_cap = ZS_LIMIT_NONE;
2227 	*msgids_cap = ZS_LIMIT_NONE;
2228 	*lofi_cap = ZS_LIMIT_NONE;
2229 
2230 	*processes = 0;
2231 	*lwps = 0;
2232 	*shm = 0;
2233 	*shmids = 0;
2234 	*semids = 0;
2235 	*msgids = 0;
2236 	*lofi = 0;
2237 
2238 	/* Get the ram cap first since it is a zone attr */
2239 	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_PHYS_MCAP,
2240 	    ram_cap, sizeof (*ram_cap));
2241 	if (ret < 0 || *ram_cap == 0)
2242 		*ram_cap = ZS_LIMIT_NONE;
2243 
2244 	/* Get the zone's default scheduling class */
2245 	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_SCHED_CLASS,
2246 	    class, sizeof (class));
2247 	if (ret < 0)
2248 		return (-1);
2249 
2250 	*sched = zsd_schedname2int(class, 0);
2251 
2252 	/* rctl caps must be fetched from within the zone */
2253 	if (pipe(p) != 0)
2254 		return (-1);
2255 
2256 	if ((tmpl_fd = init_template()) == -1) {
2257 		(void) close(p[0]);
2258 		(void) close(p[1]);
2259 		return (-1);
2260 	}
2261 	pid = forkx(0);
2262 	if (pid < 0) {
2263 		(void) ct_tmpl_clear(tmpl_fd);
2264 		(void) close(p[0]);
2265 		(void) close(p[1]);
2266 		return (-1);
2267 	}
2268 	if (pid == 0) {
2269 
2270 		(void) ct_tmpl_clear(tmpl_fd);
2271 		(void) close(tmpl_fd);
2272 		(void) close(p[0]);
2273 		if (zone->zsz_id != getzoneid()) {
2274 			if (zone_enter(zone->zsz_id) < 0) {
2275 				(void) close(p[1]);
2276 				_exit(0);
2277 			}
2278 		}
2279 
2280 		/* Get caps for zone, and write them to zonestatd parent. */
2281 		vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-shares");
2282 		vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-cap");
2283 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-locked-memory");
2284 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-swap");
2285 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-processes");
2286 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-processes");
2287 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-lwps");
2288 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-lwps");
2289 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-memory");
2290 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-memory");
2291 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-ids");
2292 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-ids");
2293 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-sem-ids");
2294 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-sem-ids");
2295 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-msg-ids");
2296 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-msg-ids");
2297 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-lofi");
2298 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-lofi");
2299 
2300 		if (write(p[1], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2301 		    ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2302 			(void) close(p[1]);
2303 			_exit(1);
2304 		}
2305 
2306 		(void) close(p[1]);
2307 		_exit(0);
2308 	}
2309 	if (contract_latest(&ct) == -1)
2310 		ct = -1;
2311 
2312 	(void) ct_tmpl_clear(tmpl_fd);
2313 	(void) close(tmpl_fd);
2314 	(void) close(p[1]);
2315 	while (waitpid(pid, NULL, 0) != pid)
2316 		;
2317 
2318 	/* Read cap from child in zone */
2319 	if (read(p[0], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2320 	    ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2321 		res = -1;
2322 		goto cleanup;
2323 	}
2324 	i = 0;
2325 	*cpu_shares = vals[i++];
2326 	*cpu_cap = vals[i++];
2327 	*locked_cap = vals[i++];
2328 	*vm_cap = vals[i++];
2329 	*processes_cap = vals[i++];
2330 	*processes = vals[i++];
2331 	*lwps_cap = vals[i++];
2332 	*lwps = vals[i++];
2333 	*shm_cap = vals[i++];
2334 	*shm = vals[i++];
2335 	*shmids_cap = vals[i++];
2336 	*shmids = vals[i++];
2337 	*semids_cap = vals[i++];
2338 	*semids = vals[i++];
2339 	*msgids_cap = vals[i++];
2340 	*msgids = vals[i++];
2341 	*lofi_cap = vals[i++];
2342 	*lofi = vals[i++];
2343 
2344 	/* Interpret maximum values as no cap */
2345 	if (*cpu_cap == UINT32_MAX || *cpu_cap == 0)
2346 		*cpu_cap = ZS_LIMIT_NONE;
2347 	if (*processes_cap == sys->zss_processes_max)
2348 		*processes_cap = ZS_LIMIT_NONE;
2349 	if (*lwps_cap == sys->zss_lwps_max)
2350 		*lwps_cap = ZS_LIMIT_NONE;
2351 	if (*shm_cap == sys->zss_shm_max)
2352 		*shm_cap = ZS_LIMIT_NONE;
2353 	if (*shmids_cap == sys->zss_shmids_max)
2354 		*shmids_cap = ZS_LIMIT_NONE;
2355 	if (*semids_cap == sys->zss_semids_max)
2356 		*semids_cap = ZS_LIMIT_NONE;
2357 	if (*msgids_cap == sys->zss_msgids_max)
2358 		*msgids_cap = ZS_LIMIT_NONE;
2359 	if (*lofi_cap == sys->zss_lofi_max)
2360 		*lofi_cap = ZS_LIMIT_NONE;
2361 
2362 
2363 cleanup:
2364 	(void) close(p[0]);
2365 	(void) ct_tmpl_clear(tmpl_fd);
2366 	(void) close(tmpl_fd);
2367 	(void) contract_abandon_id(ct);
2368 
2369 	return (res);
2370 }
2371 
2372 /* Update the current list of running zones */
2373 static void
2374 zsd_refresh_zones(zsd_ctl_t *ctl)
2375 {
2376 	zsd_zone_t *zone;
2377 	uint_t old, num;
2378 	ushort_t flags;
2379 	int i, ret;
2380 	zoneid_t *cache;
2381 	uint64_t cpu_shares;
2382 	uint64_t cpu_cap;
2383 	uint64_t ram_cap;
2384 	uint64_t locked_cap;
2385 	uint64_t vm_cap;
2386 	uint64_t processes_cap;
2387 	uint64_t processes;
2388 	uint64_t lwps_cap;
2389 	uint64_t lwps;
2390 	uint64_t shm_cap;
2391 	uint64_t shm;
2392 	uint64_t shmids_cap;
2393 	uint64_t shmids;
2394 	uint64_t semids_cap;
2395 	uint64_t semids;
2396 	uint64_t msgids_cap;
2397 	uint64_t msgids;
2398 	uint64_t lofi_cap;
2399 	uint64_t lofi;
2400 
2401 	char zonename[ZS_ZONENAME_MAX];
2402 	char poolname[ZS_POOLNAME_MAX];
2403 	char psetname[ZS_PSETNAME_MAX];
2404 	uint_t sched;
2405 	uint_t cputype;
2406 	uint_t iptype;
2407 
2408 	/* Get the current list of running zones */
2409 	for (;;) {
2410 		old = num = ctl->zsctl_zone_ncache;
2411 		(void) zone_list(ctl->zsctl_zone_cache, &num);
2412 		if (num <= old)
2413 			break;
2414 		if ((cache = (zoneid_t *)realloc(ctl->zsctl_zone_cache,
2415 		    (num) * sizeof (zoneid_t))) != NULL) {
2416 			ctl->zsctl_zone_ncache = num;
2417 			ctl->zsctl_zone_cache = cache;
2418 		} else {
2419 			/* Could not allocate to get new zone list.  Give up */
2420 			return;
2421 		}
2422 	}
2423 
2424 	zsd_mark_zones_start(ctl);
2425 
2426 	for (i = 0; i < num; i++) {
2427 
2428 		ret = getzonenamebyid(ctl->zsctl_zone_cache[i],
2429 		    zonename, sizeof (zonename));
2430 		if (ret < 0)
2431 			continue;
2432 
2433 		zone = zsd_lookup_insert_zone(ctl, zonename,
2434 		    ctl->zsctl_zone_cache[i]);
2435 
2436 		ret = zone_getattr(ctl->zsctl_zone_cache[i], ZONE_ATTR_FLAGS,
2437 		    &flags, sizeof (flags));
2438 		if (ret < 0)
2439 			continue;
2440 
2441 		if (flags & ZF_NET_EXCL)
2442 			iptype = ZS_IPTYPE_EXCLUSIVE;
2443 		else
2444 			iptype = ZS_IPTYPE_SHARED;
2445 
2446 		zsd_get_zone_pool_pset(ctl, zone, poolname, sizeof (poolname),
2447 		    psetname, sizeof (psetname), &cputype);
2448 
2449 		if (zsd_get_zone_caps(ctl, zone, &cpu_shares, &cpu_cap,
2450 		    &ram_cap, &locked_cap, &vm_cap, &processes_cap, &processes,
2451 		    &lwps_cap, &lwps, &shm_cap, &shm, &shmids_cap, &shmids,
2452 		    &semids_cap, &semids, &msgids_cap, &msgids, &lofi_cap,
2453 		    &lofi, &sched) != 0)
2454 			continue;
2455 
2456 		zsd_mark_zone_found(ctl, zone, cpu_shares, cpu_cap, ram_cap,
2457 		    locked_cap, vm_cap, processes_cap, processes, lwps_cap,
2458 		    lwps, shm_cap, shm, shmids_cap, shmids, semids_cap,
2459 		    semids, msgids_cap, msgids, lofi_cap, lofi, poolname,
2460 		    psetname, sched, cputype, iptype);
2461 	}
2462 }
2463 
2464 /* Fetch the details of a process from its psinfo_t */
2465 static void
2466 zsd_get_proc_info(zsd_ctl_t *ctl, psinfo_t *psinfo, psetid_t *psetid,
2467     psetid_t *prev_psetid, zoneid_t *zoneid, zoneid_t *prev_zoneid,
2468     timestruc_t *delta, uint_t *sched)
2469 {
2470 	timestruc_t d;
2471 	zsd_proc_t *proc;
2472 
2473 	/* Get cached data for proc */
2474 	proc = &(ctl->zsctl_proc_array[psinfo->pr_pid]);
2475 	*psetid = psinfo->pr_lwp.pr_bindpset;
2476 
2477 	if (proc->zspr_psetid == ZS_PSET_ERROR)
2478 		*prev_psetid = *psetid;
2479 	else
2480 		*prev_psetid = proc->zspr_psetid;
2481 
2482 	*zoneid = psinfo->pr_zoneid;
2483 	if (proc->zspr_zoneid == -1)
2484 		*prev_zoneid = *zoneid;
2485 	else
2486 		*prev_zoneid = proc->zspr_zoneid;
2487 
2488 	TIMESTRUC_DELTA(d, psinfo->pr_time, proc->zspr_usage);
2489 	*delta = d;
2490 
2491 	*sched = zsd_schedname2int(psinfo->pr_lwp.pr_clname,
2492 	    psinfo->pr_lwp.pr_pri);
2493 
2494 	/* Update cached data for proc */
2495 	proc->zspr_psetid = psinfo->pr_lwp.pr_bindpset;
2496 	proc->zspr_zoneid = psinfo->pr_zoneid;
2497 	proc->zspr_sched = *sched;
2498 	proc->zspr_usage.tv_sec = psinfo->pr_time.tv_sec;
2499 	proc->zspr_usage.tv_nsec = psinfo->pr_time.tv_nsec;
2500 	proc->zspr_ppid = psinfo->pr_ppid;
2501 }
2502 
2503 /*
2504  * Reset the known cpu usage of a process. This is done after a process
2505  * exits so that if the pid is recycled, data from its previous life is
2506  * not reused
2507  */
2508 static void
2509 zsd_flush_proc_info(zsd_proc_t *proc)
2510 {
2511 	proc->zspr_usage.tv_sec = 0;
2512 	proc->zspr_usage.tv_nsec = 0;
2513 }
2514 
2515 /*
2516  * Open the current extended accounting file.  On initialization, open the
2517  * file as the current file to be used.  Otherwise, open the file as the
2518  * next file to use of the current file reaches EOF.
2519  */
2520 static int
2521 zsd_open_exacct(zsd_ctl_t *ctl, boolean_t init)
2522 {
2523 	int ret, oret, state, trys = 0, flags;
2524 	int *fd, *open;
2525 	ea_file_t *eaf;
2526 	struct stat64 *stat;
2527 	char path[MAXPATHLEN];
2528 
2529 	/*
2530 	 * The accounting file is first opened at the tail.  Following
2531 	 * opens to new accounting files are opened at the head.
2532 	 */
2533 	if (init == B_TRUE) {
2534 		flags = EO_NO_VALID_HDR | EO_TAIL;
2535 		fd = &ctl->zsctl_proc_fd;
2536 		eaf = &ctl->zsctl_proc_eaf;
2537 		stat = &ctl->zsctl_proc_stat;
2538 		open = &ctl->zsctl_proc_open;
2539 	} else {
2540 		flags = EO_NO_VALID_HDR | EO_HEAD;
2541 		fd = &ctl->zsctl_proc_fd_next;
2542 		eaf = &ctl->zsctl_proc_eaf_next;
2543 		stat = &ctl->zsctl_proc_stat_next;
2544 		open = &ctl->zsctl_proc_open_next;
2545 	}
2546 
2547 	*fd = -1;
2548 	*open = 0;
2549 retry:
2550 	/* open accounting files for cpu consumption */
2551 	ret = acctctl(AC_STATE_GET | AC_PROC, &state, sizeof (state));
2552 	if (ret != 0) {
2553 		zsd_warn(gettext("Unable to get process accounting state"));
2554 		goto err;
2555 	}
2556 	if (state != AC_ON) {
2557 		if (trys > 0) {
2558 			zsd_warn(gettext(
2559 			    "Unable to enable process accounting"));
2560 			goto err;
2561 		}
2562 		(void) zsd_enable_cpu_stats();
2563 		trys++;
2564 		goto retry;
2565 	}
2566 
2567 	ret = acctctl(AC_FILE_GET | AC_PROC, path, sizeof (path));
2568 	if (ret != 0) {
2569 		zsd_warn(gettext("Unable to get process accounting file"));
2570 		goto err;
2571 	}
2572 
2573 	if ((*fd = open64(path, O_RDONLY, 0)) >= 0 &&
2574 	    (oret = ea_fdopen(eaf, *fd, NULL, flags, O_RDONLY)) == 0)
2575 		ret = fstat64(*fd, stat);
2576 
2577 	if (*fd < 0 || oret < 0 || ret < 0) {
2578 		struct timespec ts;
2579 
2580 		/*
2581 		 * It is possible the accounting file is momentarily unavailable
2582 		 * because it is being rolled.  Try for up to half a second.
2583 		 *
2584 		 * If failure to open accounting file persists, give up.
2585 		 */
2586 		if (oret == 0)
2587 			(void) ea_close(eaf);
2588 		else if (*fd >= 0)
2589 			(void) close(*fd);
2590 		if (trys > 500) {
2591 			zsd_warn(gettext(
2592 			    "Unable to open process accounting file"));
2593 			goto err;
2594 		}
2595 		/* wait one millisecond */
2596 		ts.tv_sec = 0;
2597 		ts.tv_nsec = NANOSEC / 1000;
2598 		(void) nanosleep(&ts, NULL);
2599 		goto retry;
2600 	}
2601 	*open = 1;
2602 	return (0);
2603 err:
2604 	if (*fd >= 0)
2605 		(void) close(*fd);
2606 	*open = 0;
2607 	*fd = -1;
2608 	return (-1);
2609 }
2610 
2611 /*
2612  * Walk /proc and charge each process to its zone and processor set.
2613  * Then read exacct data for exited processes, and charge them as well.
2614  */
2615 static void
2616 zsd_refresh_procs(zsd_ctl_t *ctl, boolean_t init)
2617 {
2618 	DIR *dir;
2619 	struct dirent *dent;
2620 	psinfo_t psinfo;
2621 	int fd, ret;
2622 	zsd_proc_t *proc, *pproc, *tmp, *next;
2623 	list_t pplist, plist;
2624 	zsd_zone_t *zone, *prev_zone;
2625 	zsd_pset_t *pset, *prev_pset;
2626 	psetid_t psetid, prev_psetid;
2627 	zoneid_t zoneid, prev_zoneid;
2628 	zsd_pset_usage_t *usage, *prev_usage;
2629 	char path[MAXPATHLEN];
2630 
2631 	ea_object_t object;
2632 	ea_object_t pobject;
2633 	boolean_t hrtime_expired = B_FALSE;
2634 	struct timeval interval_end;
2635 
2636 	timestruc_t delta, d1, d2;
2637 	uint_t sched = 0;
2638 
2639 	/*
2640 	 * Get the current accounting file.  The current accounting file
2641 	 * may be different than the file in use, as the accounting file
2642 	 * may have been rolled, or manually changed by an admin.
2643 	 */
2644 	ret = zsd_open_exacct(ctl, init);
2645 	if (ret != 0) {
2646 		zsd_warn(gettext("Unable to track process accounting"));
2647 		return;
2648 	}
2649 
2650 	/*
2651 	 * Mark the current time as the interval end time.  Don't track
2652 	 * processes that exit after this time.
2653 	 */
2654 	(void) gettimeofday(&interval_end, NULL);
2655 
2656 	dir = opendir("/proc");
2657 	if (dir == NULL) {
2658 		zsd_warn(gettext("Unable to open /proc"));
2659 		return;
2660 	}
2661 
2662 	/* Walk all processes and compute each zone's usage on each pset. */
2663 	while ((dent = readdir(dir)) != NULL) {
2664 
2665 		if (strcmp(dent->d_name, ".") == 0 ||
2666 		    strcmp(dent->d_name, "..") == 0)
2667 			continue;
2668 
2669 		(void) snprintf(path, sizeof (path), "/proc/%s/psinfo",
2670 		    dent->d_name);
2671 
2672 		fd = open(path, O_RDONLY);
2673 		if (fd < 0)
2674 			continue;
2675 
2676 		if (read(fd, &psinfo, sizeof (psinfo)) != sizeof (psinfo)) {
2677 			(void) close(fd);
2678 			continue;
2679 		}
2680 		(void) close(fd);
2681 
2682 		zsd_get_proc_info(ctl, &psinfo, &psetid, &prev_psetid,
2683 		    &zoneid, &prev_zoneid, &delta, &sched);
2684 
2685 		d1.tv_sec = delta.tv_sec / 2;
2686 		d1.tv_nsec = delta.tv_nsec / 2;
2687 		d2.tv_sec = (delta.tv_sec / 2) + (delta.tv_sec % 2);
2688 		d2.tv_nsec = (delta.tv_nsec / 2) + (delta.tv_nsec % 2);
2689 
2690 		/* Get the zone and pset this process is running in */
2691 		zone = zsd_lookup_zone_byid(ctl, zoneid);
2692 		if (zone == NULL)
2693 			continue;
2694 		pset = zsd_lookup_pset_byid(ctl, psetid);
2695 		if (pset == NULL)
2696 			continue;
2697 		usage = zsd_lookup_insert_usage(ctl, pset, zone);
2698 		if (usage == NULL)
2699 			continue;
2700 
2701 		/*
2702 		 * Get the usage of the previous zone and pset if they were
2703 		 * different.
2704 		 */
2705 		if (zoneid != prev_zoneid)
2706 			prev_zone = zsd_lookup_zone_byid(ctl, prev_zoneid);
2707 		else
2708 			prev_zone = NULL;
2709 
2710 		if (psetid != prev_psetid)
2711 			prev_pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2712 		else
2713 			prev_pset = NULL;
2714 
2715 		prev_usage = NULL;
2716 		if (prev_zone != NULL || prev_pset != NULL) {
2717 			if (prev_zone == NULL)
2718 				prev_zone = zone;
2719 			if (prev_pset == NULL)
2720 				prev_pset = pset;
2721 
2722 			prev_usage = zsd_lookup_insert_usage(ctl, prev_pset,
2723 			    prev_zone);
2724 		}
2725 
2726 		/* Update the usage with the processes info */
2727 		if (prev_usage == NULL) {
2728 			zsd_mark_pset_usage_found(usage, sched);
2729 		} else {
2730 			zsd_mark_pset_usage_found(usage, sched);
2731 			zsd_mark_pset_usage_found(prev_usage, sched);
2732 		}
2733 
2734 		/*
2735 		 * First time around is just to get a starting point.  All
2736 		 * usages will be zero.
2737 		 */
2738 		if (init == B_TRUE)
2739 			continue;
2740 
2741 		if (prev_usage == NULL) {
2742 			zsd_add_usage(ctl, usage, &delta);
2743 		} else {
2744 			zsd_add_usage(ctl, usage, &d1);
2745 			zsd_add_usage(ctl, prev_usage, &d2);
2746 		}
2747 	}
2748 	(void) closedir(dir);
2749 
2750 	/*
2751 	 * No need to collect exited proc data on initialization.  Just
2752 	 * caching the usage of the known processes to get a zero starting
2753 	 * point.
2754 	 */
2755 	if (init == B_TRUE)
2756 		return;
2757 
2758 	/*
2759 	 * Add accounting records to account for processes which have
2760 	 * exited.
2761 	 */
2762 	list_create(&plist, sizeof (zsd_proc_t),
2763 	    offsetof(zsd_proc_t, zspr_next));
2764 	list_create(&pplist, sizeof (zsd_proc_t),
2765 	    offsetof(zsd_proc_t, zspr_next));
2766 
2767 	for (;;) {
2768 		pid_t pid;
2769 		pid_t ppid;
2770 		timestruc_t user, sys, proc_usage;
2771 		timestruc_t finish;
2772 		int numfound = 0;
2773 
2774 		bzero(&object, sizeof (object));
2775 		proc = NULL;
2776 		zone = NULL;
2777 		pset = NULL;
2778 		usage = NULL;
2779 		ret = ea_get_object(&ctl->zsctl_proc_eaf, &object);
2780 		if (ret == EO_ERROR) {
2781 			if (ea_error() == EXR_EOF) {
2782 
2783 				struct stat64 *stat;
2784 				struct stat64 *stat_next;
2785 
2786 				/*
2787 				 * See if the next accounting file is the
2788 				 * same as the current accounting file.
2789 				 */
2790 				stat = &(ctl->zsctl_proc_stat);
2791 				stat_next = &(ctl->zsctl_proc_stat_next);
2792 				if (stat->st_ino == stat_next->st_ino &&
2793 				    stat->st_dev == stat_next->st_dev) {
2794 					/*
2795 					 * End of current accounting file is
2796 					 * reached, so finished.  Clear EOF
2797 					 * bit for next time around.
2798 					 */
2799 					ea_clear(&ctl->zsctl_proc_eaf);
2800 					break;
2801 				} else {
2802 					/*
2803 					 * Accounting file has changed.  Move
2804 					 * to current accounting file.
2805 					 */
2806 					(void) ea_close(&ctl->zsctl_proc_eaf);
2807 
2808 					ctl->zsctl_proc_fd =
2809 					    ctl->zsctl_proc_fd_next;
2810 					ctl->zsctl_proc_eaf =
2811 					    ctl->zsctl_proc_eaf_next;
2812 					ctl->zsctl_proc_stat =
2813 					    ctl->zsctl_proc_stat_next;
2814 
2815 					ctl->zsctl_proc_fd_next = -1;
2816 					ctl->zsctl_proc_open_next = 0;
2817 					continue;
2818 				}
2819 			} else {
2820 				/*
2821 				 * Other accounting error.  Give up on
2822 				 * accounting.
2823 				 */
2824 				goto ea_err;
2825 			}
2826 		}
2827 		/* Skip if not a process group */
2828 		if ((object.eo_catalog & EXT_TYPE_MASK) != EXT_GROUP ||
2829 		    (object.eo_catalog & EXD_DATA_MASK) != EXD_GROUP_PROC) {
2830 			(void) ea_free_item(&object, EUP_ALLOC);
2831 			continue;
2832 		}
2833 
2834 		/* The process group entry should be complete */
2835 		while (numfound < 9) {
2836 			bzero(&pobject, sizeof (pobject));
2837 			ret = ea_get_object(&ctl->zsctl_proc_eaf,
2838 			    &pobject);
2839 			if (ret < 0) {
2840 				(void) ea_free_item(&object, EUP_ALLOC);
2841 				zsd_warn(
2842 				    "unable to get process accounting data");
2843 				goto ea_err;
2844 			}
2845 			/* Next entries should be process data */
2846 			if ((pobject.eo_catalog & EXT_TYPE_MASK) ==
2847 			    EXT_GROUP) {
2848 				(void) ea_free_item(&object, EUP_ALLOC);
2849 				(void) ea_free_item(&pobject, EUP_ALLOC);
2850 				zsd_warn(
2851 				    "process data of wrong type");
2852 				goto ea_err;
2853 			}
2854 			switch (pobject.eo_catalog & EXD_DATA_MASK) {
2855 			case EXD_PROC_PID:
2856 				pid = pobject.eo_item.ei_uint32;
2857 				proc = &(ctl->zsctl_proc_array[pid]);
2858 				/*
2859 				 * This process should not be currently in
2860 				 * the list of processes to process.
2861 				 */
2862 				assert(!list_link_active(&proc->zspr_next));
2863 				numfound++;
2864 				break;
2865 			case EXD_PROC_ANCPID:
2866 				ppid = pobject.eo_item.ei_uint32;
2867 				pproc = &(ctl->zsctl_proc_array[ppid]);
2868 				numfound++;
2869 				break;
2870 			case EXD_PROC_ZONENAME:
2871 				zone = zsd_lookup_zone(ctl,
2872 				    pobject.eo_item.ei_string, -1);
2873 				numfound++;
2874 				break;
2875 			case EXD_PROC_CPU_USER_SEC:
2876 				user.tv_sec =
2877 				    pobject.eo_item.ei_uint64;
2878 				numfound++;
2879 				break;
2880 			case EXD_PROC_CPU_USER_NSEC:
2881 				user.tv_nsec =
2882 				    pobject.eo_item.ei_uint64;
2883 				numfound++;
2884 				break;
2885 			case EXD_PROC_CPU_SYS_SEC:
2886 				sys.tv_sec =
2887 				    pobject.eo_item.ei_uint64;
2888 				numfound++;
2889 				break;
2890 			case EXD_PROC_CPU_SYS_NSEC:
2891 				sys.tv_nsec =
2892 				    pobject.eo_item.ei_uint64;
2893 				numfound++;
2894 				break;
2895 			case EXD_PROC_FINISH_SEC:
2896 				finish.tv_sec =
2897 				    pobject.eo_item.ei_uint64;
2898 				numfound++;
2899 				break;
2900 			case EXD_PROC_FINISH_NSEC:
2901 				finish.tv_nsec =
2902 				    pobject.eo_item.ei_uint64;
2903 				numfound++;
2904 				break;
2905 			}
2906 			(void) ea_free_item(&pobject, EUP_ALLOC);
2907 		}
2908 		(void) ea_free_item(&object, EUP_ALLOC);
2909 		if (numfound != 9) {
2910 			zsd_warn(gettext(
2911 			    "Malformed process accounting entry found"));
2912 			goto proc_done;
2913 		}
2914 
2915 		if (finish.tv_sec > interval_end.tv_sec ||
2916 		    (finish.tv_sec == interval_end.tv_sec &&
2917 		    finish.tv_nsec > (interval_end.tv_usec * 1000)))
2918 			hrtime_expired = B_TRUE;
2919 
2920 		/*
2921 		 * Try to identify the zone and pset to which this
2922 		 * exited process belongs.
2923 		 */
2924 		if (zone == NULL)
2925 			goto proc_done;
2926 
2927 		/* Save proc info */
2928 		proc->zspr_ppid = ppid;
2929 		proc->zspr_zoneid = zone->zsz_id;
2930 
2931 		prev_psetid = ZS_PSET_ERROR;
2932 		sched = 0;
2933 
2934 		/*
2935 		 * The following tries to deduce the processes pset.
2936 		 *
2937 		 * First choose pset and sched using cached value from the
2938 		 * most recent time the process has been seen.
2939 		 *
2940 		 * pset and sched can change across zone_enter, so make sure
2941 		 * most recent sighting of this process was in the same
2942 		 * zone before using most recent known value.
2943 		 *
2944 		 * If there is no known value, use value of processes
2945 		 * parent.  If parent is unknown, walk parents until a known
2946 		 * parent is found.
2947 		 *
2948 		 * If no parent in the zone is found, use the zone's default
2949 		 * pset and scheduling class.
2950 		 */
2951 		if (proc->zspr_psetid != ZS_PSET_ERROR) {
2952 			prev_psetid = proc->zspr_psetid;
2953 			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2954 			sched = proc->zspr_sched;
2955 		} else if (pproc->zspr_zoneid == zone->zsz_id &&
2956 		    pproc->zspr_psetid != ZS_PSET_ERROR) {
2957 			prev_psetid = pproc->zspr_psetid;
2958 			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2959 			sched = pproc->zspr_sched;
2960 		}
2961 
2962 		if (pset == NULL) {
2963 			/*
2964 			 * Process or processes parent has never been seen.
2965 			 * Save to deduce a known parent later.
2966 			 */
2967 			proc_usage = sys;
2968 			TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2969 			TIMESTRUC_DELTA(delta, proc_usage,
2970 			    proc->zspr_usage);
2971 			proc->zspr_usage = delta;
2972 			list_insert_tail(&plist, proc);
2973 			continue;
2974 		}
2975 
2976 		/* Add the zone's usage to the pset */
2977 		usage = zsd_lookup_insert_usage(ctl, pset, zone);
2978 		if (usage == NULL)
2979 			goto proc_done;
2980 
2981 		zsd_mark_pset_usage_found(usage, sched);
2982 
2983 		/* compute the usage to add for the exited proc */
2984 		proc_usage = sys;
2985 		TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2986 		TIMESTRUC_DELTA(delta, proc_usage,
2987 		    proc->zspr_usage);
2988 
2989 		zsd_add_usage(ctl, usage, &delta);
2990 proc_done:
2991 		zsd_flush_proc_info(proc);
2992 
2993 		if (hrtime_expired == B_TRUE)
2994 			break;
2995 	}
2996 	/*
2997 	 * close next accounting file.
2998 	 */
2999 	if (ctl->zsctl_proc_open_next) {
3000 		(void) ea_close(
3001 		    &ctl->zsctl_proc_eaf_next);
3002 		ctl->zsctl_proc_open_next = 0;
3003 		ctl->zsctl_proc_fd_next = -1;
3004 	}
3005 
3006 	/* For the remaining processes, use pset and sched of a known parent */
3007 	proc = list_head(&plist);
3008 	while (proc != NULL) {
3009 		next = proc;
3010 		for (;;) {
3011 			if (next->zspr_ppid == 0 || next->zspr_ppid == -1) {
3012 				/*
3013 				 * Kernel process, or parent is unknown, skip
3014 				 * process, remove from process list.
3015 				 */
3016 				tmp = proc;
3017 				proc = list_next(&plist, proc);
3018 				list_link_init(&tmp->zspr_next);
3019 				break;
3020 			}
3021 			pproc = &(ctl->zsctl_proc_array[next->zspr_ppid]);
3022 			if (pproc->zspr_zoneid != proc->zspr_zoneid) {
3023 				/*
3024 				 * Parent in different zone.  Save process and
3025 				 * use zone's default pset and sched below
3026 				 */
3027 				tmp = proc;
3028 				proc = list_next(&plist, proc);
3029 				list_remove(&plist, tmp);
3030 				list_insert_tail(&pplist, tmp);
3031 				break;
3032 			}
3033 			/* Parent has unknown pset, Search parent's parent  */
3034 			if (pproc->zspr_psetid == ZS_PSET_ERROR) {
3035 				next = pproc;
3036 				continue;
3037 			}
3038 			/* Found parent with known pset.  Use its info */
3039 			proc->zspr_psetid = pproc->zspr_psetid;
3040 			proc->zspr_sched = pproc->zspr_sched;
3041 			next->zspr_psetid = pproc->zspr_psetid;
3042 			next->zspr_sched = pproc->zspr_sched;
3043 			zone = zsd_lookup_zone_byid(ctl,
3044 			    proc->zspr_zoneid);
3045 			if (zone == NULL) {
3046 				tmp = proc;
3047 				proc = list_next(&plist, proc);
3048 				list_remove(&plist, tmp);
3049 				list_link_init(&tmp->zspr_next);
3050 				break;
3051 			}
3052 			pset = zsd_lookup_pset_byid(ctl,
3053 			    proc->zspr_psetid);
3054 			if (pset == NULL) {
3055 				tmp = proc;
3056 				proc = list_next(&plist, proc);
3057 				list_remove(&plist, tmp);
3058 				list_link_init(&tmp->zspr_next);
3059 				break;
3060 			}
3061 			/* Add the zone's usage to the pset */
3062 			usage = zsd_lookup_insert_usage(ctl, pset, zone);
3063 			if (usage == NULL) {
3064 				tmp = proc;
3065 				proc = list_next(&plist, proc);
3066 				list_remove(&plist, tmp);
3067 				list_link_init(&tmp->zspr_next);
3068 				break;
3069 			}
3070 			zsd_mark_pset_usage_found(usage, proc->zspr_sched);
3071 			zsd_add_usage(ctl, usage, &proc->zspr_usage);
3072 			zsd_flush_proc_info(proc);
3073 			tmp = proc;
3074 			proc = list_next(&plist, proc);
3075 			list_remove(&plist, tmp);
3076 			list_link_init(&tmp->zspr_next);
3077 			break;
3078 		}
3079 	}
3080 	/*
3081 	 * Process has never been seen.  Using zone info to
3082 	 * determine pset and scheduling class.
3083 	 */
3084 	proc = list_head(&pplist);
3085 	while (proc != NULL) {
3086 
3087 		zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid);
3088 		if (zone == NULL)
3089 			goto next;
3090 		if (zone->zsz_psetid != ZS_PSET_ERROR &&
3091 		    zone->zsz_psetid != ZS_PSET_MULTI) {
3092 			prev_psetid = zone->zsz_psetid;
3093 			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
3094 		} else {
3095 			pset = zsd_lookup_pset(ctl, zone->zsz_pset, -1);
3096 			if (pset != NULL)
3097 				prev_psetid = pset->zsp_id;
3098 		}
3099 		if (pset == NULL)
3100 			goto next;
3101 
3102 		sched = zone->zsz_scheds;
3103 		/*
3104 		 * Ignore FX high scheduling class if it is not the
3105 		 * only scheduling class in the zone.
3106 		 */
3107 		if (sched != ZS_SCHED_FX_60)
3108 			sched &= (~ZS_SCHED_FX_60);
3109 		/*
3110 		 * If more than one scheduling class has been found
3111 		 * in the zone, use zone's default scheduling class for
3112 		 * this process.
3113 		 */
3114 		if ((sched & (sched - 1)) != 0)
3115 			sched = zone->zsz_default_sched;
3116 
3117 		/* Add the zone's usage to the pset */
3118 		usage = zsd_lookup_insert_usage(ctl, pset, zone);
3119 		if (usage == NULL)
3120 			goto next;
3121 
3122 		zsd_mark_pset_usage_found(usage, sched);
3123 		zsd_add_usage(ctl, usage, &proc->zspr_usage);
3124 next:
3125 		tmp = proc;
3126 		proc = list_next(&pplist, proc);
3127 		zsd_flush_proc_info(tmp);
3128 		list_link_init(&tmp->zspr_next);
3129 	}
3130 	return;
3131 ea_err:
3132 	/*
3133 	 * Close the next accounting file if we have not transitioned to it
3134 	 * yet.
3135 	 */
3136 	if (ctl->zsctl_proc_open_next) {
3137 		(void) ea_close(&ctl->zsctl_proc_eaf_next);
3138 		ctl->zsctl_proc_open_next = 0;
3139 		ctl->zsctl_proc_fd_next = -1;
3140 	}
3141 }
3142 
3143 /*
3144  * getvmusage(2) uses size_t's in the passwd data structure, which differ
3145  * in size for 32bit and 64 bit kernels.  Since this is a contracted interface,
3146  * and zonestatd does not necessarily match the kernel's bitness, marshal
3147  * results appropriately.
3148  */
3149 static int
3150 zsd_getvmusage(zsd_ctl_t *ctl, uint_t flags, time_t age, zsd_vmusage64_t *buf,
3151     uint64_t *nres)
3152 {
3153 	zsd_vmusage32_t *vmu32;
3154 	zsd_vmusage64_t *vmu64;
3155 	uint32_t nres32;
3156 	int i;
3157 	int ret;
3158 
3159 	if (ctl->zsctl_kern_bits == 32)  {
3160 		nres32 = *nres;
3161 		ret = syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3162 		    flags, age, (uintptr_t)buf, (uintptr_t)&nres32);
3163 		*nres = nres32;
3164 		if (ret == 0 && buf != NULL) {
3165 			/*
3166 			 * An array of vmusage32_t's has been returned.
3167 			 * Convert it to an array of vmusage64_t's.
3168 			 */
3169 			vmu32 = (zsd_vmusage32_t *)buf;
3170 			vmu64 = (zsd_vmusage64_t *)buf;
3171 			for (i = nres32 - 1; i >= 0; i--) {
3172 
3173 				vmu64[i].vmu_zoneid = vmu32[i].vmu_zoneid;
3174 				vmu64[i].vmu_type = vmu32[i].vmu_type;
3175 				vmu64[i].vmu_type = vmu32[i].vmu_type;
3176 				vmu64[i].vmu_rss_all = vmu32[i].vmu_rss_all;
3177 				vmu64[i].vmu_rss_private =
3178 				    vmu32[i].vmu_rss_private;
3179 				vmu64[i].vmu_rss_shared =
3180 				    vmu32[i].vmu_rss_shared;
3181 				vmu64[i].vmu_swap_all = vmu32[i].vmu_swap_all;
3182 				vmu64[i].vmu_swap_private =
3183 				    vmu32[i].vmu_swap_private;
3184 				vmu64[i].vmu_swap_shared =
3185 				    vmu32[i].vmu_swap_shared;
3186 			}
3187 		}
3188 		return (ret);
3189 	} else {
3190 		/*
3191 		 * kernel is 64 bit, so use 64 bit structures as zonestat
3192 		 * expects.
3193 		 */
3194 		return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3195 		    flags, age, (uintptr_t)buf, (uintptr_t)nres));
3196 
3197 	}
3198 }
3199 
3200 /*
3201  * Update the current physical, virtual, and locked memory usage of the
3202  * running zones.
3203  */
3204 static void
3205 zsd_refresh_memory(zsd_ctl_t *ctl, boolean_t init)
3206 {
3207 
3208 	uint64_t phys_total;
3209 	uint64_t phys_used;
3210 	uint64_t phys_zones;
3211 	uint64_t phys_zones_overcount;
3212 	uint64_t phys_zones_extra;
3213 	uint64_t phys_zones_credit;
3214 
3215 	uint64_t vm_free;
3216 	uint64_t vm_used;
3217 
3218 	uint64_t disk_swap_total;
3219 	uint64_t disk_swap_used;	/* disk swap with contents */
3220 
3221 	uint64_t physmem;
3222 	uint64_t pp_kernel;
3223 	uint64_t arc_size = 0;
3224 	struct anoninfo ani;
3225 
3226 	int num_swap_devices;
3227 	struct swaptable *swt;
3228 	struct swapent *swent;
3229 	size_t swt_size;
3230 	char *path;
3231 
3232 	zsd_vmusage64_t *vmusage;
3233 	uint64_t num_vmusage;
3234 
3235 	int i, ret;
3236 
3237 	zsd_system_t *sys;
3238 	zsd_zone_t *zone;
3239 	int vmu_nzones;
3240 
3241 	kstat_t *kstat;
3242 	char kstat_name[KSTAT_STRLEN];
3243 	kstat_named_t *knp;
3244 	kid_t kid;
3245 
3246 	if (init)
3247 		return;
3248 
3249 	sys = ctl->zsctl_system;
3250 
3251 	/* interrogate swap devices to find the amount of disk swap */
3252 disk_swap_again:
3253 	num_swap_devices = swapctl(SC_GETNSWP, NULL);
3254 
3255 	if (num_swap_devices == 0) {
3256 		sys->zss_swap_total = disk_swap_total = 0;
3257 		sys->zss_swap_used = disk_swap_used = 0;
3258 		/* No disk swap */
3259 		goto disk_swap_done;
3260 	}
3261 	/* see if swap table needs to be larger */
3262 	if (num_swap_devices > ctl->zsctl_swap_cache_num) {
3263 		swt_size = sizeof (int) +
3264 		    (num_swap_devices * sizeof (struct swapent)) +
3265 		    (num_swap_devices * MAXPATHLEN);
3266 		if (ctl->zsctl_swap_cache != NULL)
3267 			free(ctl->zsctl_swap_cache);
3268 
3269 		swt = (struct swaptable *)malloc(swt_size);
3270 		if (swt == NULL) {
3271 			/*
3272 			 * Could not allocate to get list of swap devices.
3273 			 * Just use data from the most recent read, which will
3274 			 * be zero if this is the first read.
3275 			 */
3276 			zsd_warn(gettext("Unable to allocate to determine "
3277 			    "virtual memory"));
3278 			disk_swap_total = sys->zss_swap_total;
3279 			disk_swap_used = sys->zss_swap_used;
3280 			goto disk_swap_done;
3281 		}
3282 		swent = swt->swt_ent;
3283 		path = (char *)swt + (sizeof (int) +
3284 		    num_swap_devices * sizeof (swapent_t));
3285 		for (i = 0; i < num_swap_devices; i++, swent++) {
3286 			swent->ste_path = path;
3287 			path += MAXPATHLEN;
3288 		}
3289 		swt->swt_n = num_swap_devices;
3290 		ctl->zsctl_swap_cache = swt;
3291 		ctl->zsctl_swap_cache_size = swt_size;
3292 		ctl->zsctl_swap_cache_num = num_swap_devices;
3293 	}
3294 	num_swap_devices = swapctl(SC_LIST, ctl->zsctl_swap_cache);
3295 	if (num_swap_devices < 0) {
3296 		/* More swap devices have arrived */
3297 		if (errno == ENOMEM)
3298 			goto disk_swap_again;
3299 
3300 		zsd_warn(gettext("Unable to determine disk swap devices"));
3301 		/* Unexpected error.  Use existing data */
3302 		disk_swap_total = sys->zss_swap_total;
3303 		disk_swap_used = sys->zss_swap_used;
3304 		goto disk_swap_done;
3305 	}
3306 
3307 	/* add up the disk swap */
3308 	disk_swap_total = 0;
3309 	disk_swap_used = 0;
3310 	swent = ctl->zsctl_swap_cache->swt_ent;
3311 	for (i = 0; i < num_swap_devices; i++, swent++) {
3312 		disk_swap_total += swent->ste_pages;
3313 		disk_swap_used += (swent->ste_pages - swent->ste_free);
3314 	}
3315 	disk_swap_total *= ctl->zsctl_pagesize;
3316 	disk_swap_used *= ctl->zsctl_pagesize;
3317 
3318 	sys->zss_swap_total = disk_swap_total;
3319 	sys->zss_swap_used = disk_swap_used;
3320 
3321 disk_swap_done:
3322 
3323 	/* get system pages kstat */
3324 	kid = -1;
3325 	kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "unix", 0, "system_pages");
3326 	if (kstat == NULL)
3327 		zsd_warn(gettext("Unable to lookup system pages kstat"));
3328 	else
3329 		kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3330 
3331 	if (kid == -1) {
3332 		zsd_warn(gettext("Unable to read system pages kstat"));
3333 		return;
3334 	} else {
3335 		knp = kstat_data_lookup(kstat, "physmem");
3336 		if (knp == NULL) {
3337 			zsd_warn(gettext("Unable to read physmem"));
3338 		} else {
3339 			if (knp->data_type == KSTAT_DATA_UINT64)
3340 				physmem = knp->value.ui64;
3341 			else if (knp->data_type == KSTAT_DATA_UINT32)
3342 				physmem = knp->value.ui32;
3343 			else
3344 				return;
3345 		}
3346 		knp = kstat_data_lookup(kstat, "pp_kernel");
3347 		if (knp == NULL) {
3348 			zsd_warn(gettext("Unable to read pp_kernel"));
3349 		} else {
3350 			if (knp->data_type == KSTAT_DATA_UINT64)
3351 				pp_kernel = knp->value.ui64;
3352 			else if (knp->data_type == KSTAT_DATA_UINT32)
3353 				pp_kernel = knp->value.ui32;
3354 			else
3355 				return;
3356 		}
3357 	}
3358 	physmem *= ctl->zsctl_pagesize;
3359 	pp_kernel *= ctl->zsctl_pagesize;
3360 
3361 	/* get the zfs arc size if available */
3362 	arc_size = 0;
3363 	kid = -1;
3364 	kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "zfs", 0, "arcstats");
3365 	if (kstat != NULL)
3366 		kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3367 	if (kid != -1) {
3368 		knp = kstat_data_lookup(kstat, "size");
3369 		if (knp != NULL)
3370 			if (knp->data_type == KSTAT_DATA_UINT64)
3371 				arc_size = knp->value.ui64;
3372 	}
3373 
3374 	/* Try to get swap information */
3375 	if (swapctl(SC_AINFO, &ani) < 0) {
3376 		zsd_warn(gettext("Unable to get swap info"));
3377 		return;
3378 	}
3379 
3380 vmusage_again:
3381 	/* getvmusage to get physical memory usage */
3382 	vmusage = ctl->zsctl_vmusage_cache;
3383 	num_vmusage = ctl->zsctl_vmusage_cache_num;
3384 
3385 	ret = zsd_getvmusage(ctl, VMUSAGE_SYSTEM | VMUSAGE_ALL_ZONES, 0,
3386 	    vmusage, &num_vmusage);
3387 
3388 	if (ret != 0) {
3389 		/* Unexpected error.  Use existing data */
3390 		if (errno != EOVERFLOW) {
3391 			zsd_warn(gettext(
3392 			    "Unable to read physical memory usage"));
3393 			phys_zones = sys->zss_ram_zones;
3394 			goto vmusage_done;
3395 		}
3396 	}
3397 	/* vmusage results cache too small */
3398 	if (num_vmusage > ctl->zsctl_vmusage_cache_num) {
3399 
3400 		size_t size = sizeof (zsd_vmusage64_t) * num_vmusage;
3401 
3402 		if (ctl->zsctl_vmusage_cache != NULL)
3403 			free(ctl->zsctl_vmusage_cache);
3404 		vmusage = (zsd_vmusage64_t *)malloc(size);
3405 		if (vmusage == NULL) {
3406 			zsd_warn(gettext("Unable to alloc to determine "
3407 			    "physical memory usage"));
3408 			phys_zones = sys->zss_ram_zones;
3409 			goto vmusage_done;
3410 		}
3411 		ctl->zsctl_vmusage_cache = vmusage;
3412 		ctl->zsctl_vmusage_cache_num = num_vmusage;
3413 		goto vmusage_again;
3414 	}
3415 
3416 	phys_zones_overcount = 0;
3417 	vmu_nzones = 0;
3418 	for (i = 0; i < num_vmusage; i++) {
3419 		switch (vmusage[i].vmu_type) {
3420 		case VMUSAGE_SYSTEM:
3421 			/* total pages backing user process mappings */
3422 			phys_zones = sys->zss_ram_zones =
3423 			    vmusage[i].vmu_rss_all;
3424 			break;
3425 		case VMUSAGE_ZONE:
3426 			vmu_nzones++;
3427 			phys_zones_overcount += vmusage[i].vmu_rss_all;
3428 			zone = zsd_lookup_zone_byid(ctl, vmusage[i].vmu_id);
3429 			if (zone != NULL)
3430 				zone->zsz_usage_ram = vmusage[i].vmu_rss_all;
3431 			break;
3432 		default:
3433 			break;
3434 		}
3435 	}
3436 	/*
3437 	 * Figure how much memory was double counted due to text sharing
3438 	 * between zones.  Credit this back so that the sum of the zones
3439 	 * equals the total zone ram usage;
3440 	 */
3441 	phys_zones_extra = phys_zones_overcount - phys_zones;
3442 	phys_zones_credit = phys_zones_extra / vmu_nzones;
3443 
3444 vmusage_done:
3445 
3446 	/* walk the zones to get swap and locked kstats.  Fetch ram cap. */
3447 	sys->zss_locked_zones = 0;
3448 	sys->zss_vm_zones = 0;
3449 	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3450 	    zone = list_next(&ctl->zsctl_zones, zone)) {
3451 
3452 		/* If zone halted during interval, show memory usage as none */
3453 		if (zone->zsz_active == B_FALSE ||
3454 		    zone->zsz_deleted == B_TRUE) {
3455 			zone->zsz_usage_ram = 0;
3456 			zone->zsz_usage_vm = 0;
3457 			zone->zsz_usage_locked = 0;
3458 			continue;
3459 		}
3460 
3461 		if (phys_zones_credit > 0) {
3462 			if (zone->zsz_usage_ram > phys_zones_credit) {
3463 				zone->zsz_usage_ram -= phys_zones_credit;
3464 			}
3465 		}
3466 		/*
3467 		 * Get zone's swap usage.  Since zone could have halted,
3468 		 * treats as zero if cannot read
3469 		 */
3470 		zone->zsz_usage_vm = 0;
3471 		(void) snprintf(kstat_name, sizeof (kstat_name),
3472 		    "swapresv_zone_%d", zone->zsz_id);
3473 		kid = -1;
3474 		kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3475 		    zone->zsz_id, kstat_name);
3476 		if (kstat != NULL)
3477 			kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3478 		if (kid != -1) {
3479 			knp = kstat_data_lookup(kstat, "usage");
3480 			if (knp != NULL &&
3481 			    knp->data_type == KSTAT_DATA_UINT64) {
3482 				zone->zsz_usage_vm = knp->value.ui64;
3483 				sys->zss_vm_zones += knp->value.ui64;
3484 			}
3485 		}
3486 		/*
3487 		 * Get zone's locked usage.  Since zone could have halted,
3488 		 * treats as zero if cannot read
3489 		 */
3490 		zone->zsz_usage_locked = 0;
3491 		(void) snprintf(kstat_name, sizeof (kstat_name),
3492 		    "lockedmem_zone_%d", zone->zsz_id);
3493 		kid = -1;
3494 		kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3495 		    zone->zsz_id, kstat_name);
3496 		if (kstat != NULL)
3497 			kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3498 		if (kid != -1) {
3499 			knp = kstat_data_lookup(kstat, "usage");
3500 			if (knp != NULL &&
3501 			    knp->data_type == KSTAT_DATA_UINT64) {
3502 				zone->zsz_usage_locked = knp->value.ui64;
3503 				/*
3504 				 * Since locked memory accounting for zones
3505 				 * can double count ddi locked memory, cap each
3506 				 * zone's locked usage at its ram usage.
3507 				 */
3508 				if (zone->zsz_usage_locked >
3509 				    zone->zsz_usage_ram)
3510 					zone->zsz_usage_locked =
3511 					    zone->zsz_usage_ram;
3512 				sys->zss_locked_zones +=
3513 				    zone->zsz_usage_locked;
3514 			}
3515 		}
3516 	}
3517 
3518 	phys_total =
3519 	    sysconf(_SC_PHYS_PAGES) * ctl->zsctl_pagesize;
3520 
3521 	phys_used = (sysconf(_SC_PHYS_PAGES) - sysconf(_SC_AVPHYS_PAGES))
3522 	    * ctl->zsctl_pagesize;
3523 
3524 	/* Compute remaining statistics */
3525 	sys->zss_ram_total = phys_total;
3526 	sys->zss_ram_zones = phys_zones;
3527 	sys->zss_ram_kern = phys_used - phys_zones - arc_size;
3528 
3529 	/*
3530 	 * The total for kernel locked memory should include
3531 	 * segkp locked pages, but oh well.  The arc size is subtracted,
3532 	 * as that physical memory is reclaimable.
3533 	 */
3534 	sys->zss_locked_kern = pp_kernel - arc_size;
3535 	/* Add memory used by kernel startup and obp to kernel locked */
3536 	if ((phys_total - physmem) > 0)
3537 		sys->zss_locked_kern += phys_total - physmem;
3538 
3539 	/*
3540 	 * Add in the portion of (RAM+DISK) that is not available as swap,
3541 	 * and consider it swap used by the kernel.
3542 	 */
3543 	sys->zss_vm_total = phys_total + disk_swap_total;
3544 	vm_free = (ani.ani_max - ani.ani_resv) * ctl->zsctl_pagesize;
3545 	vm_used = sys->zss_vm_total - vm_free;
3546 	sys->zss_vm_kern = vm_used - sys->zss_vm_zones - arc_size;
3547 }
3548 
3549 /*
3550  * Charge each cpu's usage to its processor sets.  Also add the cpu's total
3551  * time to each zone using the processor set.  This tracks the maximum
3552  * amount of cpu time that a zone could have used.
3553  */
3554 static void
3555 zsd_refresh_cpu_stats(zsd_ctl_t *ctl, boolean_t init)
3556 {
3557 	zsd_system_t *sys;
3558 	zsd_zone_t *zone;
3559 	zsd_pset_usage_t *usage;
3560 	zsd_cpu_t *cpu;
3561 	zsd_cpu_t *cpu_next;
3562 	zsd_pset_t *pset;
3563 	timestruc_t ts;
3564 	uint64_t hrtime;
3565 	timestruc_t delta;
3566 
3567 	/* Update the per-cpu kstat data */
3568 	cpu_next = list_head(&ctl->zsctl_cpus);
3569 	while (cpu_next != NULL) {
3570 		cpu = cpu_next;
3571 		cpu_next = list_next(&ctl->zsctl_cpus, cpu);
3572 		zsd_update_cpu_stats(ctl, cpu);
3573 	}
3574 	/* Update the elapsed real time */
3575 	hrtime = gethrtime();
3576 	if (init) {
3577 		/* first time around, store hrtime for future comparision */
3578 		ctl->zsctl_hrtime = hrtime;
3579 		ctl->zsctl_hrtime_prev = hrtime;
3580 
3581 	} else {
3582 		/* Compute increase in hrtime since the most recent read */
3583 		ctl->zsctl_hrtime_prev = ctl->zsctl_hrtime;
3584 		ctl->zsctl_hrtime = hrtime;
3585 		if ((hrtime = hrtime - ctl->zsctl_hrtime_prev) > 0)
3586 			TIMESTRUC_ADD_NANOSEC(ctl->zsctl_hrtime_total, hrtime);
3587 	}
3588 
3589 	/* On initialization, all psets have zero time  */
3590 	if (init)
3591 		return;
3592 
3593 	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
3594 	    pset = list_next(&ctl->zsctl_psets, pset)) {
3595 
3596 		if (pset->zsp_active == B_FALSE) {
3597 			zsd_warn(gettext("Internal error,inactive pset found"));
3598 			continue;
3599 		}
3600 
3601 		/* sum total used time for pset */
3602 		ts.tv_sec = 0;
3603 		ts.tv_nsec = 0;
3604 		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_intr);
3605 		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_kern);
3606 		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_user);
3607 		/* kernel time in pset is total time minus zone time */
3608 		TIMESTRUC_DELTA(pset->zsp_usage_kern, ts,
3609 		    pset->zsp_usage_zones);
3610 		if (pset->zsp_usage_kern.tv_sec < 0 ||
3611 		    pset->zsp_usage_kern.tv_nsec < 0) {
3612 			pset->zsp_usage_kern.tv_sec = 0;
3613 			pset->zsp_usage_kern.tv_nsec = 0;
3614 		}
3615 		/* Total pset elapsed time is used time plus idle time */
3616 		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_idle);
3617 
3618 		TIMESTRUC_DELTA(delta, ts, pset->zsp_total_time);
3619 
3620 		for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
3621 		    usage = list_next(&pset->zsp_usage_list, usage)) {
3622 
3623 			zone = usage->zsu_zone;
3624 			if (usage->zsu_cpu_shares != ZS_LIMIT_NONE &&
3625 			    usage->zsu_cpu_shares != ZS_SHARES_UNLIMITED &&
3626 			    usage->zsu_cpu_shares != 0) {
3627 				/*
3628 				 * Figure out how many nanoseconds of share time
3629 				 * to give to the zone
3630 				 */
3631 				hrtime = delta.tv_sec;
3632 				hrtime *= NANOSEC;
3633 				hrtime += delta.tv_nsec;
3634 				hrtime *= usage->zsu_cpu_shares;
3635 				hrtime /= pset->zsp_cpu_shares;
3636 				TIMESTRUC_ADD_NANOSEC(zone->zsz_share_time,
3637 				    hrtime);
3638 			}
3639 			/* Add pset time to each zone using pset */
3640 			TIMESTRUC_ADD_TIMESTRUC(zone->zsz_pset_time, delta);
3641 
3642 			zone->zsz_cpus_online += pset->zsp_online;
3643 		}
3644 		pset->zsp_total_time = ts;
3645 	}
3646 
3647 	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3648 	    zone = list_next(&ctl->zsctl_zones, zone)) {
3649 
3650 		/* update cpu cap tracking if the zone has a cpu cap */
3651 		if (zone->zsz_cpu_cap != ZS_LIMIT_NONE) {
3652 			uint64_t elapsed;
3653 
3654 			elapsed = ctl->zsctl_hrtime - ctl->zsctl_hrtime_prev;
3655 			elapsed *= zone->zsz_cpu_cap;
3656 			elapsed = elapsed / 100;
3657 			TIMESTRUC_ADD_NANOSEC(zone->zsz_cap_time, elapsed);
3658 		}
3659 	}
3660 	sys = ctl->zsctl_system;
3661 	ts.tv_sec = 0;
3662 	ts.tv_nsec = 0;
3663 	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_intr);
3664 	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_kern);
3665 	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_user);
3666 
3667 	/* kernel time in pset is total time minus zone time */
3668 	TIMESTRUC_DELTA(sys->zss_cpu_usage_kern, ts,
3669 	    sys->zss_cpu_usage_zones);
3670 	if (sys->zss_cpu_usage_kern.tv_sec < 0 ||
3671 	    sys->zss_cpu_usage_kern.tv_nsec < 0) {
3672 		sys->zss_cpu_usage_kern.tv_sec = 0;
3673 		sys->zss_cpu_usage_kern.tv_nsec = 0;
3674 	}
3675 	/* Total pset elapsed time is used time plus idle time */
3676 	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_idle);
3677 	sys->zss_cpu_total_time = ts;
3678 }
3679 
3680 /*
3681  * Saves current usage data to a cache that is read by libzonestat when
3682  * calling zs_usage_read().
3683  *
3684  * All pointers in the cached data structure are set to NULL.  When
3685  * libzonestat reads the cached data, it will set the pointers relative to
3686  * its address space.
3687  */
3688 static void
3689 zsd_usage_cache_update(zsd_ctl_t *ctl)
3690 {
3691 	zs_usage_cache_t *cache;
3692 	zs_usage_cache_t *old;
3693 	zs_usage_t *usage;
3694 
3695 	zs_system_t *sys;
3696 	zsd_system_t *dsys;
3697 	zs_zone_t *zone = NULL;
3698 	zsd_zone_t *dzone;
3699 	zs_pset_t *pset = NULL;
3700 	zsd_pset_t *dpset;
3701 	zs_pset_zone_t *pusage;
3702 	zsd_pset_usage_t *dpusage;
3703 
3704 	char *next;
3705 	uint_t size, i, j;
3706 
3707 	size =
3708 	    sizeof (zs_usage_cache_t) +
3709 	    sizeof (zs_usage_t) +
3710 	    sizeof (zs_system_t) +
3711 	    sizeof (zs_zone_t) * ctl->zsctl_nzones +
3712 	    sizeof (zs_pset_t) *  ctl->zsctl_npsets +
3713 	    sizeof (zs_pset_zone_t) * ctl->zsctl_npset_usages;
3714 
3715 	cache = (zs_usage_cache_t *)malloc(size);
3716 	if (cache == NULL) {
3717 		zsd_warn(gettext("Unable to allocate usage cache\n"));
3718 		return;
3719 	}
3720 
3721 	next = (char *)cache;
3722 	cache->zsuc_size = size - sizeof (zs_usage_cache_t);
3723 	next += sizeof (zs_usage_cache_t);
3724 
3725 	/* LINTED */
3726 	usage = cache->zsuc_usage = (zs_usage_t *)next;
3727 	next += sizeof (zs_usage_t);
3728 	usage->zsu_start = g_start;
3729 	usage->zsu_hrstart = g_hrstart;
3730 	usage->zsu_time = g_now;
3731 	usage->zsu_hrtime = g_hrnow;
3732 	usage->zsu_nzones = ctl->zsctl_nzones;
3733 	usage->zsu_npsets = ctl->zsctl_npsets;
3734 	usage->zsu_system = NULL;
3735 
3736 	/* LINTED */
3737 	sys = (zs_system_t *)next;
3738 	next += sizeof (zs_system_t);
3739 	dsys = ctl->zsctl_system;
3740 	sys->zss_ram_total = dsys->zss_ram_total;
3741 	sys->zss_ram_kern = dsys->zss_ram_kern;
3742 	sys->zss_ram_zones = dsys->zss_ram_zones;
3743 	sys->zss_locked_kern = dsys->zss_locked_kern;
3744 	sys->zss_locked_zones = dsys->zss_locked_zones;
3745 	sys->zss_vm_total = dsys->zss_vm_total;
3746 	sys->zss_vm_kern = dsys->zss_vm_kern;
3747 	sys->zss_vm_zones = dsys->zss_vm_zones;
3748 	sys->zss_swap_total = dsys->zss_swap_total;
3749 	sys->zss_swap_used = dsys->zss_swap_used;
3750 	sys->zss_ncpus = dsys->zss_ncpus;
3751 	sys->zss_ncpus_online = dsys->zss_ncpus_online;
3752 
3753 	sys->zss_processes_max = dsys->zss_maxpid;
3754 	sys->zss_lwps_max = dsys->zss_lwps_max;
3755 	sys->zss_shm_max = dsys->zss_shm_max;
3756 	sys->zss_shmids_max = dsys->zss_shmids_max;
3757 	sys->zss_semids_max = dsys->zss_semids_max;
3758 	sys->zss_msgids_max = dsys->zss_msgids_max;
3759 	sys->zss_lofi_max = dsys->zss_lofi_max;
3760 
3761 	sys->zss_processes = dsys->zss_processes;
3762 	sys->zss_lwps = dsys->zss_lwps;
3763 	sys->zss_shm = dsys->zss_shm;
3764 	sys->zss_shmids = dsys->zss_shmids;
3765 	sys->zss_semids = dsys->zss_semids;
3766 	sys->zss_msgids = dsys->zss_msgids;
3767 	sys->zss_lofi = dsys->zss_lofi;
3768 
3769 	sys->zss_cpu_total_time = dsys->zss_cpu_total_time;
3770 	sys->zss_cpu_usage_zones = dsys->zss_cpu_usage_zones;
3771 	sys->zss_cpu_usage_kern = dsys->zss_cpu_usage_kern;
3772 
3773 	for (i = 0, dzone = list_head(&ctl->zsctl_zones);
3774 	    i < ctl->zsctl_nzones;
3775 	    i++, dzone = list_next(&ctl->zsctl_zones, dzone)) {
3776 		/* LINTED */
3777 		zone = (zs_zone_t *)next;
3778 		next += sizeof (zs_zone_t);
3779 		list_link_init(&zone->zsz_next);
3780 		zone->zsz_system = NULL;
3781 
3782 		(void) strlcpy(zone->zsz_name, dzone->zsz_name,
3783 		    sizeof (zone->zsz_name));
3784 		(void) strlcpy(zone->zsz_pool, dzone->zsz_pool,
3785 		    sizeof (zone->zsz_pool));
3786 		(void) strlcpy(zone->zsz_pset, dzone->zsz_pset,
3787 		    sizeof (zone->zsz_pset));
3788 		zone->zsz_id = dzone->zsz_id;
3789 		zone->zsz_cputype = dzone->zsz_cputype;
3790 		zone->zsz_iptype = dzone->zsz_iptype;
3791 		zone->zsz_start = dzone->zsz_start;
3792 		zone->zsz_hrstart = dzone->zsz_hrstart;
3793 		zone->zsz_scheds = dzone->zsz_scheds;
3794 		zone->zsz_cpu_shares = dzone->zsz_cpu_shares;
3795 		zone->zsz_cpu_cap = dzone->zsz_cpu_cap;
3796 		zone->zsz_ram_cap = dzone->zsz_ram_cap;
3797 		zone->zsz_vm_cap = dzone->zsz_vm_cap;
3798 		zone->zsz_locked_cap = dzone->zsz_locked_cap;
3799 		zone->zsz_cpu_usage = dzone->zsz_cpu_usage;
3800 		zone->zsz_cpus_online = dzone->zsz_cpus_online;
3801 		zone->zsz_pset_time = dzone->zsz_pset_time;
3802 		zone->zsz_cap_time = dzone->zsz_cap_time;
3803 		zone->zsz_share_time = dzone->zsz_share_time;
3804 		zone->zsz_usage_ram = dzone->zsz_usage_ram;
3805 		zone->zsz_usage_locked = dzone->zsz_usage_locked;
3806 		zone->zsz_usage_vm = dzone->zsz_usage_vm;
3807 
3808 		zone->zsz_processes_cap = dzone->zsz_processes_cap;
3809 		zone->zsz_lwps_cap = dzone->zsz_lwps_cap;
3810 		zone->zsz_shm_cap = dzone->zsz_shm_cap;
3811 		zone->zsz_shmids_cap = dzone->zsz_shmids_cap;
3812 		zone->zsz_semids_cap = dzone->zsz_semids_cap;
3813 		zone->zsz_msgids_cap = dzone->zsz_msgids_cap;
3814 		zone->zsz_lofi_cap = dzone->zsz_lofi_cap;
3815 
3816 		zone->zsz_processes = dzone->zsz_processes;
3817 		zone->zsz_lwps = dzone->zsz_lwps;
3818 		zone->zsz_shm = dzone->zsz_shm;
3819 		zone->zsz_shmids = dzone->zsz_shmids;
3820 		zone->zsz_semids = dzone->zsz_semids;
3821 		zone->zsz_msgids = dzone->zsz_msgids;
3822 		zone->zsz_lofi = dzone->zsz_lofi;
3823 	}
3824 
3825 	for (i = 0, dpset = list_head(&ctl->zsctl_psets);
3826 	    i < ctl->zsctl_npsets;
3827 	    i++, dpset = list_next(&ctl->zsctl_psets, dpset)) {
3828 		/* LINTED */
3829 		pset = (zs_pset_t *)next;
3830 		next += sizeof (zs_pset_t);
3831 		list_link_init(&pset->zsp_next);
3832 		(void) strlcpy(pset->zsp_name, dpset->zsp_name,
3833 		    sizeof (pset->zsp_name));
3834 		pset->zsp_id = dpset->zsp_id;
3835 		pset->zsp_cputype = dpset->zsp_cputype;
3836 		pset->zsp_start = dpset->zsp_start;
3837 		pset->zsp_hrstart = dpset->zsp_hrstart;
3838 		pset->zsp_online = dpset->zsp_online;
3839 		pset->zsp_size = dpset->zsp_size;
3840 		pset->zsp_min = dpset->zsp_min;
3841 		pset->zsp_max = dpset->zsp_max;
3842 		pset->zsp_importance = dpset->zsp_importance;
3843 		pset->zsp_scheds = dpset->zsp_scheds;
3844 		pset->zsp_cpu_shares = dpset->zsp_cpu_shares;
3845 		pset->zsp_total_time = dpset->zsp_total_time;
3846 		pset->zsp_usage_kern = dpset->zsp_usage_kern;
3847 		pset->zsp_usage_zones = dpset->zsp_usage_zones;
3848 		pset->zsp_nusage = dpset->zsp_nusage;
3849 		/* Add pset usages for pset */
3850 		for (j = 0, dpusage = list_head(&dpset->zsp_usage_list);
3851 		    j < dpset->zsp_nusage;
3852 		    j++, dpusage = list_next(&dpset->zsp_usage_list, dpusage)) {
3853 			/* LINTED */
3854 			pusage = (zs_pset_zone_t *)next;
3855 			next += sizeof (zs_pset_zone_t);
3856 			/* pointers are computed by client */
3857 			pusage->zspz_pset = NULL;
3858 			pusage->zspz_zone = NULL;
3859 			list_link_init(&pusage->zspz_next);
3860 			pusage->zspz_zoneid = dpusage->zsu_zone->zsz_id;
3861 			pusage->zspz_start = dpusage->zsu_start;
3862 			pusage->zspz_hrstart = dpusage->zsu_hrstart;
3863 			pusage->zspz_hrstart = dpusage->zsu_hrstart;
3864 			pusage->zspz_cpu_shares = dpusage->zsu_cpu_shares;
3865 			pusage->zspz_scheds = dpusage->zsu_scheds;
3866 			pusage->zspz_cpu_usage = dpusage->zsu_cpu_usage;
3867 		}
3868 	}
3869 
3870 	/* Update the current cache pointer */
3871 	(void) mutex_lock(&g_usage_cache_lock);
3872 	old = g_usage_cache;
3873 	cache->zsuc_ref = 1;
3874 	cache->zsuc_gen = g_gen_next;
3875 	usage->zsu_gen = g_gen_next;
3876 	usage->zsu_size = size;
3877 	g_usage_cache = cache;
3878 	if (old != NULL) {
3879 		old->zsuc_ref--;
3880 		if (old->zsuc_ref == 0)
3881 			free(old);
3882 	}
3883 	g_gen_next++;
3884 	/* Wake up any clients that are waiting for this calculation */
3885 	if (g_usage_cache_kickers > 0) {
3886 		(void) cond_broadcast(&g_usage_cache_wait);
3887 	}
3888 	(void) mutex_unlock(&g_usage_cache_lock);
3889 }
3890 
3891 static zs_usage_cache_t *
3892 zsd_usage_cache_hold_locked()
3893 {
3894 	zs_usage_cache_t *ret;
3895 
3896 	ret = g_usage_cache;
3897 	ret->zsuc_ref++;
3898 	return (ret);
3899 }
3900 
3901 void
3902 zsd_usage_cache_rele(zs_usage_cache_t *cache)
3903 {
3904 	(void) mutex_lock(&g_usage_cache_lock);
3905 	cache->zsuc_ref--;
3906 	if (cache->zsuc_ref == 0)
3907 		free(cache);
3908 	(void) mutex_unlock(&g_usage_cache_lock);
3909 }
3910 
3911 /* Close the handles held by zsd_open() */
3912 void
3913 zsd_close(zsd_ctl_t *ctl)
3914 {
3915 	zsd_zone_t *zone;
3916 	zsd_pset_t *pset;
3917 	zsd_pset_usage_t *usage;
3918 	zsd_cpu_t *cpu;
3919 	int id;
3920 
3921 	if (ctl->zsctl_kstat_ctl) {
3922 		(void) kstat_close(ctl->zsctl_kstat_ctl);
3923 		ctl->zsctl_kstat_ctl = NULL;
3924 	}
3925 	if (ctl->zsctl_proc_open) {
3926 		(void) ea_close(&ctl->zsctl_proc_eaf);
3927 		ctl->zsctl_proc_open = 0;
3928 		ctl->zsctl_proc_fd = -1;
3929 	}
3930 	if (ctl->zsctl_pool_conf) {
3931 		if (ctl->zsctl_pool_status == POOL_ENABLED)
3932 			(void) pool_conf_close(ctl->zsctl_pool_conf);
3933 		ctl->zsctl_pool_status = POOL_DISABLED;
3934 	}
3935 
3936 	while ((zone = list_head(&ctl->zsctl_zones)) != NULL) {
3937 		list_remove(&ctl->zsctl_zones, zone);
3938 		free(zone);
3939 		ctl->zsctl_nzones--;
3940 	}
3941 
3942 	while ((pset = list_head(&ctl->zsctl_psets)) != NULL) {
3943 		while ((usage = list_head(&pset->zsp_usage_list))
3944 		    != NULL) {
3945 			list_remove(&pset->zsp_usage_list, usage);
3946 			ctl->zsctl_npset_usages--;
3947 			free(usage);
3948 		}
3949 		list_remove(&ctl->zsctl_psets, pset);
3950 		free(pset);
3951 		ctl->zsctl_npsets--;
3952 	}
3953 
3954 	/* Release all cpus being tracked */
3955 	while (cpu = list_head(&ctl->zsctl_cpus)) {
3956 		list_remove(&ctl->zsctl_cpus, cpu);
3957 		id = cpu->zsc_id;
3958 		bzero(cpu, sizeof (zsd_cpu_t));
3959 		cpu->zsc_id = id;
3960 		cpu->zsc_allocated = B_FALSE;
3961 		cpu->zsc_psetid = ZS_PSET_ERROR;
3962 		cpu->zsc_psetid_prev = ZS_PSET_ERROR;
3963 	}
3964 
3965 	assert(ctl->zsctl_npset_usages == 0);
3966 	assert(ctl->zsctl_npsets == 0);
3967 	assert(ctl->zsctl_nzones == 0);
3968 	(void) zsd_disable_cpu_stats();
3969 }
3970 
3971 
3972 /*
3973  * Update the utilization data for all zones and processor sets.
3974  */
3975 static int
3976 zsd_read(zsd_ctl_t *ctl, boolean_t init, boolean_t do_memory)
3977 {
3978 	(void) kstat_chain_update(ctl->zsctl_kstat_ctl);
3979 	(void) gettimeofday(&(ctl->zsctl_timeofday), NULL);
3980 
3981 	zsd_refresh_system(ctl);
3982 
3983 	/*
3984 	 * Memory calculation is expensive.  Only update it on sample
3985 	 * intervals.
3986 	 */
3987 	if (do_memory == B_TRUE)
3988 		zsd_refresh_memory(ctl, init);
3989 	zsd_refresh_zones(ctl);
3990 	zsd_refresh_psets(ctl);
3991 	zsd_refresh_procs(ctl, init);
3992 	zsd_refresh_cpu_stats(ctl, init);
3993 
3994 	/*
3995 	 * Delete objects that no longer exist.
3996 	 * Pset usages must be deleted first as they point to zone and
3997 	 * pset objects.
3998 	 */
3999 	zsd_mark_pset_usages_end(ctl);
4000 	zsd_mark_psets_end(ctl);
4001 	zsd_mark_cpus_end(ctl);
4002 	zsd_mark_zones_end(ctl);
4003 
4004 	/*
4005 	 * Save results for clients.
4006 	 */
4007 	zsd_usage_cache_update(ctl);
4008 
4009 	/*
4010 	 * Roll process accounting file.
4011 	 */
4012 	(void) zsd_roll_exacct();
4013 	return (0);
4014 }
4015 
4016 /*
4017  * Get the system rctl, which is the upper most limit
4018  */
4019 static uint64_t
4020 zsd_get_system_rctl(char *name)
4021 {
4022 	rctlblk_t *rblk, *rblk_last;
4023 
4024 	rblk = (rctlblk_t *)alloca(rctlblk_size());
4025 	rblk_last = (rctlblk_t *)alloca(rctlblk_size());
4026 
4027 	if (getrctl(name, NULL, rblk_last, RCTL_FIRST) != 0)
4028 		return (ZS_LIMIT_NONE);
4029 
4030 	while (getrctl(name, rblk_last, rblk, RCTL_NEXT) == 0)
4031 		(void) bcopy(rblk, rblk_last, rctlblk_size());
4032 
4033 	return (rctlblk_get_value(rblk_last));
4034 }
4035 
4036 /*
4037  * Open any necessary subsystems for collecting utilization data,
4038  * allocate and initialize data structures, and get initial utilization.
4039  *
4040  * Errors:
4041  *	ENOMEM	out of memory
4042  *	EINVAL  other error
4043  */
4044 static zsd_ctl_t *
4045 zsd_open(zsd_ctl_t *ctl)
4046 {
4047 	zsd_system_t *system;
4048 
4049 	char path[MAXPATHLEN];
4050 	struct statvfs svfs;
4051 	int ret;
4052 	int i;
4053 	size_t size;
4054 	int err;
4055 
4056 	if (ctl == NULL && (ctl = (zsd_ctl_t *)calloc(1,
4057 	    sizeof (zsd_ctl_t))) == NULL) {
4058 			zsd_warn(gettext("Out of Memory"));
4059 			errno = ENOMEM;
4060 			goto err;
4061 	}
4062 	ctl->zsctl_proc_fd = -1;
4063 
4064 	/* open kstats */
4065 	if (ctl->zsctl_kstat_ctl == NULL &&
4066 	    (ctl->zsctl_kstat_ctl = kstat_open()) == NULL) {
4067 		err = errno;
4068 		zsd_warn(gettext("Unable to open kstats"));
4069 		errno = err;
4070 		if (errno != ENOMEM)
4071 			errno = EAGAIN;
4072 		goto err;
4073 	}
4074 
4075 	/*
4076 	 * These are set when the accounting file is opened by
4077 	 * zsd_update_procs()
4078 	 */
4079 	ctl->zsctl_proc_fd = -1;
4080 	ctl->zsctl_proc_fd_next = -1;
4081 	ctl->zsctl_proc_open = 0;
4082 	ctl->zsctl_proc_open_next = 0;
4083 
4084 check_exacct:
4085 	(void) zsd_enable_cpu_stats();
4086 
4087 	/* Create structures to track usage */
4088 	if (ctl->zsctl_system == NULL && (ctl->zsctl_system = (zsd_system_t *)
4089 	    calloc(1, sizeof (zsd_system_t))) == NULL) {
4090 		ret = -1;
4091 		zsd_warn(gettext("Out of Memory"));
4092 		errno = ENOMEM;
4093 		goto err;
4094 	}
4095 	system = ctl->zsctl_system;
4096 	/* get the kernel bitness to know structure layout for getvmusage */
4097 	ret = sysinfo(SI_ARCHITECTURE_64, path, sizeof (path));
4098 	if (ret < 0)
4099 		ctl->zsctl_kern_bits = 32;
4100 	else
4101 		ctl->zsctl_kern_bits = 64;
4102 	ctl->zsctl_pagesize = sysconf(_SC_PAGESIZE);
4103 
4104 	size = sysconf(_SC_CPUID_MAX);
4105 	ctl->zsctl_maxcpuid = size;
4106 	if (ctl->zsctl_cpu_array == NULL && (ctl->zsctl_cpu_array =
4107 	    (zsd_cpu_t *)calloc(size + 1, sizeof (zsd_cpu_t))) == NULL) {
4108 		zsd_warn(gettext("Out of Memory"));
4109 		errno = ENOMEM;
4110 		goto err;
4111 	}
4112 	for (i = 0; i <= ctl->zsctl_maxcpuid; i++) {
4113 		ctl->zsctl_cpu_array[i].zsc_id = i;
4114 		ctl->zsctl_cpu_array[i].zsc_allocated = B_FALSE;
4115 		ctl->zsctl_cpu_array[i].zsc_psetid = ZS_PSET_ERROR;
4116 		ctl->zsctl_cpu_array[i].zsc_psetid_prev = ZS_PSET_ERROR;
4117 	}
4118 	if (statvfs("/proc", &svfs) != 0 ||
4119 	    strcmp("/proc", svfs.f_fstr) != 0) {
4120 		zsd_warn(gettext("/proc not a procfs filesystem"));
4121 		errno = EINVAL;
4122 		goto err;
4123 	}
4124 
4125 	size = sysconf(_SC_MAXPID) + 1;
4126 	ctl->zsctl_maxproc = size;
4127 	if (ctl->zsctl_proc_array == NULL &&
4128 	    (ctl->zsctl_proc_array = (zsd_proc_t *)calloc(size,
4129 	    sizeof (zsd_proc_t))) == NULL) {
4130 		zsd_warn(gettext("Out of Memory"));
4131 		errno = ENOMEM;
4132 		goto err;
4133 	}
4134 	for (i = 0; i <= ctl->zsctl_maxproc; i++) {
4135 		list_link_init(&(ctl->zsctl_proc_array[i].zspr_next));
4136 		ctl->zsctl_proc_array[i].zspr_psetid = ZS_PSET_ERROR;
4137 		ctl->zsctl_proc_array[i].zspr_zoneid = -1;
4138 		ctl->zsctl_proc_array[i].zspr_usage.tv_sec = 0;
4139 		ctl->zsctl_proc_array[i].zspr_usage.tv_nsec = 0;
4140 		ctl->zsctl_proc_array[i].zspr_ppid = -1;
4141 	}
4142 
4143 	list_create(&ctl->zsctl_zones, sizeof (zsd_zone_t),
4144 	    offsetof(zsd_zone_t, zsz_next));
4145 
4146 	list_create(&ctl->zsctl_psets, sizeof (zsd_pset_t),
4147 	    offsetof(zsd_pset_t, zsp_next));
4148 
4149 	list_create(&ctl->zsctl_cpus, sizeof (zsd_cpu_t),
4150 	    offsetof(zsd_cpu_t, zsc_next));
4151 
4152 	if (ctl->zsctl_pool_conf == NULL &&
4153 	    (ctl->zsctl_pool_conf = pool_conf_alloc()) == NULL) {
4154 		zsd_warn(gettext("Out of Memory"));
4155 		errno = ENOMEM;
4156 		goto err;
4157 	}
4158 	ctl->zsctl_pool_status = POOL_DISABLED;
4159 	ctl->zsctl_pool_changed = 0;
4160 
4161 	if (ctl->zsctl_pool_vals[0] == NULL &&
4162 	    (ctl->zsctl_pool_vals[0] = pool_value_alloc()) == NULL) {
4163 		zsd_warn(gettext("Out of Memory"));
4164 		errno = ENOMEM;
4165 		goto err;
4166 	}
4167 	if (ctl->zsctl_pool_vals[1] == NULL &&
4168 	    (ctl->zsctl_pool_vals[1] = pool_value_alloc()) == NULL) {
4169 		zsd_warn(gettext("Out of Memory"));
4170 		errno = ENOMEM;
4171 		goto err;
4172 	}
4173 	ctl->zsctl_pool_vals[2] = NULL;
4174 
4175 	/*
4176 	 * get system limits
4177 	 */
4178 	system->zss_maxpid = size = sysconf(_SC_MAXPID);
4179 	system->zss_processes_max = zsd_get_system_rctl("zone.max-processes");
4180 	system->zss_lwps_max = zsd_get_system_rctl("zone.max-lwps");
4181 	system->zss_shm_max = zsd_get_system_rctl("zone.max-shm-memory");
4182 	system->zss_shmids_max = zsd_get_system_rctl("zone.max-shm-ids");
4183 	system->zss_semids_max = zsd_get_system_rctl("zone.max-sem-ids");
4184 	system->zss_msgids_max = zsd_get_system_rctl("zone.max-msg-ids");
4185 	system->zss_lofi_max = zsd_get_system_rctl("zone.max-lofi");
4186 
4187 	g_gen_next = 1;
4188 
4189 	if (zsd_read(ctl, B_TRUE, B_FALSE) != 0)
4190 		zsd_warn(gettext("Reading zone statistics failed"));
4191 
4192 	return (ctl);
4193 err:
4194 	if (ctl)
4195 		zsd_close(ctl);
4196 
4197 	return (NULL);
4198 }
4199 
4200 /* Copy utilization data to buffer, filtering data if non-global zone. */
4201 static void
4202 zsd_usage_filter(zoneid_t zid, zs_usage_cache_t *cache, zs_usage_t *usage,
4203     boolean_t is_gz)
4204 {
4205 	zs_usage_t *cusage;
4206 	zs_system_t *sys, *csys;
4207 	zs_zone_t *zone, *czone;
4208 	zs_pset_t *pset, *cpset;
4209 	zs_pset_zone_t *pz, *cpz, *foundpz;
4210 	size_t size = 0, csize = 0;
4211 	char *start, *cstart;
4212 	int i, j;
4213 	timestruc_t delta;
4214 
4215 	/* Privileged users in the global zone get everything */
4216 	if (is_gz) {
4217 		cusage = cache->zsuc_usage;
4218 		(void) bcopy(cusage, usage, cusage->zsu_size);
4219 		return;
4220 	}
4221 
4222 	/* Zones just get their own usage */
4223 	cusage = cache->zsuc_usage;
4224 
4225 	start = (char *)usage;
4226 	cstart = (char *)cusage;
4227 	size += sizeof (zs_usage_t);
4228 	csize += sizeof (zs_usage_t);
4229 
4230 	usage->zsu_start = cusage->zsu_start;
4231 	usage->zsu_hrstart = cusage->zsu_hrstart;
4232 	usage->zsu_time = cusage->zsu_time;
4233 	usage->zsu_hrtime = cusage->zsu_hrtime;
4234 	usage->zsu_gen = cusage->zsu_gen;
4235 	usage->zsu_nzones = 1;
4236 	usage->zsu_npsets = 0;
4237 
4238 	/* LINTED */
4239 	sys = (zs_system_t *)(start + size);
4240 	/* LINTED */
4241 	csys = (zs_system_t *)(cstart + csize);
4242 	size += sizeof (zs_system_t);
4243 	csize += sizeof (zs_system_t);
4244 
4245 	/* Save system limits but not usage */
4246 	*sys = *csys;
4247 	sys->zss_ncpus = 0;
4248 	sys->zss_ncpus_online = 0;
4249 
4250 	/* LINTED */
4251 	zone = (zs_zone_t *)(start + size);
4252 	/* LINTED */
4253 	czone = (zs_zone_t *)(cstart + csize);
4254 	/* Find the matching zone */
4255 	for (i = 0; i < cusage->zsu_nzones; i++) {
4256 		if (czone->zsz_id == zid) {
4257 			*zone = *czone;
4258 			size += sizeof (zs_zone_t);
4259 		}
4260 		csize += sizeof (zs_zone_t);
4261 		/* LINTED */
4262 		czone = (zs_zone_t *)(cstart + csize);
4263 	}
4264 	sys->zss_ram_kern += (sys->zss_ram_zones - zone->zsz_usage_ram);
4265 	sys->zss_ram_zones = zone->zsz_usage_ram;
4266 
4267 	sys->zss_vm_kern += (sys->zss_vm_zones - zone->zsz_usage_vm);
4268 	sys->zss_vm_zones = zone->zsz_usage_vm;
4269 
4270 	sys->zss_locked_kern += (sys->zss_locked_zones -
4271 	    zone->zsz_usage_locked);
4272 	sys->zss_locked_zones = zone->zsz_usage_locked;
4273 
4274 	TIMESTRUC_DELTA(delta, sys->zss_cpu_usage_zones, zone->zsz_cpu_usage);
4275 	TIMESTRUC_ADD_TIMESTRUC(sys->zss_cpu_usage_kern, delta);
4276 	sys->zss_cpu_usage_zones = zone->zsz_cpu_usage;
4277 
4278 	/* LINTED */
4279 	pset = (zs_pset_t *)(start + size);
4280 	/* LINTED */
4281 	cpset = (zs_pset_t *)(cstart + csize);
4282 	for (i = 0; i < cusage->zsu_npsets; i++) {
4283 		csize += sizeof (zs_pset_t);
4284 		/* LINTED */
4285 		cpz = (zs_pset_zone_t *)(csize + cstart);
4286 		foundpz = NULL;
4287 		for (j = 0; j < cpset->zsp_nusage; j++) {
4288 			if (cpz->zspz_zoneid == zid)
4289 				foundpz = cpz;
4290 
4291 			csize += sizeof (zs_pset_zone_t);
4292 			/* LINTED */
4293 			cpz = (zs_pset_zone_t *)(csize + cstart);
4294 		}
4295 		if (foundpz != NULL) {
4296 			size += sizeof (zs_pset_t);
4297 			/* LINTED */
4298 			pz = (zs_pset_zone_t *)(start + size);
4299 			size += sizeof (zs_pset_zone_t);
4300 
4301 			*pset = *cpset;
4302 			*pz = *foundpz;
4303 
4304 			TIMESTRUC_DELTA(delta, pset->zsp_usage_zones,
4305 			    pz->zspz_cpu_usage);
4306 			TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_kern, delta);
4307 			pset->zsp_usage_zones = pz->zspz_cpu_usage;
4308 			pset->zsp_nusage = 1;
4309 			usage->zsu_npsets++;
4310 			sys->zss_ncpus += pset->zsp_size;
4311 			sys->zss_ncpus_online += pset->zsp_online;
4312 		}
4313 		/* LINTED */
4314 		cpset = (zs_pset_t *)(cstart + csize);
4315 	}
4316 	usage->zsu_size = size;
4317 }
4318 
4319 /*
4320  * Respond to new connections from libzonestat.so.  Also respond to zoneadmd,
4321  * which reports new zones.
4322  */
4323 /* ARGSUSED */
4324 static void
4325 zsd_server(void *cookie, char *argp, size_t arg_size,
4326     door_desc_t *dp, uint_t n_desc)
4327 {
4328 	int *args, cmd;
4329 	door_desc_t door;
4330 	ucred_t *ucred;
4331 	const priv_set_t *eset;
4332 
4333 	if (argp == DOOR_UNREF_DATA) {
4334 		(void) door_return(NULL, 0, NULL, 0);
4335 		thr_exit(NULL);
4336 	}
4337 
4338 	if (arg_size != sizeof (cmd) * 2) {
4339 		(void) door_return(NULL, 0, NULL, 0);
4340 		thr_exit(NULL);
4341 	}
4342 
4343 	/* LINTED */
4344 	args = (int *)argp;
4345 	cmd = args[0];
4346 
4347 	/* If connection, return door to stat server */
4348 	if (cmd == ZSD_CMD_CONNECT) {
4349 
4350 		/* Verify client compilation version */
4351 		if (args[1] != ZS_VERSION) {
4352 			args[1] = ZSD_STATUS_VERSION_MISMATCH;
4353 			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4354 			thr_exit(NULL);
4355 		}
4356 		ucred = alloca(ucred_size());
4357 		/* Verify client permission */
4358 		if (door_ucred(&ucred) != 0) {
4359 			args[1] = ZSD_STATUS_INTERNAL_ERROR;
4360 			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4361 			thr_exit(NULL);
4362 		}
4363 
4364 		eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4365 		if (eset == NULL) {
4366 			args[1] = ZSD_STATUS_INTERNAL_ERROR;
4367 			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4368 			thr_exit(NULL);
4369 		}
4370 		if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4371 			args[1] = ZSD_STATUS_PERMISSION;
4372 			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4373 			thr_exit(NULL);
4374 		}
4375 
4376 		/* Return stat server door */
4377 		args[1] = ZSD_STATUS_OK;
4378 		door.d_attributes = DOOR_DESCRIPTOR;
4379 		door.d_data.d_desc.d_descriptor = g_stat_door;
4380 		(void) door_return(argp, sizeof (cmd) * 2, &door, 1);
4381 		thr_exit(NULL);
4382 	}
4383 
4384 	/* Respond to zoneadmd informing zonestatd of a new zone */
4385 	if (cmd == ZSD_CMD_NEW_ZONE) {
4386 		zsd_fattach_zone(args[1], g_server_door, B_FALSE);
4387 		(void) door_return(NULL, 0, NULL, 0);
4388 		thr_exit(NULL);
4389 	}
4390 
4391 	args[1] = ZSD_STATUS_INTERNAL_ERROR;
4392 	(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4393 	thr_exit(NULL);
4394 }
4395 
4396 /*
4397  * Respond to libzonestat.so clients with the current utlilzation data.
4398  */
4399 /* ARGSUSED */
4400 static void
4401 zsd_stat_server(void *cookie, char *argp, size_t arg_size,
4402     door_desc_t *dp, uint_t n_desc)
4403 {
4404 	uint64_t *args, cmd;
4405 	zs_usage_cache_t *cache;
4406 	int ret;
4407 	char *rvalp;
4408 	size_t rvals;
4409 	zs_usage_t *usage;
4410 	ucred_t *ucred;
4411 	zoneid_t zoneid;
4412 	const priv_set_t *eset;
4413 	boolean_t is_gz = B_FALSE;
4414 
4415 	/* Tell stat thread there are no more clients */
4416 	if (argp == DOOR_UNREF_DATA) {
4417 		(void) mutex_lock(&g_usage_cache_lock);
4418 		g_hasclient = B_FALSE;
4419 		(void) cond_signal(&g_usage_cache_kick);
4420 		(void) mutex_unlock(&g_usage_cache_lock);
4421 		(void) door_return(NULL, 0, NULL, 0);
4422 		thr_exit(NULL);
4423 	}
4424 	if (arg_size != sizeof (cmd) * 2) {
4425 		(void) door_return(NULL, 0, NULL, 0);
4426 		thr_exit(NULL);
4427 	}
4428 	/* LINTED */
4429 	args = (uint64_t *)argp;
4430 	cmd = args[0];
4431 	if (cmd != ZSD_CMD_READ) {
4432 		(void) door_return(NULL, 0, NULL, 0);
4433 		thr_exit(NULL);
4434 	}
4435 	ucred = alloca(ucred_size());
4436 	if (door_ucred(&ucred) != 0) {
4437 		(void) door_return(NULL, 0, NULL, 0);
4438 		thr_exit(NULL);
4439 	}
4440 	zoneid = ucred_getzoneid(ucred);
4441 
4442 	if (zoneid == GLOBAL_ZONEID)
4443 		is_gz = B_TRUE;
4444 
4445 	eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4446 	if (eset == NULL) {
4447 		(void) door_return(NULL, 0, NULL, 0);
4448 		thr_exit(NULL);
4449 	}
4450 	if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4451 		(void) door_return(NULL, 0, NULL, 0);
4452 		thr_exit(NULL);
4453 	}
4454 	(void) mutex_lock(&g_usage_cache_lock);
4455 	g_hasclient = B_TRUE;
4456 
4457 	/*
4458 	 * Force a new cpu calculation for client.  This will force a
4459 	 * new memory calculation if the memory data is older than the
4460 	 * sample period.
4461 	 */
4462 	g_usage_cache_kickers++;
4463 	(void) cond_signal(&g_usage_cache_kick);
4464 	ret = cond_wait(&g_usage_cache_wait, &g_usage_cache_lock);
4465 	g_usage_cache_kickers--;
4466 	if (ret != 0 && errno == EINTR) {
4467 		(void) mutex_unlock(&g_usage_cache_lock);
4468 		zsd_warn(gettext(
4469 		    "Interrupted before writing usage size to client\n"));
4470 		(void) door_return(NULL, 0, NULL, 0);
4471 		thr_exit(NULL);
4472 	}
4473 	cache = zsd_usage_cache_hold_locked();
4474 	if (cache == NULL) {
4475 		zsd_warn(gettext("Usage cache empty.\n"));
4476 		(void) door_return(NULL, 0, NULL, 0);
4477 		thr_exit(NULL);
4478 	}
4479 	(void) mutex_unlock(&g_usage_cache_lock);
4480 
4481 	/* Copy current usage data to stack to send to client */
4482 	usage = (zs_usage_t *)alloca(cache->zsuc_size);
4483 
4484 	/* Filter out results if caller is non-global zone */
4485 	zsd_usage_filter(zoneid, cache, usage, is_gz);
4486 
4487 	rvalp = (void *)usage;
4488 	rvals = usage->zsu_size;
4489 	zsd_usage_cache_rele(cache);
4490 
4491 	(void) door_return(rvalp, rvals, NULL, 0);
4492 	thr_exit(NULL);
4493 }
4494 
4495 static volatile boolean_t g_quit;
4496 
4497 /* ARGSUSED */
4498 static void
4499 zonestat_quithandler(int sig)
4500 {
4501 	g_quit = B_TRUE;
4502 }
4503 
4504 /*
4505  * The stat thread generates new utilization data when clients request
4506  * it.  It also manages opening and closing the subsystems used to gather
4507  * data depending on if clients exist.
4508  */
4509 /* ARGSUSED */
4510 void *
4511 stat_thread(void *arg)
4512 {
4513 	time_t start;
4514 	time_t now;
4515 	time_t next_memory;
4516 	boolean_t do_memory;
4517 	boolean_t do_read;
4518 	boolean_t do_close;
4519 
4520 	start = time(NULL);
4521 	if (start < 0) {
4522 		if (g_quit == B_TRUE)
4523 			goto quit;
4524 		zsd_warn(gettext("Unable to fetch current time"));
4525 		g_quit = B_TRUE;
4526 		goto quit;
4527 	}
4528 
4529 	next_memory = start;
4530 	while (g_quit == B_FALSE) {
4531 		for (;;) {
4532 			/*
4533 			 * These are used to decide if the most recent memory
4534 			 * calculation was within a sample interval,
4535 			 * and weather or not the usage collection needs to
4536 			 * be opened or closed.
4537 			 */
4538 			do_memory = B_FALSE;
4539 			do_read = B_FALSE;
4540 			do_close = B_FALSE;
4541 
4542 			/*
4543 			 * If all clients have gone, close usage collecting
4544 			 */
4545 			(void) mutex_lock(&g_usage_cache_lock);
4546 			if (!g_hasclient && g_open == B_TRUE) {
4547 				do_close = B_TRUE;
4548 				(void) mutex_unlock(&g_usage_cache_lock);
4549 				break;
4550 			}
4551 			if (g_quit == B_TRUE) {
4552 				(void) mutex_unlock(
4553 				    &g_usage_cache_lock);
4554 				break;
4555 			}
4556 			/*
4557 			 * Wait for a usage data request
4558 			 */
4559 			if (g_usage_cache_kickers == 0) {
4560 				(void) cond_wait(&g_usage_cache_kick,
4561 				    &g_usage_cache_lock);
4562 			}
4563 			now = time(NULL);
4564 			if (now < 0) {
4565 				if (g_quit == B_TRUE) {
4566 					(void) mutex_unlock(
4567 					    &g_usage_cache_lock);
4568 					goto quit;
4569 				}
4570 				g_quit = B_TRUE;
4571 				(void) mutex_unlock(&g_usage_cache_lock);
4572 				zsd_warn(gettext(
4573 				    "Unable to fetch current time"));
4574 				goto quit;
4575 			}
4576 			if (g_hasclient) {
4577 				do_read = B_TRUE;
4578 				if (now >= next_memory) {
4579 					do_memory = B_TRUE;
4580 					next_memory = now + g_interval;
4581 				}
4582 			} else {
4583 				do_close = B_TRUE;
4584 			}
4585 			(void) mutex_unlock(&g_usage_cache_lock);
4586 			if (do_read || do_close)
4587 				break;
4588 		}
4589 		g_now = now;
4590 		g_hrnow = gethrtime();
4591 		if (g_hasclient && g_open == B_FALSE) {
4592 			g_start = g_now;
4593 			g_hrstart = g_hrnow;
4594 			g_ctl = zsd_open(g_ctl);
4595 			if (g_ctl == NULL)
4596 				zsd_warn(gettext(
4597 				    "Unable to open zone statistics"));
4598 			else
4599 				g_open = B_TRUE;
4600 		}
4601 		if (do_read && g_ctl) {
4602 			if (zsd_read(g_ctl, B_FALSE, do_memory) != 0) {
4603 				zsd_warn(gettext(
4604 				    "Unable to read zone statistics"));
4605 				g_quit = B_TRUE;
4606 				return (NULL);
4607 			}
4608 		}
4609 		(void) mutex_lock(&g_usage_cache_lock);
4610 		if (!g_hasclient && g_open == B_TRUE && g_ctl) {
4611 			(void) mutex_unlock(&g_usage_cache_lock);
4612 			zsd_close(g_ctl);
4613 			g_open = B_FALSE;
4614 		} else {
4615 			(void) mutex_unlock(&g_usage_cache_lock);
4616 		}
4617 	}
4618 quit:
4619 	if (g_open)
4620 		zsd_close(g_ctl);
4621 
4622 	(void) thr_kill(g_main, SIGINT);
4623 	thr_exit(NULL);
4624 	return (NULL);
4625 }
4626 
4627 void
4628 zsd_set_fx()
4629 {
4630 	pcinfo_t pcinfo;
4631 	pcparms_t pcparms;
4632 
4633 	(void) strlcpy(pcinfo.pc_clname, "FX", sizeof (pcinfo.pc_clname));
4634 	if (priocntl(0, 0, PC_GETCID, (caddr_t)&pcinfo) == -1) {
4635 		zsd_warn(gettext("cannot get FX class parameters"));
4636 		return;
4637 	}
4638 	pcparms.pc_cid = pcinfo.pc_cid;
4639 	((fxparms_t *)pcparms.pc_clparms)->fx_upri = 60;
4640 	((fxparms_t *)pcparms.pc_clparms)->fx_uprilim = 60;
4641 	((fxparms_t *)pcparms.pc_clparms)->fx_tqsecs = 0;
4642 	((fxparms_t *)pcparms.pc_clparms)->fx_tqnsecs = FX_NOCHANGE;
4643 	if (priocntl(P_PID, getpid(), PC_SETPARMS, (caddr_t)&pcparms) == -1)
4644 		zsd_warn(gettext("cannot enter the FX class"));
4645 }
4646 
4647 static int pipe_fd;
4648 
4649 static void
4650 daemonize_ready(char status)
4651 {
4652 	/*
4653 	 * wake the parent with a clue
4654 	 */
4655 	(void) write(pipe_fd, &status, 1);
4656 	(void) close(pipe_fd);
4657 }
4658 
4659 static int
4660 daemonize_start(void)
4661 {
4662 	char data;
4663 	int status;
4664 
4665 	int filedes[2];
4666 	pid_t pid;
4667 
4668 	(void) close(0);
4669 	(void) dup2(2, 1);
4670 
4671 	if (pipe(filedes) < 0)
4672 		return (-1);
4673 
4674 	(void) fflush(NULL);
4675 
4676 	if ((pid = fork1()) < 0)
4677 		return (-1);
4678 
4679 	if (pid != 0) {
4680 		/*
4681 		 * parent
4682 		 */
4683 		struct sigaction act;
4684 
4685 		act.sa_handler = SIG_DFL;
4686 		(void) sigemptyset(&act.sa_mask);
4687 		act.sa_flags = 0;
4688 
4689 		(void) sigaction(SIGPIPE, &act, NULL);  /* ignore SIGPIPE */
4690 
4691 		(void) close(filedes[1]);
4692 		if (read(filedes[0], &data, 1) == 1) {
4693 			/* forward ready code via exit status */
4694 			exit(data);
4695 		}
4696 		status = -1;
4697 		(void) wait4(pid, &status, 0, NULL);
4698 		/* daemon process exited before becoming ready */
4699 		if (WIFEXITED(status)) {
4700 			/* assume daemon process printed useful message */
4701 			exit(WEXITSTATUS(status));
4702 		} else {
4703 			zsd_warn(gettext("daemon process killed or died"));
4704 			exit(1);
4705 		}
4706 	}
4707 
4708 	/*
4709 	 * child
4710 	 */
4711 	pipe_fd = filedes[1];
4712 	(void) close(filedes[0]);
4713 
4714 	/*
4715 	 * generic Unix setup
4716 	 */
4717 	(void) setsid();
4718 	(void) umask(0000);
4719 
4720 	return (0);
4721 }
4722 
4723 static void
4724 fattach_all_zones(boolean_t detach_only)
4725 {
4726 	zoneid_t *zids;
4727 	uint_t nzids, nzids_last;
4728 	int i;
4729 
4730 again:
4731 	(void) zone_list(NULL, &nzids);
4732 	nzids_last = nzids;
4733 	zids = (zoneid_t *)malloc(sizeof (zoneid_t) * nzids_last);
4734 	if (zids == NULL)
4735 		zsd_error(gettext("Out of memory"));
4736 
4737 	(void) zone_list(zids, &nzids);
4738 	if (nzids > nzids_last) {
4739 		free(zids);
4740 		goto again;
4741 	}
4742 	for (i = 0; i < nzids; i++)
4743 		zsd_fattach_zone(zids[i], g_server_door, detach_only);
4744 
4745 	free(zids);
4746 }
4747 
4748 int
4749 main(int argc, char *argv[])
4750 {
4751 
4752 	int arg;
4753 	thread_t tid;
4754 	scf_simple_prop_t *prop;
4755 	uint64_t *intervalp;
4756 	boolean_t opt_cleanup = B_FALSE;
4757 
4758 	g_main = thr_self();
4759 	g_quit = B_FALSE;
4760 	(void) signal(SIGINT, zonestat_quithandler);
4761 	(void) signal(SIGTERM, zonestat_quithandler);
4762 	(void) signal(SIGHUP, zonestat_quithandler);
4763 /*	(void) sigignore(SIGCHLD); */
4764 	(void) sigignore(SIGPIPE);
4765 
4766 	if (getzoneid() != GLOBAL_ZONEID)
4767 		zsd_error(gettext("Must be run from global zone only"));
4768 
4769 	while ((arg = getopt(argc, argv, "c"))
4770 	    != EOF) {
4771 		switch (arg) {
4772 		case 'c':
4773 			opt_cleanup = B_TRUE;
4774 			break;
4775 		default:
4776 			zsd_error(gettext("Invalid option"));
4777 		}
4778 	}
4779 
4780 	if (opt_cleanup) {
4781 		if (zsd_disable_cpu_stats() != 0)
4782 			exit(1);
4783 		else
4784 			exit(0);
4785 	}
4786 
4787 	/* Get the configured sample interval */
4788 	prop = scf_simple_prop_get(NULL, "svc:/system/zones-monitoring:default",
4789 	    "config", "sample_interval");
4790 	if (prop == NULL)
4791 		zsd_error(gettext("Unable to fetch SMF property "
4792 		    "\"config/sample_interval\""));
4793 
4794 	if (scf_simple_prop_type(prop) != SCF_TYPE_COUNT)
4795 		zsd_error(gettext("Malformed SMF property "
4796 		    "\"config/sample_interval\".  Must be of type \"count\""));
4797 
4798 	intervalp = scf_simple_prop_next_count(prop);
4799 	g_interval = *intervalp;
4800 	if (g_interval == 0)
4801 		zsd_error(gettext("Malformed SMF property "
4802 		    "\"config/sample_interval\".  Must be greater than zero"));
4803 
4804 	scf_simple_prop_free(prop);
4805 
4806 	if (daemonize_start() < 0)
4807 		zsd_error(gettext("Unable to start daemon\n"));
4808 
4809 	/* Run at high priority */
4810 	zsd_set_fx();
4811 
4812 	(void) mutex_init(&g_usage_cache_lock, USYNC_THREAD, NULL);
4813 	(void) cond_init(&g_usage_cache_kick, USYNC_THREAD, NULL);
4814 	(void) cond_init(&g_usage_cache_wait, USYNC_THREAD, NULL);
4815 
4816 	g_server_door = door_create(zsd_server, NULL,
4817 	    DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4818 	if (g_server_door < 0)
4819 		zsd_error(gettext("Unable to create server door\n"));
4820 
4821 
4822 	g_stat_door = door_create(zsd_stat_server, NULL, DOOR_UNREF_MULTI |
4823 	    DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4824 	if (g_stat_door < 0)
4825 		zsd_error(gettext("Unable to create statistics door\n"));
4826 
4827 	fattach_all_zones(B_FALSE);
4828 
4829 	if (thr_create(NULL, 0, stat_thread, NULL, 0, &tid) != 0)
4830 		zsd_error(gettext("Unable to create statistics thread\n"));
4831 
4832 	daemonize_ready(0);
4833 
4834 	/* Wait for signal to quit */
4835 	while (g_quit == B_FALSE)
4836 		(void) pause();
4837 
4838 	/* detach doors */
4839 	fattach_all_zones(B_TRUE);
4840 
4841 	(void) door_revoke(g_server_door);
4842 	(void) door_revoke(g_stat_door);
4843 
4844 	/* kick stat thread and wait for it to close the statistics */
4845 	(void) mutex_lock(&g_usage_cache_lock);
4846 	g_quit = B_TRUE;
4847 	(void) cond_signal(&g_usage_cache_kick);
4848 	(void) mutex_unlock(&g_usage_cache_lock);
4849 end:
4850 	(void) thr_join(tid, NULL, NULL);
4851 	return (0);
4852 }
4853