1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25 #include <alloca.h>
26 #include <assert.h>
27 #include <dirent.h>
28 #include <dlfcn.h>
29 #include <door.h>
30 #include <errno.h>
31 #include <exacct.h>
32 #include <ctype.h>
33 #include <fcntl.h>
34 #include <kstat.h>
35 #include <libcontract.h>
36 #include <libintl.h>
37 #include <libscf.h>
38 #include <zonestat.h>
39 #include <zonestat_impl.h>
40 #include <limits.h>
41 #include <pool.h>
42 #include <procfs.h>
43 #include <rctl.h>
44 #include <thread.h>
45 #include <signal.h>
46 #include <stdarg.h>
47 #include <stddef.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <strings.h>
51 #include <synch.h>
52 #include <sys/acctctl.h>
53 #include <sys/contract/process.h>
54 #include <sys/ctfs.h>
55 #include <sys/fork.h>
56 #include <sys/param.h>
57 #include <sys/priocntl.h>
58 #include <sys/fxpriocntl.h>
59 #include <sys/processor.h>
60 #include <sys/pset.h>
61 #include <sys/socket.h>
62 #include <sys/stat.h>
63 #include <sys/statvfs.h>
64 #include <sys/swap.h>
65 #include <sys/systeminfo.h>
66 #include <thread.h>
67 #include <sys/list.h>
68 #include <sys/time.h>
69 #include <sys/types.h>
70 #include <sys/vm_usage.h>
71 #include <sys/wait.h>
72 #include <sys/zone.h>
73 #include <time.h>
74 #include <ucred.h>
75 #include <unistd.h>
76 #include <vm/anon.h>
77 #include <zone.h>
78 #include <zonestat.h>
79
80 #define MAX_PSET_NAME 1024 /* Taken from PV_NAME_MAX_LEN */
81 #define ZSD_PSET_UNLIMITED UINT16_MAX
82 #define ZONESTAT_EXACCT_FILE "/var/adm/exacct/zonestat-process"
83
84 /*
85 * zonestatd implements gathering cpu and memory utilization data for
86 * running zones. It has these components:
87 *
88 * zsd_server:
89 * Door server to respond to client connections. Each client
90 * will connect using libzonestat.so, which will open and
91 * call /var/tmp/.zonestat_door. Each connecting client is given
92 * a file descriptor to the stat server.
93 *
94 * The zsd_server also responds to zoneadmd, which reports when a
95 * new zone is booted. This is used to fattach the zsd_server door
96 * into the new zone.
97 *
98 * zsd_stat_server:
99 * Receives client requests for the current utilization data. Each
100 * client request will cause zonestatd to update the current utilization
101 * data by kicking the stat_thread.
102 *
103 * If the client is in a non-global zone, the utilization data will
104 * be filtered to only show the given zone. The usage by all other zones
105 * will be added to the system utilization.
106 *
107 * stat_thread:
108 * The stat thread implements querying the system to determine the
109 * current utilization data for each running zone. This includes
110 * inspecting the system's processor set configuration, as well as details
111 * of each zone, such as their configured limits, and which processor
112 * sets they are running in.
113 *
114 * The stat_thread will only update memory utilization data as often as
115 * the configured config/sample_interval on the zones-monitoring service.
116 */
117
118 /*
119 * The private vmusage structure unfortunately uses size_t types, and assumes
120 * the caller's bitness matches the kernel's bitness. Since the getvmusage()
121 * system call is contracted, and zonestatd is 32 bit, the following structures
122 * are used to interact with a 32bit or 64 bit kernel.
123 */
124 typedef struct zsd_vmusage32 {
125 id_t vmu_zoneid;
126 uint_t vmu_type;
127 id_t vmu_id;
128
129 uint32_t vmu_rss_all;
130 uint32_t vmu_rss_private;
131 uint32_t vmu_rss_shared;
132 uint32_t vmu_swap_all;
133 uint32_t vmu_swap_private;
134 uint32_t vmu_swap_shared;
135 } zsd_vmusage32_t;
136
137 typedef struct zsd_vmusage64 {
138 id_t vmu_zoneid;
139 uint_t vmu_type;
140 id_t vmu_id;
141 /*
142 * An amd64 kernel will align the following uint64_t members, but a
143 * 32bit i386 process will not without help.
144 */
145 int vmu_align_next_members_on_8_bytes;
146 uint64_t vmu_rss_all;
147 uint64_t vmu_rss_private;
148 uint64_t vmu_rss_shared;
149 uint64_t vmu_swap_all;
150 uint64_t vmu_swap_private;
151 uint64_t vmu_swap_shared;
152 } zsd_vmusage64_t;
153
154 struct zsd_zone;
155
156 /* Used to store a zone's usage of a pset */
157 typedef struct zsd_pset_usage {
158 struct zsd_zone *zsu_zone;
159 struct zsd_pset *zsu_pset;
160
161 list_node_t zsu_next;
162
163 zoneid_t zsu_zoneid;
164 boolean_t zsu_found; /* zone bound at end of interval */
165 boolean_t zsu_active; /* zone was bound during interval */
166 boolean_t zsu_new; /* zone newly bound in this interval */
167 boolean_t zsu_deleted; /* zone was unbound in this interval */
168 boolean_t zsu_empty; /* no procs in pset in this interval */
169 time_t zsu_start; /* time when zone was found in pset */
170 hrtime_t zsu_hrstart; /* time when zone was found in pset */
171 uint64_t zsu_cpu_shares;
172 uint_t zsu_scheds; /* schedulers found in this pass */
173 timestruc_t zsu_cpu_usage; /* cpu time used */
174 } zsd_pset_usage_t;
175
176 /* Used to store a pset's utilization */
177 typedef struct zsd_pset {
178 psetid_t zsp_id;
179 list_node_t zsp_next;
180 char zsp_name[ZS_PSETNAME_MAX];
181
182 uint_t zsp_cputype; /* default, dedicated or shared */
183 boolean_t zsp_found; /* pset found at end of interval */
184 boolean_t zsp_new; /* pset new in this interval */
185 boolean_t zsp_deleted; /* pset deleted in this interval */
186 boolean_t zsp_active; /* pset existed during interval */
187 boolean_t zsp_empty; /* no processes in pset */
188 time_t zsp_start;
189 hrtime_t zsp_hrstart;
190
191 uint64_t zsp_online; /* online cpus in interval */
192 uint64_t zsp_size; /* size in this interval */
193 uint64_t zsp_min; /* configured min in this interval */
194 uint64_t zsp_max; /* configured max in this interval */
195 int64_t zsp_importance; /* configured max in this interval */
196
197 uint_t zsp_scheds; /* scheds of processes found in pset */
198 uint64_t zsp_cpu_shares; /* total shares in this interval */
199
200 timestruc_t zsp_total_time;
201 timestruc_t zsp_usage_kern;
202 timestruc_t zsp_usage_zones;
203
204 /* Individual zone usages of pset */
205 list_t zsp_usage_list;
206 int zsp_nusage;
207
208 /* Summed kstat values from individual cpus in pset */
209 timestruc_t zsp_idle;
210 timestruc_t zsp_intr;
211 timestruc_t zsp_kern;
212 timestruc_t zsp_user;
213
214 } zsd_pset_t;
215
216 /* Used to track an individual cpu's utilization as reported by kstats */
217 typedef struct zsd_cpu {
218 processorid_t zsc_id;
219 list_node_t zsc_next;
220 psetid_t zsc_psetid;
221 psetid_t zsc_psetid_prev;
222 zsd_pset_t *zsc_pset;
223
224 boolean_t zsc_found; /* cpu online in this interval */
225 boolean_t zsc_onlined; /* cpu onlined during this interval */
226 boolean_t zsc_offlined; /* cpu offlined during this interval */
227 boolean_t zsc_active; /* cpu online during this interval */
228 boolean_t zsc_allocated; /* True if cpu has ever been found */
229
230 /* kstats this interval */
231 uint64_t zsc_nsec_idle;
232 uint64_t zsc_nsec_intr;
233 uint64_t zsc_nsec_kern;
234 uint64_t zsc_nsec_user;
235
236 /* kstats in most recent interval */
237 uint64_t zsc_nsec_idle_prev;
238 uint64_t zsc_nsec_intr_prev;
239 uint64_t zsc_nsec_kern_prev;
240 uint64_t zsc_nsec_user_prev;
241
242 /* Total kstat increases since zonestatd started reading kstats */
243 timestruc_t zsc_idle;
244 timestruc_t zsc_intr;
245 timestruc_t zsc_kern;
246 timestruc_t zsc_user;
247
248 } zsd_cpu_t;
249
250 /* Used to describe an individual zone and its utilization */
251 typedef struct zsd_zone {
252 zoneid_t zsz_id;
253 list_node_t zsz_next;
254 char zsz_name[ZS_ZONENAME_MAX];
255 uint_t zsz_cputype;
256 uint_t zsz_iptype;
257 time_t zsz_start;
258 hrtime_t zsz_hrstart;
259
260 char zsz_pool[ZS_POOLNAME_MAX];
261 char zsz_pset[ZS_PSETNAME_MAX];
262 int zsz_default_sched;
263 /* These are deduced by inspecting processes */
264 psetid_t zsz_psetid;
265 uint_t zsz_scheds;
266
267 boolean_t zsz_new; /* zone booted during this interval */
268 boolean_t zsz_deleted; /* halted during this interval */
269 boolean_t zsz_active; /* running in this interval */
270 boolean_t zsz_empty; /* no processes in this interval */
271 boolean_t zsz_gone; /* not installed in this interval */
272 boolean_t zsz_found; /* Running at end of this interval */
273
274 uint64_t zsz_cpu_shares;
275 uint64_t zsz_cpu_cap;
276 uint64_t zsz_ram_cap;
277 uint64_t zsz_locked_cap;
278 uint64_t zsz_vm_cap;
279
280 uint64_t zsz_cpus_online;
281 timestruc_t zsz_cpu_usage; /* cpu time of cpu cap */
282 timestruc_t zsz_cap_time; /* cpu time of cpu cap */
283 timestruc_t zsz_share_time; /* cpu time of share of cpu */
284 timestruc_t zsz_pset_time; /* time of all psets zone is bound to */
285
286 uint64_t zsz_usage_ram;
287 uint64_t zsz_usage_locked;
288 uint64_t zsz_usage_vm;
289
290 uint64_t zsz_processes_cap;
291 uint64_t zsz_lwps_cap;
292 uint64_t zsz_shm_cap;
293 uint64_t zsz_shmids_cap;
294 uint64_t zsz_semids_cap;
295 uint64_t zsz_msgids_cap;
296 uint64_t zsz_lofi_cap;
297
298 uint64_t zsz_processes;
299 uint64_t zsz_lwps;
300 uint64_t zsz_shm;
301 uint64_t zsz_shmids;
302 uint64_t zsz_semids;
303 uint64_t zsz_msgids;
304 uint64_t zsz_lofi;
305
306 } zsd_zone_t;
307
308 /*
309 * Used to track the cpu usage of an individual processes.
310 *
311 * zonestatd sweeps /proc each interval and charges the cpu usage of processes.
312 * to their zone. As processes exit, their extended accounting records are
313 * read and the difference of their total and known usage is charged to their
314 * zone.
315 *
316 * If a process is never seen in /proc, the total usage on its extended
317 * accounting record will be charged to its zone.
318 */
319 typedef struct zsd_proc {
320 list_node_t zspr_next;
321 pid_t zspr_ppid;
322 psetid_t zspr_psetid;
323 zoneid_t zspr_zoneid;
324 int zspr_sched;
325 timestruc_t zspr_usage;
326 } zsd_proc_t;
327
328 /* Used to track the overall resource usage of the system */
329 typedef struct zsd_system {
330
331 uint64_t zss_ram_total;
332 uint64_t zss_ram_kern;
333 uint64_t zss_ram_zones;
334
335 uint64_t zss_locked_kern;
336 uint64_t zss_locked_zones;
337
338 uint64_t zss_vm_total;
339 uint64_t zss_vm_kern;
340 uint64_t zss_vm_zones;
341
342 uint64_t zss_swap_total;
343 uint64_t zss_swap_used;
344
345 timestruc_t zss_idle;
346 timestruc_t zss_intr;
347 timestruc_t zss_kern;
348 timestruc_t zss_user;
349
350 timestruc_t zss_cpu_total_time;
351 timestruc_t zss_cpu_usage_kern;
352 timestruc_t zss_cpu_usage_zones;
353
354 uint64_t zss_maxpid;
355 uint64_t zss_processes_max;
356 uint64_t zss_lwps_max;
357 uint64_t zss_shm_max;
358 uint64_t zss_shmids_max;
359 uint64_t zss_semids_max;
360 uint64_t zss_msgids_max;
361 uint64_t zss_lofi_max;
362
363 uint64_t zss_processes;
364 uint64_t zss_lwps;
365 uint64_t zss_shm;
366 uint64_t zss_shmids;
367 uint64_t zss_semids;
368 uint64_t zss_msgids;
369 uint64_t zss_lofi;
370
371 uint64_t zss_ncpus;
372 uint64_t zss_ncpus_online;
373
374 } zsd_system_t;
375
376 /*
377 * A dumping ground for various information and structures used to compute
378 * utilization.
379 *
380 * This structure is used to track the system while clients are connected.
381 * When The first client connects, a zsd_ctl is allocated and configured by
382 * zsd_open(). When all clients disconnect, the zsd_ctl is closed.
383 */
384 typedef struct zsd_ctl {
385 kstat_ctl_t *zsctl_kstat_ctl;
386
387 /* To track extended accounting */
388 int zsctl_proc_fd; /* Log currently being used */
389 ea_file_t zsctl_proc_eaf;
390 struct stat64 zsctl_proc_stat;
391 int zsctl_proc_open;
392 int zsctl_proc_fd_next; /* Log file to use next */
393 ea_file_t zsctl_proc_eaf_next;
394 struct stat64 zsctl_proc_stat_next;
395 int zsctl_proc_open_next;
396
397 /* pool configuration handle */
398 pool_conf_t *zsctl_pool_conf;
399 int zsctl_pool_status;
400 int zsctl_pool_changed;
401
402 /* The above usage tacking structures */
403 zsd_system_t *zsctl_system;
404 list_t zsctl_zones;
405 list_t zsctl_psets;
406 list_t zsctl_cpus;
407 zsd_cpu_t *zsctl_cpu_array;
408 zsd_proc_t *zsctl_proc_array;
409
410 /* Various system info */
411 uint64_t zsctl_maxcpuid;
412 uint64_t zsctl_maxproc;
413 uint64_t zsctl_kern_bits;
414 uint64_t zsctl_pagesize;
415
416 /* Used to track time available under a cpu cap. */
417 uint64_t zsctl_hrtime;
418 uint64_t zsctl_hrtime_prev;
419 timestruc_t zsctl_hrtime_total;
420
421 struct timeval zsctl_timeofday;
422
423 /* Caches for arrays allocated for use by various system calls */
424 psetid_t *zsctl_pset_cache;
425 uint_t zsctl_pset_ncache;
426 processorid_t *zsctl_cpu_cache;
427 uint_t zsctl_cpu_ncache;
428 zoneid_t *zsctl_zone_cache;
429 uint_t zsctl_zone_ncache;
430 struct swaptable *zsctl_swap_cache;
431 uint64_t zsctl_swap_cache_size;
432 uint64_t zsctl_swap_cache_num;
433 zsd_vmusage64_t *zsctl_vmusage_cache;
434 uint64_t zsctl_vmusage_cache_num;
435
436 /* Info about procfs for scanning /proc */
437 pool_value_t *zsctl_pool_vals[3];
438
439 /* Counts on tracked entities */
440 uint_t zsctl_nzones;
441 uint_t zsctl_npsets;
442 uint_t zsctl_npset_usages;
443 } zsd_ctl_t;
444
445 zsd_ctl_t *g_ctl;
446 boolean_t g_open; /* True if g_ctl is open */
447 int g_hasclient; /* True if any clients are connected */
448
449 /*
450 * The usage cache is updated by the stat_thread, and copied to clients by
451 * the zsd_stat_server. Mutex and cond are to synchronize between the
452 * stat_thread and the stat_server.
453 */
454 zs_usage_cache_t *g_usage_cache;
455 mutex_t g_usage_cache_lock;
456 cond_t g_usage_cache_kick;
457 uint_t g_usage_cache_kickers;
458 cond_t g_usage_cache_wait;
459 char *g_usage_cache_buf;
460 uint_t g_usage_cache_bufsz;
461 uint64_t g_gen_next;
462
463 /* fds of door servers */
464 int g_server_door;
465 int g_stat_door;
466
467 /*
468 * Starting and current time. Used to throttle memory calculation, and to
469 * mark new zones and psets with their boot and creation time.
470 */
471 time_t g_now;
472 time_t g_start;
473 hrtime_t g_hrnow;
474 hrtime_t g_hrstart;
475 uint64_t g_interval;
476
477 /*
478 * main() thread.
479 */
480 thread_t g_main;
481
482 /* PRINTFLIKE1 */
483 static void
zsd_warn(const char * fmt,...)484 zsd_warn(const char *fmt, ...)
485 {
486 va_list alist;
487
488 va_start(alist, fmt);
489
490 (void) fprintf(stderr, gettext("zonestat: Warning: "));
491 (void) vfprintf(stderr, fmt, alist);
492 (void) fprintf(stderr, "\n");
493 va_end(alist);
494 }
495
496 /* PRINTFLIKE1 */
497 static void
zsd_error(const char * fmt,...)498 zsd_error(const char *fmt, ...)
499 {
500 va_list alist;
501
502 va_start(alist, fmt);
503
504 (void) fprintf(stderr, gettext("zonestat: Error: "));
505 (void) vfprintf(stderr, fmt, alist);
506 (void) fprintf(stderr, "\n");
507 va_end(alist);
508 exit(1);
509 }
510
511 /* Turns on extended accounting if not configured externally */
512 int
zsd_enable_cpu_stats()513 zsd_enable_cpu_stats()
514 {
515 char *path = ZONESTAT_EXACCT_FILE;
516 char oldfile[MAXPATHLEN];
517 int ret, state = AC_ON;
518 ac_res_t res[6];
519
520 /*
521 * Start a new accounting file if accounting not configured
522 * externally.
523 */
524
525 res[0].ar_id = AC_PROC_PID;
526 res[0].ar_state = AC_ON;
527 res[1].ar_id = AC_PROC_ANCPID;
528 res[1].ar_state = AC_ON;
529 res[2].ar_id = AC_PROC_CPU;
530 res[2].ar_state = AC_ON;
531 res[3].ar_id = AC_PROC_TIME;
532 res[3].ar_state = AC_ON;
533 res[4].ar_id = AC_PROC_ZONENAME;
534 res[4].ar_state = AC_ON;
535 res[5].ar_id = AC_NONE;
536 res[5].ar_state = AC_ON;
537 if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
538 zsd_warn(gettext("Unable to set accounting resources"));
539 return (-1);
540 }
541 /* Only set accounting file if none is configured */
542 ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
543 if (ret < 0) {
544
545 (void) unlink(path);
546 if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1)
547 == -1) {
548 zsd_warn(gettext("Unable to set accounting file"));
549 return (-1);
550 }
551 }
552 if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
553 zsd_warn(gettext("Unable to enable accounting"));
554 return (-1);
555 }
556 return (0);
557 }
558
559 /* Turns off extended accounting if not configured externally */
560 int
zsd_disable_cpu_stats()561 zsd_disable_cpu_stats()
562 {
563 char *path = ZONESTAT_EXACCT_FILE;
564 int ret, state = AC_OFF;
565 ac_res_t res[6];
566 char oldfile[MAXPATHLEN];
567
568 /* If accounting file is externally configured, leave it alone */
569 ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
570 if (ret == 0 && strcmp(oldfile, path) != 0)
571 return (0);
572
573 res[0].ar_id = AC_PROC_PID;
574 res[0].ar_state = AC_OFF;
575 res[1].ar_id = AC_PROC_ANCPID;
576 res[1].ar_state = AC_OFF;
577 res[2].ar_id = AC_PROC_CPU;
578 res[2].ar_state = AC_OFF;
579 res[3].ar_id = AC_PROC_TIME;
580 res[3].ar_state = AC_OFF;
581 res[4].ar_id = AC_PROC_ZONENAME;
582 res[4].ar_state = AC_OFF;
583 res[5].ar_id = AC_NONE;
584 res[5].ar_state = AC_OFF;
585 if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
586 zsd_warn(gettext("Unable to clear accounting resources"));
587 return (-1);
588 }
589 if (acctctl(AC_PROC | AC_FILE_SET, NULL, 0) == -1) {
590 zsd_warn(gettext("Unable to clear accounting file"));
591 return (-1);
592 }
593 if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
594 zsd_warn(gettext("Unable to diable accounting"));
595 return (-1);
596 }
597
598 (void) unlink(path);
599 return (0);
600 }
601
602 /*
603 * If not configured externally, deletes the current extended accounting file
604 * and starts a new one.
605 *
606 * Since the stat_thread holds an open handle to the accounting file, it will
607 * read all remaining entries from the old file before switching to
608 * read the new one.
609 */
610 int
zsd_roll_exacct(void)611 zsd_roll_exacct(void)
612 {
613 int ret;
614 char *path = ZONESTAT_EXACCT_FILE;
615 char oldfile[MAXPATHLEN];
616
617 /* If accounting file is externally configured, leave it alone */
618 ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
619 if (ret == 0 && strcmp(oldfile, path) != 0)
620 return (0);
621
622 if (unlink(path) != 0)
623 /* Roll it next time */
624 return (0);
625
626 if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) {
627 zsd_warn(gettext("Unable to set accounting file"));
628 return (-1);
629 }
630 return (0);
631 }
632
633 /* Contract stuff for zone_enter() */
634 int
init_template(void)635 init_template(void)
636 {
637 int fd;
638 int err = 0;
639
640 fd = open64(CTFS_ROOT "/process/template", O_RDWR);
641 if (fd == -1)
642 return (-1);
643
644 /*
645 * For now, zoneadmd doesn't do anything with the contract.
646 * Deliver no events, don't inherit, and allow it to be orphaned.
647 */
648 err |= ct_tmpl_set_critical(fd, 0);
649 err |= ct_tmpl_set_informative(fd, 0);
650 err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
651 err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
652 if (err || ct_tmpl_activate(fd)) {
653 (void) close(fd);
654 return (-1);
655 }
656
657 return (fd);
658 }
659
660 /*
661 * Contract stuff for zone_enter()
662 */
663 int
contract_latest(ctid_t * id)664 contract_latest(ctid_t *id)
665 {
666 int cfd, r;
667 ct_stathdl_t st;
668 ctid_t result;
669
670 if ((cfd = open64(CTFS_ROOT "/process/latest", O_RDONLY)) == -1)
671 return (errno);
672
673 if ((r = ct_status_read(cfd, CTD_COMMON, &st)) != 0) {
674 (void) close(cfd);
675 return (r);
676 }
677
678 result = ct_status_get_id(st);
679 ct_status_free(st);
680 (void) close(cfd);
681
682 *id = result;
683 return (0);
684 }
685
686 static int
close_on_exec(int fd)687 close_on_exec(int fd)
688 {
689 int flags = fcntl(fd, F_GETFD, 0);
690 if ((flags != -1) && (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1))
691 return (0);
692 return (-1);
693 }
694
695 int
contract_open(ctid_t ctid,const char * type,const char * file,int oflag)696 contract_open(ctid_t ctid, const char *type, const char *file, int oflag)
697 {
698 char path[PATH_MAX];
699 int n, fd;
700
701 if (type == NULL)
702 type = "all";
703
704 n = snprintf(path, PATH_MAX, CTFS_ROOT "/%s/%ld/%s", type, ctid, file);
705 if (n >= sizeof (path)) {
706 errno = ENAMETOOLONG;
707 return (-1);
708 }
709
710 fd = open64(path, oflag);
711 if (fd != -1) {
712 if (close_on_exec(fd) == -1) {
713 int err = errno;
714 (void) close(fd);
715 errno = err;
716 return (-1);
717 }
718 }
719 return (fd);
720 }
721
722 int
contract_abandon_id(ctid_t ctid)723 contract_abandon_id(ctid_t ctid)
724 {
725 int fd, err;
726
727 fd = contract_open(ctid, "all", "ctl", O_WRONLY);
728 if (fd == -1)
729 return (errno);
730
731 err = ct_ctl_abandon(fd);
732 (void) close(fd);
733
734 return (err);
735 }
736 /*
737 * Attach the zsd_server to a zone. Called for each zone when zonestatd
738 * starts, and for each newly booted zone when zoneadmd contacts the zsd_server
739 *
740 * Zone_enter is used to avoid reaching into zone to fattach door.
741 */
742 static void
zsd_fattach_zone(zoneid_t zid,int door,boolean_t detach_only)743 zsd_fattach_zone(zoneid_t zid, int door, boolean_t detach_only)
744 {
745 char *path = ZS_DOOR_PATH;
746 int fd, pid, stat, tmpl_fd;
747 ctid_t ct;
748
749 if ((tmpl_fd = init_template()) == -1) {
750 zsd_warn("Unable to init template");
751 return;
752 }
753
754 pid = forkx(0);
755 if (pid < 0) {
756 (void) ct_tmpl_clear(tmpl_fd);
757 zsd_warn(gettext(
758 "Unable to fork to add zonestat to zoneid %d\n"), zid);
759 return;
760 }
761
762 if (pid == 0) {
763 (void) ct_tmpl_clear(tmpl_fd);
764 (void) close(tmpl_fd);
765 if (zid != 0 && zone_enter(zid) != 0) {
766 if (errno == EINVAL) {
767 _exit(0);
768 }
769 _exit(1);
770 }
771 (void) fdetach(path);
772 (void) unlink(path);
773 if (detach_only)
774 _exit(0);
775 fd = open(path, O_CREAT|O_RDWR, 0644);
776 if (fd < 0)
777 _exit(2);
778 if (fattach(door, path) != 0)
779 _exit(3);
780 _exit(0);
781 }
782 if (contract_latest(&ct) == -1)
783 ct = -1;
784 (void) ct_tmpl_clear(tmpl_fd);
785 (void) close(tmpl_fd);
786 (void) contract_abandon_id(ct);
787 while (waitpid(pid, &stat, 0) != pid)
788 ;
789 if (WIFEXITED(stat) && WEXITSTATUS(stat) == 0)
790 return;
791
792 zsd_warn(gettext("Unable to attach door to zoneid: %d"), zid);
793
794 if (WEXITSTATUS(stat) == 1)
795 zsd_warn(gettext("Cannot entering zone"));
796 else if (WEXITSTATUS(stat) == 2)
797 zsd_warn(gettext("Unable to create door file: %s"), path);
798 else if (WEXITSTATUS(stat) == 3)
799 zsd_warn(gettext("Unable to fattach file: %s"), path);
800
801 zsd_warn(gettext("Internal error entering zone: %d"), zid);
802 }
803
804 /*
805 * Zone lookup and allocation functions to manage list of currently running
806 * zones.
807 */
808 static zsd_zone_t *
zsd_lookup_zone(zsd_ctl_t * ctl,char * zonename,zoneid_t zoneid)809 zsd_lookup_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
810 {
811 zsd_zone_t *zone;
812
813 for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
814 zone = list_next(&ctl->zsctl_zones, zone)) {
815 if (strcmp(zone->zsz_name, zonename) == 0) {
816 if (zoneid != -1)
817 zone->zsz_id = zoneid;
818 return (zone);
819 }
820 }
821 return (NULL);
822 }
823
824 static zsd_zone_t *
zsd_lookup_zone_byid(zsd_ctl_t * ctl,zoneid_t zoneid)825 zsd_lookup_zone_byid(zsd_ctl_t *ctl, zoneid_t zoneid)
826 {
827 zsd_zone_t *zone;
828
829 for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
830 zone = list_next(&ctl->zsctl_zones, zone)) {
831 if (zone->zsz_id == zoneid)
832 return (zone);
833 }
834 return (NULL);
835 }
836
837 static zsd_zone_t *
zsd_allocate_zone(zsd_ctl_t * ctl,char * zonename,zoneid_t zoneid)838 zsd_allocate_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
839 {
840 zsd_zone_t *zone;
841
842 if ((zone = (zsd_zone_t *)calloc(1, sizeof (zsd_zone_t))) == NULL)
843 return (NULL);
844
845 (void) strlcpy(zone->zsz_name, zonename, sizeof (zone->zsz_name));
846 zone->zsz_id = zoneid;
847 zone->zsz_found = B_FALSE;
848
849 /*
850 * Allocate as deleted so if not found in first pass, zone is deleted
851 * from list. This can happen if zone is returned by zone_list, but
852 * exits before first attempt to fetch zone details.
853 */
854 zone->zsz_start = g_now;
855 zone->zsz_hrstart = g_hrnow;
856 zone->zsz_deleted = B_TRUE;
857
858 zone->zsz_cpu_shares = ZS_LIMIT_NONE;
859 zone->zsz_cpu_cap = ZS_LIMIT_NONE;
860 zone->zsz_ram_cap = ZS_LIMIT_NONE;
861 zone->zsz_locked_cap = ZS_LIMIT_NONE;
862 zone->zsz_vm_cap = ZS_LIMIT_NONE;
863
864 zone->zsz_processes_cap = ZS_LIMIT_NONE;
865 zone->zsz_lwps_cap = ZS_LIMIT_NONE;
866 zone->zsz_shm_cap = ZS_LIMIT_NONE;
867 zone->zsz_shmids_cap = ZS_LIMIT_NONE;
868 zone->zsz_semids_cap = ZS_LIMIT_NONE;
869 zone->zsz_msgids_cap = ZS_LIMIT_NONE;
870 zone->zsz_lofi_cap = ZS_LIMIT_NONE;
871
872 ctl->zsctl_nzones++;
873
874 return (zone);
875 }
876
877 static zsd_zone_t *
zsd_lookup_insert_zone(zsd_ctl_t * ctl,char * zonename,zoneid_t zoneid)878 zsd_lookup_insert_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
879 {
880 zsd_zone_t *zone, *tmp;
881
882 if ((zone = zsd_lookup_zone(ctl, zonename, zoneid)) != NULL)
883 return (zone);
884
885 if ((zone = zsd_allocate_zone(ctl, zonename, zoneid)) == NULL)
886 return (NULL);
887
888 /* Insert sorted by zonename */
889 tmp = list_head(&ctl->zsctl_zones);
890 while (tmp != NULL && strcmp(zonename, tmp->zsz_name) > 0)
891 tmp = list_next(&ctl->zsctl_zones, tmp);
892
893 list_insert_before(&ctl->zsctl_zones, tmp, zone);
894 return (zone);
895 }
896
897 /*
898 * Mark all zones as not existing. As zones are found, they will
899 * be marked as existing. If a zone is not found, then it must have
900 * halted.
901 */
902 static void
zsd_mark_zones_start(zsd_ctl_t * ctl)903 zsd_mark_zones_start(zsd_ctl_t *ctl)
904 {
905
906 zsd_zone_t *zone;
907
908 for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
909 zone = list_next(&ctl->zsctl_zones, zone)) {
910 zone->zsz_found = B_FALSE;
911 }
912 }
913
914 /*
915 * Mark each zone as not using pset. If processes are found using the
916 * pset, the zone will remain bound to the pset. If none of a zones
917 * processes are bound to the pset, the zone's usage of the pset will
918 * be deleted.
919 *
920 */
921 static void
zsd_mark_pset_usage_start(zsd_pset_t * pset)922 zsd_mark_pset_usage_start(zsd_pset_t *pset)
923 {
924 zsd_pset_usage_t *usage;
925
926 for (usage = list_head(&pset->zsp_usage_list);
927 usage != NULL;
928 usage = list_next(&pset->zsp_usage_list, usage)) {
929 usage->zsu_found = B_FALSE;
930 usage->zsu_empty = B_TRUE;
931 }
932 }
933
934 /*
935 * Mark each pset as not existing. If a pset is found, it will be marked
936 * as existing. If a pset is not found, it wil be deleted.
937 */
938 static void
zsd_mark_psets_start(zsd_ctl_t * ctl)939 zsd_mark_psets_start(zsd_ctl_t *ctl)
940 {
941 zsd_pset_t *pset;
942
943 for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
944 pset = list_next(&ctl->zsctl_psets, pset)) {
945 pset->zsp_found = B_FALSE;
946 zsd_mark_pset_usage_start(pset);
947 }
948 }
949
950 /*
951 * A pset was found. Update its information
952 */
953 static void
zsd_mark_pset_found(zsd_pset_t * pset,uint_t type,uint64_t online,uint64_t size,uint64_t min,uint64_t max,int64_t importance)954 zsd_mark_pset_found(zsd_pset_t *pset, uint_t type, uint64_t online,
955 uint64_t size, uint64_t min, uint64_t max, int64_t importance)
956 {
957 pset->zsp_empty = B_TRUE;
958 pset->zsp_deleted = B_FALSE;
959
960 assert(pset->zsp_found == B_FALSE);
961
962 /* update pset flags */
963 if (pset->zsp_active == B_FALSE)
964 /* pset not seen on previous interval. It is new. */
965 pset->zsp_new = B_TRUE;
966 else
967 pset->zsp_new = B_FALSE;
968
969 pset->zsp_found = B_TRUE;
970 pset->zsp_cputype = type;
971 pset->zsp_online = online;
972 pset->zsp_size = size;
973 pset->zsp_min = min;
974 pset->zsp_max = max;
975 pset->zsp_importance = importance;
976 pset->zsp_cpu_shares = 0;
977 pset->zsp_scheds = 0;
978 pset->zsp_active = B_TRUE;
979 }
980
981 /*
982 * A zone's process was found using a pset. Charge the process to the pset and
983 * the per-zone data for the pset.
984 */
985 static void
zsd_mark_pset_usage_found(zsd_pset_usage_t * usage,uint_t sched)986 zsd_mark_pset_usage_found(zsd_pset_usage_t *usage, uint_t sched)
987 {
988 zsd_zone_t *zone = usage->zsu_zone;
989 zsd_pset_t *pset = usage->zsu_pset;
990
991 /* Nothing to do if already found */
992 if (usage->zsu_found == B_TRUE)
993 goto add_stats;
994
995 usage->zsu_found = B_TRUE;
996 usage->zsu_empty = B_FALSE;
997
998 usage->zsu_deleted = B_FALSE;
999 /* update usage flags */
1000 if (usage->zsu_active == B_FALSE)
1001 usage->zsu_new = B_TRUE;
1002 else
1003 usage->zsu_new = B_FALSE;
1004
1005 usage->zsu_scheds = 0;
1006 usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1007 usage->zsu_active = B_TRUE;
1008 pset->zsp_empty = B_FALSE;
1009 zone->zsz_empty = B_FALSE;
1010
1011 add_stats:
1012 /* Detect zone's pset id, and if it is bound to multiple psets */
1013 if (zone->zsz_psetid == ZS_PSET_ERROR)
1014 zone->zsz_psetid = pset->zsp_id;
1015 else if (zone->zsz_psetid != pset->zsp_id)
1016 zone->zsz_psetid = ZS_PSET_MULTI;
1017
1018 usage->zsu_scheds |= sched;
1019 pset->zsp_scheds |= sched;
1020 zone->zsz_scheds |= sched;
1021
1022 /* Record if FSS is co-habitating with conflicting scheduler */
1023 if ((pset->zsp_scheds & ZS_SCHED_FSS) &&
1024 usage->zsu_scheds & (
1025 ZS_SCHED_TS | ZS_SCHED_IA | ZS_SCHED_FX)) {
1026 usage->zsu_scheds |= ZS_SCHED_CONFLICT;
1027
1028 pset->zsp_scheds |= ZS_SCHED_CONFLICT;
1029 }
1030
1031 }
1032
1033 /* Add cpu time for a process to a pset, zone, and system totals */
1034 static void
zsd_add_usage(zsd_ctl_t * ctl,zsd_pset_usage_t * usage,timestruc_t * delta)1035 zsd_add_usage(zsd_ctl_t *ctl, zsd_pset_usage_t *usage, timestruc_t *delta)
1036 {
1037 zsd_system_t *system = ctl->zsctl_system;
1038 zsd_zone_t *zone = usage->zsu_zone;
1039 zsd_pset_t *pset = usage->zsu_pset;
1040
1041 TIMESTRUC_ADD_TIMESTRUC(usage->zsu_cpu_usage, *delta);
1042 TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_zones, *delta);
1043 TIMESTRUC_ADD_TIMESTRUC(zone->zsz_cpu_usage, *delta);
1044 TIMESTRUC_ADD_TIMESTRUC(system->zss_cpu_usage_zones, *delta);
1045 }
1046
1047 /* Determine which processor sets have been deleted */
1048 static void
zsd_mark_psets_end(zsd_ctl_t * ctl)1049 zsd_mark_psets_end(zsd_ctl_t *ctl)
1050 {
1051 zsd_pset_t *pset, *tmp;
1052
1053 /*
1054 * Mark pset as not exists, and deleted if it existed
1055 * previous interval.
1056 */
1057 pset = list_head(&ctl->zsctl_psets);
1058 while (pset != NULL) {
1059 if (pset->zsp_found == B_FALSE) {
1060 pset->zsp_empty = B_TRUE;
1061 if (pset->zsp_deleted == B_TRUE) {
1062 tmp = pset;
1063 pset = list_next(&ctl->zsctl_psets, pset);
1064 list_remove(&ctl->zsctl_psets, tmp);
1065 free(tmp);
1066 ctl->zsctl_npsets--;
1067 continue;
1068 } else {
1069 /* Pset vanished during this interval */
1070 pset->zsp_new = B_FALSE;
1071 pset->zsp_deleted = B_TRUE;
1072 pset->zsp_active = B_TRUE;
1073 }
1074 }
1075 pset = list_next(&ctl->zsctl_psets, pset);
1076 }
1077 }
1078
1079 /* Determine which zones are no longer bound to processor sets */
1080 static void
zsd_mark_pset_usages_end(zsd_ctl_t * ctl)1081 zsd_mark_pset_usages_end(zsd_ctl_t *ctl)
1082 {
1083 zsd_pset_t *pset;
1084 zsd_zone_t *zone;
1085 zsd_pset_usage_t *usage, *tmp;
1086
1087 /*
1088 * Mark pset as not exists, and deleted if it existed previous
1089 * interval.
1090 */
1091 for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1092 pset = list_next(&ctl->zsctl_psets, pset)) {
1093 usage = list_head(&pset->zsp_usage_list);
1094 while (usage != NULL) {
1095 /*
1096 * Mark pset as not exists, and deleted if it existed
1097 * previous interval.
1098 */
1099 if (usage->zsu_found == B_FALSE ||
1100 usage->zsu_zone->zsz_deleted == B_TRUE ||
1101 usage->zsu_pset->zsp_deleted == B_TRUE) {
1102 tmp = usage;
1103 usage = list_next(&pset->zsp_usage_list,
1104 usage);
1105 list_remove(&pset->zsp_usage_list, tmp);
1106 free(tmp);
1107 pset->zsp_nusage--;
1108 ctl->zsctl_npset_usages--;
1109 continue;
1110 } else {
1111 usage->zsu_new = B_FALSE;
1112 usage->zsu_deleted = B_TRUE;
1113 usage->zsu_active = B_TRUE;
1114 }
1115 /* Add cpu shares for usages that are in FSS */
1116 zone = usage->zsu_zone;
1117 if (usage->zsu_scheds & ZS_SCHED_FSS &&
1118 zone->zsz_cpu_shares != ZS_SHARES_UNLIMITED &&
1119 zone->zsz_cpu_shares != 0) {
1120 zone = usage->zsu_zone;
1121 usage->zsu_cpu_shares = zone->zsz_cpu_shares;
1122 pset->zsp_cpu_shares += zone->zsz_cpu_shares;
1123 }
1124 usage = list_next(&pset->zsp_usage_list,
1125 usage);
1126 }
1127 }
1128 }
1129
1130 /* A zone has been found. Update its information */
1131 static void
zsd_mark_zone_found(zsd_ctl_t * ctl,zsd_zone_t * zone,uint64_t cpu_shares,uint64_t cpu_cap,uint64_t ram_cap,uint64_t locked_cap,uint64_t vm_cap,uint64_t processes_cap,uint64_t processes,uint64_t lwps_cap,uint64_t lwps,uint64_t shm_cap,uint64_t shm,uint64_t shmids_cap,uint64_t shmids,uint64_t semids_cap,uint64_t semids,uint64_t msgids_cap,uint64_t msgids,uint64_t lofi_cap,uint64_t lofi,char * poolname,char * psetname,uint_t sched,uint_t cputype,uint_t iptype)1132 zsd_mark_zone_found(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t cpu_shares,
1133 uint64_t cpu_cap, uint64_t ram_cap, uint64_t locked_cap,
1134 uint64_t vm_cap, uint64_t processes_cap, uint64_t processes,
1135 uint64_t lwps_cap, uint64_t lwps, uint64_t shm_cap, uint64_t shm,
1136 uint64_t shmids_cap, uint64_t shmids, uint64_t semids_cap,
1137 uint64_t semids, uint64_t msgids_cap, uint64_t msgids, uint64_t lofi_cap,
1138 uint64_t lofi, char *poolname, char *psetname, uint_t sched, uint_t cputype,
1139 uint_t iptype)
1140 {
1141 zsd_system_t *sys = ctl->zsctl_system;
1142
1143 assert(zone->zsz_found == B_FALSE);
1144
1145 /*
1146 * Mark zone as exists, and new if it did not exist in previous
1147 * interval.
1148 */
1149 zone->zsz_found = B_TRUE;
1150 zone->zsz_empty = B_TRUE;
1151 zone->zsz_deleted = B_FALSE;
1152
1153 /*
1154 * Zone is new. Assume zone's properties are the same over entire
1155 * interval.
1156 */
1157 if (zone->zsz_active == B_FALSE)
1158 zone->zsz_new = B_TRUE;
1159 else
1160 zone->zsz_new = B_FALSE;
1161
1162 (void) strlcpy(zone->zsz_pool, poolname, sizeof (zone->zsz_pool));
1163 (void) strlcpy(zone->zsz_pset, psetname, sizeof (zone->zsz_pset));
1164 zone->zsz_default_sched = sched;
1165
1166 /* Schedulers updated later as processes are found */
1167 zone->zsz_scheds = 0;
1168
1169 /* Cpus updated later as psets bound are identified */
1170 zone->zsz_cpus_online = 0;
1171
1172 zone->zsz_cputype = cputype;
1173 zone->zsz_iptype = iptype;
1174 zone->zsz_psetid = ZS_PSET_ERROR;
1175 zone->zsz_cpu_cap = cpu_cap;
1176 zone->zsz_cpu_shares = cpu_shares;
1177 zone->zsz_ram_cap = ram_cap;
1178 zone->zsz_locked_cap = locked_cap;
1179 zone->zsz_vm_cap = vm_cap;
1180 zone->zsz_processes_cap = processes_cap;
1181 zone->zsz_processes = processes;
1182 zone->zsz_lwps_cap = lwps_cap;
1183 zone->zsz_lwps = lwps;
1184 zone->zsz_shm_cap = shm_cap;
1185 zone->zsz_shm = shm;
1186 zone->zsz_shmids_cap = shmids_cap;
1187 zone->zsz_shmids = shmids;
1188 zone->zsz_semids_cap = semids_cap;
1189 zone->zsz_semids = semids;
1190 zone->zsz_msgids_cap = msgids_cap;
1191 zone->zsz_msgids = msgids;
1192 zone->zsz_lofi_cap = lofi_cap;
1193 zone->zsz_lofi = lofi;
1194
1195 sys->zss_processes += processes;
1196 sys->zss_lwps += lwps;
1197 sys->zss_shm += shm;
1198 sys->zss_shmids += shmids;
1199 sys->zss_semids += semids;
1200 sys->zss_msgids += msgids;
1201 sys->zss_lofi += lofi;
1202 zone->zsz_active = B_TRUE;
1203 }
1204
1205
1206 /* Determine which zones have halted */
1207 static void
zsd_mark_zones_end(zsd_ctl_t * ctl)1208 zsd_mark_zones_end(zsd_ctl_t *ctl)
1209 {
1210 zsd_zone_t *zone, *tmp;
1211
1212 /*
1213 * Mark zone as not existing, or delete if it did not exist in
1214 * previous interval.
1215 */
1216 zone = list_head(&ctl->zsctl_zones);
1217 while (zone != NULL) {
1218 if (zone->zsz_found == B_FALSE) {
1219 zone->zsz_empty = B_TRUE;
1220 if (zone->zsz_deleted == B_TRUE) {
1221 /*
1222 * Zone deleted in prior interval,
1223 * so it no longer exists.
1224 */
1225 tmp = zone;
1226 zone = list_next(&ctl->zsctl_zones, zone);
1227 list_remove(&ctl->zsctl_zones, tmp);
1228 free(tmp);
1229 ctl->zsctl_nzones--;
1230 continue;
1231 } else {
1232 zone->zsz_new = B_FALSE;
1233 zone->zsz_deleted = B_TRUE;
1234 zone->zsz_active = B_TRUE;
1235 }
1236 }
1237 zone = list_next(&ctl->zsctl_zones, zone);
1238 }
1239 }
1240
1241 /*
1242 * Mark cpus as not existing. If a cpu is found, it will be updated. If
1243 * a cpu is not found, then it must have gone offline, so it will be
1244 * deleted.
1245 *
1246 * The kstat tracking data is rolled so that the usage since the previous
1247 * interval can be determined.
1248 */
1249 static void
zsd_mark_cpus_start(zsd_ctl_t * ctl,boolean_t roll)1250 zsd_mark_cpus_start(zsd_ctl_t *ctl, boolean_t roll)
1251 {
1252 zsd_cpu_t *cpu;
1253
1254 /*
1255 * Mark all cpus as not existing. As cpus are found, they will
1256 * be marked as existing.
1257 */
1258 for (cpu = list_head(&ctl->zsctl_cpus); cpu != NULL;
1259 cpu = list_next(&ctl->zsctl_cpus, cpu)) {
1260 cpu->zsc_found = B_FALSE;
1261 if (cpu->zsc_active == B_TRUE && roll) {
1262 cpu->zsc_psetid_prev = cpu->zsc_psetid;
1263 cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1264 cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1265 cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1266 cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1267 }
1268 }
1269 }
1270
1271 /*
1272 * An array the size of the maximum number of cpus is kept. Within this array
1273 * a list of the online cpus is maintained.
1274 */
1275 zsd_cpu_t *
zsd_lookup_insert_cpu(zsd_ctl_t * ctl,processorid_t cpuid)1276 zsd_lookup_insert_cpu(zsd_ctl_t *ctl, processorid_t cpuid)
1277 {
1278 zsd_cpu_t *cpu;
1279
1280 assert(cpuid < ctl->zsctl_maxcpuid);
1281 cpu = &(ctl->zsctl_cpu_array[cpuid]);
1282 assert(cpuid == cpu->zsc_id);
1283
1284 if (cpu->zsc_allocated == B_FALSE) {
1285 cpu->zsc_allocated = B_TRUE;
1286 list_insert_tail(&ctl->zsctl_cpus, cpu);
1287 }
1288 return (cpu);
1289 }
1290
1291 /* A cpu has been found. Update its information */
1292 static void
zsd_mark_cpu_found(zsd_cpu_t * cpu,zsd_pset_t * pset,psetid_t psetid)1293 zsd_mark_cpu_found(zsd_cpu_t *cpu, zsd_pset_t *pset, psetid_t psetid)
1294 {
1295 /*
1296 * legacy processor sets, the cpu may move while zonestatd is
1297 * inspecting, causing it to be found twice. In this case, just
1298 * leave cpu in the first processor set in which it was found.
1299 */
1300 if (cpu->zsc_found == B_TRUE)
1301 return;
1302
1303 /* Mark cpu as online */
1304 cpu->zsc_found = B_TRUE;
1305 cpu->zsc_offlined = B_FALSE;
1306 cpu->zsc_pset = pset;
1307 /*
1308 * cpu is newly online.
1309 */
1310 if (cpu->zsc_active == B_FALSE) {
1311 /*
1312 * Cpu is newly online.
1313 */
1314 cpu->zsc_onlined = B_TRUE;
1315 cpu->zsc_psetid = psetid;
1316 cpu->zsc_psetid_prev = psetid;
1317 } else {
1318 /*
1319 * cpu online during previous interval. Save properties at
1320 * start of interval
1321 */
1322 cpu->zsc_onlined = B_FALSE;
1323 cpu->zsc_psetid = psetid;
1324
1325 }
1326 cpu->zsc_active = B_TRUE;
1327 }
1328
1329 /* Remove all offlined cpus from the list of tracked cpus */
1330 static void
zsd_mark_cpus_end(zsd_ctl_t * ctl)1331 zsd_mark_cpus_end(zsd_ctl_t *ctl)
1332 {
1333 zsd_cpu_t *cpu, *tmp;
1334 int id;
1335
1336 /* Mark cpu as online or offline */
1337 cpu = list_head(&ctl->zsctl_cpus);
1338 while (cpu != NULL) {
1339 if (cpu->zsc_found == B_FALSE) {
1340 if (cpu->zsc_offlined == B_TRUE) {
1341 /*
1342 * cpu offlined in prior interval. It is gone.
1343 */
1344 tmp = cpu;
1345 cpu = list_next(&ctl->zsctl_cpus, cpu);
1346 list_remove(&ctl->zsctl_cpus, tmp);
1347 /* Clear structure for future use */
1348 id = tmp->zsc_id;
1349 bzero(tmp, sizeof (zsd_cpu_t));
1350 tmp->zsc_id = id;
1351 tmp->zsc_allocated = B_FALSE;
1352 tmp->zsc_psetid = ZS_PSET_ERROR;
1353 tmp->zsc_psetid_prev = ZS_PSET_ERROR;
1354
1355 } else {
1356 /*
1357 * cpu online at start of interval. Treat
1358 * as still online, since it was online for
1359 * some portion of the interval.
1360 */
1361 cpu->zsc_offlined = B_TRUE;
1362 cpu->zsc_onlined = B_FALSE;
1363 cpu->zsc_active = B_TRUE;
1364 cpu->zsc_psetid = cpu->zsc_psetid_prev;
1365 cpu->zsc_pset = NULL;
1366 }
1367 }
1368 cpu = list_next(&ctl->zsctl_cpus, cpu);
1369 }
1370 }
1371
1372 /* Some utility functions for managing the list of processor sets */
1373 static zsd_pset_t *
zsd_lookup_pset_byid(zsd_ctl_t * ctl,psetid_t psetid)1374 zsd_lookup_pset_byid(zsd_ctl_t *ctl, psetid_t psetid)
1375 {
1376 zsd_pset_t *pset;
1377
1378 for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1379 pset = list_next(&ctl->zsctl_psets, pset)) {
1380 if (pset->zsp_id == psetid)
1381 return (pset);
1382 }
1383 return (NULL);
1384 }
1385
1386 static zsd_pset_t *
zsd_lookup_pset(zsd_ctl_t * ctl,char * psetname,psetid_t psetid)1387 zsd_lookup_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1388 {
1389 zsd_pset_t *pset;
1390
1391 for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1392 pset = list_next(&ctl->zsctl_psets, pset)) {
1393 if (strcmp(pset->zsp_name, psetname) == 0) {
1394 if (psetid != -1)
1395 pset->zsp_id = psetid;
1396 return (pset);
1397 }
1398 }
1399 return (NULL);
1400 }
1401
1402 static zsd_pset_t *
zsd_allocate_pset(zsd_ctl_t * ctl,char * psetname,psetid_t psetid)1403 zsd_allocate_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1404 {
1405 zsd_pset_t *pset;
1406
1407 if ((pset = (zsd_pset_t *)calloc(1, sizeof (zsd_pset_t))) == NULL)
1408 return (NULL);
1409
1410 (void) strlcpy(pset->zsp_name, psetname, sizeof (pset->zsp_name));
1411 pset->zsp_id = psetid;
1412 pset->zsp_found = B_FALSE;
1413 /*
1414 * Allocate as deleted so if not found in first pass, pset is deleted
1415 * from list. This can happen if pset is returned by pset_list, but
1416 * is destroyed before first attempt to fetch pset details.
1417 */
1418 list_create(&pset->zsp_usage_list, sizeof (zsd_pset_usage_t),
1419 offsetof(zsd_pset_usage_t, zsu_next));
1420
1421 pset->zsp_hrstart = g_hrnow;
1422 pset->zsp_deleted = B_TRUE;
1423 pset->zsp_empty = B_TRUE;
1424 ctl->zsctl_npsets++;
1425
1426 return (pset);
1427 }
1428
1429 static zsd_pset_t *
zsd_lookup_insert_pset(zsd_ctl_t * ctl,char * psetname,psetid_t psetid)1430 zsd_lookup_insert_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1431 {
1432 zsd_pset_t *pset, *tmp;
1433
1434 if ((pset = zsd_lookup_pset(ctl, psetname, psetid)) != NULL)
1435 return (pset);
1436
1437 if ((pset = zsd_allocate_pset(ctl, psetname, psetid)) == NULL)
1438 return (NULL);
1439
1440 /* Insert sorted by psetname */
1441 tmp = list_head(&ctl->zsctl_psets);
1442 while (tmp != NULL && strcmp(psetname, tmp->zsp_name) > 0)
1443 tmp = list_next(&ctl->zsctl_psets, tmp);
1444
1445 list_insert_before(&ctl->zsctl_psets, tmp, pset);
1446 return (pset);
1447 }
1448
1449 /* Some utility functions for managing the list of zones using each pset */
1450 static zsd_pset_usage_t *
zsd_lookup_usage(zsd_pset_t * pset,zsd_zone_t * zone)1451 zsd_lookup_usage(zsd_pset_t *pset, zsd_zone_t *zone)
1452 {
1453 zsd_pset_usage_t *usage;
1454
1455 for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
1456 usage = list_next(&pset->zsp_usage_list, usage))
1457 if (usage->zsu_zone == zone)
1458 return (usage);
1459
1460 return (NULL);
1461 }
1462
1463 static zsd_pset_usage_t *
zsd_allocate_pset_usage(zsd_ctl_t * ctl,zsd_pset_t * pset,zsd_zone_t * zone)1464 zsd_allocate_pset_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1465 {
1466 zsd_pset_usage_t *usage;
1467
1468 if ((usage = (zsd_pset_usage_t *)calloc(1, sizeof (zsd_pset_usage_t)))
1469 == NULL)
1470 return (NULL);
1471
1472 list_link_init(&usage->zsu_next);
1473 usage->zsu_zone = zone;
1474 usage->zsu_zoneid = zone->zsz_id;
1475 usage->zsu_pset = pset;
1476 usage->zsu_found = B_FALSE;
1477 usage->zsu_active = B_FALSE;
1478 usage->zsu_new = B_FALSE;
1479 /*
1480 * Allocate as not deleted. If a process is found in a pset for
1481 * a zone, the usage will not be deleted until at least the next
1482 * interval.
1483 */
1484 usage->zsu_start = g_now;
1485 usage->zsu_hrstart = g_hrnow;
1486 usage->zsu_deleted = B_FALSE;
1487 usage->zsu_empty = B_TRUE;
1488 usage->zsu_scheds = 0;
1489 usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1490
1491 ctl->zsctl_npset_usages++;
1492 pset->zsp_nusage++;
1493
1494 return (usage);
1495 }
1496
1497 static zsd_pset_usage_t *
zsd_lookup_insert_usage(zsd_ctl_t * ctl,zsd_pset_t * pset,zsd_zone_t * zone)1498 zsd_lookup_insert_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1499 {
1500 zsd_pset_usage_t *usage, *tmp;
1501
1502 if ((usage = zsd_lookup_usage(pset, zone))
1503 != NULL)
1504 return (usage);
1505
1506 if ((usage = zsd_allocate_pset_usage(ctl, pset, zone)) == NULL)
1507 return (NULL);
1508
1509 tmp = list_head(&pset->zsp_usage_list);
1510 while (tmp != NULL && strcmp(zone->zsz_name, tmp->zsu_zone->zsz_name)
1511 > 0)
1512 tmp = list_next(&pset->zsp_usage_list, tmp);
1513
1514 list_insert_before(&pset->zsp_usage_list, tmp, usage);
1515 return (usage);
1516 }
1517
1518 static void
zsd_refresh_system(zsd_ctl_t * ctl)1519 zsd_refresh_system(zsd_ctl_t *ctl)
1520 {
1521 zsd_system_t *system = ctl->zsctl_system;
1522
1523 /* Re-count these values each interval */
1524 system->zss_processes = 0;
1525 system->zss_lwps = 0;
1526 system->zss_shm = 0;
1527 system->zss_shmids = 0;
1528 system->zss_semids = 0;
1529 system->zss_msgids = 0;
1530 system->zss_lofi = 0;
1531 }
1532
1533
1534 /* Reads each cpu's kstats, and adds the usage to the cpu's pset */
1535 static void
zsd_update_cpu_stats(zsd_ctl_t * ctl,zsd_cpu_t * cpu)1536 zsd_update_cpu_stats(zsd_ctl_t *ctl, zsd_cpu_t *cpu)
1537 {
1538 zsd_system_t *sys;
1539 processorid_t cpuid;
1540 zsd_pset_t *pset_prev;
1541 zsd_pset_t *pset;
1542 kstat_t *kstat;
1543 kstat_named_t *knp;
1544 kid_t kid;
1545 uint64_t idle, intr, kern, user;
1546
1547 sys = ctl->zsctl_system;
1548 pset = cpu->zsc_pset;
1549 knp = NULL;
1550 kid = -1;
1551 cpuid = cpu->zsc_id;
1552
1553 /* Get the cpu time totals for this cpu */
1554 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "cpu", cpuid, "sys");
1555 if (kstat == NULL)
1556 return;
1557
1558 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
1559 if (kid == -1)
1560 return;
1561
1562 knp = kstat_data_lookup(kstat, "cpu_nsec_idle");
1563 if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1564 return;
1565
1566 idle = knp->value.ui64;
1567
1568 knp = kstat_data_lookup(kstat, "cpu_nsec_kernel");
1569 if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1570 return;
1571
1572 kern = knp->value.ui64;
1573
1574 knp = kstat_data_lookup(kstat, "cpu_nsec_user");
1575 if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1576 return;
1577
1578 user = knp->value.ui64;
1579
1580 /*
1581 * Tracking intr time per cpu just exists for future enhancements.
1582 * The value is presently always zero.
1583 */
1584 intr = 0;
1585 cpu->zsc_nsec_idle = idle;
1586 cpu->zsc_nsec_intr = intr;
1587 cpu->zsc_nsec_kern = kern;
1588 cpu->zsc_nsec_user = user;
1589
1590 if (cpu->zsc_onlined == B_TRUE) {
1591 /*
1592 * cpu is newly online. There is no reference value,
1593 * so just record its current stats for comparison
1594 * on next stat read.
1595 */
1596 cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1597 cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1598 cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1599 cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1600 return;
1601 }
1602
1603 /*
1604 * Calculate relative time since previous refresh.
1605 * Paranoia. Don't let time go backwards.
1606 */
1607 idle = intr = kern = user = 0;
1608 if (cpu->zsc_nsec_idle > cpu->zsc_nsec_idle_prev)
1609 idle = cpu->zsc_nsec_idle - cpu->zsc_nsec_idle_prev;
1610
1611 if (cpu->zsc_nsec_intr > cpu->zsc_nsec_intr_prev)
1612 intr = cpu->zsc_nsec_intr - cpu->zsc_nsec_intr_prev;
1613
1614 if (cpu->zsc_nsec_kern > cpu->zsc_nsec_kern_prev)
1615 kern = cpu->zsc_nsec_kern - cpu->zsc_nsec_kern_prev;
1616
1617 if (cpu->zsc_nsec_user > cpu->zsc_nsec_user_prev)
1618 user = cpu->zsc_nsec_user - cpu->zsc_nsec_user_prev;
1619
1620 /* Update totals for cpu usage */
1621 TIMESTRUC_ADD_NANOSEC(cpu->zsc_idle, idle);
1622 TIMESTRUC_ADD_NANOSEC(cpu->zsc_intr, intr);
1623 TIMESTRUC_ADD_NANOSEC(cpu->zsc_kern, kern);
1624 TIMESTRUC_ADD_NANOSEC(cpu->zsc_user, user);
1625
1626 /*
1627 * Add cpu's stats to its pset if it is known to be in
1628 * the pset since previous read.
1629 */
1630 if (cpu->zsc_psetid == cpu->zsc_psetid_prev ||
1631 cpu->zsc_psetid_prev == ZS_PSET_ERROR ||
1632 (pset_prev = zsd_lookup_pset_byid(ctl,
1633 cpu->zsc_psetid_prev)) == NULL) {
1634 TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, idle);
1635 TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, intr);
1636 TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, kern);
1637 TIMESTRUC_ADD_NANOSEC(pset->zsp_user, user);
1638 } else {
1639 /*
1640 * Last pset was different than current pset.
1641 * Best guess is to split usage between the two.
1642 */
1643 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_idle, idle / 2);
1644 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_intr, intr / 2);
1645 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_kern, kern / 2);
1646 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_user, user / 2);
1647
1648 TIMESTRUC_ADD_NANOSEC(pset->zsp_idle,
1649 (idle / 2) + (idle % 2));
1650 TIMESTRUC_ADD_NANOSEC(pset->zsp_intr,
1651 (intr / 2) + (intr % 2));
1652 TIMESTRUC_ADD_NANOSEC(pset->zsp_kern,
1653 (kern / 2) + (kern % 2));
1654 TIMESTRUC_ADD_NANOSEC(pset->zsp_user,
1655 (user / 2) + (user % 2));
1656 }
1657 TIMESTRUC_ADD_NANOSEC(sys->zss_idle, idle);
1658 TIMESTRUC_ADD_NANOSEC(sys->zss_intr, intr);
1659 TIMESTRUC_ADD_NANOSEC(sys->zss_kern, kern);
1660 TIMESTRUC_ADD_NANOSEC(sys->zss_user, user);
1661 }
1662
1663 /* Determine the details of a processor set by pset_id */
1664 static int
zsd_get_pool_pset(zsd_ctl_t * ctl,psetid_t psetid,char * psetname,size_t namelen,uint_t * cputype,uint64_t * online,uint64_t * size,uint64_t * min,uint64_t * max,int64_t * importance)1665 zsd_get_pool_pset(zsd_ctl_t *ctl, psetid_t psetid, char *psetname,
1666 size_t namelen, uint_t *cputype, uint64_t *online, uint64_t *size,
1667 uint64_t *min, uint64_t *max, int64_t *importance)
1668 {
1669 uint_t old, num;
1670
1671 pool_conf_t *conf = ctl->zsctl_pool_conf;
1672 pool_value_t **vals = ctl->zsctl_pool_vals;
1673 pool_resource_t **res_list = NULL;
1674 pool_resource_t *pset;
1675 pool_component_t **cpus = NULL;
1676 processorid_t *cache;
1677 const char *string;
1678 uint64_t uint64;
1679 int64_t int64;
1680 int i, ret, type;
1681
1682 if (ctl->zsctl_pool_status == POOL_DISABLED) {
1683
1684 /*
1685 * Inspect legacy psets
1686 */
1687 for (;;) {
1688 old = num = ctl->zsctl_cpu_ncache;
1689 ret = pset_info(psetid, &type, &num,
1690 ctl->zsctl_cpu_cache);
1691 if (ret < 0) {
1692 /* pset is gone. Tell caller to retry */
1693 errno = EINTR;
1694 return (-1);
1695 }
1696 if (num <= old) {
1697 /* Success */
1698 break;
1699 }
1700 if ((cache = (processorid_t *)realloc(
1701 ctl->zsctl_cpu_cache, num *
1702 sizeof (processorid_t))) != NULL) {
1703 ctl->zsctl_cpu_ncache = num;
1704 ctl->zsctl_cpu_cache = cache;
1705 } else {
1706 /*
1707 * Could not allocate to get new cpu list.
1708 */
1709 zsd_warn(gettext(
1710 "Could not allocate for cpu list"));
1711 errno = ENOMEM;
1712 return (-1);
1713 }
1714 }
1715 /*
1716 * Old school pset. Just make min and max equal
1717 * to its size
1718 */
1719 if (psetid == ZS_PSET_DEFAULT) {
1720 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
1721 (void) strlcpy(psetname, "pset_default", namelen);
1722 } else {
1723 *cputype = ZS_CPUTYPE_PSRSET_PSET;
1724 (void) snprintf(psetname, namelen,
1725 "SUNWlegacy_pset_%d", psetid);
1726 }
1727
1728 /*
1729 * Just treat legacy pset as a simple pool pset
1730 */
1731 *online = num;
1732 *size = num;
1733 *min = num;
1734 *max = num;
1735 *importance = 1;
1736
1737 return (0);
1738 }
1739
1740 /* Look up the pool pset using the pset id */
1741 res_list = NULL;
1742 pool_value_set_int64(vals[1], psetid);
1743 if (pool_value_set_name(vals[1], "pset.sys_id")
1744 != PO_SUCCESS)
1745 goto err;
1746
1747 if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1748 goto err;
1749 if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1750 goto err;
1751 if ((res_list = pool_query_resources(conf, &num, vals)) == NULL)
1752 goto err;
1753 if (num != 1)
1754 goto err;
1755 pset = res_list[0];
1756 free(res_list);
1757 res_list = NULL;
1758 if (pool_get_property(conf, pool_resource_to_elem(conf, pset),
1759 "pset.name", vals[0]) != POC_STRING ||
1760 pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1761 goto err;
1762
1763 (void) strlcpy(psetname, string, namelen);
1764 if (strncmp(psetname, "SUNWtmp", strlen("SUNWtmp")) == 0)
1765 *cputype = ZS_CPUTYPE_DEDICATED;
1766 else if (psetid == ZS_PSET_DEFAULT)
1767 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
1768 else
1769 *cputype = ZS_CPUTYPE_POOL_PSET;
1770
1771 /* Get size, min, max, and importance */
1772 if (pool_get_property(conf, pool_resource_to_elem(conf,
1773 pset), "pset.size", vals[0]) == POC_UINT &&
1774 pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1775 *size = uint64;
1776 else
1777 *size = 0;
1778
1779 /* Get size, min, max, and importance */
1780 if (pool_get_property(conf, pool_resource_to_elem(conf,
1781 pset), "pset.min", vals[0]) == POC_UINT &&
1782 pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1783 *min = uint64;
1784 else
1785 *min = 0;
1786 if (*min >= ZSD_PSET_UNLIMITED)
1787 *min = ZS_LIMIT_NONE;
1788
1789 if (pool_get_property(conf, pool_resource_to_elem(conf,
1790 pset), "pset.max", vals[0]) == POC_UINT &&
1791 pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1792 *max = uint64;
1793 else
1794 *max = ZS_LIMIT_NONE;
1795
1796 if (*max >= ZSD_PSET_UNLIMITED)
1797 *max = ZS_LIMIT_NONE;
1798
1799 if (pool_get_property(conf, pool_resource_to_elem(conf,
1800 pset), "pset.importance", vals[0]) == POC_INT &&
1801 pool_value_get_int64(vals[0], &int64) == PO_SUCCESS)
1802 *importance = int64;
1803 else
1804 *importance = (uint64_t)1;
1805
1806 *online = 0;
1807 if (*size == 0)
1808 return (0);
1809
1810 /* get cpus */
1811 cpus = pool_query_resource_components(conf, pset, &num, NULL);
1812 if (cpus == NULL)
1813 goto err;
1814
1815 /* Make sure there is space for cpu id list */
1816 if (num > ctl->zsctl_cpu_ncache) {
1817 if ((cache = (processorid_t *)realloc(
1818 ctl->zsctl_cpu_cache, num *
1819 sizeof (processorid_t))) != NULL) {
1820 ctl->zsctl_cpu_ncache = num;
1821 ctl->zsctl_cpu_cache = cache;
1822 } else {
1823 /*
1824 * Could not allocate to get new cpu list.
1825 */
1826 zsd_warn(gettext(
1827 "Could not allocate for cpu list"));
1828 goto err;
1829 }
1830 }
1831
1832 /* count the online cpus */
1833 for (i = 0; i < num; i++) {
1834 if (pool_get_property(conf, pool_component_to_elem(
1835 conf, cpus[i]), "cpu.status", vals[0]) != POC_STRING ||
1836 pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1837 goto err;
1838
1839 if (strcmp(string, "on-line") != 0 &&
1840 strcmp(string, "no-intr") != 0)
1841 continue;
1842
1843 if (pool_get_property(conf, pool_component_to_elem(
1844 conf, cpus[i]), "cpu.sys_id", vals[0]) != POC_INT ||
1845 pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
1846 goto err;
1847
1848 (*online)++;
1849 ctl->zsctl_cpu_cache[i] = (psetid_t)int64;
1850 }
1851 free(cpus);
1852 return (0);
1853 err:
1854 if (res_list != NULL)
1855 free(res_list);
1856 if (cpus != NULL)
1857 free(cpus);
1858
1859 /*
1860 * The pools operations should succeed since the conf is a consistent
1861 * snapshot. Tell caller there is no need to retry.
1862 */
1863 errno = EINVAL;
1864 return (-1);
1865 }
1866
1867 /*
1868 * Update the current list of processor sets.
1869 * This also updates the list of online cpus, and each cpu's pset membership.
1870 */
1871 static void
zsd_refresh_psets(zsd_ctl_t * ctl)1872 zsd_refresh_psets(zsd_ctl_t *ctl)
1873 {
1874 int i, j, ret, state;
1875 uint_t old, num;
1876 uint_t cputype;
1877 int64_t sys_id, importance;
1878 uint64_t online, size, min, max;
1879 zsd_system_t *system;
1880 zsd_pset_t *pset;
1881 zsd_cpu_t *cpu;
1882 psetid_t *cache;
1883 char psetname[ZS_PSETNAME_MAX];
1884 processorid_t cpuid;
1885 pool_value_t *pv_save = NULL;
1886 pool_resource_t **res_list = NULL;
1887 pool_resource_t *res;
1888 pool_value_t **vals;
1889 pool_conf_t *conf;
1890 boolean_t roll_cpus = B_TRUE;
1891
1892 /* Zero cpu counters to recount them */
1893 system = ctl->zsctl_system;
1894 system->zss_ncpus = 0;
1895 system->zss_ncpus_online = 0;
1896 retry:
1897 ret = pool_get_status(&state);
1898 if (ret == 0 && state == POOL_ENABLED) {
1899
1900 conf = ctl->zsctl_pool_conf;
1901 vals = ctl->zsctl_pool_vals;
1902 pv_save = vals[1];
1903 vals[1] = NULL;
1904
1905 if (ctl->zsctl_pool_status == POOL_DISABLED) {
1906 if (pool_conf_open(ctl->zsctl_pool_conf,
1907 pool_dynamic_location(), PO_RDONLY) == 0) {
1908 ctl->zsctl_pool_status = POOL_ENABLED;
1909 ctl->zsctl_pool_changed = POU_PSET;
1910 }
1911 } else {
1912 ctl->zsctl_pool_changed = 0;
1913 ret = pool_conf_update(ctl->zsctl_pool_conf,
1914 &(ctl->zsctl_pool_changed));
1915 if (ret < 0) {
1916 /* Pools must have become disabled */
1917 (void) pool_conf_close(ctl->zsctl_pool_conf);
1918 ctl->zsctl_pool_status = POOL_DISABLED;
1919 if (pool_error() == POE_SYSTEM && errno ==
1920 ENOTACTIVE)
1921 goto retry;
1922
1923 zsd_warn(gettext(
1924 "Unable to update pool configuration"));
1925 /* Not able to get pool info. Don't update. */
1926 goto err;
1927 }
1928 }
1929 /* Get the list of psets using libpool */
1930 if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1931 goto err;
1932
1933 if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1934 goto err;
1935 if ((res_list = pool_query_resources(conf, &num, vals))
1936 == NULL)
1937 goto err;
1938
1939 if (num > ctl->zsctl_pset_ncache) {
1940 if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1941 (num) * sizeof (psetid_t))) == NULL) {
1942 goto err;
1943 }
1944 ctl->zsctl_pset_ncache = num;
1945 ctl->zsctl_pset_cache = cache;
1946 }
1947 /* Save the pset id of each pset */
1948 for (i = 0; i < num; i++) {
1949 res = res_list[i];
1950 if (pool_get_property(conf, pool_resource_to_elem(conf,
1951 res), "pset.sys_id", vals[0]) != POC_INT ||
1952 pool_value_get_int64(vals[0], &sys_id)
1953 != PO_SUCCESS)
1954 goto err;
1955 ctl->zsctl_pset_cache[i] = (int)sys_id;
1956 }
1957 vals[1] = pv_save;
1958 pv_save = NULL;
1959 } else {
1960 if (ctl->zsctl_pool_status == POOL_ENABLED) {
1961 (void) pool_conf_close(ctl->zsctl_pool_conf);
1962 ctl->zsctl_pool_status = POOL_DISABLED;
1963 }
1964 /* Get the pset list using legacy psets */
1965 for (;;) {
1966 old = num = ctl->zsctl_pset_ncache;
1967 (void) pset_list(ctl->zsctl_pset_cache, &num);
1968 if ((num + 1) <= old) {
1969 break;
1970 }
1971 if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1972 (num + 1) * sizeof (psetid_t))) != NULL) {
1973 ctl->zsctl_pset_ncache = num + 1;
1974 ctl->zsctl_pset_cache = cache;
1975 } else {
1976 /*
1977 * Could not allocate to get new pset list.
1978 * Give up
1979 */
1980 return;
1981 }
1982 }
1983 /* Add the default pset to list */
1984 ctl->zsctl_pset_cache[num] = ctl->zsctl_pset_cache[0];
1985 ctl->zsctl_pset_cache[0] = ZS_PSET_DEFAULT;
1986 num++;
1987 }
1988 psets_changed:
1989 zsd_mark_cpus_start(ctl, roll_cpus);
1990 zsd_mark_psets_start(ctl);
1991 roll_cpus = B_FALSE;
1992
1993 /* Refresh cpu membership of all psets */
1994 for (i = 0; i < num; i++) {
1995
1996 /* Get pool pset information */
1997 sys_id = ctl->zsctl_pset_cache[i];
1998 if (zsd_get_pool_pset(ctl, sys_id, psetname, sizeof (psetname),
1999 &cputype, &online, &size, &min, &max, &importance)
2000 != 0) {
2001 if (errno == EINTR)
2002 goto psets_changed;
2003 zsd_warn(gettext("Failed to get info for pset %d"),
2004 sys_id);
2005 continue;
2006 }
2007
2008 system->zss_ncpus += size;
2009 system->zss_ncpus_online += online;
2010
2011 pset = zsd_lookup_insert_pset(ctl, psetname,
2012 ctl->zsctl_pset_cache[i]);
2013
2014 /* update pset info */
2015 zsd_mark_pset_found(pset, cputype, online, size, min,
2016 max, importance);
2017
2018 /* update each cpu in pset */
2019 for (j = 0; j < pset->zsp_online; j++) {
2020 cpuid = ctl->zsctl_cpu_cache[j];
2021 cpu = zsd_lookup_insert_cpu(ctl, cpuid);
2022 zsd_mark_cpu_found(cpu, pset, sys_id);
2023 }
2024 }
2025 err:
2026 if (res_list != NULL)
2027 free(res_list);
2028 if (pv_save != NULL)
2029 vals[1] = pv_save;
2030 }
2031
2032
2033
2034 /*
2035 * Fetch the current pool and pset name for the given zone.
2036 */
2037 static void
zsd_get_zone_pool_pset(zsd_ctl_t * ctl,zsd_zone_t * zone,char * pool,int poollen,char * pset,int psetlen,uint_t * cputype)2038 zsd_get_zone_pool_pset(zsd_ctl_t *ctl, zsd_zone_t *zone,
2039 char *pool, int poollen, char *pset, int psetlen, uint_t *cputype)
2040 {
2041 poolid_t poolid;
2042 pool_t **pools = NULL;
2043 pool_resource_t **res_list = NULL;
2044 char poolname[ZS_POOLNAME_MAX];
2045 char psetname[ZS_PSETNAME_MAX];
2046 pool_conf_t *conf = ctl->zsctl_pool_conf;
2047 pool_value_t *pv_save = NULL;
2048 pool_value_t **vals = ctl->zsctl_pool_vals;
2049 const char *string;
2050 int ret;
2051 int64_t int64;
2052 uint_t num;
2053
2054 ret = zone_getattr(zone->zsz_id, ZONE_ATTR_POOLID,
2055 &poolid, sizeof (poolid));
2056 if (ret < 0)
2057 goto lookup_done;
2058
2059 pv_save = vals[1];
2060 vals[1] = NULL;
2061 pools = NULL;
2062 res_list = NULL;
2063
2064 /* Default values if lookup fails */
2065 (void) strlcpy(poolname, "pool_default", sizeof (poolname));
2066 (void) strlcpy(psetname, "pset_default", sizeof (poolname));
2067 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
2068
2069 /* no dedicated cpu if pools are disabled */
2070 if (ctl->zsctl_pool_status == POOL_DISABLED)
2071 goto lookup_done;
2072
2073 /* Get the pool name using the id */
2074 pool_value_set_int64(vals[0], poolid);
2075 if (pool_value_set_name(vals[0], "pool.sys_id") != PO_SUCCESS)
2076 goto lookup_done;
2077
2078 if ((pools = pool_query_pools(conf, &num, vals)) == NULL)
2079 goto lookup_done;
2080
2081 if (num != 1)
2082 goto lookup_done;
2083
2084 if (pool_get_property(conf, pool_to_elem(conf, pools[0]),
2085 "pool.name", vals[0]) != POC_STRING ||
2086 pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2087 goto lookup_done;
2088 (void) strlcpy(poolname, (char *)string, sizeof (poolname));
2089
2090 /* Get the name of the pset for the pool */
2091 if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
2092 goto lookup_done;
2093
2094 if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
2095 goto lookup_done;
2096
2097 if ((res_list = pool_query_pool_resources(conf, pools[0], &num, vals))
2098 == NULL)
2099 goto lookup_done;
2100
2101 if (num != 1)
2102 goto lookup_done;
2103
2104 if (pool_get_property(conf, pool_resource_to_elem(conf,
2105 res_list[0]), "pset.sys_id", vals[0]) != POC_INT ||
2106 pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
2107 goto lookup_done;
2108
2109 if (int64 == ZS_PSET_DEFAULT)
2110 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
2111
2112 if (pool_get_property(conf, pool_resource_to_elem(conf,
2113 res_list[0]), "pset.name", vals[0]) != POC_STRING ||
2114 pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2115 goto lookup_done;
2116
2117 (void) strlcpy(psetname, (char *)string, sizeof (psetname));
2118
2119 if (strncmp(psetname, "SUNWtmp_", strlen("SUNWtmp_")) == 0)
2120 *cputype = ZS_CPUTYPE_DEDICATED;
2121 if (strncmp(psetname, "SUNW_legacy_", strlen("SUNW_legacy_")) == 0)
2122 *cputype = ZS_CPUTYPE_PSRSET_PSET;
2123 else
2124 *cputype = ZS_CPUTYPE_POOL_PSET;
2125
2126 lookup_done:
2127
2128 if (pv_save != NULL)
2129 vals[1] = pv_save;
2130
2131 if (res_list)
2132 free(res_list);
2133 if (pools)
2134 free(pools);
2135
2136 (void) strlcpy(pool, poolname, poollen);
2137 (void) strlcpy(pset, psetname, psetlen);
2138 }
2139
2140 /* Convert scheduler names to ZS_* scheduler flags */
2141 static uint_t
zsd_schedname2int(char * clname,int pri)2142 zsd_schedname2int(char *clname, int pri)
2143 {
2144 uint_t sched = 0;
2145
2146 if (strcmp(clname, "TS") == 0) {
2147 sched = ZS_SCHED_TS;
2148 } else if (strcmp(clname, "IA") == 0) {
2149 sched = ZS_SCHED_IA;
2150 } else if (strcmp(clname, "FX") == 0) {
2151 if (pri > 59) {
2152 sched = ZS_SCHED_FX_60;
2153 } else {
2154 sched = ZS_SCHED_FX;
2155 }
2156 } else if (strcmp(clname, "RT") == 0) {
2157 sched = ZS_SCHED_RT;
2158
2159 } else if (strcmp(clname, "FSS") == 0) {
2160 sched = ZS_SCHED_FSS;
2161 }
2162 return (sched);
2163 }
2164
2165 static uint64_t
zsd_get_zone_rctl_limit(char * name)2166 zsd_get_zone_rctl_limit(char *name)
2167 {
2168 rctlblk_t *rblk;
2169
2170 rblk = (rctlblk_t *)alloca(rctlblk_size());
2171 if (getrctl(name, NULL, rblk, RCTL_FIRST)
2172 != 0) {
2173 return (ZS_LIMIT_NONE);
2174 }
2175 return (rctlblk_get_value(rblk));
2176 }
2177
2178 static uint64_t
zsd_get_zone_rctl_usage(char * name)2179 zsd_get_zone_rctl_usage(char *name)
2180 {
2181 rctlblk_t *rblk;
2182
2183 rblk = (rctlblk_t *)alloca(rctlblk_size());
2184 if (getrctl(name, NULL, rblk, RCTL_USAGE)
2185 != 0) {
2186 return (0);
2187 }
2188 return (rctlblk_get_value(rblk));
2189 }
2190
2191 #define ZSD_NUM_RCTL_VALS 19
2192
2193 /*
2194 * Fetch the limit information for a zone. This uses zone_enter() as the
2195 * getrctl(2) system call only returns rctl information for the zone of
2196 * the caller.
2197 */
2198 static int
zsd_get_zone_caps(zsd_ctl_t * ctl,zsd_zone_t * zone,uint64_t * cpu_shares,uint64_t * cpu_cap,uint64_t * ram_cap,uint64_t * locked_cap,uint64_t * vm_cap,uint64_t * processes_cap,uint64_t * processes,uint64_t * lwps_cap,uint64_t * lwps,uint64_t * shm_cap,uint64_t * shm,uint64_t * shmids_cap,uint64_t * shmids,uint64_t * semids_cap,uint64_t * semids,uint64_t * msgids_cap,uint64_t * msgids,uint64_t * lofi_cap,uint64_t * lofi,uint_t * sched)2199 zsd_get_zone_caps(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t *cpu_shares,
2200 uint64_t *cpu_cap, uint64_t *ram_cap, uint64_t *locked_cap,
2201 uint64_t *vm_cap, uint64_t *processes_cap, uint64_t *processes,
2202 uint64_t *lwps_cap, uint64_t *lwps, uint64_t *shm_cap, uint64_t *shm,
2203 uint64_t *shmids_cap, uint64_t *shmids, uint64_t *semids_cap,
2204 uint64_t *semids, uint64_t *msgids_cap, uint64_t *msgids,
2205 uint64_t *lofi_cap, uint64_t *lofi, uint_t *sched)
2206 {
2207 int p[2], pid, tmpl_fd, ret;
2208 ctid_t ct;
2209 char class[PC_CLNMSZ];
2210 uint64_t vals[ZSD_NUM_RCTL_VALS];
2211 zsd_system_t *sys = ctl->zsctl_system;
2212 int i = 0;
2213 int res = 0;
2214
2215 /* Treat all caps as no cap on error */
2216 *cpu_shares = ZS_LIMIT_NONE;
2217 *cpu_cap = ZS_LIMIT_NONE;
2218 *ram_cap = ZS_LIMIT_NONE;
2219 *locked_cap = ZS_LIMIT_NONE;
2220 *vm_cap = ZS_LIMIT_NONE;
2221
2222 *processes_cap = ZS_LIMIT_NONE;
2223 *lwps_cap = ZS_LIMIT_NONE;
2224 *shm_cap = ZS_LIMIT_NONE;
2225 *shmids_cap = ZS_LIMIT_NONE;
2226 *semids_cap = ZS_LIMIT_NONE;
2227 *msgids_cap = ZS_LIMIT_NONE;
2228 *lofi_cap = ZS_LIMIT_NONE;
2229
2230 *processes = 0;
2231 *lwps = 0;
2232 *shm = 0;
2233 *shmids = 0;
2234 *semids = 0;
2235 *msgids = 0;
2236 *lofi = 0;
2237
2238 /* Get the ram cap first since it is a zone attr */
2239 ret = zone_getattr(zone->zsz_id, ZONE_ATTR_PHYS_MCAP,
2240 ram_cap, sizeof (*ram_cap));
2241 if (ret < 0 || *ram_cap == 0)
2242 *ram_cap = ZS_LIMIT_NONE;
2243
2244 /* Get the zone's default scheduling class */
2245 ret = zone_getattr(zone->zsz_id, ZONE_ATTR_SCHED_CLASS,
2246 class, sizeof (class));
2247 if (ret < 0)
2248 return (-1);
2249
2250 *sched = zsd_schedname2int(class, 0);
2251
2252 /* rctl caps must be fetched from within the zone */
2253 if (pipe(p) != 0)
2254 return (-1);
2255
2256 if ((tmpl_fd = init_template()) == -1) {
2257 (void) close(p[0]);
2258 (void) close(p[1]);
2259 return (-1);
2260 }
2261 pid = forkx(0);
2262 if (pid < 0) {
2263 (void) ct_tmpl_clear(tmpl_fd);
2264 (void) close(p[0]);
2265 (void) close(p[1]);
2266 return (-1);
2267 }
2268 if (pid == 0) {
2269
2270 (void) ct_tmpl_clear(tmpl_fd);
2271 (void) close(tmpl_fd);
2272 (void) close(p[0]);
2273 if (zone->zsz_id != getzoneid()) {
2274 if (zone_enter(zone->zsz_id) < 0) {
2275 (void) close(p[1]);
2276 _exit(0);
2277 }
2278 }
2279
2280 /* Get caps for zone, and write them to zonestatd parent. */
2281 vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-shares");
2282 vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-cap");
2283 vals[i++] = zsd_get_zone_rctl_limit("zone.max-locked-memory");
2284 vals[i++] = zsd_get_zone_rctl_limit("zone.max-swap");
2285 vals[i++] = zsd_get_zone_rctl_limit("zone.max-processes");
2286 vals[i++] = zsd_get_zone_rctl_usage("zone.max-processes");
2287 vals[i++] = zsd_get_zone_rctl_limit("zone.max-lwps");
2288 vals[i++] = zsd_get_zone_rctl_usage("zone.max-lwps");
2289 vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-memory");
2290 vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-memory");
2291 vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-ids");
2292 vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-ids");
2293 vals[i++] = zsd_get_zone_rctl_limit("zone.max-sem-ids");
2294 vals[i++] = zsd_get_zone_rctl_usage("zone.max-sem-ids");
2295 vals[i++] = zsd_get_zone_rctl_limit("zone.max-msg-ids");
2296 vals[i++] = zsd_get_zone_rctl_usage("zone.max-msg-ids");
2297 vals[i++] = zsd_get_zone_rctl_limit("zone.max-lofi");
2298 vals[i++] = zsd_get_zone_rctl_usage("zone.max-lofi");
2299
2300 if (write(p[1], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2301 ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2302 (void) close(p[1]);
2303 _exit(1);
2304 }
2305
2306 (void) close(p[1]);
2307 _exit(0);
2308 }
2309 if (contract_latest(&ct) == -1)
2310 ct = -1;
2311
2312 (void) ct_tmpl_clear(tmpl_fd);
2313 (void) close(tmpl_fd);
2314 (void) close(p[1]);
2315 while (waitpid(pid, NULL, 0) != pid)
2316 ;
2317
2318 /* Read cap from child in zone */
2319 if (read(p[0], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2320 ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2321 res = -1;
2322 goto cleanup;
2323 }
2324 i = 0;
2325 *cpu_shares = vals[i++];
2326 *cpu_cap = vals[i++];
2327 *locked_cap = vals[i++];
2328 *vm_cap = vals[i++];
2329 *processes_cap = vals[i++];
2330 *processes = vals[i++];
2331 *lwps_cap = vals[i++];
2332 *lwps = vals[i++];
2333 *shm_cap = vals[i++];
2334 *shm = vals[i++];
2335 *shmids_cap = vals[i++];
2336 *shmids = vals[i++];
2337 *semids_cap = vals[i++];
2338 *semids = vals[i++];
2339 *msgids_cap = vals[i++];
2340 *msgids = vals[i++];
2341 *lofi_cap = vals[i++];
2342 *lofi = vals[i++];
2343
2344 /* Interpret maximum values as no cap */
2345 if (*cpu_cap == UINT32_MAX || *cpu_cap == 0)
2346 *cpu_cap = ZS_LIMIT_NONE;
2347 if (*processes_cap == sys->zss_processes_max)
2348 *processes_cap = ZS_LIMIT_NONE;
2349 if (*lwps_cap == sys->zss_lwps_max)
2350 *lwps_cap = ZS_LIMIT_NONE;
2351 if (*shm_cap == sys->zss_shm_max)
2352 *shm_cap = ZS_LIMIT_NONE;
2353 if (*shmids_cap == sys->zss_shmids_max)
2354 *shmids_cap = ZS_LIMIT_NONE;
2355 if (*semids_cap == sys->zss_semids_max)
2356 *semids_cap = ZS_LIMIT_NONE;
2357 if (*msgids_cap == sys->zss_msgids_max)
2358 *msgids_cap = ZS_LIMIT_NONE;
2359 if (*lofi_cap == sys->zss_lofi_max)
2360 *lofi_cap = ZS_LIMIT_NONE;
2361
2362
2363 cleanup:
2364 (void) close(p[0]);
2365 (void) ct_tmpl_clear(tmpl_fd);
2366 (void) close(tmpl_fd);
2367 (void) contract_abandon_id(ct);
2368
2369 return (res);
2370 }
2371
2372 /* Update the current list of running zones */
2373 static void
zsd_refresh_zones(zsd_ctl_t * ctl)2374 zsd_refresh_zones(zsd_ctl_t *ctl)
2375 {
2376 zsd_zone_t *zone;
2377 uint_t old, num;
2378 ushort_t flags;
2379 int i, ret;
2380 zoneid_t *cache;
2381 uint64_t cpu_shares;
2382 uint64_t cpu_cap;
2383 uint64_t ram_cap;
2384 uint64_t locked_cap;
2385 uint64_t vm_cap;
2386 uint64_t processes_cap;
2387 uint64_t processes;
2388 uint64_t lwps_cap;
2389 uint64_t lwps;
2390 uint64_t shm_cap;
2391 uint64_t shm;
2392 uint64_t shmids_cap;
2393 uint64_t shmids;
2394 uint64_t semids_cap;
2395 uint64_t semids;
2396 uint64_t msgids_cap;
2397 uint64_t msgids;
2398 uint64_t lofi_cap;
2399 uint64_t lofi;
2400
2401 char zonename[ZS_ZONENAME_MAX];
2402 char poolname[ZS_POOLNAME_MAX];
2403 char psetname[ZS_PSETNAME_MAX];
2404 uint_t sched;
2405 uint_t cputype;
2406 uint_t iptype;
2407
2408 /* Get the current list of running zones */
2409 for (;;) {
2410 old = num = ctl->zsctl_zone_ncache;
2411 (void) zone_list(ctl->zsctl_zone_cache, &num);
2412 if (num <= old)
2413 break;
2414 if ((cache = (zoneid_t *)realloc(ctl->zsctl_zone_cache,
2415 (num) * sizeof (zoneid_t))) != NULL) {
2416 ctl->zsctl_zone_ncache = num;
2417 ctl->zsctl_zone_cache = cache;
2418 } else {
2419 /* Could not allocate to get new zone list. Give up */
2420 return;
2421 }
2422 }
2423
2424 zsd_mark_zones_start(ctl);
2425
2426 for (i = 0; i < num; i++) {
2427
2428 ret = getzonenamebyid(ctl->zsctl_zone_cache[i],
2429 zonename, sizeof (zonename));
2430 if (ret < 0)
2431 continue;
2432
2433 zone = zsd_lookup_insert_zone(ctl, zonename,
2434 ctl->zsctl_zone_cache[i]);
2435
2436 ret = zone_getattr(ctl->zsctl_zone_cache[i], ZONE_ATTR_FLAGS,
2437 &flags, sizeof (flags));
2438 if (ret < 0)
2439 continue;
2440
2441 if (flags & ZF_NET_EXCL)
2442 iptype = ZS_IPTYPE_EXCLUSIVE;
2443 else
2444 iptype = ZS_IPTYPE_SHARED;
2445
2446 zsd_get_zone_pool_pset(ctl, zone, poolname, sizeof (poolname),
2447 psetname, sizeof (psetname), &cputype);
2448
2449 if (zsd_get_zone_caps(ctl, zone, &cpu_shares, &cpu_cap,
2450 &ram_cap, &locked_cap, &vm_cap, &processes_cap, &processes,
2451 &lwps_cap, &lwps, &shm_cap, &shm, &shmids_cap, &shmids,
2452 &semids_cap, &semids, &msgids_cap, &msgids, &lofi_cap,
2453 &lofi, &sched) != 0)
2454 continue;
2455
2456 zsd_mark_zone_found(ctl, zone, cpu_shares, cpu_cap, ram_cap,
2457 locked_cap, vm_cap, processes_cap, processes, lwps_cap,
2458 lwps, shm_cap, shm, shmids_cap, shmids, semids_cap,
2459 semids, msgids_cap, msgids, lofi_cap, lofi, poolname,
2460 psetname, sched, cputype, iptype);
2461 }
2462 }
2463
2464 /* Fetch the details of a process from its psinfo_t */
2465 static void
zsd_get_proc_info(zsd_ctl_t * ctl,psinfo_t * psinfo,psetid_t * psetid,psetid_t * prev_psetid,zoneid_t * zoneid,zoneid_t * prev_zoneid,timestruc_t * delta,uint_t * sched)2466 zsd_get_proc_info(zsd_ctl_t *ctl, psinfo_t *psinfo, psetid_t *psetid,
2467 psetid_t *prev_psetid, zoneid_t *zoneid, zoneid_t *prev_zoneid,
2468 timestruc_t *delta, uint_t *sched)
2469 {
2470 timestruc_t d;
2471 zsd_proc_t *proc;
2472
2473 /* Get cached data for proc */
2474 proc = &(ctl->zsctl_proc_array[psinfo->pr_pid]);
2475 *psetid = psinfo->pr_lwp.pr_bindpset;
2476
2477 if (proc->zspr_psetid == ZS_PSET_ERROR)
2478 *prev_psetid = *psetid;
2479 else
2480 *prev_psetid = proc->zspr_psetid;
2481
2482 *zoneid = psinfo->pr_zoneid;
2483 if (proc->zspr_zoneid == -1)
2484 *prev_zoneid = *zoneid;
2485 else
2486 *prev_zoneid = proc->zspr_zoneid;
2487
2488 TIMESTRUC_DELTA(d, psinfo->pr_time, proc->zspr_usage);
2489 *delta = d;
2490
2491 *sched = zsd_schedname2int(psinfo->pr_lwp.pr_clname,
2492 psinfo->pr_lwp.pr_pri);
2493
2494 /* Update cached data for proc */
2495 proc->zspr_psetid = psinfo->pr_lwp.pr_bindpset;
2496 proc->zspr_zoneid = psinfo->pr_zoneid;
2497 proc->zspr_sched = *sched;
2498 proc->zspr_usage.tv_sec = psinfo->pr_time.tv_sec;
2499 proc->zspr_usage.tv_nsec = psinfo->pr_time.tv_nsec;
2500 proc->zspr_ppid = psinfo->pr_ppid;
2501 }
2502
2503 /*
2504 * Reset the known cpu usage of a process. This is done after a process
2505 * exits so that if the pid is recycled, data from its previous life is
2506 * not reused
2507 */
2508 static void
zsd_flush_proc_info(zsd_proc_t * proc)2509 zsd_flush_proc_info(zsd_proc_t *proc)
2510 {
2511 proc->zspr_usage.tv_sec = 0;
2512 proc->zspr_usage.tv_nsec = 0;
2513 }
2514
2515 /*
2516 * Open the current extended accounting file. On initialization, open the
2517 * file as the current file to be used. Otherwise, open the file as the
2518 * next file to use of the current file reaches EOF.
2519 */
2520 static int
zsd_open_exacct(zsd_ctl_t * ctl,boolean_t init)2521 zsd_open_exacct(zsd_ctl_t *ctl, boolean_t init)
2522 {
2523 int ret, oret, state, trys = 0, flags;
2524 int *fd, *open;
2525 ea_file_t *eaf;
2526 struct stat64 *stat;
2527 char path[MAXPATHLEN];
2528
2529 /*
2530 * The accounting file is first opened at the tail. Following
2531 * opens to new accounting files are opened at the head.
2532 */
2533 if (init == B_TRUE) {
2534 flags = EO_NO_VALID_HDR | EO_TAIL;
2535 fd = &ctl->zsctl_proc_fd;
2536 eaf = &ctl->zsctl_proc_eaf;
2537 stat = &ctl->zsctl_proc_stat;
2538 open = &ctl->zsctl_proc_open;
2539 } else {
2540 flags = EO_NO_VALID_HDR | EO_HEAD;
2541 fd = &ctl->zsctl_proc_fd_next;
2542 eaf = &ctl->zsctl_proc_eaf_next;
2543 stat = &ctl->zsctl_proc_stat_next;
2544 open = &ctl->zsctl_proc_open_next;
2545 }
2546
2547 *fd = -1;
2548 *open = 0;
2549 retry:
2550 /* open accounting files for cpu consumption */
2551 ret = acctctl(AC_STATE_GET | AC_PROC, &state, sizeof (state));
2552 if (ret != 0) {
2553 zsd_warn(gettext("Unable to get process accounting state"));
2554 goto err;
2555 }
2556 if (state != AC_ON) {
2557 if (trys > 0) {
2558 zsd_warn(gettext(
2559 "Unable to enable process accounting"));
2560 goto err;
2561 }
2562 (void) zsd_enable_cpu_stats();
2563 trys++;
2564 goto retry;
2565 }
2566
2567 ret = acctctl(AC_FILE_GET | AC_PROC, path, sizeof (path));
2568 if (ret != 0) {
2569 zsd_warn(gettext("Unable to get process accounting file"));
2570 goto err;
2571 }
2572
2573 if ((*fd = open64(path, O_RDONLY, 0)) >= 0 &&
2574 (oret = ea_fdopen(eaf, *fd, NULL, flags, O_RDONLY)) == 0)
2575 ret = fstat64(*fd, stat);
2576
2577 if (*fd < 0 || oret < 0 || ret < 0) {
2578 struct timespec ts;
2579
2580 /*
2581 * It is possible the accounting file is momentarily unavailable
2582 * because it is being rolled. Try for up to half a second.
2583 *
2584 * If failure to open accounting file persists, give up.
2585 */
2586 if (oret == 0)
2587 (void) ea_close(eaf);
2588 else if (*fd >= 0)
2589 (void) close(*fd);
2590 if (trys > 500) {
2591 zsd_warn(gettext(
2592 "Unable to open process accounting file"));
2593 goto err;
2594 }
2595 /* wait one millisecond */
2596 ts.tv_sec = 0;
2597 ts.tv_nsec = NANOSEC / 1000;
2598 (void) nanosleep(&ts, NULL);
2599 goto retry;
2600 }
2601 *open = 1;
2602 return (0);
2603 err:
2604 if (*fd >= 0)
2605 (void) close(*fd);
2606 *open = 0;
2607 *fd = -1;
2608 return (-1);
2609 }
2610
2611 /*
2612 * Walk /proc and charge each process to its zone and processor set.
2613 * Then read exacct data for exited processes, and charge them as well.
2614 */
2615 static void
zsd_refresh_procs(zsd_ctl_t * ctl,boolean_t init)2616 zsd_refresh_procs(zsd_ctl_t *ctl, boolean_t init)
2617 {
2618 DIR *dir;
2619 struct dirent *dent;
2620 psinfo_t psinfo;
2621 int fd, ret;
2622 zsd_proc_t *proc, *pproc, *tmp, *next;
2623 list_t pplist, plist;
2624 zsd_zone_t *zone, *prev_zone;
2625 zsd_pset_t *pset, *prev_pset;
2626 psetid_t psetid, prev_psetid;
2627 zoneid_t zoneid, prev_zoneid;
2628 zsd_pset_usage_t *usage, *prev_usage;
2629 char path[MAXPATHLEN];
2630
2631 ea_object_t object;
2632 ea_object_t pobject;
2633 boolean_t hrtime_expired = B_FALSE;
2634 struct timeval interval_end;
2635
2636 timestruc_t delta, d1, d2;
2637 uint_t sched = 0;
2638
2639 /*
2640 * Get the current accounting file. The current accounting file
2641 * may be different than the file in use, as the accounting file
2642 * may have been rolled, or manually changed by an admin.
2643 */
2644 ret = zsd_open_exacct(ctl, init);
2645 if (ret != 0) {
2646 zsd_warn(gettext("Unable to track process accounting"));
2647 return;
2648 }
2649
2650 /*
2651 * Mark the current time as the interval end time. Don't track
2652 * processes that exit after this time.
2653 */
2654 (void) gettimeofday(&interval_end, NULL);
2655
2656 dir = opendir("/proc");
2657 if (dir == NULL) {
2658 zsd_warn(gettext("Unable to open /proc"));
2659 return;
2660 }
2661
2662 /* Walk all processes and compute each zone's usage on each pset. */
2663 while ((dent = readdir(dir)) != NULL) {
2664
2665 if (strcmp(dent->d_name, ".") == 0 ||
2666 strcmp(dent->d_name, "..") == 0)
2667 continue;
2668
2669 (void) snprintf(path, sizeof (path), "/proc/%s/psinfo",
2670 dent->d_name);
2671
2672 fd = open(path, O_RDONLY);
2673 if (fd < 0)
2674 continue;
2675
2676 if (read(fd, &psinfo, sizeof (psinfo)) != sizeof (psinfo)) {
2677 (void) close(fd);
2678 continue;
2679 }
2680 (void) close(fd);
2681
2682 zsd_get_proc_info(ctl, &psinfo, &psetid, &prev_psetid,
2683 &zoneid, &prev_zoneid, &delta, &sched);
2684
2685 d1.tv_sec = delta.tv_sec / 2;
2686 d1.tv_nsec = delta.tv_nsec / 2;
2687 d2.tv_sec = (delta.tv_sec / 2) + (delta.tv_sec % 2);
2688 d2.tv_nsec = (delta.tv_nsec / 2) + (delta.tv_nsec % 2);
2689
2690 /* Get the zone and pset this process is running in */
2691 zone = zsd_lookup_zone_byid(ctl, zoneid);
2692 if (zone == NULL)
2693 continue;
2694 pset = zsd_lookup_pset_byid(ctl, psetid);
2695 if (pset == NULL)
2696 continue;
2697 usage = zsd_lookup_insert_usage(ctl, pset, zone);
2698 if (usage == NULL)
2699 continue;
2700
2701 /*
2702 * Get the usage of the previous zone and pset if they were
2703 * different.
2704 */
2705 if (zoneid != prev_zoneid)
2706 prev_zone = zsd_lookup_zone_byid(ctl, prev_zoneid);
2707 else
2708 prev_zone = NULL;
2709
2710 if (psetid != prev_psetid)
2711 prev_pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2712 else
2713 prev_pset = NULL;
2714
2715 prev_usage = NULL;
2716 if (prev_zone != NULL || prev_pset != NULL) {
2717 if (prev_zone == NULL)
2718 prev_zone = zone;
2719 if (prev_pset == NULL)
2720 prev_pset = pset;
2721
2722 prev_usage = zsd_lookup_insert_usage(ctl, prev_pset,
2723 prev_zone);
2724 }
2725
2726 /* Update the usage with the processes info */
2727 if (prev_usage == NULL) {
2728 zsd_mark_pset_usage_found(usage, sched);
2729 } else {
2730 zsd_mark_pset_usage_found(usage, sched);
2731 zsd_mark_pset_usage_found(prev_usage, sched);
2732 }
2733
2734 /*
2735 * First time around is just to get a starting point. All
2736 * usages will be zero.
2737 */
2738 if (init == B_TRUE)
2739 continue;
2740
2741 if (prev_usage == NULL) {
2742 zsd_add_usage(ctl, usage, &delta);
2743 } else {
2744 zsd_add_usage(ctl, usage, &d1);
2745 zsd_add_usage(ctl, prev_usage, &d2);
2746 }
2747 }
2748 (void) closedir(dir);
2749
2750 /*
2751 * No need to collect exited proc data on initialization. Just
2752 * caching the usage of the known processes to get a zero starting
2753 * point.
2754 */
2755 if (init == B_TRUE)
2756 return;
2757
2758 /*
2759 * Add accounting records to account for processes which have
2760 * exited.
2761 */
2762 list_create(&plist, sizeof (zsd_proc_t),
2763 offsetof(zsd_proc_t, zspr_next));
2764 list_create(&pplist, sizeof (zsd_proc_t),
2765 offsetof(zsd_proc_t, zspr_next));
2766
2767 for (;;) {
2768 pid_t pid;
2769 pid_t ppid;
2770 timestruc_t user, sys, proc_usage;
2771 timestruc_t finish;
2772 int numfound = 0;
2773
2774 bzero(&object, sizeof (object));
2775 proc = NULL;
2776 zone = NULL;
2777 pset = NULL;
2778 usage = NULL;
2779 ret = ea_get_object(&ctl->zsctl_proc_eaf, &object);
2780 if (ret == EO_ERROR) {
2781 if (ea_error() == EXR_EOF) {
2782
2783 struct stat64 *stat;
2784 struct stat64 *stat_next;
2785
2786 /*
2787 * See if the next accounting file is the
2788 * same as the current accounting file.
2789 */
2790 stat = &(ctl->zsctl_proc_stat);
2791 stat_next = &(ctl->zsctl_proc_stat_next);
2792 if (stat->st_ino == stat_next->st_ino &&
2793 stat->st_dev == stat_next->st_dev) {
2794 /*
2795 * End of current accounting file is
2796 * reached, so finished. Clear EOF
2797 * bit for next time around.
2798 */
2799 ea_clear(&ctl->zsctl_proc_eaf);
2800 break;
2801 } else {
2802 /*
2803 * Accounting file has changed. Move
2804 * to current accounting file.
2805 */
2806 (void) ea_close(&ctl->zsctl_proc_eaf);
2807
2808 ctl->zsctl_proc_fd =
2809 ctl->zsctl_proc_fd_next;
2810 ctl->zsctl_proc_eaf =
2811 ctl->zsctl_proc_eaf_next;
2812 ctl->zsctl_proc_stat =
2813 ctl->zsctl_proc_stat_next;
2814
2815 ctl->zsctl_proc_fd_next = -1;
2816 ctl->zsctl_proc_open_next = 0;
2817 continue;
2818 }
2819 } else {
2820 /*
2821 * Other accounting error. Give up on
2822 * accounting.
2823 */
2824 goto ea_err;
2825 }
2826 }
2827 /* Skip if not a process group */
2828 if ((object.eo_catalog & EXT_TYPE_MASK) != EXT_GROUP ||
2829 (object.eo_catalog & EXD_DATA_MASK) != EXD_GROUP_PROC) {
2830 (void) ea_free_item(&object, EUP_ALLOC);
2831 continue;
2832 }
2833
2834 /* The process group entry should be complete */
2835 while (numfound < 9) {
2836 bzero(&pobject, sizeof (pobject));
2837 ret = ea_get_object(&ctl->zsctl_proc_eaf,
2838 &pobject);
2839 if (ret < 0) {
2840 (void) ea_free_item(&object, EUP_ALLOC);
2841 zsd_warn(
2842 "unable to get process accounting data");
2843 goto ea_err;
2844 }
2845 /* Next entries should be process data */
2846 if ((pobject.eo_catalog & EXT_TYPE_MASK) ==
2847 EXT_GROUP) {
2848 (void) ea_free_item(&object, EUP_ALLOC);
2849 (void) ea_free_item(&pobject, EUP_ALLOC);
2850 zsd_warn(
2851 "process data of wrong type");
2852 goto ea_err;
2853 }
2854 switch (pobject.eo_catalog & EXD_DATA_MASK) {
2855 case EXD_PROC_PID:
2856 pid = pobject.eo_item.ei_uint32;
2857 proc = &(ctl->zsctl_proc_array[pid]);
2858 /*
2859 * This process should not be currently in
2860 * the list of processes to process.
2861 */
2862 assert(!list_link_active(&proc->zspr_next));
2863 numfound++;
2864 break;
2865 case EXD_PROC_ANCPID:
2866 ppid = pobject.eo_item.ei_uint32;
2867 pproc = &(ctl->zsctl_proc_array[ppid]);
2868 numfound++;
2869 break;
2870 case EXD_PROC_ZONENAME:
2871 zone = zsd_lookup_zone(ctl,
2872 pobject.eo_item.ei_string, -1);
2873 numfound++;
2874 break;
2875 case EXD_PROC_CPU_USER_SEC:
2876 user.tv_sec =
2877 pobject.eo_item.ei_uint64;
2878 numfound++;
2879 break;
2880 case EXD_PROC_CPU_USER_NSEC:
2881 user.tv_nsec =
2882 pobject.eo_item.ei_uint64;
2883 numfound++;
2884 break;
2885 case EXD_PROC_CPU_SYS_SEC:
2886 sys.tv_sec =
2887 pobject.eo_item.ei_uint64;
2888 numfound++;
2889 break;
2890 case EXD_PROC_CPU_SYS_NSEC:
2891 sys.tv_nsec =
2892 pobject.eo_item.ei_uint64;
2893 numfound++;
2894 break;
2895 case EXD_PROC_FINISH_SEC:
2896 finish.tv_sec =
2897 pobject.eo_item.ei_uint64;
2898 numfound++;
2899 break;
2900 case EXD_PROC_FINISH_NSEC:
2901 finish.tv_nsec =
2902 pobject.eo_item.ei_uint64;
2903 numfound++;
2904 break;
2905 }
2906 (void) ea_free_item(&pobject, EUP_ALLOC);
2907 }
2908 (void) ea_free_item(&object, EUP_ALLOC);
2909 if (numfound != 9) {
2910 zsd_warn(gettext(
2911 "Malformed process accounting entry found"));
2912 goto proc_done;
2913 }
2914
2915 if (finish.tv_sec > interval_end.tv_sec ||
2916 (finish.tv_sec == interval_end.tv_sec &&
2917 finish.tv_nsec > (interval_end.tv_usec * 1000)))
2918 hrtime_expired = B_TRUE;
2919
2920 /*
2921 * Try to identify the zone and pset to which this
2922 * exited process belongs.
2923 */
2924 if (zone == NULL)
2925 goto proc_done;
2926
2927 /* Save proc info */
2928 proc->zspr_ppid = ppid;
2929 proc->zspr_zoneid = zone->zsz_id;
2930
2931 prev_psetid = ZS_PSET_ERROR;
2932 sched = 0;
2933
2934 /*
2935 * The following tries to deduce the processes pset.
2936 *
2937 * First choose pset and sched using cached value from the
2938 * most recent time the process has been seen.
2939 *
2940 * pset and sched can change across zone_enter, so make sure
2941 * most recent sighting of this process was in the same
2942 * zone before using most recent known value.
2943 *
2944 * If there is no known value, use value of processes
2945 * parent. If parent is unknown, walk parents until a known
2946 * parent is found.
2947 *
2948 * If no parent in the zone is found, use the zone's default
2949 * pset and scheduling class.
2950 */
2951 if (proc->zspr_psetid != ZS_PSET_ERROR) {
2952 prev_psetid = proc->zspr_psetid;
2953 pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2954 sched = proc->zspr_sched;
2955 } else if (pproc->zspr_zoneid == zone->zsz_id &&
2956 pproc->zspr_psetid != ZS_PSET_ERROR) {
2957 prev_psetid = pproc->zspr_psetid;
2958 pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2959 sched = pproc->zspr_sched;
2960 }
2961
2962 if (pset == NULL) {
2963 /*
2964 * Process or processes parent has never been seen.
2965 * Save to deduce a known parent later.
2966 */
2967 proc_usage = sys;
2968 TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2969 TIMESTRUC_DELTA(delta, proc_usage,
2970 proc->zspr_usage);
2971 proc->zspr_usage = delta;
2972 list_insert_tail(&plist, proc);
2973 continue;
2974 }
2975
2976 /* Add the zone's usage to the pset */
2977 usage = zsd_lookup_insert_usage(ctl, pset, zone);
2978 if (usage == NULL)
2979 goto proc_done;
2980
2981 zsd_mark_pset_usage_found(usage, sched);
2982
2983 /* compute the usage to add for the exited proc */
2984 proc_usage = sys;
2985 TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2986 TIMESTRUC_DELTA(delta, proc_usage,
2987 proc->zspr_usage);
2988
2989 zsd_add_usage(ctl, usage, &delta);
2990 proc_done:
2991 zsd_flush_proc_info(proc);
2992
2993 if (hrtime_expired == B_TRUE)
2994 break;
2995 }
2996 /*
2997 * close next accounting file.
2998 */
2999 if (ctl->zsctl_proc_open_next) {
3000 (void) ea_close(
3001 &ctl->zsctl_proc_eaf_next);
3002 ctl->zsctl_proc_open_next = 0;
3003 ctl->zsctl_proc_fd_next = -1;
3004 }
3005
3006 /* For the remaining processes, use pset and sched of a known parent */
3007 proc = list_head(&plist);
3008 while (proc != NULL) {
3009 next = proc;
3010 for (;;) {
3011 if (next->zspr_ppid == 0 || next->zspr_ppid == -1) {
3012 /*
3013 * Kernel process, or parent is unknown, skip
3014 * process, remove from process list.
3015 */
3016 tmp = proc;
3017 proc = list_next(&plist, proc);
3018 list_link_init(&tmp->zspr_next);
3019 break;
3020 }
3021 pproc = &(ctl->zsctl_proc_array[next->zspr_ppid]);
3022 if (pproc->zspr_zoneid != proc->zspr_zoneid) {
3023 /*
3024 * Parent in different zone. Save process and
3025 * use zone's default pset and sched below
3026 */
3027 tmp = proc;
3028 proc = list_next(&plist, proc);
3029 list_remove(&plist, tmp);
3030 list_insert_tail(&pplist, tmp);
3031 break;
3032 }
3033 /* Parent has unknown pset, Search parent's parent */
3034 if (pproc->zspr_psetid == ZS_PSET_ERROR) {
3035 next = pproc;
3036 continue;
3037 }
3038 /* Found parent with known pset. Use its info */
3039 proc->zspr_psetid = pproc->zspr_psetid;
3040 proc->zspr_sched = pproc->zspr_sched;
3041 next->zspr_psetid = pproc->zspr_psetid;
3042 next->zspr_sched = pproc->zspr_sched;
3043 zone = zsd_lookup_zone_byid(ctl,
3044 proc->zspr_zoneid);
3045 if (zone == NULL) {
3046 tmp = proc;
3047 proc = list_next(&plist, proc);
3048 list_remove(&plist, tmp);
3049 list_link_init(&tmp->zspr_next);
3050 break;
3051 }
3052 pset = zsd_lookup_pset_byid(ctl,
3053 proc->zspr_psetid);
3054 if (pset == NULL) {
3055 tmp = proc;
3056 proc = list_next(&plist, proc);
3057 list_remove(&plist, tmp);
3058 list_link_init(&tmp->zspr_next);
3059 break;
3060 }
3061 /* Add the zone's usage to the pset */
3062 usage = zsd_lookup_insert_usage(ctl, pset, zone);
3063 if (usage == NULL) {
3064 tmp = proc;
3065 proc = list_next(&plist, proc);
3066 list_remove(&plist, tmp);
3067 list_link_init(&tmp->zspr_next);
3068 break;
3069 }
3070 zsd_mark_pset_usage_found(usage, proc->zspr_sched);
3071 zsd_add_usage(ctl, usage, &proc->zspr_usage);
3072 zsd_flush_proc_info(proc);
3073 tmp = proc;
3074 proc = list_next(&plist, proc);
3075 list_remove(&plist, tmp);
3076 list_link_init(&tmp->zspr_next);
3077 break;
3078 }
3079 }
3080 /*
3081 * Process has never been seen. Using zone info to
3082 * determine pset and scheduling class.
3083 */
3084 proc = list_head(&pplist);
3085 while (proc != NULL) {
3086
3087 zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid);
3088 if (zone == NULL)
3089 goto next;
3090 if (zone->zsz_psetid != ZS_PSET_ERROR &&
3091 zone->zsz_psetid != ZS_PSET_MULTI) {
3092 prev_psetid = zone->zsz_psetid;
3093 pset = zsd_lookup_pset_byid(ctl, prev_psetid);
3094 } else {
3095 pset = zsd_lookup_pset(ctl, zone->zsz_pset, -1);
3096 if (pset != NULL)
3097 prev_psetid = pset->zsp_id;
3098 }
3099 if (pset == NULL)
3100 goto next;
3101
3102 sched = zone->zsz_scheds;
3103 /*
3104 * Ignore FX high scheduling class if it is not the
3105 * only scheduling class in the zone.
3106 */
3107 if (sched != ZS_SCHED_FX_60)
3108 sched &= (~ZS_SCHED_FX_60);
3109 /*
3110 * If more than one scheduling class has been found
3111 * in the zone, use zone's default scheduling class for
3112 * this process.
3113 */
3114 if ((sched & (sched - 1)) != 0)
3115 sched = zone->zsz_default_sched;
3116
3117 /* Add the zone's usage to the pset */
3118 usage = zsd_lookup_insert_usage(ctl, pset, zone);
3119 if (usage == NULL)
3120 goto next;
3121
3122 zsd_mark_pset_usage_found(usage, sched);
3123 zsd_add_usage(ctl, usage, &proc->zspr_usage);
3124 next:
3125 tmp = proc;
3126 proc = list_next(&pplist, proc);
3127 zsd_flush_proc_info(tmp);
3128 list_link_init(&tmp->zspr_next);
3129 }
3130 return;
3131 ea_err:
3132 /*
3133 * Close the next accounting file if we have not transitioned to it
3134 * yet.
3135 */
3136 if (ctl->zsctl_proc_open_next) {
3137 (void) ea_close(&ctl->zsctl_proc_eaf_next);
3138 ctl->zsctl_proc_open_next = 0;
3139 ctl->zsctl_proc_fd_next = -1;
3140 }
3141 }
3142
3143 /*
3144 * getvmusage(2) uses size_t's in the passwd data structure, which differ
3145 * in size for 32bit and 64 bit kernels. Since this is a contracted interface,
3146 * and zonestatd does not necessarily match the kernel's bitness, marshal
3147 * results appropriately.
3148 */
3149 static int
zsd_getvmusage(zsd_ctl_t * ctl,uint_t flags,time_t age,zsd_vmusage64_t * buf,uint64_t * nres)3150 zsd_getvmusage(zsd_ctl_t *ctl, uint_t flags, time_t age, zsd_vmusage64_t *buf,
3151 uint64_t *nres)
3152 {
3153 zsd_vmusage32_t *vmu32;
3154 zsd_vmusage64_t *vmu64;
3155 uint32_t nres32;
3156 int i;
3157 int ret;
3158
3159 if (ctl->zsctl_kern_bits == 32) {
3160 nres32 = *nres;
3161 ret = syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3162 flags, age, (uintptr_t)buf, (uintptr_t)&nres32);
3163 *nres = nres32;
3164 if (ret == 0 && buf != NULL) {
3165 /*
3166 * An array of vmusage32_t's has been returned.
3167 * Convert it to an array of vmusage64_t's.
3168 */
3169 vmu32 = (zsd_vmusage32_t *)buf;
3170 vmu64 = (zsd_vmusage64_t *)buf;
3171 for (i = nres32 - 1; i >= 0; i--) {
3172
3173 vmu64[i].vmu_zoneid = vmu32[i].vmu_zoneid;
3174 vmu64[i].vmu_type = vmu32[i].vmu_type;
3175 vmu64[i].vmu_type = vmu32[i].vmu_type;
3176 vmu64[i].vmu_rss_all = vmu32[i].vmu_rss_all;
3177 vmu64[i].vmu_rss_private =
3178 vmu32[i].vmu_rss_private;
3179 vmu64[i].vmu_rss_shared =
3180 vmu32[i].vmu_rss_shared;
3181 vmu64[i].vmu_swap_all = vmu32[i].vmu_swap_all;
3182 vmu64[i].vmu_swap_private =
3183 vmu32[i].vmu_swap_private;
3184 vmu64[i].vmu_swap_shared =
3185 vmu32[i].vmu_swap_shared;
3186 }
3187 }
3188 return (ret);
3189 } else {
3190 /*
3191 * kernel is 64 bit, so use 64 bit structures as zonestat
3192 * expects.
3193 */
3194 return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3195 flags, age, (uintptr_t)buf, (uintptr_t)nres));
3196
3197 }
3198 }
3199
3200 /*
3201 * Update the current physical, virtual, and locked memory usage of the
3202 * running zones.
3203 */
3204 static void
zsd_refresh_memory(zsd_ctl_t * ctl,boolean_t init)3205 zsd_refresh_memory(zsd_ctl_t *ctl, boolean_t init)
3206 {
3207
3208 uint64_t phys_total;
3209 uint64_t phys_used;
3210 uint64_t phys_zones;
3211 uint64_t phys_zones_overcount;
3212 uint64_t phys_zones_extra;
3213 uint64_t phys_zones_credit;
3214
3215 uint64_t vm_free;
3216 uint64_t vm_used;
3217
3218 uint64_t disk_swap_total;
3219 uint64_t disk_swap_used; /* disk swap with contents */
3220
3221 uint64_t physmem;
3222 uint64_t pp_kernel;
3223 uint64_t arc_size = 0;
3224 struct anoninfo ani;
3225
3226 int num_swap_devices;
3227 struct swaptable *swt;
3228 struct swapent *swent;
3229 size_t swt_size;
3230 char *path;
3231
3232 zsd_vmusage64_t *vmusage;
3233 uint64_t num_vmusage;
3234
3235 int i, ret;
3236
3237 zsd_system_t *sys;
3238 zsd_zone_t *zone;
3239 int vmu_nzones;
3240
3241 kstat_t *kstat;
3242 char kstat_name[KSTAT_STRLEN];
3243 kstat_named_t *knp;
3244 kid_t kid;
3245
3246 if (init)
3247 return;
3248
3249 sys = ctl->zsctl_system;
3250
3251 /* interrogate swap devices to find the amount of disk swap */
3252 disk_swap_again:
3253 num_swap_devices = swapctl(SC_GETNSWP, NULL);
3254
3255 if (num_swap_devices == 0) {
3256 sys->zss_swap_total = disk_swap_total = 0;
3257 sys->zss_swap_used = disk_swap_used = 0;
3258 /* No disk swap */
3259 goto disk_swap_done;
3260 }
3261 /* see if swap table needs to be larger */
3262 if (num_swap_devices > ctl->zsctl_swap_cache_num) {
3263 swt_size = sizeof (int) +
3264 (num_swap_devices * sizeof (struct swapent)) +
3265 (num_swap_devices * MAXPATHLEN);
3266 if (ctl->zsctl_swap_cache != NULL)
3267 free(ctl->zsctl_swap_cache);
3268
3269 swt = (struct swaptable *)malloc(swt_size);
3270 if (swt == NULL) {
3271 /*
3272 * Could not allocate to get list of swap devices.
3273 * Just use data from the most recent read, which will
3274 * be zero if this is the first read.
3275 */
3276 zsd_warn(gettext("Unable to allocate to determine "
3277 "virtual memory"));
3278 disk_swap_total = sys->zss_swap_total;
3279 disk_swap_used = sys->zss_swap_used;
3280 goto disk_swap_done;
3281 }
3282 swent = swt->swt_ent;
3283 path = (char *)swt + (sizeof (int) +
3284 num_swap_devices * sizeof (swapent_t));
3285 for (i = 0; i < num_swap_devices; i++, swent++) {
3286 swent->ste_path = path;
3287 path += MAXPATHLEN;
3288 }
3289 swt->swt_n = num_swap_devices;
3290 ctl->zsctl_swap_cache = swt;
3291 ctl->zsctl_swap_cache_size = swt_size;
3292 ctl->zsctl_swap_cache_num = num_swap_devices;
3293 }
3294 num_swap_devices = swapctl(SC_LIST, ctl->zsctl_swap_cache);
3295 if (num_swap_devices < 0) {
3296 /* More swap devices have arrived */
3297 if (errno == ENOMEM)
3298 goto disk_swap_again;
3299
3300 zsd_warn(gettext("Unable to determine disk swap devices"));
3301 /* Unexpected error. Use existing data */
3302 disk_swap_total = sys->zss_swap_total;
3303 disk_swap_used = sys->zss_swap_used;
3304 goto disk_swap_done;
3305 }
3306
3307 /* add up the disk swap */
3308 disk_swap_total = 0;
3309 disk_swap_used = 0;
3310 swent = ctl->zsctl_swap_cache->swt_ent;
3311 for (i = 0; i < num_swap_devices; i++, swent++) {
3312 disk_swap_total += swent->ste_pages;
3313 disk_swap_used += (swent->ste_pages - swent->ste_free);
3314 }
3315 disk_swap_total *= ctl->zsctl_pagesize;
3316 disk_swap_used *= ctl->zsctl_pagesize;
3317
3318 sys->zss_swap_total = disk_swap_total;
3319 sys->zss_swap_used = disk_swap_used;
3320
3321 disk_swap_done:
3322
3323 /* get system pages kstat */
3324 kid = -1;
3325 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "unix", 0, "system_pages");
3326 if (kstat == NULL)
3327 zsd_warn(gettext("Unable to lookup system pages kstat"));
3328 else
3329 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3330
3331 if (kid == -1) {
3332 zsd_warn(gettext("Unable to read system pages kstat"));
3333 return;
3334 } else {
3335 knp = kstat_data_lookup(kstat, "physmem");
3336 if (knp == NULL) {
3337 zsd_warn(gettext("Unable to read physmem"));
3338 } else {
3339 if (knp->data_type == KSTAT_DATA_UINT64)
3340 physmem = knp->value.ui64;
3341 else if (knp->data_type == KSTAT_DATA_UINT32)
3342 physmem = knp->value.ui32;
3343 else
3344 return;
3345 }
3346 knp = kstat_data_lookup(kstat, "pp_kernel");
3347 if (knp == NULL) {
3348 zsd_warn(gettext("Unable to read pp_kernel"));
3349 } else {
3350 if (knp->data_type == KSTAT_DATA_UINT64)
3351 pp_kernel = knp->value.ui64;
3352 else if (knp->data_type == KSTAT_DATA_UINT32)
3353 pp_kernel = knp->value.ui32;
3354 else
3355 return;
3356 }
3357 }
3358 physmem *= ctl->zsctl_pagesize;
3359 pp_kernel *= ctl->zsctl_pagesize;
3360
3361 /* get the zfs arc size if available */
3362 arc_size = 0;
3363 kid = -1;
3364 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "zfs", 0, "arcstats");
3365 if (kstat != NULL)
3366 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3367 if (kid != -1) {
3368 knp = kstat_data_lookup(kstat, "size");
3369 if (knp != NULL)
3370 if (knp->data_type == KSTAT_DATA_UINT64)
3371 arc_size = knp->value.ui64;
3372 }
3373
3374 /* Try to get swap information */
3375 if (swapctl(SC_AINFO, &ani) < 0) {
3376 zsd_warn(gettext("Unable to get swap info"));
3377 return;
3378 }
3379
3380 vmusage_again:
3381 /* getvmusage to get physical memory usage */
3382 vmusage = ctl->zsctl_vmusage_cache;
3383 num_vmusage = ctl->zsctl_vmusage_cache_num;
3384
3385 ret = zsd_getvmusage(ctl, VMUSAGE_SYSTEM | VMUSAGE_ALL_ZONES, 0,
3386 vmusage, &num_vmusage);
3387
3388 if (ret != 0) {
3389 /* Unexpected error. Use existing data */
3390 if (errno != EOVERFLOW) {
3391 zsd_warn(gettext(
3392 "Unable to read physical memory usage"));
3393 phys_zones = sys->zss_ram_zones;
3394 goto vmusage_done;
3395 }
3396 }
3397 /* vmusage results cache too small */
3398 if (num_vmusage > ctl->zsctl_vmusage_cache_num) {
3399
3400 size_t size = sizeof (zsd_vmusage64_t) * num_vmusage;
3401
3402 if (ctl->zsctl_vmusage_cache != NULL)
3403 free(ctl->zsctl_vmusage_cache);
3404 vmusage = (zsd_vmusage64_t *)malloc(size);
3405 if (vmusage == NULL) {
3406 zsd_warn(gettext("Unable to alloc to determine "
3407 "physical memory usage"));
3408 phys_zones = sys->zss_ram_zones;
3409 goto vmusage_done;
3410 }
3411 ctl->zsctl_vmusage_cache = vmusage;
3412 ctl->zsctl_vmusage_cache_num = num_vmusage;
3413 goto vmusage_again;
3414 }
3415
3416 phys_zones_overcount = 0;
3417 vmu_nzones = 0;
3418 for (i = 0; i < num_vmusage; i++) {
3419 switch (vmusage[i].vmu_type) {
3420 case VMUSAGE_SYSTEM:
3421 /* total pages backing user process mappings */
3422 phys_zones = sys->zss_ram_zones =
3423 vmusage[i].vmu_rss_all;
3424 break;
3425 case VMUSAGE_ZONE:
3426 vmu_nzones++;
3427 phys_zones_overcount += vmusage[i].vmu_rss_all;
3428 zone = zsd_lookup_zone_byid(ctl, vmusage[i].vmu_id);
3429 if (zone != NULL)
3430 zone->zsz_usage_ram = vmusage[i].vmu_rss_all;
3431 break;
3432 default:
3433 break;
3434 }
3435 }
3436 /*
3437 * Figure how much memory was double counted due to text sharing
3438 * between zones. Credit this back so that the sum of the zones
3439 * equals the total zone ram usage;
3440 */
3441 phys_zones_extra = phys_zones_overcount - phys_zones;
3442 phys_zones_credit = phys_zones_extra / vmu_nzones;
3443
3444 vmusage_done:
3445
3446 /* walk the zones to get swap and locked kstats. Fetch ram cap. */
3447 sys->zss_locked_zones = 0;
3448 sys->zss_vm_zones = 0;
3449 for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3450 zone = list_next(&ctl->zsctl_zones, zone)) {
3451
3452 /* If zone halted during interval, show memory usage as none */
3453 if (zone->zsz_active == B_FALSE ||
3454 zone->zsz_deleted == B_TRUE) {
3455 zone->zsz_usage_ram = 0;
3456 zone->zsz_usage_vm = 0;
3457 zone->zsz_usage_locked = 0;
3458 continue;
3459 }
3460
3461 if (phys_zones_credit > 0) {
3462 if (zone->zsz_usage_ram > phys_zones_credit) {
3463 zone->zsz_usage_ram -= phys_zones_credit;
3464 }
3465 }
3466 /*
3467 * Get zone's swap usage. Since zone could have halted,
3468 * treats as zero if cannot read
3469 */
3470 zone->zsz_usage_vm = 0;
3471 (void) snprintf(kstat_name, sizeof (kstat_name),
3472 "swapresv_zone_%d", zone->zsz_id);
3473 kid = -1;
3474 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3475 zone->zsz_id, kstat_name);
3476 if (kstat != NULL)
3477 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3478 if (kid != -1) {
3479 knp = kstat_data_lookup(kstat, "usage");
3480 if (knp != NULL &&
3481 knp->data_type == KSTAT_DATA_UINT64) {
3482 zone->zsz_usage_vm = knp->value.ui64;
3483 sys->zss_vm_zones += knp->value.ui64;
3484 }
3485 }
3486 /*
3487 * Get zone's locked usage. Since zone could have halted,
3488 * treats as zero if cannot read
3489 */
3490 zone->zsz_usage_locked = 0;
3491 (void) snprintf(kstat_name, sizeof (kstat_name),
3492 "lockedmem_zone_%d", zone->zsz_id);
3493 kid = -1;
3494 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3495 zone->zsz_id, kstat_name);
3496 if (kstat != NULL)
3497 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3498 if (kid != -1) {
3499 knp = kstat_data_lookup(kstat, "usage");
3500 if (knp != NULL &&
3501 knp->data_type == KSTAT_DATA_UINT64) {
3502 zone->zsz_usage_locked = knp->value.ui64;
3503 /*
3504 * Since locked memory accounting for zones
3505 * can double count ddi locked memory, cap each
3506 * zone's locked usage at its ram usage.
3507 */
3508 if (zone->zsz_usage_locked >
3509 zone->zsz_usage_ram)
3510 zone->zsz_usage_locked =
3511 zone->zsz_usage_ram;
3512 sys->zss_locked_zones +=
3513 zone->zsz_usage_locked;
3514 }
3515 }
3516 }
3517
3518 phys_total =
3519 sysconf(_SC_PHYS_PAGES) * ctl->zsctl_pagesize;
3520
3521 phys_used = (sysconf(_SC_PHYS_PAGES) - sysconf(_SC_AVPHYS_PAGES))
3522 * ctl->zsctl_pagesize;
3523
3524 /* Compute remaining statistics */
3525 sys->zss_ram_total = phys_total;
3526 sys->zss_ram_zones = phys_zones;
3527 sys->zss_ram_kern = phys_used - phys_zones - arc_size;
3528
3529 /*
3530 * The total for kernel locked memory should include
3531 * segkp locked pages, but oh well. The arc size is subtracted,
3532 * as that physical memory is reclaimable.
3533 */
3534 sys->zss_locked_kern = pp_kernel - arc_size;
3535 /* Add memory used by kernel startup and obp to kernel locked */
3536 if ((phys_total - physmem) > 0)
3537 sys->zss_locked_kern += phys_total - physmem;
3538
3539 /*
3540 * Add in the portion of (RAM+DISK) that is not available as swap,
3541 * and consider it swap used by the kernel.
3542 */
3543 sys->zss_vm_total = phys_total + disk_swap_total;
3544 vm_free = (ani.ani_max - ani.ani_resv) * ctl->zsctl_pagesize;
3545 vm_used = sys->zss_vm_total - vm_free;
3546 sys->zss_vm_kern = vm_used - sys->zss_vm_zones - arc_size;
3547 }
3548
3549 /*
3550 * Charge each cpu's usage to its processor sets. Also add the cpu's total
3551 * time to each zone using the processor set. This tracks the maximum
3552 * amount of cpu time that a zone could have used.
3553 */
3554 static void
zsd_refresh_cpu_stats(zsd_ctl_t * ctl,boolean_t init)3555 zsd_refresh_cpu_stats(zsd_ctl_t *ctl, boolean_t init)
3556 {
3557 zsd_system_t *sys;
3558 zsd_zone_t *zone;
3559 zsd_pset_usage_t *usage;
3560 zsd_cpu_t *cpu;
3561 zsd_cpu_t *cpu_next;
3562 zsd_pset_t *pset;
3563 timestruc_t ts;
3564 uint64_t hrtime;
3565 timestruc_t delta;
3566
3567 /* Update the per-cpu kstat data */
3568 cpu_next = list_head(&ctl->zsctl_cpus);
3569 while (cpu_next != NULL) {
3570 cpu = cpu_next;
3571 cpu_next = list_next(&ctl->zsctl_cpus, cpu);
3572 zsd_update_cpu_stats(ctl, cpu);
3573 }
3574 /* Update the elapsed real time */
3575 hrtime = gethrtime();
3576 if (init) {
3577 /* first time around, store hrtime for future comparision */
3578 ctl->zsctl_hrtime = hrtime;
3579 ctl->zsctl_hrtime_prev = hrtime;
3580
3581 } else {
3582 /* Compute increase in hrtime since the most recent read */
3583 ctl->zsctl_hrtime_prev = ctl->zsctl_hrtime;
3584 ctl->zsctl_hrtime = hrtime;
3585 if ((hrtime = hrtime - ctl->zsctl_hrtime_prev) > 0)
3586 TIMESTRUC_ADD_NANOSEC(ctl->zsctl_hrtime_total, hrtime);
3587 }
3588
3589 /* On initialization, all psets have zero time */
3590 if (init)
3591 return;
3592
3593 for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
3594 pset = list_next(&ctl->zsctl_psets, pset)) {
3595
3596 if (pset->zsp_active == B_FALSE) {
3597 zsd_warn(gettext("Internal error,inactive pset found"));
3598 continue;
3599 }
3600
3601 /* sum total used time for pset */
3602 ts.tv_sec = 0;
3603 ts.tv_nsec = 0;
3604 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_intr);
3605 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_kern);
3606 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_user);
3607 /* kernel time in pset is total time minus zone time */
3608 TIMESTRUC_DELTA(pset->zsp_usage_kern, ts,
3609 pset->zsp_usage_zones);
3610 if (pset->zsp_usage_kern.tv_sec < 0 ||
3611 pset->zsp_usage_kern.tv_nsec < 0) {
3612 pset->zsp_usage_kern.tv_sec = 0;
3613 pset->zsp_usage_kern.tv_nsec = 0;
3614 }
3615 /* Total pset elapsed time is used time plus idle time */
3616 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_idle);
3617
3618 TIMESTRUC_DELTA(delta, ts, pset->zsp_total_time);
3619
3620 for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
3621 usage = list_next(&pset->zsp_usage_list, usage)) {
3622
3623 zone = usage->zsu_zone;
3624 if (usage->zsu_cpu_shares != ZS_LIMIT_NONE &&
3625 usage->zsu_cpu_shares != ZS_SHARES_UNLIMITED &&
3626 usage->zsu_cpu_shares != 0) {
3627 /*
3628 * Figure out how many nanoseconds of share time
3629 * to give to the zone
3630 */
3631 hrtime = delta.tv_sec;
3632 hrtime *= NANOSEC;
3633 hrtime += delta.tv_nsec;
3634 hrtime *= usage->zsu_cpu_shares;
3635 hrtime /= pset->zsp_cpu_shares;
3636 TIMESTRUC_ADD_NANOSEC(zone->zsz_share_time,
3637 hrtime);
3638 }
3639 /* Add pset time to each zone using pset */
3640 TIMESTRUC_ADD_TIMESTRUC(zone->zsz_pset_time, delta);
3641
3642 zone->zsz_cpus_online += pset->zsp_online;
3643 }
3644 pset->zsp_total_time = ts;
3645 }
3646
3647 for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3648 zone = list_next(&ctl->zsctl_zones, zone)) {
3649
3650 /* update cpu cap tracking if the zone has a cpu cap */
3651 if (zone->zsz_cpu_cap != ZS_LIMIT_NONE) {
3652 uint64_t elapsed;
3653
3654 elapsed = ctl->zsctl_hrtime - ctl->zsctl_hrtime_prev;
3655 elapsed *= zone->zsz_cpu_cap;
3656 elapsed = elapsed / 100;
3657 TIMESTRUC_ADD_NANOSEC(zone->zsz_cap_time, elapsed);
3658 }
3659 }
3660 sys = ctl->zsctl_system;
3661 ts.tv_sec = 0;
3662 ts.tv_nsec = 0;
3663 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_intr);
3664 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_kern);
3665 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_user);
3666
3667 /* kernel time in pset is total time minus zone time */
3668 TIMESTRUC_DELTA(sys->zss_cpu_usage_kern, ts,
3669 sys->zss_cpu_usage_zones);
3670 if (sys->zss_cpu_usage_kern.tv_sec < 0 ||
3671 sys->zss_cpu_usage_kern.tv_nsec < 0) {
3672 sys->zss_cpu_usage_kern.tv_sec = 0;
3673 sys->zss_cpu_usage_kern.tv_nsec = 0;
3674 }
3675 /* Total pset elapsed time is used time plus idle time */
3676 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_idle);
3677 sys->zss_cpu_total_time = ts;
3678 }
3679
3680 /*
3681 * Saves current usage data to a cache that is read by libzonestat when
3682 * calling zs_usage_read().
3683 *
3684 * All pointers in the cached data structure are set to NULL. When
3685 * libzonestat reads the cached data, it will set the pointers relative to
3686 * its address space.
3687 */
3688 static void
zsd_usage_cache_update(zsd_ctl_t * ctl)3689 zsd_usage_cache_update(zsd_ctl_t *ctl)
3690 {
3691 zs_usage_cache_t *cache;
3692 zs_usage_cache_t *old;
3693 zs_usage_t *usage;
3694
3695 zs_system_t *sys;
3696 zsd_system_t *dsys;
3697 zs_zone_t *zone = NULL;
3698 zsd_zone_t *dzone;
3699 zs_pset_t *pset = NULL;
3700 zsd_pset_t *dpset;
3701 zs_pset_zone_t *pusage;
3702 zsd_pset_usage_t *dpusage;
3703
3704 char *next;
3705 uint_t size, i, j;
3706
3707 size =
3708 sizeof (zs_usage_cache_t) +
3709 sizeof (zs_usage_t) +
3710 sizeof (zs_system_t) +
3711 sizeof (zs_zone_t) * ctl->zsctl_nzones +
3712 sizeof (zs_pset_t) * ctl->zsctl_npsets +
3713 sizeof (zs_pset_zone_t) * ctl->zsctl_npset_usages;
3714
3715 cache = (zs_usage_cache_t *)malloc(size);
3716 if (cache == NULL) {
3717 zsd_warn(gettext("Unable to allocate usage cache\n"));
3718 return;
3719 }
3720
3721 next = (char *)cache;
3722 cache->zsuc_size = size - sizeof (zs_usage_cache_t);
3723 next += sizeof (zs_usage_cache_t);
3724
3725 /* LINTED */
3726 usage = cache->zsuc_usage = (zs_usage_t *)next;
3727 next += sizeof (zs_usage_t);
3728 usage->zsu_start = g_start;
3729 usage->zsu_hrstart = g_hrstart;
3730 usage->zsu_time = g_now;
3731 usage->zsu_hrtime = g_hrnow;
3732 usage->zsu_nzones = ctl->zsctl_nzones;
3733 usage->zsu_npsets = ctl->zsctl_npsets;
3734 usage->zsu_system = NULL;
3735
3736 /* LINTED */
3737 sys = (zs_system_t *)next;
3738 next += sizeof (zs_system_t);
3739 dsys = ctl->zsctl_system;
3740 sys->zss_ram_total = dsys->zss_ram_total;
3741 sys->zss_ram_kern = dsys->zss_ram_kern;
3742 sys->zss_ram_zones = dsys->zss_ram_zones;
3743 sys->zss_locked_kern = dsys->zss_locked_kern;
3744 sys->zss_locked_zones = dsys->zss_locked_zones;
3745 sys->zss_vm_total = dsys->zss_vm_total;
3746 sys->zss_vm_kern = dsys->zss_vm_kern;
3747 sys->zss_vm_zones = dsys->zss_vm_zones;
3748 sys->zss_swap_total = dsys->zss_swap_total;
3749 sys->zss_swap_used = dsys->zss_swap_used;
3750 sys->zss_ncpus = dsys->zss_ncpus;
3751 sys->zss_ncpus_online = dsys->zss_ncpus_online;
3752
3753 sys->zss_processes_max = dsys->zss_maxpid;
3754 sys->zss_lwps_max = dsys->zss_lwps_max;
3755 sys->zss_shm_max = dsys->zss_shm_max;
3756 sys->zss_shmids_max = dsys->zss_shmids_max;
3757 sys->zss_semids_max = dsys->zss_semids_max;
3758 sys->zss_msgids_max = dsys->zss_msgids_max;
3759 sys->zss_lofi_max = dsys->zss_lofi_max;
3760
3761 sys->zss_processes = dsys->zss_processes;
3762 sys->zss_lwps = dsys->zss_lwps;
3763 sys->zss_shm = dsys->zss_shm;
3764 sys->zss_shmids = dsys->zss_shmids;
3765 sys->zss_semids = dsys->zss_semids;
3766 sys->zss_msgids = dsys->zss_msgids;
3767 sys->zss_lofi = dsys->zss_lofi;
3768
3769 sys->zss_cpu_total_time = dsys->zss_cpu_total_time;
3770 sys->zss_cpu_usage_zones = dsys->zss_cpu_usage_zones;
3771 sys->zss_cpu_usage_kern = dsys->zss_cpu_usage_kern;
3772
3773 for (i = 0, dzone = list_head(&ctl->zsctl_zones);
3774 i < ctl->zsctl_nzones;
3775 i++, dzone = list_next(&ctl->zsctl_zones, dzone)) {
3776 /* LINTED */
3777 zone = (zs_zone_t *)next;
3778 next += sizeof (zs_zone_t);
3779 list_link_init(&zone->zsz_next);
3780 zone->zsz_system = NULL;
3781
3782 (void) strlcpy(zone->zsz_name, dzone->zsz_name,
3783 sizeof (zone->zsz_name));
3784 (void) strlcpy(zone->zsz_pool, dzone->zsz_pool,
3785 sizeof (zone->zsz_pool));
3786 (void) strlcpy(zone->zsz_pset, dzone->zsz_pset,
3787 sizeof (zone->zsz_pset));
3788 zone->zsz_id = dzone->zsz_id;
3789 zone->zsz_cputype = dzone->zsz_cputype;
3790 zone->zsz_iptype = dzone->zsz_iptype;
3791 zone->zsz_start = dzone->zsz_start;
3792 zone->zsz_hrstart = dzone->zsz_hrstart;
3793 zone->zsz_scheds = dzone->zsz_scheds;
3794 zone->zsz_cpu_shares = dzone->zsz_cpu_shares;
3795 zone->zsz_cpu_cap = dzone->zsz_cpu_cap;
3796 zone->zsz_ram_cap = dzone->zsz_ram_cap;
3797 zone->zsz_vm_cap = dzone->zsz_vm_cap;
3798 zone->zsz_locked_cap = dzone->zsz_locked_cap;
3799 zone->zsz_cpu_usage = dzone->zsz_cpu_usage;
3800 zone->zsz_cpus_online = dzone->zsz_cpus_online;
3801 zone->zsz_pset_time = dzone->zsz_pset_time;
3802 zone->zsz_cap_time = dzone->zsz_cap_time;
3803 zone->zsz_share_time = dzone->zsz_share_time;
3804 zone->zsz_usage_ram = dzone->zsz_usage_ram;
3805 zone->zsz_usage_locked = dzone->zsz_usage_locked;
3806 zone->zsz_usage_vm = dzone->zsz_usage_vm;
3807
3808 zone->zsz_processes_cap = dzone->zsz_processes_cap;
3809 zone->zsz_lwps_cap = dzone->zsz_lwps_cap;
3810 zone->zsz_shm_cap = dzone->zsz_shm_cap;
3811 zone->zsz_shmids_cap = dzone->zsz_shmids_cap;
3812 zone->zsz_semids_cap = dzone->zsz_semids_cap;
3813 zone->zsz_msgids_cap = dzone->zsz_msgids_cap;
3814 zone->zsz_lofi_cap = dzone->zsz_lofi_cap;
3815
3816 zone->zsz_processes = dzone->zsz_processes;
3817 zone->zsz_lwps = dzone->zsz_lwps;
3818 zone->zsz_shm = dzone->zsz_shm;
3819 zone->zsz_shmids = dzone->zsz_shmids;
3820 zone->zsz_semids = dzone->zsz_semids;
3821 zone->zsz_msgids = dzone->zsz_msgids;
3822 zone->zsz_lofi = dzone->zsz_lofi;
3823 }
3824
3825 for (i = 0, dpset = list_head(&ctl->zsctl_psets);
3826 i < ctl->zsctl_npsets;
3827 i++, dpset = list_next(&ctl->zsctl_psets, dpset)) {
3828 /* LINTED */
3829 pset = (zs_pset_t *)next;
3830 next += sizeof (zs_pset_t);
3831 list_link_init(&pset->zsp_next);
3832 (void) strlcpy(pset->zsp_name, dpset->zsp_name,
3833 sizeof (pset->zsp_name));
3834 pset->zsp_id = dpset->zsp_id;
3835 pset->zsp_cputype = dpset->zsp_cputype;
3836 pset->zsp_start = dpset->zsp_start;
3837 pset->zsp_hrstart = dpset->zsp_hrstart;
3838 pset->zsp_online = dpset->zsp_online;
3839 pset->zsp_size = dpset->zsp_size;
3840 pset->zsp_min = dpset->zsp_min;
3841 pset->zsp_max = dpset->zsp_max;
3842 pset->zsp_importance = dpset->zsp_importance;
3843 pset->zsp_scheds = dpset->zsp_scheds;
3844 pset->zsp_cpu_shares = dpset->zsp_cpu_shares;
3845 pset->zsp_total_time = dpset->zsp_total_time;
3846 pset->zsp_usage_kern = dpset->zsp_usage_kern;
3847 pset->zsp_usage_zones = dpset->zsp_usage_zones;
3848 pset->zsp_nusage = dpset->zsp_nusage;
3849 /* Add pset usages for pset */
3850 for (j = 0, dpusage = list_head(&dpset->zsp_usage_list);
3851 j < dpset->zsp_nusage;
3852 j++, dpusage = list_next(&dpset->zsp_usage_list, dpusage)) {
3853 /* LINTED */
3854 pusage = (zs_pset_zone_t *)next;
3855 next += sizeof (zs_pset_zone_t);
3856 /* pointers are computed by client */
3857 pusage->zspz_pset = NULL;
3858 pusage->zspz_zone = NULL;
3859 list_link_init(&pusage->zspz_next);
3860 pusage->zspz_zoneid = dpusage->zsu_zone->zsz_id;
3861 pusage->zspz_start = dpusage->zsu_start;
3862 pusage->zspz_hrstart = dpusage->zsu_hrstart;
3863 pusage->zspz_hrstart = dpusage->zsu_hrstart;
3864 pusage->zspz_cpu_shares = dpusage->zsu_cpu_shares;
3865 pusage->zspz_scheds = dpusage->zsu_scheds;
3866 pusage->zspz_cpu_usage = dpusage->zsu_cpu_usage;
3867 }
3868 }
3869
3870 /* Update the current cache pointer */
3871 (void) mutex_lock(&g_usage_cache_lock);
3872 old = g_usage_cache;
3873 cache->zsuc_ref = 1;
3874 cache->zsuc_gen = g_gen_next;
3875 usage->zsu_gen = g_gen_next;
3876 usage->zsu_size = size;
3877 g_usage_cache = cache;
3878 if (old != NULL) {
3879 old->zsuc_ref--;
3880 if (old->zsuc_ref == 0)
3881 free(old);
3882 }
3883 g_gen_next++;
3884 /* Wake up any clients that are waiting for this calculation */
3885 if (g_usage_cache_kickers > 0) {
3886 (void) cond_broadcast(&g_usage_cache_wait);
3887 }
3888 (void) mutex_unlock(&g_usage_cache_lock);
3889 }
3890
3891 static zs_usage_cache_t *
zsd_usage_cache_hold_locked()3892 zsd_usage_cache_hold_locked()
3893 {
3894 zs_usage_cache_t *ret;
3895
3896 ret = g_usage_cache;
3897 ret->zsuc_ref++;
3898 return (ret);
3899 }
3900
3901 void
zsd_usage_cache_rele(zs_usage_cache_t * cache)3902 zsd_usage_cache_rele(zs_usage_cache_t *cache)
3903 {
3904 (void) mutex_lock(&g_usage_cache_lock);
3905 cache->zsuc_ref--;
3906 if (cache->zsuc_ref == 0)
3907 free(cache);
3908 (void) mutex_unlock(&g_usage_cache_lock);
3909 }
3910
3911 /* Close the handles held by zsd_open() */
3912 void
zsd_close(zsd_ctl_t * ctl)3913 zsd_close(zsd_ctl_t *ctl)
3914 {
3915 zsd_zone_t *zone;
3916 zsd_pset_t *pset;
3917 zsd_pset_usage_t *usage;
3918 zsd_cpu_t *cpu;
3919 int id;
3920
3921 if (ctl->zsctl_kstat_ctl) {
3922 (void) kstat_close(ctl->zsctl_kstat_ctl);
3923 ctl->zsctl_kstat_ctl = NULL;
3924 }
3925 if (ctl->zsctl_proc_open) {
3926 (void) ea_close(&ctl->zsctl_proc_eaf);
3927 ctl->zsctl_proc_open = 0;
3928 ctl->zsctl_proc_fd = -1;
3929 }
3930 if (ctl->zsctl_pool_conf) {
3931 if (ctl->zsctl_pool_status == POOL_ENABLED)
3932 (void) pool_conf_close(ctl->zsctl_pool_conf);
3933 ctl->zsctl_pool_status = POOL_DISABLED;
3934 }
3935
3936 while ((zone = list_head(&ctl->zsctl_zones)) != NULL) {
3937 list_remove(&ctl->zsctl_zones, zone);
3938 free(zone);
3939 ctl->zsctl_nzones--;
3940 }
3941
3942 while ((pset = list_head(&ctl->zsctl_psets)) != NULL) {
3943 while ((usage = list_head(&pset->zsp_usage_list))
3944 != NULL) {
3945 list_remove(&pset->zsp_usage_list, usage);
3946 ctl->zsctl_npset_usages--;
3947 free(usage);
3948 }
3949 list_remove(&ctl->zsctl_psets, pset);
3950 free(pset);
3951 ctl->zsctl_npsets--;
3952 }
3953
3954 /* Release all cpus being tracked */
3955 while (cpu = list_head(&ctl->zsctl_cpus)) {
3956 list_remove(&ctl->zsctl_cpus, cpu);
3957 id = cpu->zsc_id;
3958 bzero(cpu, sizeof (zsd_cpu_t));
3959 cpu->zsc_id = id;
3960 cpu->zsc_allocated = B_FALSE;
3961 cpu->zsc_psetid = ZS_PSET_ERROR;
3962 cpu->zsc_psetid_prev = ZS_PSET_ERROR;
3963 }
3964
3965 assert(ctl->zsctl_npset_usages == 0);
3966 assert(ctl->zsctl_npsets == 0);
3967 assert(ctl->zsctl_nzones == 0);
3968 (void) zsd_disable_cpu_stats();
3969 }
3970
3971
3972 /*
3973 * Update the utilization data for all zones and processor sets.
3974 */
3975 static int
zsd_read(zsd_ctl_t * ctl,boolean_t init,boolean_t do_memory)3976 zsd_read(zsd_ctl_t *ctl, boolean_t init, boolean_t do_memory)
3977 {
3978 (void) kstat_chain_update(ctl->zsctl_kstat_ctl);
3979 (void) gettimeofday(&(ctl->zsctl_timeofday), NULL);
3980
3981 zsd_refresh_system(ctl);
3982
3983 /*
3984 * Memory calculation is expensive. Only update it on sample
3985 * intervals.
3986 */
3987 if (do_memory == B_TRUE)
3988 zsd_refresh_memory(ctl, init);
3989 zsd_refresh_zones(ctl);
3990 zsd_refresh_psets(ctl);
3991 zsd_refresh_procs(ctl, init);
3992 zsd_refresh_cpu_stats(ctl, init);
3993
3994 /*
3995 * Delete objects that no longer exist.
3996 * Pset usages must be deleted first as they point to zone and
3997 * pset objects.
3998 */
3999 zsd_mark_pset_usages_end(ctl);
4000 zsd_mark_psets_end(ctl);
4001 zsd_mark_cpus_end(ctl);
4002 zsd_mark_zones_end(ctl);
4003
4004 /*
4005 * Save results for clients.
4006 */
4007 zsd_usage_cache_update(ctl);
4008
4009 /*
4010 * Roll process accounting file.
4011 */
4012 (void) zsd_roll_exacct();
4013 return (0);
4014 }
4015
4016 /*
4017 * Get the system rctl, which is the upper most limit
4018 */
4019 static uint64_t
zsd_get_system_rctl(char * name)4020 zsd_get_system_rctl(char *name)
4021 {
4022 rctlblk_t *rblk, *rblk_last;
4023
4024 rblk = (rctlblk_t *)alloca(rctlblk_size());
4025 rblk_last = (rctlblk_t *)alloca(rctlblk_size());
4026
4027 if (getrctl(name, NULL, rblk_last, RCTL_FIRST) != 0)
4028 return (ZS_LIMIT_NONE);
4029
4030 while (getrctl(name, rblk_last, rblk, RCTL_NEXT) == 0)
4031 (void) bcopy(rblk, rblk_last, rctlblk_size());
4032
4033 return (rctlblk_get_value(rblk_last));
4034 }
4035
4036 /*
4037 * Open any necessary subsystems for collecting utilization data,
4038 * allocate and initialize data structures, and get initial utilization.
4039 *
4040 * Errors:
4041 * ENOMEM out of memory
4042 * EINVAL other error
4043 */
4044 static zsd_ctl_t *
zsd_open(zsd_ctl_t * ctl)4045 zsd_open(zsd_ctl_t *ctl)
4046 {
4047 zsd_system_t *system;
4048
4049 char path[MAXPATHLEN];
4050 struct statvfs svfs;
4051 int ret;
4052 int i;
4053 size_t size;
4054 int err;
4055
4056 if (ctl == NULL && (ctl = (zsd_ctl_t *)calloc(1,
4057 sizeof (zsd_ctl_t))) == NULL) {
4058 zsd_warn(gettext("Out of Memory"));
4059 errno = ENOMEM;
4060 goto err;
4061 }
4062 ctl->zsctl_proc_fd = -1;
4063
4064 /* open kstats */
4065 if (ctl->zsctl_kstat_ctl == NULL &&
4066 (ctl->zsctl_kstat_ctl = kstat_open()) == NULL) {
4067 err = errno;
4068 zsd_warn(gettext("Unable to open kstats"));
4069 errno = err;
4070 if (errno != ENOMEM)
4071 errno = EAGAIN;
4072 goto err;
4073 }
4074
4075 /*
4076 * These are set when the accounting file is opened by
4077 * zsd_update_procs()
4078 */
4079 ctl->zsctl_proc_fd = -1;
4080 ctl->zsctl_proc_fd_next = -1;
4081 ctl->zsctl_proc_open = 0;
4082 ctl->zsctl_proc_open_next = 0;
4083
4084 check_exacct:
4085 (void) zsd_enable_cpu_stats();
4086
4087 /* Create structures to track usage */
4088 if (ctl->zsctl_system == NULL && (ctl->zsctl_system = (zsd_system_t *)
4089 calloc(1, sizeof (zsd_system_t))) == NULL) {
4090 ret = -1;
4091 zsd_warn(gettext("Out of Memory"));
4092 errno = ENOMEM;
4093 goto err;
4094 }
4095 system = ctl->zsctl_system;
4096 /* get the kernel bitness to know structure layout for getvmusage */
4097 ret = sysinfo(SI_ARCHITECTURE_64, path, sizeof (path));
4098 if (ret < 0)
4099 ctl->zsctl_kern_bits = 32;
4100 else
4101 ctl->zsctl_kern_bits = 64;
4102 ctl->zsctl_pagesize = sysconf(_SC_PAGESIZE);
4103
4104 size = sysconf(_SC_CPUID_MAX);
4105 ctl->zsctl_maxcpuid = size;
4106 if (ctl->zsctl_cpu_array == NULL && (ctl->zsctl_cpu_array =
4107 (zsd_cpu_t *)calloc(size + 1, sizeof (zsd_cpu_t))) == NULL) {
4108 zsd_warn(gettext("Out of Memory"));
4109 errno = ENOMEM;
4110 goto err;
4111 }
4112 for (i = 0; i <= ctl->zsctl_maxcpuid; i++) {
4113 ctl->zsctl_cpu_array[i].zsc_id = i;
4114 ctl->zsctl_cpu_array[i].zsc_allocated = B_FALSE;
4115 ctl->zsctl_cpu_array[i].zsc_psetid = ZS_PSET_ERROR;
4116 ctl->zsctl_cpu_array[i].zsc_psetid_prev = ZS_PSET_ERROR;
4117 }
4118 if (statvfs("/proc", &svfs) != 0 ||
4119 strcmp("/proc", svfs.f_fstr) != 0) {
4120 zsd_warn(gettext("/proc not a procfs filesystem"));
4121 errno = EINVAL;
4122 goto err;
4123 }
4124
4125 size = sysconf(_SC_MAXPID) + 1;
4126 ctl->zsctl_maxproc = size;
4127 if (ctl->zsctl_proc_array == NULL &&
4128 (ctl->zsctl_proc_array = (zsd_proc_t *)calloc(size,
4129 sizeof (zsd_proc_t))) == NULL) {
4130 zsd_warn(gettext("Out of Memory"));
4131 errno = ENOMEM;
4132 goto err;
4133 }
4134 for (i = 0; i <= ctl->zsctl_maxproc; i++) {
4135 list_link_init(&(ctl->zsctl_proc_array[i].zspr_next));
4136 ctl->zsctl_proc_array[i].zspr_psetid = ZS_PSET_ERROR;
4137 ctl->zsctl_proc_array[i].zspr_zoneid = -1;
4138 ctl->zsctl_proc_array[i].zspr_usage.tv_sec = 0;
4139 ctl->zsctl_proc_array[i].zspr_usage.tv_nsec = 0;
4140 ctl->zsctl_proc_array[i].zspr_ppid = -1;
4141 }
4142
4143 list_create(&ctl->zsctl_zones, sizeof (zsd_zone_t),
4144 offsetof(zsd_zone_t, zsz_next));
4145
4146 list_create(&ctl->zsctl_psets, sizeof (zsd_pset_t),
4147 offsetof(zsd_pset_t, zsp_next));
4148
4149 list_create(&ctl->zsctl_cpus, sizeof (zsd_cpu_t),
4150 offsetof(zsd_cpu_t, zsc_next));
4151
4152 if (ctl->zsctl_pool_conf == NULL &&
4153 (ctl->zsctl_pool_conf = pool_conf_alloc()) == NULL) {
4154 zsd_warn(gettext("Out of Memory"));
4155 errno = ENOMEM;
4156 goto err;
4157 }
4158 ctl->zsctl_pool_status = POOL_DISABLED;
4159 ctl->zsctl_pool_changed = 0;
4160
4161 if (ctl->zsctl_pool_vals[0] == NULL &&
4162 (ctl->zsctl_pool_vals[0] = pool_value_alloc()) == NULL) {
4163 zsd_warn(gettext("Out of Memory"));
4164 errno = ENOMEM;
4165 goto err;
4166 }
4167 if (ctl->zsctl_pool_vals[1] == NULL &&
4168 (ctl->zsctl_pool_vals[1] = pool_value_alloc()) == NULL) {
4169 zsd_warn(gettext("Out of Memory"));
4170 errno = ENOMEM;
4171 goto err;
4172 }
4173 ctl->zsctl_pool_vals[2] = NULL;
4174
4175 /*
4176 * get system limits
4177 */
4178 system->zss_maxpid = size = sysconf(_SC_MAXPID);
4179 system->zss_processes_max = zsd_get_system_rctl("zone.max-processes");
4180 system->zss_lwps_max = zsd_get_system_rctl("zone.max-lwps");
4181 system->zss_shm_max = zsd_get_system_rctl("zone.max-shm-memory");
4182 system->zss_shmids_max = zsd_get_system_rctl("zone.max-shm-ids");
4183 system->zss_semids_max = zsd_get_system_rctl("zone.max-sem-ids");
4184 system->zss_msgids_max = zsd_get_system_rctl("zone.max-msg-ids");
4185 system->zss_lofi_max = zsd_get_system_rctl("zone.max-lofi");
4186
4187 g_gen_next = 1;
4188
4189 if (zsd_read(ctl, B_TRUE, B_FALSE) != 0)
4190 zsd_warn(gettext("Reading zone statistics failed"));
4191
4192 return (ctl);
4193 err:
4194 if (ctl)
4195 zsd_close(ctl);
4196
4197 return (NULL);
4198 }
4199
4200 /* Copy utilization data to buffer, filtering data if non-global zone. */
4201 static void
zsd_usage_filter(zoneid_t zid,zs_usage_cache_t * cache,zs_usage_t * usage,boolean_t is_gz)4202 zsd_usage_filter(zoneid_t zid, zs_usage_cache_t *cache, zs_usage_t *usage,
4203 boolean_t is_gz)
4204 {
4205 zs_usage_t *cusage;
4206 zs_system_t *sys, *csys;
4207 zs_zone_t *zone, *czone;
4208 zs_pset_t *pset, *cpset;
4209 zs_pset_zone_t *pz, *cpz, *foundpz;
4210 size_t size = 0, csize = 0;
4211 char *start, *cstart;
4212 int i, j;
4213 timestruc_t delta;
4214
4215 /* Privileged users in the global zone get everything */
4216 if (is_gz) {
4217 cusage = cache->zsuc_usage;
4218 (void) bcopy(cusage, usage, cusage->zsu_size);
4219 return;
4220 }
4221
4222 /* Zones just get their own usage */
4223 cusage = cache->zsuc_usage;
4224
4225 start = (char *)usage;
4226 cstart = (char *)cusage;
4227 size += sizeof (zs_usage_t);
4228 csize += sizeof (zs_usage_t);
4229
4230 usage->zsu_start = cusage->zsu_start;
4231 usage->zsu_hrstart = cusage->zsu_hrstart;
4232 usage->zsu_time = cusage->zsu_time;
4233 usage->zsu_hrtime = cusage->zsu_hrtime;
4234 usage->zsu_gen = cusage->zsu_gen;
4235 usage->zsu_nzones = 1;
4236 usage->zsu_npsets = 0;
4237
4238 /* LINTED */
4239 sys = (zs_system_t *)(start + size);
4240 /* LINTED */
4241 csys = (zs_system_t *)(cstart + csize);
4242 size += sizeof (zs_system_t);
4243 csize += sizeof (zs_system_t);
4244
4245 /* Save system limits but not usage */
4246 *sys = *csys;
4247 sys->zss_ncpus = 0;
4248 sys->zss_ncpus_online = 0;
4249
4250 /* LINTED */
4251 zone = (zs_zone_t *)(start + size);
4252 /* LINTED */
4253 czone = (zs_zone_t *)(cstart + csize);
4254 /* Find the matching zone */
4255 for (i = 0; i < cusage->zsu_nzones; i++) {
4256 if (czone->zsz_id == zid) {
4257 *zone = *czone;
4258 size += sizeof (zs_zone_t);
4259 }
4260 csize += sizeof (zs_zone_t);
4261 /* LINTED */
4262 czone = (zs_zone_t *)(cstart + csize);
4263 }
4264 sys->zss_ram_kern += (sys->zss_ram_zones - zone->zsz_usage_ram);
4265 sys->zss_ram_zones = zone->zsz_usage_ram;
4266
4267 sys->zss_vm_kern += (sys->zss_vm_zones - zone->zsz_usage_vm);
4268 sys->zss_vm_zones = zone->zsz_usage_vm;
4269
4270 sys->zss_locked_kern += (sys->zss_locked_zones -
4271 zone->zsz_usage_locked);
4272 sys->zss_locked_zones = zone->zsz_usage_locked;
4273
4274 TIMESTRUC_DELTA(delta, sys->zss_cpu_usage_zones, zone->zsz_cpu_usage);
4275 TIMESTRUC_ADD_TIMESTRUC(sys->zss_cpu_usage_kern, delta);
4276 sys->zss_cpu_usage_zones = zone->zsz_cpu_usage;
4277
4278 /* LINTED */
4279 pset = (zs_pset_t *)(start + size);
4280 /* LINTED */
4281 cpset = (zs_pset_t *)(cstart + csize);
4282 for (i = 0; i < cusage->zsu_npsets; i++) {
4283 csize += sizeof (zs_pset_t);
4284 /* LINTED */
4285 cpz = (zs_pset_zone_t *)(csize + cstart);
4286 foundpz = NULL;
4287 for (j = 0; j < cpset->zsp_nusage; j++) {
4288 if (cpz->zspz_zoneid == zid)
4289 foundpz = cpz;
4290
4291 csize += sizeof (zs_pset_zone_t);
4292 /* LINTED */
4293 cpz = (zs_pset_zone_t *)(csize + cstart);
4294 }
4295 if (foundpz != NULL) {
4296 size += sizeof (zs_pset_t);
4297 /* LINTED */
4298 pz = (zs_pset_zone_t *)(start + size);
4299 size += sizeof (zs_pset_zone_t);
4300
4301 *pset = *cpset;
4302 *pz = *foundpz;
4303
4304 TIMESTRUC_DELTA(delta, pset->zsp_usage_zones,
4305 pz->zspz_cpu_usage);
4306 TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_kern, delta);
4307 pset->zsp_usage_zones = pz->zspz_cpu_usage;
4308 pset->zsp_nusage = 1;
4309 usage->zsu_npsets++;
4310 sys->zss_ncpus += pset->zsp_size;
4311 sys->zss_ncpus_online += pset->zsp_online;
4312 }
4313 /* LINTED */
4314 cpset = (zs_pset_t *)(cstart + csize);
4315 }
4316 usage->zsu_size = size;
4317 }
4318
4319 /*
4320 * Respond to new connections from libzonestat.so. Also respond to zoneadmd,
4321 * which reports new zones.
4322 */
4323 /* ARGSUSED */
4324 static void
zsd_server(void * cookie,char * argp,size_t arg_size,door_desc_t * dp,uint_t n_desc)4325 zsd_server(void *cookie, char *argp, size_t arg_size,
4326 door_desc_t *dp, uint_t n_desc)
4327 {
4328 int *args, cmd;
4329 door_desc_t door;
4330 ucred_t *ucred;
4331 const priv_set_t *eset;
4332
4333 if (argp == DOOR_UNREF_DATA) {
4334 (void) door_return(NULL, 0, NULL, 0);
4335 thr_exit(NULL);
4336 }
4337
4338 if (arg_size != sizeof (cmd) * 2) {
4339 (void) door_return(NULL, 0, NULL, 0);
4340 thr_exit(NULL);
4341 }
4342
4343 /* LINTED */
4344 args = (int *)argp;
4345 cmd = args[0];
4346
4347 /* If connection, return door to stat server */
4348 if (cmd == ZSD_CMD_CONNECT) {
4349
4350 /* Verify client compilation version */
4351 if (args[1] != ZS_VERSION) {
4352 args[1] = ZSD_STATUS_VERSION_MISMATCH;
4353 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4354 thr_exit(NULL);
4355 }
4356 ucred = alloca(ucred_size());
4357 /* Verify client permission */
4358 if (door_ucred(&ucred) != 0) {
4359 args[1] = ZSD_STATUS_INTERNAL_ERROR;
4360 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4361 thr_exit(NULL);
4362 }
4363
4364 eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4365 if (eset == NULL) {
4366 args[1] = ZSD_STATUS_INTERNAL_ERROR;
4367 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4368 thr_exit(NULL);
4369 }
4370 if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4371 args[1] = ZSD_STATUS_PERMISSION;
4372 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4373 thr_exit(NULL);
4374 }
4375
4376 /* Return stat server door */
4377 args[1] = ZSD_STATUS_OK;
4378 door.d_attributes = DOOR_DESCRIPTOR;
4379 door.d_data.d_desc.d_descriptor = g_stat_door;
4380 (void) door_return(argp, sizeof (cmd) * 2, &door, 1);
4381 thr_exit(NULL);
4382 }
4383
4384 /* Respond to zoneadmd informing zonestatd of a new zone */
4385 if (cmd == ZSD_CMD_NEW_ZONE) {
4386 zsd_fattach_zone(args[1], g_server_door, B_FALSE);
4387 (void) door_return(NULL, 0, NULL, 0);
4388 thr_exit(NULL);
4389 }
4390
4391 args[1] = ZSD_STATUS_INTERNAL_ERROR;
4392 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4393 thr_exit(NULL);
4394 }
4395
4396 /*
4397 * Respond to libzonestat.so clients with the current utlilzation data.
4398 */
4399 /* ARGSUSED */
4400 static void
zsd_stat_server(void * cookie,char * argp,size_t arg_size,door_desc_t * dp,uint_t n_desc)4401 zsd_stat_server(void *cookie, char *argp, size_t arg_size,
4402 door_desc_t *dp, uint_t n_desc)
4403 {
4404 uint64_t *args, cmd;
4405 zs_usage_cache_t *cache;
4406 int ret;
4407 char *rvalp;
4408 size_t rvals;
4409 zs_usage_t *usage;
4410 ucred_t *ucred;
4411 zoneid_t zoneid;
4412 const priv_set_t *eset;
4413 boolean_t is_gz = B_FALSE;
4414
4415 /* Tell stat thread there are no more clients */
4416 if (argp == DOOR_UNREF_DATA) {
4417 (void) mutex_lock(&g_usage_cache_lock);
4418 g_hasclient = B_FALSE;
4419 (void) cond_signal(&g_usage_cache_kick);
4420 (void) mutex_unlock(&g_usage_cache_lock);
4421 (void) door_return(NULL, 0, NULL, 0);
4422 thr_exit(NULL);
4423 }
4424 if (arg_size != sizeof (cmd) * 2) {
4425 (void) door_return(NULL, 0, NULL, 0);
4426 thr_exit(NULL);
4427 }
4428 /* LINTED */
4429 args = (uint64_t *)argp;
4430 cmd = args[0];
4431 if (cmd != ZSD_CMD_READ) {
4432 (void) door_return(NULL, 0, NULL, 0);
4433 thr_exit(NULL);
4434 }
4435 ucred = alloca(ucred_size());
4436 if (door_ucred(&ucred) != 0) {
4437 (void) door_return(NULL, 0, NULL, 0);
4438 thr_exit(NULL);
4439 }
4440 zoneid = ucred_getzoneid(ucred);
4441
4442 if (zoneid == GLOBAL_ZONEID)
4443 is_gz = B_TRUE;
4444
4445 eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4446 if (eset == NULL) {
4447 (void) door_return(NULL, 0, NULL, 0);
4448 thr_exit(NULL);
4449 }
4450 if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4451 (void) door_return(NULL, 0, NULL, 0);
4452 thr_exit(NULL);
4453 }
4454 (void) mutex_lock(&g_usage_cache_lock);
4455 g_hasclient = B_TRUE;
4456
4457 /*
4458 * Force a new cpu calculation for client. This will force a
4459 * new memory calculation if the memory data is older than the
4460 * sample period.
4461 */
4462 g_usage_cache_kickers++;
4463 (void) cond_signal(&g_usage_cache_kick);
4464 ret = cond_wait(&g_usage_cache_wait, &g_usage_cache_lock);
4465 g_usage_cache_kickers--;
4466 if (ret != 0 && errno == EINTR) {
4467 (void) mutex_unlock(&g_usage_cache_lock);
4468 zsd_warn(gettext(
4469 "Interrupted before writing usage size to client\n"));
4470 (void) door_return(NULL, 0, NULL, 0);
4471 thr_exit(NULL);
4472 }
4473 cache = zsd_usage_cache_hold_locked();
4474 if (cache == NULL) {
4475 zsd_warn(gettext("Usage cache empty.\n"));
4476 (void) door_return(NULL, 0, NULL, 0);
4477 thr_exit(NULL);
4478 }
4479 (void) mutex_unlock(&g_usage_cache_lock);
4480
4481 /* Copy current usage data to stack to send to client */
4482 usage = (zs_usage_t *)alloca(cache->zsuc_size);
4483
4484 /* Filter out results if caller is non-global zone */
4485 zsd_usage_filter(zoneid, cache, usage, is_gz);
4486
4487 rvalp = (void *)usage;
4488 rvals = usage->zsu_size;
4489 zsd_usage_cache_rele(cache);
4490
4491 (void) door_return(rvalp, rvals, NULL, 0);
4492 thr_exit(NULL);
4493 }
4494
4495 static volatile boolean_t g_quit;
4496
4497 /* ARGSUSED */
4498 static void
zonestat_quithandler(int sig)4499 zonestat_quithandler(int sig)
4500 {
4501 g_quit = B_TRUE;
4502 }
4503
4504 /*
4505 * The stat thread generates new utilization data when clients request
4506 * it. It also manages opening and closing the subsystems used to gather
4507 * data depending on if clients exist.
4508 */
4509 /* ARGSUSED */
4510 void *
stat_thread(void * arg)4511 stat_thread(void *arg)
4512 {
4513 time_t start;
4514 time_t now;
4515 time_t next_memory;
4516 boolean_t do_memory;
4517 boolean_t do_read;
4518 boolean_t do_close;
4519
4520 start = time(NULL);
4521 if (start < 0) {
4522 if (g_quit == B_TRUE)
4523 goto quit;
4524 zsd_warn(gettext("Unable to fetch current time"));
4525 g_quit = B_TRUE;
4526 goto quit;
4527 }
4528
4529 next_memory = start;
4530 while (g_quit == B_FALSE) {
4531 for (;;) {
4532 /*
4533 * These are used to decide if the most recent memory
4534 * calculation was within a sample interval,
4535 * and weather or not the usage collection needs to
4536 * be opened or closed.
4537 */
4538 do_memory = B_FALSE;
4539 do_read = B_FALSE;
4540 do_close = B_FALSE;
4541
4542 /*
4543 * If all clients have gone, close usage collecting
4544 */
4545 (void) mutex_lock(&g_usage_cache_lock);
4546 if (!g_hasclient && g_open == B_TRUE) {
4547 do_close = B_TRUE;
4548 (void) mutex_unlock(&g_usage_cache_lock);
4549 break;
4550 }
4551 if (g_quit == B_TRUE) {
4552 (void) mutex_unlock(
4553 &g_usage_cache_lock);
4554 break;
4555 }
4556 /*
4557 * Wait for a usage data request
4558 */
4559 if (g_usage_cache_kickers == 0) {
4560 (void) cond_wait(&g_usage_cache_kick,
4561 &g_usage_cache_lock);
4562 }
4563 now = time(NULL);
4564 if (now < 0) {
4565 if (g_quit == B_TRUE) {
4566 (void) mutex_unlock(
4567 &g_usage_cache_lock);
4568 goto quit;
4569 }
4570 g_quit = B_TRUE;
4571 (void) mutex_unlock(&g_usage_cache_lock);
4572 zsd_warn(gettext(
4573 "Unable to fetch current time"));
4574 goto quit;
4575 }
4576 if (g_hasclient) {
4577 do_read = B_TRUE;
4578 if (now >= next_memory) {
4579 do_memory = B_TRUE;
4580 next_memory = now + g_interval;
4581 }
4582 } else {
4583 do_close = B_TRUE;
4584 }
4585 (void) mutex_unlock(&g_usage_cache_lock);
4586 if (do_read || do_close)
4587 break;
4588 }
4589 g_now = now;
4590 g_hrnow = gethrtime();
4591 if (g_hasclient && g_open == B_FALSE) {
4592 g_start = g_now;
4593 g_hrstart = g_hrnow;
4594 g_ctl = zsd_open(g_ctl);
4595 if (g_ctl == NULL)
4596 zsd_warn(gettext(
4597 "Unable to open zone statistics"));
4598 else
4599 g_open = B_TRUE;
4600 }
4601 if (do_read && g_ctl) {
4602 if (zsd_read(g_ctl, B_FALSE, do_memory) != 0) {
4603 zsd_warn(gettext(
4604 "Unable to read zone statistics"));
4605 g_quit = B_TRUE;
4606 return (NULL);
4607 }
4608 }
4609 (void) mutex_lock(&g_usage_cache_lock);
4610 if (!g_hasclient && g_open == B_TRUE && g_ctl) {
4611 (void) mutex_unlock(&g_usage_cache_lock);
4612 zsd_close(g_ctl);
4613 g_open = B_FALSE;
4614 } else {
4615 (void) mutex_unlock(&g_usage_cache_lock);
4616 }
4617 }
4618 quit:
4619 if (g_open)
4620 zsd_close(g_ctl);
4621
4622 (void) thr_kill(g_main, SIGINT);
4623 thr_exit(NULL);
4624 return (NULL);
4625 }
4626
4627 void
zsd_set_fx()4628 zsd_set_fx()
4629 {
4630 pcinfo_t pcinfo;
4631 pcparms_t pcparms;
4632
4633 (void) strlcpy(pcinfo.pc_clname, "FX", sizeof (pcinfo.pc_clname));
4634 if (priocntl(0, 0, PC_GETCID, (caddr_t)&pcinfo) == -1) {
4635 zsd_warn(gettext("cannot get FX class parameters"));
4636 return;
4637 }
4638 pcparms.pc_cid = pcinfo.pc_cid;
4639 ((fxparms_t *)pcparms.pc_clparms)->fx_upri = 60;
4640 ((fxparms_t *)pcparms.pc_clparms)->fx_uprilim = 60;
4641 ((fxparms_t *)pcparms.pc_clparms)->fx_tqsecs = 0;
4642 ((fxparms_t *)pcparms.pc_clparms)->fx_tqnsecs = FX_NOCHANGE;
4643 if (priocntl(P_PID, getpid(), PC_SETPARMS, (caddr_t)&pcparms) == -1)
4644 zsd_warn(gettext("cannot enter the FX class"));
4645 }
4646
4647 static int pipe_fd;
4648
4649 static void
daemonize_ready(char status)4650 daemonize_ready(char status)
4651 {
4652 /*
4653 * wake the parent with a clue
4654 */
4655 (void) write(pipe_fd, &status, 1);
4656 (void) close(pipe_fd);
4657 }
4658
4659 static int
daemonize_start(void)4660 daemonize_start(void)
4661 {
4662 char data;
4663 int status;
4664
4665 int filedes[2];
4666 pid_t pid;
4667
4668 (void) close(0);
4669 (void) dup2(2, 1);
4670
4671 if (pipe(filedes) < 0)
4672 return (-1);
4673
4674 (void) fflush(NULL);
4675
4676 if ((pid = fork1()) < 0)
4677 return (-1);
4678
4679 if (pid != 0) {
4680 /*
4681 * parent
4682 */
4683 struct sigaction act;
4684
4685 act.sa_handler = SIG_DFL;
4686 (void) sigemptyset(&act.sa_mask);
4687 act.sa_flags = 0;
4688
4689 (void) sigaction(SIGPIPE, &act, NULL); /* ignore SIGPIPE */
4690
4691 (void) close(filedes[1]);
4692 if (read(filedes[0], &data, 1) == 1) {
4693 /* forward ready code via exit status */
4694 exit(data);
4695 }
4696 status = -1;
4697 (void) wait4(pid, &status, 0, NULL);
4698 /* daemon process exited before becoming ready */
4699 if (WIFEXITED(status)) {
4700 /* assume daemon process printed useful message */
4701 exit(WEXITSTATUS(status));
4702 } else {
4703 zsd_warn(gettext("daemon process killed or died"));
4704 exit(1);
4705 }
4706 }
4707
4708 /*
4709 * child
4710 */
4711 pipe_fd = filedes[1];
4712 (void) close(filedes[0]);
4713
4714 /*
4715 * generic Unix setup
4716 */
4717 (void) setsid();
4718 (void) umask(0000);
4719
4720 return (0);
4721 }
4722
4723 static void
fattach_all_zones(boolean_t detach_only)4724 fattach_all_zones(boolean_t detach_only)
4725 {
4726 zoneid_t *zids;
4727 uint_t nzids, nzids_last;
4728 int i;
4729
4730 again:
4731 (void) zone_list(NULL, &nzids);
4732 nzids_last = nzids;
4733 zids = (zoneid_t *)malloc(sizeof (zoneid_t) * nzids_last);
4734 if (zids == NULL)
4735 zsd_error(gettext("Out of memory"));
4736
4737 (void) zone_list(zids, &nzids);
4738 if (nzids > nzids_last) {
4739 free(zids);
4740 goto again;
4741 }
4742 for (i = 0; i < nzids; i++)
4743 zsd_fattach_zone(zids[i], g_server_door, detach_only);
4744
4745 free(zids);
4746 }
4747
4748 int
main(int argc,char * argv[])4749 main(int argc, char *argv[])
4750 {
4751
4752 int arg;
4753 thread_t tid;
4754 scf_simple_prop_t *prop;
4755 uint64_t *intervalp;
4756 boolean_t opt_cleanup = B_FALSE;
4757
4758 g_main = thr_self();
4759 g_quit = B_FALSE;
4760 (void) signal(SIGINT, zonestat_quithandler);
4761 (void) signal(SIGTERM, zonestat_quithandler);
4762 (void) signal(SIGHUP, zonestat_quithandler);
4763 /* (void) sigignore(SIGCHLD); */
4764 (void) sigignore(SIGPIPE);
4765
4766 if (getzoneid() != GLOBAL_ZONEID)
4767 zsd_error(gettext("Must be run from global zone only"));
4768
4769 while ((arg = getopt(argc, argv, "c"))
4770 != EOF) {
4771 switch (arg) {
4772 case 'c':
4773 opt_cleanup = B_TRUE;
4774 break;
4775 default:
4776 zsd_error(gettext("Invalid option"));
4777 }
4778 }
4779
4780 if (opt_cleanup) {
4781 if (zsd_disable_cpu_stats() != 0)
4782 exit(1);
4783 else
4784 exit(0);
4785 }
4786
4787 /* Get the configured sample interval */
4788 prop = scf_simple_prop_get(NULL, "svc:/system/zones-monitoring:default",
4789 "config", "sample_interval");
4790 if (prop == NULL)
4791 zsd_error(gettext("Unable to fetch SMF property "
4792 "\"config/sample_interval\""));
4793
4794 if (scf_simple_prop_type(prop) != SCF_TYPE_COUNT)
4795 zsd_error(gettext("Malformed SMF property "
4796 "\"config/sample_interval\". Must be of type \"count\""));
4797
4798 intervalp = scf_simple_prop_next_count(prop);
4799 g_interval = *intervalp;
4800 if (g_interval == 0)
4801 zsd_error(gettext("Malformed SMF property "
4802 "\"config/sample_interval\". Must be greater than zero"));
4803
4804 scf_simple_prop_free(prop);
4805
4806 if (daemonize_start() < 0)
4807 zsd_error(gettext("Unable to start daemon\n"));
4808
4809 /* Run at high priority */
4810 zsd_set_fx();
4811
4812 (void) mutex_init(&g_usage_cache_lock, USYNC_THREAD, NULL);
4813 (void) cond_init(&g_usage_cache_kick, USYNC_THREAD, NULL);
4814 (void) cond_init(&g_usage_cache_wait, USYNC_THREAD, NULL);
4815
4816 g_server_door = door_create(zsd_server, NULL,
4817 DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4818 if (g_server_door < 0)
4819 zsd_error(gettext("Unable to create server door\n"));
4820
4821
4822 g_stat_door = door_create(zsd_stat_server, NULL, DOOR_UNREF_MULTI |
4823 DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4824 if (g_stat_door < 0)
4825 zsd_error(gettext("Unable to create statistics door\n"));
4826
4827 fattach_all_zones(B_FALSE);
4828
4829 if (thr_create(NULL, 0, stat_thread, NULL, 0, &tid) != 0)
4830 zsd_error(gettext("Unable to create statistics thread\n"));
4831
4832 daemonize_ready(0);
4833
4834 /* Wait for signal to quit */
4835 while (g_quit == B_FALSE)
4836 (void) pause();
4837
4838 /* detach doors */
4839 fattach_all_zones(B_TRUE);
4840
4841 (void) door_revoke(g_server_door);
4842 (void) door_revoke(g_stat_door);
4843
4844 /* kick stat thread and wait for it to close the statistics */
4845 (void) mutex_lock(&g_usage_cache_lock);
4846 g_quit = B_TRUE;
4847 (void) cond_signal(&g_usage_cache_kick);
4848 (void) mutex_unlock(&g_usage_cache_lock);
4849 end:
4850 (void) thr_join(tid, NULL, NULL);
4851 return (0);
4852 }
4853