1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 #include <alloca.h> 26 #include <assert.h> 27 #include <dirent.h> 28 #include <dlfcn.h> 29 #include <door.h> 30 #include <errno.h> 31 #include <exacct.h> 32 #include <ctype.h> 33 #include <fcntl.h> 34 #include <kstat.h> 35 #include <libcontract.h> 36 #include <libintl.h> 37 #include <libscf.h> 38 #include <zonestat.h> 39 #include <zonestat_impl.h> 40 #include <limits.h> 41 #include <pool.h> 42 #include <procfs.h> 43 #include <rctl.h> 44 #include <thread.h> 45 #include <signal.h> 46 #include <stdarg.h> 47 #include <stddef.h> 48 #include <stdio.h> 49 #include <stdlib.h> 50 #include <strings.h> 51 #include <synch.h> 52 #include <sys/acctctl.h> 53 #include <sys/contract/process.h> 54 #include <sys/ctfs.h> 55 #include <sys/fork.h> 56 #include <sys/param.h> 57 #include <sys/priocntl.h> 58 #include <sys/fxpriocntl.h> 59 #include <sys/processor.h> 60 #include <sys/pset.h> 61 #include <sys/socket.h> 62 #include <sys/stat.h> 63 #include <sys/statvfs.h> 64 #include <sys/swap.h> 65 #include <sys/systeminfo.h> 66 #include <thread.h> 67 #include <sys/list.h> 68 #include <sys/time.h> 69 #include <sys/types.h> 70 #include <sys/vm_usage.h> 71 #include <sys/wait.h> 72 #include <sys/zone.h> 73 #include <time.h> 74 #include <ucred.h> 75 #include <unistd.h> 76 #include <vm/anon.h> 77 #include <zone.h> 78 #include <zonestat.h> 79 80 #define MAX_PSET_NAME 1024 /* Taken from PV_NAME_MAX_LEN */ 81 #define ZSD_PSET_UNLIMITED UINT16_MAX 82 #define ZONESTAT_EXACCT_FILE "/var/adm/exacct/zonestat-process" 83 84 /* 85 * zonestatd implements gathering cpu and memory utilization data for 86 * running zones. It has these components: 87 * 88 * zsd_server: 89 * Door server to respond to client connections. Each client 90 * will connect using libzonestat.so, which will open and 91 * call /var/tmp/.zonestat_door. Each connecting client is given 92 * a file descriptor to the stat server. 93 * 94 * The zsd_server also responds to zoneadmd, which reports when a 95 * new zone is booted. This is used to fattach the zsd_server door 96 * into the new zone. 97 * 98 * zsd_stat_server: 99 * Receives client requests for the current utilization data. Each 100 * client request will cause zonestatd to update the current utilization 101 * data by kicking the stat_thread. 102 * 103 * If the client is in a non-global zone, the utilization data will 104 * be filtered to only show the given zone. The usage by all other zones 105 * will be added to the system utilization. 106 * 107 * stat_thread: 108 * The stat thread implements querying the system to determine the 109 * current utilization data for each running zone. This includes 110 * inspecting the system's processor set configuration, as well as details 111 * of each zone, such as their configured limits, and which processor 112 * sets they are running in. 113 * 114 * The stat_thread will only update memory utilization data as often as 115 * the configured config/sample_interval on the zones-monitoring service. 116 */ 117 118 /* 119 * The private vmusage structure unfortunately uses size_t types, and assumes 120 * the caller's bitness matches the kernel's bitness. Since the getvmusage() 121 * system call is contracted, and zonestatd is 32 bit, the following structures 122 * are used to interact with a 32bit or 64 bit kernel. 123 */ 124 typedef struct zsd_vmusage32 { 125 id_t vmu_zoneid; 126 uint_t vmu_type; 127 id_t vmu_id; 128 129 uint32_t vmu_rss_all; 130 uint32_t vmu_rss_private; 131 uint32_t vmu_rss_shared; 132 uint32_t vmu_swap_all; 133 uint32_t vmu_swap_private; 134 uint32_t vmu_swap_shared; 135 } zsd_vmusage32_t; 136 137 typedef struct zsd_vmusage64 { 138 id_t vmu_zoneid; 139 uint_t vmu_type; 140 id_t vmu_id; 141 /* 142 * An amd64 kernel will align the following uint64_t members, but a 143 * 32bit i386 process will not without help. 144 */ 145 int vmu_align_next_members_on_8_bytes; 146 uint64_t vmu_rss_all; 147 uint64_t vmu_rss_private; 148 uint64_t vmu_rss_shared; 149 uint64_t vmu_swap_all; 150 uint64_t vmu_swap_private; 151 uint64_t vmu_swap_shared; 152 } zsd_vmusage64_t; 153 154 struct zsd_zone; 155 156 /* Used to store a zone's usage of a pset */ 157 typedef struct zsd_pset_usage { 158 struct zsd_zone *zsu_zone; 159 struct zsd_pset *zsu_pset; 160 161 list_node_t zsu_next; 162 163 zoneid_t zsu_zoneid; 164 boolean_t zsu_found; /* zone bound at end of interval */ 165 boolean_t zsu_active; /* zone was bound during interval */ 166 boolean_t zsu_new; /* zone newly bound in this interval */ 167 boolean_t zsu_deleted; /* zone was unbound in this interval */ 168 boolean_t zsu_empty; /* no procs in pset in this interval */ 169 time_t zsu_start; /* time when zone was found in pset */ 170 hrtime_t zsu_hrstart; /* time when zone was found in pset */ 171 uint64_t zsu_cpu_shares; 172 uint_t zsu_scheds; /* schedulers found in this pass */ 173 timestruc_t zsu_cpu_usage; /* cpu time used */ 174 } zsd_pset_usage_t; 175 176 /* Used to store a pset's utilization */ 177 typedef struct zsd_pset { 178 psetid_t zsp_id; 179 list_node_t zsp_next; 180 char zsp_name[ZS_PSETNAME_MAX]; 181 182 uint_t zsp_cputype; /* default, dedicated or shared */ 183 boolean_t zsp_found; /* pset found at end of interval */ 184 boolean_t zsp_new; /* pset new in this interval */ 185 boolean_t zsp_deleted; /* pset deleted in this interval */ 186 boolean_t zsp_active; /* pset existed during interval */ 187 boolean_t zsp_empty; /* no processes in pset */ 188 time_t zsp_start; 189 hrtime_t zsp_hrstart; 190 191 uint64_t zsp_online; /* online cpus in interval */ 192 uint64_t zsp_size; /* size in this interval */ 193 uint64_t zsp_min; /* configured min in this interval */ 194 uint64_t zsp_max; /* configured max in this interval */ 195 int64_t zsp_importance; /* configured max in this interval */ 196 197 uint_t zsp_scheds; /* scheds of processes found in pset */ 198 uint64_t zsp_cpu_shares; /* total shares in this interval */ 199 200 timestruc_t zsp_total_time; 201 timestruc_t zsp_usage_kern; 202 timestruc_t zsp_usage_zones; 203 204 /* Individual zone usages of pset */ 205 list_t zsp_usage_list; 206 int zsp_nusage; 207 208 /* Summed kstat values from individual cpus in pset */ 209 timestruc_t zsp_idle; 210 timestruc_t zsp_intr; 211 timestruc_t zsp_kern; 212 timestruc_t zsp_user; 213 214 } zsd_pset_t; 215 216 /* Used to track an individual cpu's utilization as reported by kstats */ 217 typedef struct zsd_cpu { 218 processorid_t zsc_id; 219 list_node_t zsc_next; 220 psetid_t zsc_psetid; 221 psetid_t zsc_psetid_prev; 222 zsd_pset_t *zsc_pset; 223 224 boolean_t zsc_found; /* cpu online in this interval */ 225 boolean_t zsc_onlined; /* cpu onlined during this interval */ 226 boolean_t zsc_offlined; /* cpu offlined during this interval */ 227 boolean_t zsc_active; /* cpu online during this interval */ 228 boolean_t zsc_allocated; /* True if cpu has ever been found */ 229 230 /* kstats this interval */ 231 uint64_t zsc_nsec_idle; 232 uint64_t zsc_nsec_intr; 233 uint64_t zsc_nsec_kern; 234 uint64_t zsc_nsec_user; 235 236 /* kstats in most recent interval */ 237 uint64_t zsc_nsec_idle_prev; 238 uint64_t zsc_nsec_intr_prev; 239 uint64_t zsc_nsec_kern_prev; 240 uint64_t zsc_nsec_user_prev; 241 242 /* Total kstat increases since zonestatd started reading kstats */ 243 timestruc_t zsc_idle; 244 timestruc_t zsc_intr; 245 timestruc_t zsc_kern; 246 timestruc_t zsc_user; 247 248 } zsd_cpu_t; 249 250 /* Used to describe an individual zone and its utilization */ 251 typedef struct zsd_zone { 252 zoneid_t zsz_id; 253 list_node_t zsz_next; 254 char zsz_name[ZS_ZONENAME_MAX]; 255 uint_t zsz_cputype; 256 uint_t zsz_iptype; 257 time_t zsz_start; 258 hrtime_t zsz_hrstart; 259 260 char zsz_pool[ZS_POOLNAME_MAX]; 261 char zsz_pset[ZS_PSETNAME_MAX]; 262 int zsz_default_sched; 263 /* These are deduced by inspecting processes */ 264 psetid_t zsz_psetid; 265 uint_t zsz_scheds; 266 267 boolean_t zsz_new; /* zone booted during this interval */ 268 boolean_t zsz_deleted; /* halted during this interval */ 269 boolean_t zsz_active; /* running in this interval */ 270 boolean_t zsz_empty; /* no processes in this interval */ 271 boolean_t zsz_gone; /* not installed in this interval */ 272 boolean_t zsz_found; /* Running at end of this interval */ 273 274 uint64_t zsz_cpu_shares; 275 uint64_t zsz_cpu_cap; 276 uint64_t zsz_ram_cap; 277 uint64_t zsz_locked_cap; 278 uint64_t zsz_vm_cap; 279 280 uint64_t zsz_cpus_online; 281 timestruc_t zsz_cpu_usage; /* cpu time of cpu cap */ 282 timestruc_t zsz_cap_time; /* cpu time of cpu cap */ 283 timestruc_t zsz_share_time; /* cpu time of share of cpu */ 284 timestruc_t zsz_pset_time; /* time of all psets zone is bound to */ 285 286 uint64_t zsz_usage_ram; 287 uint64_t zsz_usage_locked; 288 uint64_t zsz_usage_vm; 289 290 uint64_t zsz_processes_cap; 291 uint64_t zsz_lwps_cap; 292 uint64_t zsz_shm_cap; 293 uint64_t zsz_shmids_cap; 294 uint64_t zsz_semids_cap; 295 uint64_t zsz_msgids_cap; 296 uint64_t zsz_lofi_cap; 297 298 uint64_t zsz_processes; 299 uint64_t zsz_lwps; 300 uint64_t zsz_shm; 301 uint64_t zsz_shmids; 302 uint64_t zsz_semids; 303 uint64_t zsz_msgids; 304 uint64_t zsz_lofi; 305 306 } zsd_zone_t; 307 308 /* 309 * Used to track the cpu usage of an individual processes. 310 * 311 * zonestatd sweeps /proc each interval and charges the cpu usage of processes. 312 * to their zone. As processes exit, their extended accounting records are 313 * read and the difference of their total and known usage is charged to their 314 * zone. 315 * 316 * If a process is never seen in /proc, the total usage on its extended 317 * accounting record will be charged to its zone. 318 */ 319 typedef struct zsd_proc { 320 list_node_t zspr_next; 321 pid_t zspr_ppid; 322 psetid_t zspr_psetid; 323 zoneid_t zspr_zoneid; 324 int zspr_sched; 325 timestruc_t zspr_usage; 326 } zsd_proc_t; 327 328 /* Used to track the overall resource usage of the system */ 329 typedef struct zsd_system { 330 331 uint64_t zss_ram_total; 332 uint64_t zss_ram_kern; 333 uint64_t zss_ram_zones; 334 335 uint64_t zss_locked_kern; 336 uint64_t zss_locked_zones; 337 338 uint64_t zss_vm_total; 339 uint64_t zss_vm_kern; 340 uint64_t zss_vm_zones; 341 342 uint64_t zss_swap_total; 343 uint64_t zss_swap_used; 344 345 timestruc_t zss_idle; 346 timestruc_t zss_intr; 347 timestruc_t zss_kern; 348 timestruc_t zss_user; 349 350 timestruc_t zss_cpu_total_time; 351 timestruc_t zss_cpu_usage_kern; 352 timestruc_t zss_cpu_usage_zones; 353 354 uint64_t zss_maxpid; 355 uint64_t zss_processes_max; 356 uint64_t zss_lwps_max; 357 uint64_t zss_shm_max; 358 uint64_t zss_shmids_max; 359 uint64_t zss_semids_max; 360 uint64_t zss_msgids_max; 361 uint64_t zss_lofi_max; 362 363 uint64_t zss_processes; 364 uint64_t zss_lwps; 365 uint64_t zss_shm; 366 uint64_t zss_shmids; 367 uint64_t zss_semids; 368 uint64_t zss_msgids; 369 uint64_t zss_lofi; 370 371 uint64_t zss_ncpus; 372 uint64_t zss_ncpus_online; 373 374 } zsd_system_t; 375 376 /* 377 * A dumping ground for various information and structures used to compute 378 * utilization. 379 * 380 * This structure is used to track the system while clients are connected. 381 * When The first client connects, a zsd_ctl is allocated and configured by 382 * zsd_open(). When all clients disconnect, the zsd_ctl is closed. 383 */ 384 typedef struct zsd_ctl { 385 kstat_ctl_t *zsctl_kstat_ctl; 386 387 /* To track extended accounting */ 388 int zsctl_proc_fd; /* Log currently being used */ 389 ea_file_t zsctl_proc_eaf; 390 struct stat64 zsctl_proc_stat; 391 int zsctl_proc_open; 392 int zsctl_proc_fd_next; /* Log file to use next */ 393 ea_file_t zsctl_proc_eaf_next; 394 struct stat64 zsctl_proc_stat_next; 395 int zsctl_proc_open_next; 396 397 /* pool configuration handle */ 398 pool_conf_t *zsctl_pool_conf; 399 int zsctl_pool_status; 400 int zsctl_pool_changed; 401 402 /* The above usage tacking structures */ 403 zsd_system_t *zsctl_system; 404 list_t zsctl_zones; 405 list_t zsctl_psets; 406 list_t zsctl_cpus; 407 zsd_cpu_t *zsctl_cpu_array; 408 zsd_proc_t *zsctl_proc_array; 409 410 /* Various system info */ 411 uint64_t zsctl_maxcpuid; 412 uint64_t zsctl_maxproc; 413 uint64_t zsctl_kern_bits; 414 uint64_t zsctl_pagesize; 415 416 /* Used to track time available under a cpu cap. */ 417 uint64_t zsctl_hrtime; 418 uint64_t zsctl_hrtime_prev; 419 timestruc_t zsctl_hrtime_total; 420 421 struct timeval zsctl_timeofday; 422 423 /* Caches for arrays allocated for use by various system calls */ 424 psetid_t *zsctl_pset_cache; 425 uint_t zsctl_pset_ncache; 426 processorid_t *zsctl_cpu_cache; 427 uint_t zsctl_cpu_ncache; 428 zoneid_t *zsctl_zone_cache; 429 uint_t zsctl_zone_ncache; 430 struct swaptable *zsctl_swap_cache; 431 uint64_t zsctl_swap_cache_size; 432 uint64_t zsctl_swap_cache_num; 433 zsd_vmusage64_t *zsctl_vmusage_cache; 434 uint64_t zsctl_vmusage_cache_num; 435 436 /* Info about procfs for scanning /proc */ 437 pool_value_t *zsctl_pool_vals[3]; 438 439 /* Counts on tracked entities */ 440 uint_t zsctl_nzones; 441 uint_t zsctl_npsets; 442 uint_t zsctl_npset_usages; 443 } zsd_ctl_t; 444 445 zsd_ctl_t *g_ctl; 446 boolean_t g_open; /* True if g_ctl is open */ 447 int g_hasclient; /* True if any clients are connected */ 448 449 /* 450 * The usage cache is updated by the stat_thread, and copied to clients by 451 * the zsd_stat_server. Mutex and cond are to synchronize between the 452 * stat_thread and the stat_server. 453 */ 454 zs_usage_cache_t *g_usage_cache; 455 mutex_t g_usage_cache_lock; 456 cond_t g_usage_cache_kick; 457 uint_t g_usage_cache_kickers; 458 cond_t g_usage_cache_wait; 459 char *g_usage_cache_buf; 460 uint_t g_usage_cache_bufsz; 461 uint64_t g_gen_next; 462 463 /* fds of door servers */ 464 int g_server_door; 465 int g_stat_door; 466 467 /* 468 * Starting and current time. Used to throttle memory calculation, and to 469 * mark new zones and psets with their boot and creation time. 470 */ 471 time_t g_now; 472 time_t g_start; 473 hrtime_t g_hrnow; 474 hrtime_t g_hrstart; 475 uint64_t g_interval; 476 477 /* 478 * main() thread. 479 */ 480 thread_t g_main; 481 482 /* PRINTFLIKE1 */ 483 static void 484 zsd_warn(const char *fmt, ...) 485 { 486 va_list alist; 487 488 va_start(alist, fmt); 489 490 (void) fprintf(stderr, gettext("zonestat: Warning: ")); 491 (void) vfprintf(stderr, fmt, alist); 492 (void) fprintf(stderr, "\n"); 493 va_end(alist); 494 } 495 496 /* PRINTFLIKE1 */ 497 static void 498 zsd_error(const char *fmt, ...) 499 { 500 va_list alist; 501 502 va_start(alist, fmt); 503 504 (void) fprintf(stderr, gettext("zonestat: Error: ")); 505 (void) vfprintf(stderr, fmt, alist); 506 (void) fprintf(stderr, "\n"); 507 va_end(alist); 508 exit(1); 509 } 510 511 /* Turns on extended accounting if not configured externally */ 512 int 513 zsd_enable_cpu_stats() 514 { 515 char *path = ZONESTAT_EXACCT_FILE; 516 char oldfile[MAXPATHLEN]; 517 int ret, state = AC_ON; 518 ac_res_t res[6]; 519 520 /* 521 * Start a new accounting file if accounting not configured 522 * externally. 523 */ 524 525 res[0].ar_id = AC_PROC_PID; 526 res[0].ar_state = AC_ON; 527 res[1].ar_id = AC_PROC_ANCPID; 528 res[1].ar_state = AC_ON; 529 res[2].ar_id = AC_PROC_CPU; 530 res[2].ar_state = AC_ON; 531 res[3].ar_id = AC_PROC_TIME; 532 res[3].ar_state = AC_ON; 533 res[4].ar_id = AC_PROC_ZONENAME; 534 res[4].ar_state = AC_ON; 535 res[5].ar_id = AC_NONE; 536 res[5].ar_state = AC_ON; 537 if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) { 538 zsd_warn(gettext("Unable to set accounting resources")); 539 return (-1); 540 } 541 /* Only set accounting file if none is configured */ 542 ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile)); 543 if (ret < 0) { 544 545 (void) unlink(path); 546 if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) 547 == -1) { 548 zsd_warn(gettext("Unable to set accounting file")); 549 return (-1); 550 } 551 } 552 if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) { 553 zsd_warn(gettext("Unable to enable accounting")); 554 return (-1); 555 } 556 return (0); 557 } 558 559 /* Turns off extended accounting if not configured externally */ 560 int 561 zsd_disable_cpu_stats() 562 { 563 char *path = ZONESTAT_EXACCT_FILE; 564 int ret, state = AC_OFF; 565 ac_res_t res[6]; 566 char oldfile[MAXPATHLEN]; 567 568 /* If accounting file is externally configured, leave it alone */ 569 ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile)); 570 if (ret == 0 && strcmp(oldfile, path) != 0) 571 return (0); 572 573 res[0].ar_id = AC_PROC_PID; 574 res[0].ar_state = AC_OFF; 575 res[1].ar_id = AC_PROC_ANCPID; 576 res[1].ar_state = AC_OFF; 577 res[2].ar_id = AC_PROC_CPU; 578 res[2].ar_state = AC_OFF; 579 res[3].ar_id = AC_PROC_TIME; 580 res[3].ar_state = AC_OFF; 581 res[4].ar_id = AC_PROC_ZONENAME; 582 res[4].ar_state = AC_OFF; 583 res[5].ar_id = AC_NONE; 584 res[5].ar_state = AC_OFF; 585 if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) { 586 zsd_warn(gettext("Unable to clear accounting resources")); 587 return (-1); 588 } 589 if (acctctl(AC_PROC | AC_FILE_SET, NULL, 0) == -1) { 590 zsd_warn(gettext("Unable to clear accounting file")); 591 return (-1); 592 } 593 if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) { 594 zsd_warn(gettext("Unable to diable accounting")); 595 return (-1); 596 } 597 598 (void) unlink(path); 599 return (0); 600 } 601 602 /* 603 * If not configured externally, deletes the current extended accounting file 604 * and starts a new one. 605 * 606 * Since the stat_thread holds an open handle to the accounting file, it will 607 * read all remaining entries from the old file before switching to 608 * read the new one. 609 */ 610 int 611 zsd_roll_exacct(void) 612 { 613 int ret; 614 char *path = ZONESTAT_EXACCT_FILE; 615 char oldfile[MAXPATHLEN]; 616 617 /* If accounting file is externally configured, leave it alone */ 618 ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile)); 619 if (ret == 0 && strcmp(oldfile, path) != 0) 620 return (0); 621 622 if (unlink(path) != 0) 623 /* Roll it next time */ 624 return (0); 625 626 if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) { 627 zsd_warn(gettext("Unable to set accounting file")); 628 return (-1); 629 } 630 return (0); 631 } 632 633 /* Contract stuff for zone_enter() */ 634 int 635 init_template(void) 636 { 637 int fd; 638 int err = 0; 639 640 fd = open64(CTFS_ROOT "/process/template", O_RDWR); 641 if (fd == -1) 642 return (-1); 643 644 /* 645 * For now, zoneadmd doesn't do anything with the contract. 646 * Deliver no events, don't inherit, and allow it to be orphaned. 647 */ 648 err |= ct_tmpl_set_critical(fd, 0); 649 err |= ct_tmpl_set_informative(fd, 0); 650 err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR); 651 err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT); 652 if (err || ct_tmpl_activate(fd)) { 653 (void) close(fd); 654 return (-1); 655 } 656 657 return (fd); 658 } 659 660 /* 661 * Contract stuff for zone_enter() 662 */ 663 int 664 contract_latest(ctid_t *id) 665 { 666 int cfd, r; 667 ct_stathdl_t st; 668 ctid_t result; 669 670 if ((cfd = open64(CTFS_ROOT "/process/latest", O_RDONLY)) == -1) 671 return (errno); 672 673 if ((r = ct_status_read(cfd, CTD_COMMON, &st)) != 0) { 674 (void) close(cfd); 675 return (r); 676 } 677 678 result = ct_status_get_id(st); 679 ct_status_free(st); 680 (void) close(cfd); 681 682 *id = result; 683 return (0); 684 } 685 686 static int 687 close_on_exec(int fd) 688 { 689 int flags = fcntl(fd, F_GETFD, 0); 690 if ((flags != -1) && (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1)) 691 return (0); 692 return (-1); 693 } 694 695 int 696 contract_open(ctid_t ctid, const char *type, const char *file, int oflag) 697 { 698 char path[PATH_MAX]; 699 int n, fd; 700 701 if (type == NULL) 702 type = "all"; 703 704 n = snprintf(path, PATH_MAX, CTFS_ROOT "/%s/%ld/%s", type, ctid, file); 705 if (n >= sizeof (path)) { 706 errno = ENAMETOOLONG; 707 return (-1); 708 } 709 710 fd = open64(path, oflag); 711 if (fd != -1) { 712 if (close_on_exec(fd) == -1) { 713 int err = errno; 714 (void) close(fd); 715 errno = err; 716 return (-1); 717 } 718 } 719 return (fd); 720 } 721 722 int 723 contract_abandon_id(ctid_t ctid) 724 { 725 int fd, err; 726 727 fd = contract_open(ctid, "all", "ctl", O_WRONLY); 728 if (fd == -1) 729 return (errno); 730 731 err = ct_ctl_abandon(fd); 732 (void) close(fd); 733 734 return (err); 735 } 736 /* 737 * Attach the zsd_server to a zone. Called for each zone when zonestatd 738 * starts, and for each newly booted zone when zoneadmd contacts the zsd_server 739 * 740 * Zone_enter is used to avoid reaching into zone to fattach door. 741 */ 742 static void 743 zsd_fattach_zone(zoneid_t zid, int door, boolean_t detach_only) 744 { 745 char *path = ZS_DOOR_PATH; 746 int fd, pid, stat, tmpl_fd; 747 ctid_t ct; 748 749 if ((tmpl_fd = init_template()) == -1) { 750 zsd_warn("Unable to init template"); 751 return; 752 } 753 754 pid = forkx(0); 755 if (pid < 0) { 756 (void) ct_tmpl_clear(tmpl_fd); 757 zsd_warn(gettext( 758 "Unable to fork to add zonestat to zoneid %d\n"), zid); 759 return; 760 } 761 762 if (pid == 0) { 763 (void) ct_tmpl_clear(tmpl_fd); 764 (void) close(tmpl_fd); 765 if (zid != 0 && zone_enter(zid) != 0) { 766 if (errno == EINVAL) { 767 _exit(0); 768 } 769 _exit(1); 770 } 771 (void) fdetach(path); 772 (void) unlink(path); 773 if (detach_only) 774 _exit(0); 775 fd = open(path, O_CREAT|O_RDWR, 0644); 776 if (fd < 0) 777 _exit(2); 778 if (fattach(door, path) != 0) 779 _exit(3); 780 _exit(0); 781 } 782 if (contract_latest(&ct) == -1) 783 ct = -1; 784 (void) ct_tmpl_clear(tmpl_fd); 785 (void) close(tmpl_fd); 786 (void) contract_abandon_id(ct); 787 while (waitpid(pid, &stat, 0) != pid) 788 ; 789 if (WIFEXITED(stat) && WEXITSTATUS(stat) == 0) 790 return; 791 792 zsd_warn(gettext("Unable to attach door to zoneid: %d"), zid); 793 794 if (WEXITSTATUS(stat) == 1) 795 zsd_warn(gettext("Cannot entering zone")); 796 else if (WEXITSTATUS(stat) == 2) 797 zsd_warn(gettext("Unable to create door file: %s"), path); 798 else if (WEXITSTATUS(stat) == 3) 799 zsd_warn(gettext("Unable to fattach file: %s"), path); 800 801 zsd_warn(gettext("Internal error entering zone: %d"), zid); 802 } 803 804 /* 805 * Zone lookup and allocation functions to manage list of currently running 806 * zones. 807 */ 808 static zsd_zone_t * 809 zsd_lookup_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid) 810 { 811 zsd_zone_t *zone; 812 813 for (zone = list_head(&ctl->zsctl_zones); zone != NULL; 814 zone = list_next(&ctl->zsctl_zones, zone)) { 815 if (strcmp(zone->zsz_name, zonename) == 0) { 816 if (zoneid != -1) 817 zone->zsz_id = zoneid; 818 return (zone); 819 } 820 } 821 return (NULL); 822 } 823 824 static zsd_zone_t * 825 zsd_lookup_zone_byid(zsd_ctl_t *ctl, zoneid_t zoneid) 826 { 827 zsd_zone_t *zone; 828 829 for (zone = list_head(&ctl->zsctl_zones); zone != NULL; 830 zone = list_next(&ctl->zsctl_zones, zone)) { 831 if (zone->zsz_id == zoneid) 832 return (zone); 833 } 834 return (NULL); 835 } 836 837 static zsd_zone_t * 838 zsd_allocate_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid) 839 { 840 zsd_zone_t *zone; 841 842 if ((zone = (zsd_zone_t *)calloc(1, sizeof (zsd_zone_t))) == NULL) 843 return (NULL); 844 845 (void) strlcpy(zone->zsz_name, zonename, sizeof (zone->zsz_name)); 846 zone->zsz_id = zoneid; 847 zone->zsz_found = B_FALSE; 848 849 /* 850 * Allocate as deleted so if not found in first pass, zone is deleted 851 * from list. This can happen if zone is returned by zone_list, but 852 * exits before first attempt to fetch zone details. 853 */ 854 zone->zsz_start = g_now; 855 zone->zsz_hrstart = g_hrnow; 856 zone->zsz_deleted = B_TRUE; 857 858 zone->zsz_cpu_shares = ZS_LIMIT_NONE; 859 zone->zsz_cpu_cap = ZS_LIMIT_NONE; 860 zone->zsz_ram_cap = ZS_LIMIT_NONE; 861 zone->zsz_locked_cap = ZS_LIMIT_NONE; 862 zone->zsz_vm_cap = ZS_LIMIT_NONE; 863 864 zone->zsz_processes_cap = ZS_LIMIT_NONE; 865 zone->zsz_lwps_cap = ZS_LIMIT_NONE; 866 zone->zsz_shm_cap = ZS_LIMIT_NONE; 867 zone->zsz_shmids_cap = ZS_LIMIT_NONE; 868 zone->zsz_semids_cap = ZS_LIMIT_NONE; 869 zone->zsz_msgids_cap = ZS_LIMIT_NONE; 870 zone->zsz_lofi_cap = ZS_LIMIT_NONE; 871 872 ctl->zsctl_nzones++; 873 874 return (zone); 875 } 876 877 static zsd_zone_t * 878 zsd_lookup_insert_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid) 879 { 880 zsd_zone_t *zone, *tmp; 881 882 if ((zone = zsd_lookup_zone(ctl, zonename, zoneid)) != NULL) 883 return (zone); 884 885 if ((zone = zsd_allocate_zone(ctl, zonename, zoneid)) == NULL) 886 return (NULL); 887 888 /* Insert sorted by zonename */ 889 tmp = list_head(&ctl->zsctl_zones); 890 while (tmp != NULL && strcmp(zonename, tmp->zsz_name) > 0) 891 tmp = list_next(&ctl->zsctl_zones, tmp); 892 893 list_insert_before(&ctl->zsctl_zones, tmp, zone); 894 return (zone); 895 } 896 897 /* 898 * Mark all zones as not existing. As zones are found, they will 899 * be marked as existing. If a zone is not found, then it must have 900 * halted. 901 */ 902 static void 903 zsd_mark_zones_start(zsd_ctl_t *ctl) 904 { 905 906 zsd_zone_t *zone; 907 908 for (zone = list_head(&ctl->zsctl_zones); zone != NULL; 909 zone = list_next(&ctl->zsctl_zones, zone)) { 910 zone->zsz_found = B_FALSE; 911 } 912 } 913 914 /* 915 * Mark each zone as not using pset. If processes are found using the 916 * pset, the zone will remain bound to the pset. If none of a zones 917 * processes are bound to the pset, the zone's usage of the pset will 918 * be deleted. 919 * 920 */ 921 static void 922 zsd_mark_pset_usage_start(zsd_pset_t *pset) 923 { 924 zsd_pset_usage_t *usage; 925 926 for (usage = list_head(&pset->zsp_usage_list); 927 usage != NULL; 928 usage = list_next(&pset->zsp_usage_list, usage)) { 929 usage->zsu_found = B_FALSE; 930 usage->zsu_empty = B_TRUE; 931 } 932 } 933 934 /* 935 * Mark each pset as not existing. If a pset is found, it will be marked 936 * as existing. If a pset is not found, it wil be deleted. 937 */ 938 static void 939 zsd_mark_psets_start(zsd_ctl_t *ctl) 940 { 941 zsd_pset_t *pset; 942 943 for (pset = list_head(&ctl->zsctl_psets); pset != NULL; 944 pset = list_next(&ctl->zsctl_psets, pset)) { 945 pset->zsp_found = B_FALSE; 946 zsd_mark_pset_usage_start(pset); 947 } 948 } 949 950 /* 951 * A pset was found. Update its information 952 */ 953 static void 954 zsd_mark_pset_found(zsd_pset_t *pset, uint_t type, uint64_t online, 955 uint64_t size, uint64_t min, uint64_t max, int64_t importance) 956 { 957 pset->zsp_empty = B_TRUE; 958 pset->zsp_deleted = B_FALSE; 959 960 assert(pset->zsp_found == B_FALSE); 961 962 /* update pset flags */ 963 if (pset->zsp_active == B_FALSE) 964 /* pset not seen on previous interval. It is new. */ 965 pset->zsp_new = B_TRUE; 966 else 967 pset->zsp_new = B_FALSE; 968 969 pset->zsp_found = B_TRUE; 970 pset->zsp_cputype = type; 971 pset->zsp_online = online; 972 pset->zsp_size = size; 973 pset->zsp_min = min; 974 pset->zsp_max = max; 975 pset->zsp_importance = importance; 976 pset->zsp_cpu_shares = 0; 977 pset->zsp_scheds = 0; 978 pset->zsp_active = B_TRUE; 979 } 980 981 /* 982 * A zone's process was found using a pset. Charge the process to the pset and 983 * the per-zone data for the pset. 984 */ 985 static void 986 zsd_mark_pset_usage_found(zsd_pset_usage_t *usage, uint_t sched) 987 { 988 zsd_zone_t *zone = usage->zsu_zone; 989 zsd_pset_t *pset = usage->zsu_pset; 990 991 /* Nothing to do if already found */ 992 if (usage->zsu_found == B_TRUE) 993 goto add_stats; 994 995 usage->zsu_found = B_TRUE; 996 usage->zsu_empty = B_FALSE; 997 998 usage->zsu_deleted = B_FALSE; 999 /* update usage flags */ 1000 if (usage->zsu_active == B_FALSE) 1001 usage->zsu_new = B_TRUE; 1002 else 1003 usage->zsu_new = B_FALSE; 1004 1005 usage->zsu_scheds = 0; 1006 usage->zsu_cpu_shares = ZS_LIMIT_NONE; 1007 usage->zsu_active = B_TRUE; 1008 pset->zsp_empty = B_FALSE; 1009 zone->zsz_empty = B_FALSE; 1010 1011 add_stats: 1012 /* Detect zone's pset id, and if it is bound to multiple psets */ 1013 if (zone->zsz_psetid == ZS_PSET_ERROR) 1014 zone->zsz_psetid = pset->zsp_id; 1015 else if (zone->zsz_psetid != pset->zsp_id) 1016 zone->zsz_psetid = ZS_PSET_MULTI; 1017 1018 usage->zsu_scheds |= sched; 1019 pset->zsp_scheds |= sched; 1020 zone->zsz_scheds |= sched; 1021 1022 /* Record if FSS is co-habitating with conflicting scheduler */ 1023 if ((pset->zsp_scheds & ZS_SCHED_FSS) && 1024 usage->zsu_scheds & ( 1025 ZS_SCHED_TS | ZS_SCHED_IA | ZS_SCHED_FX)) { 1026 usage->zsu_scheds |= ZS_SCHED_CONFLICT; 1027 1028 pset->zsp_scheds |= ZS_SCHED_CONFLICT; 1029 } 1030 1031 } 1032 1033 /* Add cpu time for a process to a pset, zone, and system totals */ 1034 static void 1035 zsd_add_usage(zsd_ctl_t *ctl, zsd_pset_usage_t *usage, timestruc_t *delta) 1036 { 1037 zsd_system_t *system = ctl->zsctl_system; 1038 zsd_zone_t *zone = usage->zsu_zone; 1039 zsd_pset_t *pset = usage->zsu_pset; 1040 1041 TIMESTRUC_ADD_TIMESTRUC(usage->zsu_cpu_usage, *delta); 1042 TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_zones, *delta); 1043 TIMESTRUC_ADD_TIMESTRUC(zone->zsz_cpu_usage, *delta); 1044 TIMESTRUC_ADD_TIMESTRUC(system->zss_cpu_usage_zones, *delta); 1045 } 1046 1047 /* Determine which processor sets have been deleted */ 1048 static void 1049 zsd_mark_psets_end(zsd_ctl_t *ctl) 1050 { 1051 zsd_pset_t *pset, *tmp; 1052 1053 /* 1054 * Mark pset as not exists, and deleted if it existed 1055 * previous interval. 1056 */ 1057 pset = list_head(&ctl->zsctl_psets); 1058 while (pset != NULL) { 1059 if (pset->zsp_found == B_FALSE) { 1060 pset->zsp_empty = B_TRUE; 1061 if (pset->zsp_deleted == B_TRUE) { 1062 tmp = pset; 1063 pset = list_next(&ctl->zsctl_psets, pset); 1064 list_remove(&ctl->zsctl_psets, tmp); 1065 free(tmp); 1066 ctl->zsctl_npsets--; 1067 continue; 1068 } else { 1069 /* Pset vanished during this interval */ 1070 pset->zsp_new = B_FALSE; 1071 pset->zsp_deleted = B_TRUE; 1072 pset->zsp_active = B_TRUE; 1073 } 1074 } 1075 pset = list_next(&ctl->zsctl_psets, pset); 1076 } 1077 } 1078 1079 /* Determine which zones are no longer bound to processor sets */ 1080 static void 1081 zsd_mark_pset_usages_end(zsd_ctl_t *ctl) 1082 { 1083 zsd_pset_t *pset; 1084 zsd_zone_t *zone; 1085 zsd_pset_usage_t *usage, *tmp; 1086 1087 /* 1088 * Mark pset as not exists, and deleted if it existed previous 1089 * interval. 1090 */ 1091 for (pset = list_head(&ctl->zsctl_psets); pset != NULL; 1092 pset = list_next(&ctl->zsctl_psets, pset)) { 1093 usage = list_head(&pset->zsp_usage_list); 1094 while (usage != NULL) { 1095 /* 1096 * Mark pset as not exists, and deleted if it existed 1097 * previous interval. 1098 */ 1099 if (usage->zsu_found == B_FALSE || 1100 usage->zsu_zone->zsz_deleted == B_TRUE || 1101 usage->zsu_pset->zsp_deleted == B_TRUE) { 1102 tmp = usage; 1103 usage = list_next(&pset->zsp_usage_list, 1104 usage); 1105 list_remove(&pset->zsp_usage_list, tmp); 1106 free(tmp); 1107 pset->zsp_nusage--; 1108 ctl->zsctl_npset_usages--; 1109 continue; 1110 } else { 1111 usage->zsu_new = B_FALSE; 1112 usage->zsu_deleted = B_TRUE; 1113 usage->zsu_active = B_TRUE; 1114 } 1115 /* Add cpu shares for usages that are in FSS */ 1116 zone = usage->zsu_zone; 1117 if (usage->zsu_scheds & ZS_SCHED_FSS && 1118 zone->zsz_cpu_shares != ZS_SHARES_UNLIMITED && 1119 zone->zsz_cpu_shares != 0) { 1120 zone = usage->zsu_zone; 1121 usage->zsu_cpu_shares = zone->zsz_cpu_shares; 1122 pset->zsp_cpu_shares += zone->zsz_cpu_shares; 1123 } 1124 usage = list_next(&pset->zsp_usage_list, 1125 usage); 1126 } 1127 } 1128 } 1129 1130 /* A zone has been found. Update its information */ 1131 static void 1132 zsd_mark_zone_found(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t cpu_shares, 1133 uint64_t cpu_cap, uint64_t ram_cap, uint64_t locked_cap, 1134 uint64_t vm_cap, uint64_t processes_cap, uint64_t processes, 1135 uint64_t lwps_cap, uint64_t lwps, uint64_t shm_cap, uint64_t shm, 1136 uint64_t shmids_cap, uint64_t shmids, uint64_t semids_cap, 1137 uint64_t semids, uint64_t msgids_cap, uint64_t msgids, uint64_t lofi_cap, 1138 uint64_t lofi, char *poolname, char *psetname, uint_t sched, uint_t cputype, 1139 uint_t iptype) 1140 { 1141 zsd_system_t *sys = ctl->zsctl_system; 1142 1143 assert(zone->zsz_found == B_FALSE); 1144 1145 /* 1146 * Mark zone as exists, and new if it did not exist in previous 1147 * interval. 1148 */ 1149 zone->zsz_found = B_TRUE; 1150 zone->zsz_empty = B_TRUE; 1151 zone->zsz_deleted = B_FALSE; 1152 1153 /* 1154 * Zone is new. Assume zone's properties are the same over entire 1155 * interval. 1156 */ 1157 if (zone->zsz_active == B_FALSE) 1158 zone->zsz_new = B_TRUE; 1159 else 1160 zone->zsz_new = B_FALSE; 1161 1162 (void) strlcpy(zone->zsz_pool, poolname, sizeof (zone->zsz_pool)); 1163 (void) strlcpy(zone->zsz_pset, psetname, sizeof (zone->zsz_pset)); 1164 zone->zsz_default_sched = sched; 1165 1166 /* Schedulers updated later as processes are found */ 1167 zone->zsz_scheds = 0; 1168 1169 /* Cpus updated later as psets bound are identified */ 1170 zone->zsz_cpus_online = 0; 1171 1172 zone->zsz_cputype = cputype; 1173 zone->zsz_iptype = iptype; 1174 zone->zsz_psetid = ZS_PSET_ERROR; 1175 zone->zsz_cpu_cap = cpu_cap; 1176 zone->zsz_cpu_shares = cpu_shares; 1177 zone->zsz_ram_cap = ram_cap; 1178 zone->zsz_locked_cap = locked_cap; 1179 zone->zsz_vm_cap = vm_cap; 1180 zone->zsz_processes_cap = processes_cap; 1181 zone->zsz_processes = processes; 1182 zone->zsz_lwps_cap = lwps_cap; 1183 zone->zsz_lwps = lwps; 1184 zone->zsz_shm_cap = shm_cap; 1185 zone->zsz_shm = shm; 1186 zone->zsz_shmids_cap = shmids_cap; 1187 zone->zsz_shmids = shmids; 1188 zone->zsz_semids_cap = semids_cap; 1189 zone->zsz_semids = semids; 1190 zone->zsz_msgids_cap = msgids_cap; 1191 zone->zsz_msgids = msgids; 1192 zone->zsz_lofi_cap = lofi_cap; 1193 zone->zsz_lofi = lofi; 1194 1195 sys->zss_processes += processes; 1196 sys->zss_lwps += lwps; 1197 sys->zss_shm += shm; 1198 sys->zss_shmids += shmids; 1199 sys->zss_semids += semids; 1200 sys->zss_msgids += msgids; 1201 sys->zss_lofi += lofi; 1202 zone->zsz_active = B_TRUE; 1203 } 1204 1205 1206 /* Determine which zones have halted */ 1207 static void 1208 zsd_mark_zones_end(zsd_ctl_t *ctl) 1209 { 1210 zsd_zone_t *zone, *tmp; 1211 1212 /* 1213 * Mark zone as not existing, or delete if it did not exist in 1214 * previous interval. 1215 */ 1216 zone = list_head(&ctl->zsctl_zones); 1217 while (zone != NULL) { 1218 if (zone->zsz_found == B_FALSE) { 1219 zone->zsz_empty = B_TRUE; 1220 if (zone->zsz_deleted == B_TRUE) { 1221 /* 1222 * Zone deleted in prior interval, 1223 * so it no longer exists. 1224 */ 1225 tmp = zone; 1226 zone = list_next(&ctl->zsctl_zones, zone); 1227 list_remove(&ctl->zsctl_zones, tmp); 1228 free(tmp); 1229 ctl->zsctl_nzones--; 1230 continue; 1231 } else { 1232 zone->zsz_new = B_FALSE; 1233 zone->zsz_deleted = B_TRUE; 1234 zone->zsz_active = B_TRUE; 1235 } 1236 } 1237 zone = list_next(&ctl->zsctl_zones, zone); 1238 } 1239 } 1240 1241 /* 1242 * Mark cpus as not existing. If a cpu is found, it will be updated. If 1243 * a cpu is not found, then it must have gone offline, so it will be 1244 * deleted. 1245 * 1246 * The kstat tracking data is rolled so that the usage since the previous 1247 * interval can be determined. 1248 */ 1249 static void 1250 zsd_mark_cpus_start(zsd_ctl_t *ctl, boolean_t roll) 1251 { 1252 zsd_cpu_t *cpu; 1253 1254 /* 1255 * Mark all cpus as not existing. As cpus are found, they will 1256 * be marked as existing. 1257 */ 1258 for (cpu = list_head(&ctl->zsctl_cpus); cpu != NULL; 1259 cpu = list_next(&ctl->zsctl_cpus, cpu)) { 1260 cpu->zsc_found = B_FALSE; 1261 if (cpu->zsc_active == B_TRUE && roll) { 1262 cpu->zsc_psetid_prev = cpu->zsc_psetid; 1263 cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle; 1264 cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr; 1265 cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern; 1266 cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user; 1267 } 1268 } 1269 } 1270 1271 /* 1272 * An array the size of the maximum number of cpus is kept. Within this array 1273 * a list of the online cpus is maintained. 1274 */ 1275 zsd_cpu_t * 1276 zsd_lookup_insert_cpu(zsd_ctl_t *ctl, processorid_t cpuid) 1277 { 1278 zsd_cpu_t *cpu; 1279 1280 assert(cpuid < ctl->zsctl_maxcpuid); 1281 cpu = &(ctl->zsctl_cpu_array[cpuid]); 1282 assert(cpuid == cpu->zsc_id); 1283 1284 if (cpu->zsc_allocated == B_FALSE) { 1285 cpu->zsc_allocated = B_TRUE; 1286 list_insert_tail(&ctl->zsctl_cpus, cpu); 1287 } 1288 return (cpu); 1289 } 1290 1291 /* A cpu has been found. Update its information */ 1292 static void 1293 zsd_mark_cpu_found(zsd_cpu_t *cpu, zsd_pset_t *pset, psetid_t psetid) 1294 { 1295 /* 1296 * legacy processor sets, the cpu may move while zonestatd is 1297 * inspecting, causing it to be found twice. In this case, just 1298 * leave cpu in the first processor set in which it was found. 1299 */ 1300 if (cpu->zsc_found == B_TRUE) 1301 return; 1302 1303 /* Mark cpu as online */ 1304 cpu->zsc_found = B_TRUE; 1305 cpu->zsc_offlined = B_FALSE; 1306 cpu->zsc_pset = pset; 1307 /* 1308 * cpu is newly online. 1309 */ 1310 if (cpu->zsc_active == B_FALSE) { 1311 /* 1312 * Cpu is newly online. 1313 */ 1314 cpu->zsc_onlined = B_TRUE; 1315 cpu->zsc_psetid = psetid; 1316 cpu->zsc_psetid_prev = psetid; 1317 } else { 1318 /* 1319 * cpu online during previous interval. Save properties at 1320 * start of interval 1321 */ 1322 cpu->zsc_onlined = B_FALSE; 1323 cpu->zsc_psetid = psetid; 1324 1325 } 1326 cpu->zsc_active = B_TRUE; 1327 } 1328 1329 /* Remove all offlined cpus from the list of tracked cpus */ 1330 static void 1331 zsd_mark_cpus_end(zsd_ctl_t *ctl) 1332 { 1333 zsd_cpu_t *cpu, *tmp; 1334 int id; 1335 1336 /* Mark cpu as online or offline */ 1337 cpu = list_head(&ctl->zsctl_cpus); 1338 while (cpu != NULL) { 1339 if (cpu->zsc_found == B_FALSE) { 1340 if (cpu->zsc_offlined == B_TRUE) { 1341 /* 1342 * cpu offlined in prior interval. It is gone. 1343 */ 1344 tmp = cpu; 1345 cpu = list_next(&ctl->zsctl_cpus, cpu); 1346 list_remove(&ctl->zsctl_cpus, tmp); 1347 /* Clear structure for future use */ 1348 id = tmp->zsc_id; 1349 bzero(tmp, sizeof (zsd_cpu_t)); 1350 tmp->zsc_id = id; 1351 tmp->zsc_allocated = B_FALSE; 1352 tmp->zsc_psetid = ZS_PSET_ERROR; 1353 tmp->zsc_psetid_prev = ZS_PSET_ERROR; 1354 1355 } else { 1356 /* 1357 * cpu online at start of interval. Treat 1358 * as still online, since it was online for 1359 * some portion of the interval. 1360 */ 1361 cpu->zsc_offlined = B_TRUE; 1362 cpu->zsc_onlined = B_FALSE; 1363 cpu->zsc_active = B_TRUE; 1364 cpu->zsc_psetid = cpu->zsc_psetid_prev; 1365 cpu->zsc_pset = NULL; 1366 } 1367 } 1368 cpu = list_next(&ctl->zsctl_cpus, cpu); 1369 } 1370 } 1371 1372 /* Some utility functions for managing the list of processor sets */ 1373 static zsd_pset_t * 1374 zsd_lookup_pset_byid(zsd_ctl_t *ctl, psetid_t psetid) 1375 { 1376 zsd_pset_t *pset; 1377 1378 for (pset = list_head(&ctl->zsctl_psets); pset != NULL; 1379 pset = list_next(&ctl->zsctl_psets, pset)) { 1380 if (pset->zsp_id == psetid) 1381 return (pset); 1382 } 1383 return (NULL); 1384 } 1385 1386 static zsd_pset_t * 1387 zsd_lookup_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid) 1388 { 1389 zsd_pset_t *pset; 1390 1391 for (pset = list_head(&ctl->zsctl_psets); pset != NULL; 1392 pset = list_next(&ctl->zsctl_psets, pset)) { 1393 if (strcmp(pset->zsp_name, psetname) == 0) { 1394 if (psetid != -1) 1395 pset->zsp_id = psetid; 1396 return (pset); 1397 } 1398 } 1399 return (NULL); 1400 } 1401 1402 static zsd_pset_t * 1403 zsd_allocate_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid) 1404 { 1405 zsd_pset_t *pset; 1406 1407 if ((pset = (zsd_pset_t *)calloc(1, sizeof (zsd_pset_t))) == NULL) 1408 return (NULL); 1409 1410 (void) strlcpy(pset->zsp_name, psetname, sizeof (pset->zsp_name)); 1411 pset->zsp_id = psetid; 1412 pset->zsp_found = B_FALSE; 1413 /* 1414 * Allocate as deleted so if not found in first pass, pset is deleted 1415 * from list. This can happen if pset is returned by pset_list, but 1416 * is destroyed before first attempt to fetch pset details. 1417 */ 1418 list_create(&pset->zsp_usage_list, sizeof (zsd_pset_usage_t), 1419 offsetof(zsd_pset_usage_t, zsu_next)); 1420 1421 pset->zsp_hrstart = g_hrnow; 1422 pset->zsp_deleted = B_TRUE; 1423 pset->zsp_empty = B_TRUE; 1424 ctl->zsctl_npsets++; 1425 1426 return (pset); 1427 } 1428 1429 static zsd_pset_t * 1430 zsd_lookup_insert_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid) 1431 { 1432 zsd_pset_t *pset, *tmp; 1433 1434 if ((pset = zsd_lookup_pset(ctl, psetname, psetid)) != NULL) 1435 return (pset); 1436 1437 if ((pset = zsd_allocate_pset(ctl, psetname, psetid)) == NULL) 1438 return (NULL); 1439 1440 /* Insert sorted by psetname */ 1441 tmp = list_head(&ctl->zsctl_psets); 1442 while (tmp != NULL && strcmp(psetname, tmp->zsp_name) > 0) 1443 tmp = list_next(&ctl->zsctl_psets, tmp); 1444 1445 list_insert_before(&ctl->zsctl_psets, tmp, pset); 1446 return (pset); 1447 } 1448 1449 /* Some utility functions for managing the list of zones using each pset */ 1450 static zsd_pset_usage_t * 1451 zsd_lookup_usage(zsd_pset_t *pset, zsd_zone_t *zone) 1452 { 1453 zsd_pset_usage_t *usage; 1454 1455 for (usage = list_head(&pset->zsp_usage_list); usage != NULL; 1456 usage = list_next(&pset->zsp_usage_list, usage)) 1457 if (usage->zsu_zone == zone) 1458 return (usage); 1459 1460 return (NULL); 1461 } 1462 1463 static zsd_pset_usage_t * 1464 zsd_allocate_pset_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone) 1465 { 1466 zsd_pset_usage_t *usage; 1467 1468 if ((usage = (zsd_pset_usage_t *)calloc(1, sizeof (zsd_pset_usage_t))) 1469 == NULL) 1470 return (NULL); 1471 1472 list_link_init(&usage->zsu_next); 1473 usage->zsu_zone = zone; 1474 usage->zsu_zoneid = zone->zsz_id; 1475 usage->zsu_pset = pset; 1476 usage->zsu_found = B_FALSE; 1477 usage->zsu_active = B_FALSE; 1478 usage->zsu_new = B_FALSE; 1479 /* 1480 * Allocate as not deleted. If a process is found in a pset for 1481 * a zone, the usage will not be deleted until at least the next 1482 * interval. 1483 */ 1484 usage->zsu_start = g_now; 1485 usage->zsu_hrstart = g_hrnow; 1486 usage->zsu_deleted = B_FALSE; 1487 usage->zsu_empty = B_TRUE; 1488 usage->zsu_scheds = 0; 1489 usage->zsu_cpu_shares = ZS_LIMIT_NONE; 1490 1491 ctl->zsctl_npset_usages++; 1492 pset->zsp_nusage++; 1493 1494 return (usage); 1495 } 1496 1497 static zsd_pset_usage_t * 1498 zsd_lookup_insert_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone) 1499 { 1500 zsd_pset_usage_t *usage, *tmp; 1501 1502 if ((usage = zsd_lookup_usage(pset, zone)) 1503 != NULL) 1504 return (usage); 1505 1506 if ((usage = zsd_allocate_pset_usage(ctl, pset, zone)) == NULL) 1507 return (NULL); 1508 1509 tmp = list_head(&pset->zsp_usage_list); 1510 while (tmp != NULL && strcmp(zone->zsz_name, tmp->zsu_zone->zsz_name) 1511 > 0) 1512 tmp = list_next(&pset->zsp_usage_list, tmp); 1513 1514 list_insert_before(&pset->zsp_usage_list, tmp, usage); 1515 return (usage); 1516 } 1517 1518 static void 1519 zsd_refresh_system(zsd_ctl_t *ctl) 1520 { 1521 zsd_system_t *system = ctl->zsctl_system; 1522 1523 /* Re-count these values each interval */ 1524 system->zss_processes = 0; 1525 system->zss_lwps = 0; 1526 system->zss_shm = 0; 1527 system->zss_shmids = 0; 1528 system->zss_semids = 0; 1529 system->zss_msgids = 0; 1530 system->zss_lofi = 0; 1531 } 1532 1533 1534 /* Reads each cpu's kstats, and adds the usage to the cpu's pset */ 1535 static void 1536 zsd_update_cpu_stats(zsd_ctl_t *ctl, zsd_cpu_t *cpu) 1537 { 1538 zsd_system_t *sys; 1539 processorid_t cpuid; 1540 zsd_pset_t *pset_prev; 1541 zsd_pset_t *pset; 1542 kstat_t *kstat; 1543 kstat_named_t *knp; 1544 kid_t kid; 1545 uint64_t idle, intr, kern, user; 1546 1547 sys = ctl->zsctl_system; 1548 pset = cpu->zsc_pset; 1549 knp = NULL; 1550 kid = -1; 1551 cpuid = cpu->zsc_id; 1552 1553 /* Get the cpu time totals for this cpu */ 1554 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "cpu", cpuid, "sys"); 1555 if (kstat == NULL) 1556 return; 1557 1558 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL); 1559 if (kid == -1) 1560 return; 1561 1562 knp = kstat_data_lookup(kstat, "cpu_nsec_idle"); 1563 if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64) 1564 return; 1565 1566 idle = knp->value.ui64; 1567 1568 knp = kstat_data_lookup(kstat, "cpu_nsec_kernel"); 1569 if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64) 1570 return; 1571 1572 kern = knp->value.ui64; 1573 1574 knp = kstat_data_lookup(kstat, "cpu_nsec_user"); 1575 if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64) 1576 return; 1577 1578 user = knp->value.ui64; 1579 1580 /* 1581 * Tracking intr time per cpu just exists for future enhancements. 1582 * The value is presently always zero. 1583 */ 1584 intr = 0; 1585 cpu->zsc_nsec_idle = idle; 1586 cpu->zsc_nsec_intr = intr; 1587 cpu->zsc_nsec_kern = kern; 1588 cpu->zsc_nsec_user = user; 1589 1590 if (cpu->zsc_onlined == B_TRUE) { 1591 /* 1592 * cpu is newly online. There is no reference value, 1593 * so just record its current stats for comparison 1594 * on next stat read. 1595 */ 1596 cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle; 1597 cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr; 1598 cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern; 1599 cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user; 1600 return; 1601 } 1602 1603 /* 1604 * Calculate relative time since previous refresh. 1605 * Paranoia. Don't let time go backwards. 1606 */ 1607 idle = intr = kern = user = 0; 1608 if (cpu->zsc_nsec_idle > cpu->zsc_nsec_idle_prev) 1609 idle = cpu->zsc_nsec_idle - cpu->zsc_nsec_idle_prev; 1610 1611 if (cpu->zsc_nsec_intr > cpu->zsc_nsec_intr_prev) 1612 intr = cpu->zsc_nsec_intr - cpu->zsc_nsec_intr_prev; 1613 1614 if (cpu->zsc_nsec_kern > cpu->zsc_nsec_kern_prev) 1615 kern = cpu->zsc_nsec_kern - cpu->zsc_nsec_kern_prev; 1616 1617 if (cpu->zsc_nsec_user > cpu->zsc_nsec_user_prev) 1618 user = cpu->zsc_nsec_user - cpu->zsc_nsec_user_prev; 1619 1620 /* Update totals for cpu usage */ 1621 TIMESTRUC_ADD_NANOSEC(cpu->zsc_idle, idle); 1622 TIMESTRUC_ADD_NANOSEC(cpu->zsc_intr, intr); 1623 TIMESTRUC_ADD_NANOSEC(cpu->zsc_kern, kern); 1624 TIMESTRUC_ADD_NANOSEC(cpu->zsc_user, user); 1625 1626 /* 1627 * Add cpu's stats to its pset if it is known to be in 1628 * the pset since previous read. 1629 */ 1630 if (cpu->zsc_psetid == cpu->zsc_psetid_prev || 1631 cpu->zsc_psetid_prev == ZS_PSET_ERROR || 1632 (pset_prev = zsd_lookup_pset_byid(ctl, 1633 cpu->zsc_psetid_prev)) == NULL) { 1634 TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, idle); 1635 TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, intr); 1636 TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, kern); 1637 TIMESTRUC_ADD_NANOSEC(pset->zsp_user, user); 1638 } else { 1639 /* 1640 * Last pset was different than current pset. 1641 * Best guess is to split usage between the two. 1642 */ 1643 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_idle, idle / 2); 1644 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_intr, intr / 2); 1645 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_kern, kern / 2); 1646 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_user, user / 2); 1647 1648 TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, 1649 (idle / 2) + (idle % 2)); 1650 TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, 1651 (intr / 2) + (intr % 2)); 1652 TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, 1653 (kern / 2) + (kern % 2)); 1654 TIMESTRUC_ADD_NANOSEC(pset->zsp_user, 1655 (user / 2) + (user % 2)); 1656 } 1657 TIMESTRUC_ADD_NANOSEC(sys->zss_idle, idle); 1658 TIMESTRUC_ADD_NANOSEC(sys->zss_intr, intr); 1659 TIMESTRUC_ADD_NANOSEC(sys->zss_kern, kern); 1660 TIMESTRUC_ADD_NANOSEC(sys->zss_user, user); 1661 } 1662 1663 /* Determine the details of a processor set by pset_id */ 1664 static int 1665 zsd_get_pool_pset(zsd_ctl_t *ctl, psetid_t psetid, char *psetname, 1666 size_t namelen, uint_t *cputype, uint64_t *online, uint64_t *size, 1667 uint64_t *min, uint64_t *max, int64_t *importance) 1668 { 1669 uint_t old, num; 1670 1671 pool_conf_t *conf = ctl->zsctl_pool_conf; 1672 pool_value_t **vals = ctl->zsctl_pool_vals; 1673 pool_resource_t **res_list = NULL; 1674 pool_resource_t *pset; 1675 pool_component_t **cpus = NULL; 1676 processorid_t *cache; 1677 const char *string; 1678 uint64_t uint64; 1679 int64_t int64; 1680 int i, ret, type; 1681 1682 if (ctl->zsctl_pool_status == POOL_DISABLED) { 1683 1684 /* 1685 * Inspect legacy psets 1686 */ 1687 for (;;) { 1688 old = num = ctl->zsctl_cpu_ncache; 1689 ret = pset_info(psetid, &type, &num, 1690 ctl->zsctl_cpu_cache); 1691 if (ret < 0) { 1692 /* pset is gone. Tell caller to retry */ 1693 errno = EINTR; 1694 return (-1); 1695 } 1696 if (num <= old) { 1697 /* Success */ 1698 break; 1699 } 1700 if ((cache = (processorid_t *)realloc( 1701 ctl->zsctl_cpu_cache, num * 1702 sizeof (processorid_t))) != NULL) { 1703 ctl->zsctl_cpu_ncache = num; 1704 ctl->zsctl_cpu_cache = cache; 1705 } else { 1706 /* 1707 * Could not allocate to get new cpu list. 1708 */ 1709 zsd_warn(gettext( 1710 "Could not allocate for cpu list")); 1711 errno = ENOMEM; 1712 return (-1); 1713 } 1714 } 1715 /* 1716 * Old school pset. Just make min and max equal 1717 * to its size 1718 */ 1719 if (psetid == ZS_PSET_DEFAULT) { 1720 *cputype = ZS_CPUTYPE_DEFAULT_PSET; 1721 (void) strlcpy(psetname, "pset_default", namelen); 1722 } else { 1723 *cputype = ZS_CPUTYPE_PSRSET_PSET; 1724 (void) snprintf(psetname, namelen, 1725 "SUNWlegacy_pset_%d", psetid); 1726 } 1727 1728 /* 1729 * Just treat legacy pset as a simple pool pset 1730 */ 1731 *online = num; 1732 *size = num; 1733 *min = num; 1734 *max = num; 1735 *importance = 1; 1736 1737 return (0); 1738 } 1739 1740 /* Look up the pool pset using the pset id */ 1741 res_list = NULL; 1742 pool_value_set_int64(vals[1], psetid); 1743 if (pool_value_set_name(vals[1], "pset.sys_id") 1744 != PO_SUCCESS) 1745 goto err; 1746 1747 if (pool_value_set_name(vals[0], "type") != PO_SUCCESS) 1748 goto err; 1749 if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS) 1750 goto err; 1751 if ((res_list = pool_query_resources(conf, &num, vals)) == NULL) 1752 goto err; 1753 if (num != 1) 1754 goto err; 1755 pset = res_list[0]; 1756 free(res_list); 1757 res_list = NULL; 1758 if (pool_get_property(conf, pool_resource_to_elem(conf, pset), 1759 "pset.name", vals[0]) != POC_STRING || 1760 pool_value_get_string(vals[0], &string) != PO_SUCCESS) 1761 goto err; 1762 1763 (void) strlcpy(psetname, string, namelen); 1764 if (strncmp(psetname, "SUNWtmp", strlen("SUNWtmp")) == 0) 1765 *cputype = ZS_CPUTYPE_DEDICATED; 1766 else if (psetid == ZS_PSET_DEFAULT) 1767 *cputype = ZS_CPUTYPE_DEFAULT_PSET; 1768 else 1769 *cputype = ZS_CPUTYPE_POOL_PSET; 1770 1771 /* Get size, min, max, and importance */ 1772 if (pool_get_property(conf, pool_resource_to_elem(conf, 1773 pset), "pset.size", vals[0]) == POC_UINT && 1774 pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS) 1775 *size = uint64; 1776 else 1777 *size = 0; 1778 1779 /* Get size, min, max, and importance */ 1780 if (pool_get_property(conf, pool_resource_to_elem(conf, 1781 pset), "pset.min", vals[0]) == POC_UINT && 1782 pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS) 1783 *min = uint64; 1784 else 1785 *min = 0; 1786 if (*min >= ZSD_PSET_UNLIMITED) 1787 *min = ZS_LIMIT_NONE; 1788 1789 if (pool_get_property(conf, pool_resource_to_elem(conf, 1790 pset), "pset.max", vals[0]) == POC_UINT && 1791 pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS) 1792 *max = uint64; 1793 else 1794 *max = ZS_LIMIT_NONE; 1795 1796 if (*max >= ZSD_PSET_UNLIMITED) 1797 *max = ZS_LIMIT_NONE; 1798 1799 if (pool_get_property(conf, pool_resource_to_elem(conf, 1800 pset), "pset.importance", vals[0]) == POC_INT && 1801 pool_value_get_int64(vals[0], &int64) == PO_SUCCESS) 1802 *importance = int64; 1803 else 1804 *importance = (uint64_t)1; 1805 1806 *online = 0; 1807 if (*size == 0) 1808 return (0); 1809 1810 /* get cpus */ 1811 cpus = pool_query_resource_components(conf, pset, &num, NULL); 1812 if (cpus == NULL) 1813 goto err; 1814 1815 /* Make sure there is space for cpu id list */ 1816 if (num > ctl->zsctl_cpu_ncache) { 1817 if ((cache = (processorid_t *)realloc( 1818 ctl->zsctl_cpu_cache, num * 1819 sizeof (processorid_t))) != NULL) { 1820 ctl->zsctl_cpu_ncache = num; 1821 ctl->zsctl_cpu_cache = cache; 1822 } else { 1823 /* 1824 * Could not allocate to get new cpu list. 1825 */ 1826 zsd_warn(gettext( 1827 "Could not allocate for cpu list")); 1828 goto err; 1829 } 1830 } 1831 1832 /* count the online cpus */ 1833 for (i = 0; i < num; i++) { 1834 if (pool_get_property(conf, pool_component_to_elem( 1835 conf, cpus[i]), "cpu.status", vals[0]) != POC_STRING || 1836 pool_value_get_string(vals[0], &string) != PO_SUCCESS) 1837 goto err; 1838 1839 if (strcmp(string, "on-line") != 0 && 1840 strcmp(string, "no-intr") != 0) 1841 continue; 1842 1843 if (pool_get_property(conf, pool_component_to_elem( 1844 conf, cpus[i]), "cpu.sys_id", vals[0]) != POC_INT || 1845 pool_value_get_int64(vals[0], &int64) != PO_SUCCESS) 1846 goto err; 1847 1848 (*online)++; 1849 ctl->zsctl_cpu_cache[i] = (psetid_t)int64; 1850 } 1851 free(cpus); 1852 return (0); 1853 err: 1854 if (res_list != NULL) 1855 free(res_list); 1856 if (cpus != NULL) 1857 free(cpus); 1858 1859 /* 1860 * The pools operations should succeed since the conf is a consistent 1861 * snapshot. Tell caller there is no need to retry. 1862 */ 1863 errno = EINVAL; 1864 return (-1); 1865 } 1866 1867 /* 1868 * Update the current list of processor sets. 1869 * This also updates the list of online cpus, and each cpu's pset membership. 1870 */ 1871 static void 1872 zsd_refresh_psets(zsd_ctl_t *ctl) 1873 { 1874 int i, j, ret, state; 1875 uint_t old, num; 1876 uint_t cputype; 1877 int64_t sys_id, importance; 1878 uint64_t online, size, min, max; 1879 zsd_system_t *system; 1880 zsd_pset_t *pset; 1881 zsd_cpu_t *cpu; 1882 psetid_t *cache; 1883 char psetname[ZS_PSETNAME_MAX]; 1884 processorid_t cpuid; 1885 pool_value_t *pv_save = NULL; 1886 pool_resource_t **res_list = NULL; 1887 pool_resource_t *res; 1888 pool_value_t **vals; 1889 pool_conf_t *conf; 1890 boolean_t roll_cpus = B_TRUE; 1891 1892 /* Zero cpu counters to recount them */ 1893 system = ctl->zsctl_system; 1894 system->zss_ncpus = 0; 1895 system->zss_ncpus_online = 0; 1896 retry: 1897 ret = pool_get_status(&state); 1898 if (ret == 0 && state == POOL_ENABLED) { 1899 1900 conf = ctl->zsctl_pool_conf; 1901 vals = ctl->zsctl_pool_vals; 1902 pv_save = vals[1]; 1903 vals[1] = NULL; 1904 1905 if (ctl->zsctl_pool_status == POOL_DISABLED) { 1906 if (pool_conf_open(ctl->zsctl_pool_conf, 1907 pool_dynamic_location(), PO_RDONLY) == 0) { 1908 ctl->zsctl_pool_status = POOL_ENABLED; 1909 ctl->zsctl_pool_changed = POU_PSET; 1910 } 1911 } else { 1912 ctl->zsctl_pool_changed = 0; 1913 ret = pool_conf_update(ctl->zsctl_pool_conf, 1914 &(ctl->zsctl_pool_changed)); 1915 if (ret < 0) { 1916 /* Pools must have become disabled */ 1917 (void) pool_conf_close(ctl->zsctl_pool_conf); 1918 ctl->zsctl_pool_status = POOL_DISABLED; 1919 if (pool_error() == POE_SYSTEM && errno == 1920 ENOTACTIVE) 1921 goto retry; 1922 1923 zsd_warn(gettext( 1924 "Unable to update pool configuration")); 1925 /* Not able to get pool info. Don't update. */ 1926 goto err; 1927 } 1928 } 1929 /* Get the list of psets using libpool */ 1930 if (pool_value_set_name(vals[0], "type") != PO_SUCCESS) 1931 goto err; 1932 1933 if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS) 1934 goto err; 1935 if ((res_list = pool_query_resources(conf, &num, vals)) 1936 == NULL) 1937 goto err; 1938 1939 if (num > ctl->zsctl_pset_ncache) { 1940 if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache, 1941 (num) * sizeof (psetid_t))) == NULL) { 1942 goto err; 1943 } 1944 ctl->zsctl_pset_ncache = num; 1945 ctl->zsctl_pset_cache = cache; 1946 } 1947 /* Save the pset id of each pset */ 1948 for (i = 0; i < num; i++) { 1949 res = res_list[i]; 1950 if (pool_get_property(conf, pool_resource_to_elem(conf, 1951 res), "pset.sys_id", vals[0]) != POC_INT || 1952 pool_value_get_int64(vals[0], &sys_id) 1953 != PO_SUCCESS) 1954 goto err; 1955 ctl->zsctl_pset_cache[i] = (int)sys_id; 1956 } 1957 vals[1] = pv_save; 1958 pv_save = NULL; 1959 } else { 1960 if (ctl->zsctl_pool_status == POOL_ENABLED) { 1961 (void) pool_conf_close(ctl->zsctl_pool_conf); 1962 ctl->zsctl_pool_status = POOL_DISABLED; 1963 } 1964 /* Get the pset list using legacy psets */ 1965 for (;;) { 1966 old = num = ctl->zsctl_pset_ncache; 1967 (void) pset_list(ctl->zsctl_pset_cache, &num); 1968 if ((num + 1) <= old) { 1969 break; 1970 } 1971 if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache, 1972 (num + 1) * sizeof (psetid_t))) != NULL) { 1973 ctl->zsctl_pset_ncache = num + 1; 1974 ctl->zsctl_pset_cache = cache; 1975 } else { 1976 /* 1977 * Could not allocate to get new pset list. 1978 * Give up 1979 */ 1980 return; 1981 } 1982 } 1983 /* Add the default pset to list */ 1984 ctl->zsctl_pset_cache[num] = ctl->zsctl_pset_cache[0]; 1985 ctl->zsctl_pset_cache[0] = ZS_PSET_DEFAULT; 1986 num++; 1987 } 1988 psets_changed: 1989 zsd_mark_cpus_start(ctl, roll_cpus); 1990 zsd_mark_psets_start(ctl); 1991 roll_cpus = B_FALSE; 1992 1993 /* Refresh cpu membership of all psets */ 1994 for (i = 0; i < num; i++) { 1995 1996 /* Get pool pset information */ 1997 sys_id = ctl->zsctl_pset_cache[i]; 1998 if (zsd_get_pool_pset(ctl, sys_id, psetname, sizeof (psetname), 1999 &cputype, &online, &size, &min, &max, &importance) 2000 != 0) { 2001 if (errno == EINTR) 2002 goto psets_changed; 2003 zsd_warn(gettext("Failed to get info for pset %d"), 2004 sys_id); 2005 continue; 2006 } 2007 2008 system->zss_ncpus += size; 2009 system->zss_ncpus_online += online; 2010 2011 pset = zsd_lookup_insert_pset(ctl, psetname, 2012 ctl->zsctl_pset_cache[i]); 2013 2014 /* update pset info */ 2015 zsd_mark_pset_found(pset, cputype, online, size, min, 2016 max, importance); 2017 2018 /* update each cpu in pset */ 2019 for (j = 0; j < pset->zsp_online; j++) { 2020 cpuid = ctl->zsctl_cpu_cache[j]; 2021 cpu = zsd_lookup_insert_cpu(ctl, cpuid); 2022 zsd_mark_cpu_found(cpu, pset, sys_id); 2023 } 2024 } 2025 err: 2026 if (res_list != NULL) 2027 free(res_list); 2028 if (pv_save != NULL) 2029 vals[1] = pv_save; 2030 } 2031 2032 2033 2034 /* 2035 * Fetch the current pool and pset name for the given zone. 2036 */ 2037 static void 2038 zsd_get_zone_pool_pset(zsd_ctl_t *ctl, zsd_zone_t *zone, 2039 char *pool, int poollen, char *pset, int psetlen, uint_t *cputype) 2040 { 2041 poolid_t poolid; 2042 pool_t **pools = NULL; 2043 pool_resource_t **res_list = NULL; 2044 char poolname[ZS_POOLNAME_MAX]; 2045 char psetname[ZS_PSETNAME_MAX]; 2046 pool_conf_t *conf = ctl->zsctl_pool_conf; 2047 pool_value_t *pv_save = NULL; 2048 pool_value_t **vals = ctl->zsctl_pool_vals; 2049 const char *string; 2050 int ret; 2051 int64_t int64; 2052 uint_t num; 2053 2054 ret = zone_getattr(zone->zsz_id, ZONE_ATTR_POOLID, 2055 &poolid, sizeof (poolid)); 2056 if (ret < 0) 2057 goto lookup_done; 2058 2059 pv_save = vals[1]; 2060 vals[1] = NULL; 2061 pools = NULL; 2062 res_list = NULL; 2063 2064 /* Default values if lookup fails */ 2065 (void) strlcpy(poolname, "pool_default", sizeof (poolname)); 2066 (void) strlcpy(psetname, "pset_default", sizeof (poolname)); 2067 *cputype = ZS_CPUTYPE_DEFAULT_PSET; 2068 2069 /* no dedicated cpu if pools are disabled */ 2070 if (ctl->zsctl_pool_status == POOL_DISABLED) 2071 goto lookup_done; 2072 2073 /* Get the pool name using the id */ 2074 pool_value_set_int64(vals[0], poolid); 2075 if (pool_value_set_name(vals[0], "pool.sys_id") != PO_SUCCESS) 2076 goto lookup_done; 2077 2078 if ((pools = pool_query_pools(conf, &num, vals)) == NULL) 2079 goto lookup_done; 2080 2081 if (num != 1) 2082 goto lookup_done; 2083 2084 if (pool_get_property(conf, pool_to_elem(conf, pools[0]), 2085 "pool.name", vals[0]) != POC_STRING || 2086 pool_value_get_string(vals[0], &string) != PO_SUCCESS) 2087 goto lookup_done; 2088 (void) strlcpy(poolname, (char *)string, sizeof (poolname)); 2089 2090 /* Get the name of the pset for the pool */ 2091 if (pool_value_set_name(vals[0], "type") != PO_SUCCESS) 2092 goto lookup_done; 2093 2094 if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS) 2095 goto lookup_done; 2096 2097 if ((res_list = pool_query_pool_resources(conf, pools[0], &num, vals)) 2098 == NULL) 2099 goto lookup_done; 2100 2101 if (num != 1) 2102 goto lookup_done; 2103 2104 if (pool_get_property(conf, pool_resource_to_elem(conf, 2105 res_list[0]), "pset.sys_id", vals[0]) != POC_INT || 2106 pool_value_get_int64(vals[0], &int64) != PO_SUCCESS) 2107 goto lookup_done; 2108 2109 if (int64 == ZS_PSET_DEFAULT) 2110 *cputype = ZS_CPUTYPE_DEFAULT_PSET; 2111 2112 if (pool_get_property(conf, pool_resource_to_elem(conf, 2113 res_list[0]), "pset.name", vals[0]) != POC_STRING || 2114 pool_value_get_string(vals[0], &string) != PO_SUCCESS) 2115 goto lookup_done; 2116 2117 (void) strlcpy(psetname, (char *)string, sizeof (psetname)); 2118 2119 if (strncmp(psetname, "SUNWtmp_", strlen("SUNWtmp_")) == 0) 2120 *cputype = ZS_CPUTYPE_DEDICATED; 2121 if (strncmp(psetname, "SUNW_legacy_", strlen("SUNW_legacy_")) == 0) 2122 *cputype = ZS_CPUTYPE_PSRSET_PSET; 2123 else 2124 *cputype = ZS_CPUTYPE_POOL_PSET; 2125 2126 lookup_done: 2127 2128 if (pv_save != NULL) 2129 vals[1] = pv_save; 2130 2131 if (res_list) 2132 free(res_list); 2133 if (pools) 2134 free(pools); 2135 2136 (void) strlcpy(pool, poolname, poollen); 2137 (void) strlcpy(pset, psetname, psetlen); 2138 } 2139 2140 /* Convert scheduler names to ZS_* scheduler flags */ 2141 static uint_t 2142 zsd_schedname2int(char *clname, int pri) 2143 { 2144 uint_t sched = 0; 2145 2146 if (strcmp(clname, "TS") == 0) { 2147 sched = ZS_SCHED_TS; 2148 } else if (strcmp(clname, "IA") == 0) { 2149 sched = ZS_SCHED_IA; 2150 } else if (strcmp(clname, "FX") == 0) { 2151 if (pri > 59) { 2152 sched = ZS_SCHED_FX_60; 2153 } else { 2154 sched = ZS_SCHED_FX; 2155 } 2156 } else if (strcmp(clname, "RT") == 0) { 2157 sched = ZS_SCHED_RT; 2158 2159 } else if (strcmp(clname, "FSS") == 0) { 2160 sched = ZS_SCHED_FSS; 2161 } 2162 return (sched); 2163 } 2164 2165 static uint64_t 2166 zsd_get_zone_rctl_limit(char *name) 2167 { 2168 rctlblk_t *rblk; 2169 2170 rblk = (rctlblk_t *)alloca(rctlblk_size()); 2171 if (getrctl(name, NULL, rblk, RCTL_FIRST) 2172 != 0) { 2173 return (ZS_LIMIT_NONE); 2174 } 2175 return (rctlblk_get_value(rblk)); 2176 } 2177 2178 static uint64_t 2179 zsd_get_zone_rctl_usage(char *name) 2180 { 2181 rctlblk_t *rblk; 2182 2183 rblk = (rctlblk_t *)alloca(rctlblk_size()); 2184 if (getrctl(name, NULL, rblk, RCTL_USAGE) 2185 != 0) { 2186 return (0); 2187 } 2188 return (rctlblk_get_value(rblk)); 2189 } 2190 2191 #define ZSD_NUM_RCTL_VALS 19 2192 2193 /* 2194 * Fetch the limit information for a zone. This uses zone_enter() as the 2195 * getrctl(2) system call only returns rctl information for the zone of 2196 * the caller. 2197 */ 2198 static int 2199 zsd_get_zone_caps(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t *cpu_shares, 2200 uint64_t *cpu_cap, uint64_t *ram_cap, uint64_t *locked_cap, 2201 uint64_t *vm_cap, uint64_t *processes_cap, uint64_t *processes, 2202 uint64_t *lwps_cap, uint64_t *lwps, uint64_t *shm_cap, uint64_t *shm, 2203 uint64_t *shmids_cap, uint64_t *shmids, uint64_t *semids_cap, 2204 uint64_t *semids, uint64_t *msgids_cap, uint64_t *msgids, 2205 uint64_t *lofi_cap, uint64_t *lofi, uint_t *sched) 2206 { 2207 int p[2], pid, tmpl_fd, ret; 2208 ctid_t ct; 2209 char class[PC_CLNMSZ]; 2210 uint64_t vals[ZSD_NUM_RCTL_VALS]; 2211 zsd_system_t *sys = ctl->zsctl_system; 2212 int i = 0; 2213 int res = 0; 2214 2215 /* Treat all caps as no cap on error */ 2216 *cpu_shares = ZS_LIMIT_NONE; 2217 *cpu_cap = ZS_LIMIT_NONE; 2218 *ram_cap = ZS_LIMIT_NONE; 2219 *locked_cap = ZS_LIMIT_NONE; 2220 *vm_cap = ZS_LIMIT_NONE; 2221 2222 *processes_cap = ZS_LIMIT_NONE; 2223 *lwps_cap = ZS_LIMIT_NONE; 2224 *shm_cap = ZS_LIMIT_NONE; 2225 *shmids_cap = ZS_LIMIT_NONE; 2226 *semids_cap = ZS_LIMIT_NONE; 2227 *msgids_cap = ZS_LIMIT_NONE; 2228 *lofi_cap = ZS_LIMIT_NONE; 2229 2230 *processes = 0; 2231 *lwps = 0; 2232 *shm = 0; 2233 *shmids = 0; 2234 *semids = 0; 2235 *msgids = 0; 2236 *lofi = 0; 2237 2238 /* Get the ram cap first since it is a zone attr */ 2239 ret = zone_getattr(zone->zsz_id, ZONE_ATTR_PHYS_MCAP, 2240 ram_cap, sizeof (*ram_cap)); 2241 if (ret < 0 || *ram_cap == 0) 2242 *ram_cap = ZS_LIMIT_NONE; 2243 2244 /* Get the zone's default scheduling class */ 2245 ret = zone_getattr(zone->zsz_id, ZONE_ATTR_SCHED_CLASS, 2246 class, sizeof (class)); 2247 if (ret < 0) 2248 return (-1); 2249 2250 *sched = zsd_schedname2int(class, 0); 2251 2252 /* rctl caps must be fetched from within the zone */ 2253 if (pipe(p) != 0) 2254 return (-1); 2255 2256 if ((tmpl_fd = init_template()) == -1) { 2257 (void) close(p[0]); 2258 (void) close(p[1]); 2259 return (-1); 2260 } 2261 pid = forkx(0); 2262 if (pid < 0) { 2263 (void) ct_tmpl_clear(tmpl_fd); 2264 (void) close(p[0]); 2265 (void) close(p[1]); 2266 return (-1); 2267 } 2268 if (pid == 0) { 2269 2270 (void) ct_tmpl_clear(tmpl_fd); 2271 (void) close(tmpl_fd); 2272 (void) close(p[0]); 2273 if (zone->zsz_id != getzoneid()) { 2274 if (zone_enter(zone->zsz_id) < 0) { 2275 (void) close(p[1]); 2276 _exit(0); 2277 } 2278 } 2279 2280 /* Get caps for zone, and write them to zonestatd parent. */ 2281 vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-shares"); 2282 vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-cap"); 2283 vals[i++] = zsd_get_zone_rctl_limit("zone.max-locked-memory"); 2284 vals[i++] = zsd_get_zone_rctl_limit("zone.max-swap"); 2285 vals[i++] = zsd_get_zone_rctl_limit("zone.max-processes"); 2286 vals[i++] = zsd_get_zone_rctl_usage("zone.max-processes"); 2287 vals[i++] = zsd_get_zone_rctl_limit("zone.max-lwps"); 2288 vals[i++] = zsd_get_zone_rctl_usage("zone.max-lwps"); 2289 vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-memory"); 2290 vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-memory"); 2291 vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-ids"); 2292 vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-ids"); 2293 vals[i++] = zsd_get_zone_rctl_limit("zone.max-sem-ids"); 2294 vals[i++] = zsd_get_zone_rctl_usage("zone.max-sem-ids"); 2295 vals[i++] = zsd_get_zone_rctl_limit("zone.max-msg-ids"); 2296 vals[i++] = zsd_get_zone_rctl_usage("zone.max-msg-ids"); 2297 vals[i++] = zsd_get_zone_rctl_limit("zone.max-lofi"); 2298 vals[i++] = zsd_get_zone_rctl_usage("zone.max-lofi"); 2299 2300 if (write(p[1], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) != 2301 ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) { 2302 (void) close(p[1]); 2303 _exit(1); 2304 } 2305 2306 (void) close(p[1]); 2307 _exit(0); 2308 } 2309 if (contract_latest(&ct) == -1) 2310 ct = -1; 2311 2312 (void) ct_tmpl_clear(tmpl_fd); 2313 (void) close(tmpl_fd); 2314 (void) close(p[1]); 2315 while (waitpid(pid, NULL, 0) != pid) 2316 ; 2317 2318 /* Read cap from child in zone */ 2319 if (read(p[0], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) != 2320 ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) { 2321 res = -1; 2322 goto cleanup; 2323 } 2324 i = 0; 2325 *cpu_shares = vals[i++]; 2326 *cpu_cap = vals[i++]; 2327 *locked_cap = vals[i++]; 2328 *vm_cap = vals[i++]; 2329 *processes_cap = vals[i++]; 2330 *processes = vals[i++]; 2331 *lwps_cap = vals[i++]; 2332 *lwps = vals[i++]; 2333 *shm_cap = vals[i++]; 2334 *shm = vals[i++]; 2335 *shmids_cap = vals[i++]; 2336 *shmids = vals[i++]; 2337 *semids_cap = vals[i++]; 2338 *semids = vals[i++]; 2339 *msgids_cap = vals[i++]; 2340 *msgids = vals[i++]; 2341 *lofi_cap = vals[i++]; 2342 *lofi = vals[i++]; 2343 2344 /* Interpret maximum values as no cap */ 2345 if (*cpu_cap == UINT32_MAX || *cpu_cap == 0) 2346 *cpu_cap = ZS_LIMIT_NONE; 2347 if (*processes_cap == sys->zss_processes_max) 2348 *processes_cap = ZS_LIMIT_NONE; 2349 if (*lwps_cap == sys->zss_lwps_max) 2350 *lwps_cap = ZS_LIMIT_NONE; 2351 if (*shm_cap == sys->zss_shm_max) 2352 *shm_cap = ZS_LIMIT_NONE; 2353 if (*shmids_cap == sys->zss_shmids_max) 2354 *shmids_cap = ZS_LIMIT_NONE; 2355 if (*semids_cap == sys->zss_semids_max) 2356 *semids_cap = ZS_LIMIT_NONE; 2357 if (*msgids_cap == sys->zss_msgids_max) 2358 *msgids_cap = ZS_LIMIT_NONE; 2359 if (*lofi_cap == sys->zss_lofi_max) 2360 *lofi_cap = ZS_LIMIT_NONE; 2361 2362 2363 cleanup: 2364 (void) close(p[0]); 2365 (void) ct_tmpl_clear(tmpl_fd); 2366 (void) close(tmpl_fd); 2367 (void) contract_abandon_id(ct); 2368 2369 return (res); 2370 } 2371 2372 /* Update the current list of running zones */ 2373 static void 2374 zsd_refresh_zones(zsd_ctl_t *ctl) 2375 { 2376 zsd_zone_t *zone; 2377 uint_t old, num; 2378 ushort_t flags; 2379 int i, ret; 2380 zoneid_t *cache; 2381 uint64_t cpu_shares; 2382 uint64_t cpu_cap; 2383 uint64_t ram_cap; 2384 uint64_t locked_cap; 2385 uint64_t vm_cap; 2386 uint64_t processes_cap; 2387 uint64_t processes; 2388 uint64_t lwps_cap; 2389 uint64_t lwps; 2390 uint64_t shm_cap; 2391 uint64_t shm; 2392 uint64_t shmids_cap; 2393 uint64_t shmids; 2394 uint64_t semids_cap; 2395 uint64_t semids; 2396 uint64_t msgids_cap; 2397 uint64_t msgids; 2398 uint64_t lofi_cap; 2399 uint64_t lofi; 2400 2401 char zonename[ZS_ZONENAME_MAX]; 2402 char poolname[ZS_POOLNAME_MAX]; 2403 char psetname[ZS_PSETNAME_MAX]; 2404 uint_t sched; 2405 uint_t cputype; 2406 uint_t iptype; 2407 2408 /* Get the current list of running zones */ 2409 for (;;) { 2410 old = num = ctl->zsctl_zone_ncache; 2411 (void) zone_list(ctl->zsctl_zone_cache, &num); 2412 if (num <= old) 2413 break; 2414 if ((cache = (zoneid_t *)realloc(ctl->zsctl_zone_cache, 2415 (num) * sizeof (zoneid_t))) != NULL) { 2416 ctl->zsctl_zone_ncache = num; 2417 ctl->zsctl_zone_cache = cache; 2418 } else { 2419 /* Could not allocate to get new zone list. Give up */ 2420 return; 2421 } 2422 } 2423 2424 zsd_mark_zones_start(ctl); 2425 2426 for (i = 0; i < num; i++) { 2427 2428 ret = getzonenamebyid(ctl->zsctl_zone_cache[i], 2429 zonename, sizeof (zonename)); 2430 if (ret < 0) 2431 continue; 2432 2433 zone = zsd_lookup_insert_zone(ctl, zonename, 2434 ctl->zsctl_zone_cache[i]); 2435 2436 ret = zone_getattr(ctl->zsctl_zone_cache[i], ZONE_ATTR_FLAGS, 2437 &flags, sizeof (flags)); 2438 if (ret < 0) 2439 continue; 2440 2441 if (flags & ZF_NET_EXCL) 2442 iptype = ZS_IPTYPE_EXCLUSIVE; 2443 else 2444 iptype = ZS_IPTYPE_SHARED; 2445 2446 zsd_get_zone_pool_pset(ctl, zone, poolname, sizeof (poolname), 2447 psetname, sizeof (psetname), &cputype); 2448 2449 if (zsd_get_zone_caps(ctl, zone, &cpu_shares, &cpu_cap, 2450 &ram_cap, &locked_cap, &vm_cap, &processes_cap, &processes, 2451 &lwps_cap, &lwps, &shm_cap, &shm, &shmids_cap, &shmids, 2452 &semids_cap, &semids, &msgids_cap, &msgids, &lofi_cap, 2453 &lofi, &sched) != 0) 2454 continue; 2455 2456 zsd_mark_zone_found(ctl, zone, cpu_shares, cpu_cap, ram_cap, 2457 locked_cap, vm_cap, processes_cap, processes, lwps_cap, 2458 lwps, shm_cap, shm, shmids_cap, shmids, semids_cap, 2459 semids, msgids_cap, msgids, lofi_cap, lofi, poolname, 2460 psetname, sched, cputype, iptype); 2461 } 2462 } 2463 2464 /* Fetch the details of a process from its psinfo_t */ 2465 static void 2466 zsd_get_proc_info(zsd_ctl_t *ctl, psinfo_t *psinfo, psetid_t *psetid, 2467 psetid_t *prev_psetid, zoneid_t *zoneid, zoneid_t *prev_zoneid, 2468 timestruc_t *delta, uint_t *sched) 2469 { 2470 timestruc_t d; 2471 zsd_proc_t *proc; 2472 2473 /* Get cached data for proc */ 2474 proc = &(ctl->zsctl_proc_array[psinfo->pr_pid]); 2475 *psetid = psinfo->pr_lwp.pr_bindpset; 2476 2477 if (proc->zspr_psetid == ZS_PSET_ERROR) 2478 *prev_psetid = *psetid; 2479 else 2480 *prev_psetid = proc->zspr_psetid; 2481 2482 *zoneid = psinfo->pr_zoneid; 2483 if (proc->zspr_zoneid == -1) 2484 *prev_zoneid = *zoneid; 2485 else 2486 *prev_zoneid = proc->zspr_zoneid; 2487 2488 TIMESTRUC_DELTA(d, psinfo->pr_time, proc->zspr_usage); 2489 *delta = d; 2490 2491 *sched = zsd_schedname2int(psinfo->pr_lwp.pr_clname, 2492 psinfo->pr_lwp.pr_pri); 2493 2494 /* Update cached data for proc */ 2495 proc->zspr_psetid = psinfo->pr_lwp.pr_bindpset; 2496 proc->zspr_zoneid = psinfo->pr_zoneid; 2497 proc->zspr_sched = *sched; 2498 proc->zspr_usage.tv_sec = psinfo->pr_time.tv_sec; 2499 proc->zspr_usage.tv_nsec = psinfo->pr_time.tv_nsec; 2500 proc->zspr_ppid = psinfo->pr_ppid; 2501 } 2502 2503 /* 2504 * Reset the known cpu usage of a process. This is done after a process 2505 * exits so that if the pid is recycled, data from its previous life is 2506 * not reused 2507 */ 2508 static void 2509 zsd_flush_proc_info(zsd_proc_t *proc) 2510 { 2511 proc->zspr_usage.tv_sec = 0; 2512 proc->zspr_usage.tv_nsec = 0; 2513 } 2514 2515 /* 2516 * Open the current extended accounting file. On initialization, open the 2517 * file as the current file to be used. Otherwise, open the file as the 2518 * next file to use of the current file reaches EOF. 2519 */ 2520 static int 2521 zsd_open_exacct(zsd_ctl_t *ctl, boolean_t init) 2522 { 2523 int ret, oret, state, trys = 0, flags; 2524 int *fd, *open; 2525 ea_file_t *eaf; 2526 struct stat64 *stat; 2527 char path[MAXPATHLEN]; 2528 2529 /* 2530 * The accounting file is first opened at the tail. Following 2531 * opens to new accounting files are opened at the head. 2532 */ 2533 if (init == B_TRUE) { 2534 flags = EO_NO_VALID_HDR | EO_TAIL; 2535 fd = &ctl->zsctl_proc_fd; 2536 eaf = &ctl->zsctl_proc_eaf; 2537 stat = &ctl->zsctl_proc_stat; 2538 open = &ctl->zsctl_proc_open; 2539 } else { 2540 flags = EO_NO_VALID_HDR | EO_HEAD; 2541 fd = &ctl->zsctl_proc_fd_next; 2542 eaf = &ctl->zsctl_proc_eaf_next; 2543 stat = &ctl->zsctl_proc_stat_next; 2544 open = &ctl->zsctl_proc_open_next; 2545 } 2546 2547 *fd = -1; 2548 *open = 0; 2549 retry: 2550 /* open accounting files for cpu consumption */ 2551 ret = acctctl(AC_STATE_GET | AC_PROC, &state, sizeof (state)); 2552 if (ret != 0) { 2553 zsd_warn(gettext("Unable to get process accounting state")); 2554 goto err; 2555 } 2556 if (state != AC_ON) { 2557 if (trys > 0) { 2558 zsd_warn(gettext( 2559 "Unable to enable process accounting")); 2560 goto err; 2561 } 2562 (void) zsd_enable_cpu_stats(); 2563 trys++; 2564 goto retry; 2565 } 2566 2567 ret = acctctl(AC_FILE_GET | AC_PROC, path, sizeof (path)); 2568 if (ret != 0) { 2569 zsd_warn(gettext("Unable to get process accounting file")); 2570 goto err; 2571 } 2572 2573 if ((*fd = open64(path, O_RDONLY, 0)) >= 0 && 2574 (oret = ea_fdopen(eaf, *fd, NULL, flags, O_RDONLY)) == 0) 2575 ret = fstat64(*fd, stat); 2576 2577 if (*fd < 0 || oret < 0 || ret < 0) { 2578 struct timespec ts; 2579 2580 /* 2581 * It is possible the accounting file is momentarily unavailable 2582 * because it is being rolled. Try for up to half a second. 2583 * 2584 * If failure to open accounting file persists, give up. 2585 */ 2586 if (oret == 0) 2587 (void) ea_close(eaf); 2588 else if (*fd >= 0) 2589 (void) close(*fd); 2590 if (trys > 500) { 2591 zsd_warn(gettext( 2592 "Unable to open process accounting file")); 2593 goto err; 2594 } 2595 /* wait one millisecond */ 2596 ts.tv_sec = 0; 2597 ts.tv_nsec = NANOSEC / 1000; 2598 (void) nanosleep(&ts, NULL); 2599 goto retry; 2600 } 2601 *open = 1; 2602 return (0); 2603 err: 2604 if (*fd >= 0) 2605 (void) close(*fd); 2606 *open = 0; 2607 *fd = -1; 2608 return (-1); 2609 } 2610 2611 /* 2612 * Walk /proc and charge each process to its zone and processor set. 2613 * Then read exacct data for exited processes, and charge them as well. 2614 */ 2615 static void 2616 zsd_refresh_procs(zsd_ctl_t *ctl, boolean_t init) 2617 { 2618 DIR *dir; 2619 struct dirent *dent; 2620 psinfo_t psinfo; 2621 int fd, ret; 2622 zsd_proc_t *proc, *pproc, *tmp, *next; 2623 list_t pplist, plist; 2624 zsd_zone_t *zone, *prev_zone; 2625 zsd_pset_t *pset, *prev_pset; 2626 psetid_t psetid, prev_psetid; 2627 zoneid_t zoneid, prev_zoneid; 2628 zsd_pset_usage_t *usage, *prev_usage; 2629 char path[MAXPATHLEN]; 2630 2631 ea_object_t object; 2632 ea_object_t pobject; 2633 boolean_t hrtime_expired = B_FALSE; 2634 struct timeval interval_end; 2635 2636 timestruc_t delta, d1, d2; 2637 uint_t sched = 0; 2638 2639 /* 2640 * Get the current accounting file. The current accounting file 2641 * may be different than the file in use, as the accounting file 2642 * may have been rolled, or manually changed by an admin. 2643 */ 2644 ret = zsd_open_exacct(ctl, init); 2645 if (ret != 0) { 2646 zsd_warn(gettext("Unable to track process accounting")); 2647 return; 2648 } 2649 2650 /* 2651 * Mark the current time as the interval end time. Don't track 2652 * processes that exit after this time. 2653 */ 2654 (void) gettimeofday(&interval_end, NULL); 2655 2656 dir = opendir("/proc"); 2657 if (dir == NULL) { 2658 zsd_warn(gettext("Unable to open /proc")); 2659 return; 2660 } 2661 2662 /* Walk all processes and compute each zone's usage on each pset. */ 2663 while ((dent = readdir(dir)) != NULL) { 2664 2665 if (strcmp(dent->d_name, ".") == 0 || 2666 strcmp(dent->d_name, "..") == 0) 2667 continue; 2668 2669 (void) snprintf(path, sizeof (path), "/proc/%s/psinfo", 2670 dent->d_name); 2671 2672 fd = open(path, O_RDONLY); 2673 if (fd < 0) 2674 continue; 2675 2676 if (read(fd, &psinfo, sizeof (psinfo)) != sizeof (psinfo)) { 2677 (void) close(fd); 2678 continue; 2679 } 2680 (void) close(fd); 2681 2682 zsd_get_proc_info(ctl, &psinfo, &psetid, &prev_psetid, 2683 &zoneid, &prev_zoneid, &delta, &sched); 2684 2685 d1.tv_sec = delta.tv_sec / 2; 2686 d1.tv_nsec = delta.tv_nsec / 2; 2687 d2.tv_sec = (delta.tv_sec / 2) + (delta.tv_sec % 2); 2688 d2.tv_nsec = (delta.tv_nsec / 2) + (delta.tv_nsec % 2); 2689 2690 /* Get the zone and pset this process is running in */ 2691 zone = zsd_lookup_zone_byid(ctl, zoneid); 2692 if (zone == NULL) 2693 continue; 2694 pset = zsd_lookup_pset_byid(ctl, psetid); 2695 if (pset == NULL) 2696 continue; 2697 usage = zsd_lookup_insert_usage(ctl, pset, zone); 2698 if (usage == NULL) 2699 continue; 2700 2701 /* 2702 * Get the usage of the previous zone and pset if they were 2703 * different. 2704 */ 2705 if (zoneid != prev_zoneid) 2706 prev_zone = zsd_lookup_zone_byid(ctl, prev_zoneid); 2707 else 2708 prev_zone = NULL; 2709 2710 if (psetid != prev_psetid) 2711 prev_pset = zsd_lookup_pset_byid(ctl, prev_psetid); 2712 else 2713 prev_pset = NULL; 2714 2715 prev_usage = NULL; 2716 if (prev_zone != NULL || prev_pset != NULL) { 2717 if (prev_zone == NULL) 2718 prev_zone = zone; 2719 if (prev_pset == NULL) 2720 prev_pset = pset; 2721 2722 prev_usage = zsd_lookup_insert_usage(ctl, prev_pset, 2723 prev_zone); 2724 } 2725 2726 /* Update the usage with the processes info */ 2727 if (prev_usage == NULL) { 2728 zsd_mark_pset_usage_found(usage, sched); 2729 } else { 2730 zsd_mark_pset_usage_found(usage, sched); 2731 zsd_mark_pset_usage_found(prev_usage, sched); 2732 } 2733 2734 /* 2735 * First time around is just to get a starting point. All 2736 * usages will be zero. 2737 */ 2738 if (init == B_TRUE) 2739 continue; 2740 2741 if (prev_usage == NULL) { 2742 zsd_add_usage(ctl, usage, &delta); 2743 } else { 2744 zsd_add_usage(ctl, usage, &d1); 2745 zsd_add_usage(ctl, prev_usage, &d2); 2746 } 2747 } 2748 (void) closedir(dir); 2749 2750 /* 2751 * No need to collect exited proc data on initialization. Just 2752 * caching the usage of the known processes to get a zero starting 2753 * point. 2754 */ 2755 if (init == B_TRUE) 2756 return; 2757 2758 /* 2759 * Add accounting records to account for processes which have 2760 * exited. 2761 */ 2762 list_create(&plist, sizeof (zsd_proc_t), 2763 offsetof(zsd_proc_t, zspr_next)); 2764 list_create(&pplist, sizeof (zsd_proc_t), 2765 offsetof(zsd_proc_t, zspr_next)); 2766 2767 for (;;) { 2768 pid_t pid; 2769 pid_t ppid; 2770 timestruc_t user, sys, proc_usage; 2771 timestruc_t finish; 2772 int numfound = 0; 2773 2774 bzero(&object, sizeof (object)); 2775 proc = NULL; 2776 zone = NULL; 2777 pset = NULL; 2778 usage = NULL; 2779 ret = ea_get_object(&ctl->zsctl_proc_eaf, &object); 2780 if (ret == EO_ERROR) { 2781 if (ea_error() == EXR_EOF) { 2782 2783 struct stat64 *stat; 2784 struct stat64 *stat_next; 2785 2786 /* 2787 * See if the next accounting file is the 2788 * same as the current accounting file. 2789 */ 2790 stat = &(ctl->zsctl_proc_stat); 2791 stat_next = &(ctl->zsctl_proc_stat_next); 2792 if (stat->st_ino == stat_next->st_ino && 2793 stat->st_dev == stat_next->st_dev) { 2794 /* 2795 * End of current accounting file is 2796 * reached, so finished. Clear EOF 2797 * bit for next time around. 2798 */ 2799 ea_clear(&ctl->zsctl_proc_eaf); 2800 break; 2801 } else { 2802 /* 2803 * Accounting file has changed. Move 2804 * to current accounting file. 2805 */ 2806 (void) ea_close(&ctl->zsctl_proc_eaf); 2807 2808 ctl->zsctl_proc_fd = 2809 ctl->zsctl_proc_fd_next; 2810 ctl->zsctl_proc_eaf = 2811 ctl->zsctl_proc_eaf_next; 2812 ctl->zsctl_proc_stat = 2813 ctl->zsctl_proc_stat_next; 2814 2815 ctl->zsctl_proc_fd_next = -1; 2816 ctl->zsctl_proc_open_next = 0; 2817 continue; 2818 } 2819 } else { 2820 /* 2821 * Other accounting error. Give up on 2822 * accounting. 2823 */ 2824 goto ea_err; 2825 } 2826 } 2827 /* Skip if not a process group */ 2828 if ((object.eo_catalog & EXT_TYPE_MASK) != EXT_GROUP || 2829 (object.eo_catalog & EXD_DATA_MASK) != EXD_GROUP_PROC) { 2830 (void) ea_free_item(&object, EUP_ALLOC); 2831 continue; 2832 } 2833 2834 /* The process group entry should be complete */ 2835 while (numfound < 9) { 2836 bzero(&pobject, sizeof (pobject)); 2837 ret = ea_get_object(&ctl->zsctl_proc_eaf, 2838 &pobject); 2839 if (ret < 0) { 2840 (void) ea_free_item(&object, EUP_ALLOC); 2841 zsd_warn( 2842 "unable to get process accounting data"); 2843 goto ea_err; 2844 } 2845 /* Next entries should be process data */ 2846 if ((pobject.eo_catalog & EXT_TYPE_MASK) == 2847 EXT_GROUP) { 2848 (void) ea_free_item(&object, EUP_ALLOC); 2849 (void) ea_free_item(&pobject, EUP_ALLOC); 2850 zsd_warn( 2851 "process data of wrong type"); 2852 goto ea_err; 2853 } 2854 switch (pobject.eo_catalog & EXD_DATA_MASK) { 2855 case EXD_PROC_PID: 2856 pid = pobject.eo_item.ei_uint32; 2857 proc = &(ctl->zsctl_proc_array[pid]); 2858 /* 2859 * This process should not be currently in 2860 * the list of processes to process. 2861 */ 2862 assert(!list_link_active(&proc->zspr_next)); 2863 numfound++; 2864 break; 2865 case EXD_PROC_ANCPID: 2866 ppid = pobject.eo_item.ei_uint32; 2867 pproc = &(ctl->zsctl_proc_array[ppid]); 2868 numfound++; 2869 break; 2870 case EXD_PROC_ZONENAME: 2871 zone = zsd_lookup_zone(ctl, 2872 pobject.eo_item.ei_string, -1); 2873 numfound++; 2874 break; 2875 case EXD_PROC_CPU_USER_SEC: 2876 user.tv_sec = 2877 pobject.eo_item.ei_uint64; 2878 numfound++; 2879 break; 2880 case EXD_PROC_CPU_USER_NSEC: 2881 user.tv_nsec = 2882 pobject.eo_item.ei_uint64; 2883 numfound++; 2884 break; 2885 case EXD_PROC_CPU_SYS_SEC: 2886 sys.tv_sec = 2887 pobject.eo_item.ei_uint64; 2888 numfound++; 2889 break; 2890 case EXD_PROC_CPU_SYS_NSEC: 2891 sys.tv_nsec = 2892 pobject.eo_item.ei_uint64; 2893 numfound++; 2894 break; 2895 case EXD_PROC_FINISH_SEC: 2896 finish.tv_sec = 2897 pobject.eo_item.ei_uint64; 2898 numfound++; 2899 break; 2900 case EXD_PROC_FINISH_NSEC: 2901 finish.tv_nsec = 2902 pobject.eo_item.ei_uint64; 2903 numfound++; 2904 break; 2905 } 2906 (void) ea_free_item(&pobject, EUP_ALLOC); 2907 } 2908 (void) ea_free_item(&object, EUP_ALLOC); 2909 if (numfound != 9) { 2910 zsd_warn(gettext( 2911 "Malformed process accounting entry found")); 2912 goto proc_done; 2913 } 2914 2915 if (finish.tv_sec > interval_end.tv_sec || 2916 (finish.tv_sec == interval_end.tv_sec && 2917 finish.tv_nsec > (interval_end.tv_usec * 1000))) 2918 hrtime_expired = B_TRUE; 2919 2920 /* 2921 * Try to identify the zone and pset to which this 2922 * exited process belongs. 2923 */ 2924 if (zone == NULL) 2925 goto proc_done; 2926 2927 /* Save proc info */ 2928 proc->zspr_ppid = ppid; 2929 proc->zspr_zoneid = zone->zsz_id; 2930 2931 prev_psetid = ZS_PSET_ERROR; 2932 sched = 0; 2933 2934 /* 2935 * The following tries to deduce the processes pset. 2936 * 2937 * First choose pset and sched using cached value from the 2938 * most recent time the process has been seen. 2939 * 2940 * pset and sched can change across zone_enter, so make sure 2941 * most recent sighting of this process was in the same 2942 * zone before using most recent known value. 2943 * 2944 * If there is no known value, use value of processes 2945 * parent. If parent is unknown, walk parents until a known 2946 * parent is found. 2947 * 2948 * If no parent in the zone is found, use the zone's default 2949 * pset and scheduling class. 2950 */ 2951 if (proc->zspr_psetid != ZS_PSET_ERROR) { 2952 prev_psetid = proc->zspr_psetid; 2953 pset = zsd_lookup_pset_byid(ctl, prev_psetid); 2954 sched = proc->zspr_sched; 2955 } else if (pproc->zspr_zoneid == zone->zsz_id && 2956 pproc->zspr_psetid != ZS_PSET_ERROR) { 2957 prev_psetid = pproc->zspr_psetid; 2958 pset = zsd_lookup_pset_byid(ctl, prev_psetid); 2959 sched = pproc->zspr_sched; 2960 } 2961 2962 if (pset == NULL) { 2963 /* 2964 * Process or processes parent has never been seen. 2965 * Save to deduce a known parent later. 2966 */ 2967 proc_usage = sys; 2968 TIMESTRUC_ADD_TIMESTRUC(proc_usage, user); 2969 TIMESTRUC_DELTA(delta, proc_usage, 2970 proc->zspr_usage); 2971 proc->zspr_usage = delta; 2972 list_insert_tail(&plist, proc); 2973 continue; 2974 } 2975 2976 /* Add the zone's usage to the pset */ 2977 usage = zsd_lookup_insert_usage(ctl, pset, zone); 2978 if (usage == NULL) 2979 goto proc_done; 2980 2981 zsd_mark_pset_usage_found(usage, sched); 2982 2983 /* compute the usage to add for the exited proc */ 2984 proc_usage = sys; 2985 TIMESTRUC_ADD_TIMESTRUC(proc_usage, user); 2986 TIMESTRUC_DELTA(delta, proc_usage, 2987 proc->zspr_usage); 2988 2989 zsd_add_usage(ctl, usage, &delta); 2990 proc_done: 2991 zsd_flush_proc_info(proc); 2992 2993 if (hrtime_expired == B_TRUE) 2994 break; 2995 } 2996 /* 2997 * close next accounting file. 2998 */ 2999 if (ctl->zsctl_proc_open_next) { 3000 (void) ea_close( 3001 &ctl->zsctl_proc_eaf_next); 3002 ctl->zsctl_proc_open_next = 0; 3003 ctl->zsctl_proc_fd_next = -1; 3004 } 3005 3006 /* For the remaining processes, use pset and sched of a known parent */ 3007 proc = list_head(&plist); 3008 while (proc != NULL) { 3009 next = proc; 3010 for (;;) { 3011 if (next->zspr_ppid == 0 || next->zspr_ppid == -1) { 3012 /* 3013 * Kernel process, or parent is unknown, skip 3014 * process, remove from process list. 3015 */ 3016 tmp = proc; 3017 proc = list_next(&plist, proc); 3018 list_link_init(&tmp->zspr_next); 3019 break; 3020 } 3021 pproc = &(ctl->zsctl_proc_array[next->zspr_ppid]); 3022 if (pproc->zspr_zoneid != proc->zspr_zoneid) { 3023 /* 3024 * Parent in different zone. Save process and 3025 * use zone's default pset and sched below 3026 */ 3027 tmp = proc; 3028 proc = list_next(&plist, proc); 3029 list_remove(&plist, tmp); 3030 list_insert_tail(&pplist, tmp); 3031 break; 3032 } 3033 /* Parent has unknown pset, Search parent's parent */ 3034 if (pproc->zspr_psetid == ZS_PSET_ERROR) { 3035 next = pproc; 3036 continue; 3037 } 3038 /* Found parent with known pset. Use its info */ 3039 proc->zspr_psetid = pproc->zspr_psetid; 3040 proc->zspr_sched = pproc->zspr_sched; 3041 next->zspr_psetid = pproc->zspr_psetid; 3042 next->zspr_sched = pproc->zspr_sched; 3043 zone = zsd_lookup_zone_byid(ctl, 3044 proc->zspr_zoneid); 3045 if (zone == NULL) { 3046 tmp = proc; 3047 proc = list_next(&plist, proc); 3048 list_remove(&plist, tmp); 3049 list_link_init(&tmp->zspr_next); 3050 break; 3051 } 3052 pset = zsd_lookup_pset_byid(ctl, 3053 proc->zspr_psetid); 3054 if (pset == NULL) { 3055 tmp = proc; 3056 proc = list_next(&plist, proc); 3057 list_remove(&plist, tmp); 3058 list_link_init(&tmp->zspr_next); 3059 break; 3060 } 3061 /* Add the zone's usage to the pset */ 3062 usage = zsd_lookup_insert_usage(ctl, pset, zone); 3063 if (usage == NULL) { 3064 tmp = proc; 3065 proc = list_next(&plist, proc); 3066 list_remove(&plist, tmp); 3067 list_link_init(&tmp->zspr_next); 3068 break; 3069 } 3070 zsd_mark_pset_usage_found(usage, proc->zspr_sched); 3071 zsd_add_usage(ctl, usage, &proc->zspr_usage); 3072 zsd_flush_proc_info(proc); 3073 tmp = proc; 3074 proc = list_next(&plist, proc); 3075 list_remove(&plist, tmp); 3076 list_link_init(&tmp->zspr_next); 3077 break; 3078 } 3079 } 3080 /* 3081 * Process has never been seen. Using zone info to 3082 * determine pset and scheduling class. 3083 */ 3084 proc = list_head(&pplist); 3085 while (proc != NULL) { 3086 3087 zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid); 3088 if (zone == NULL) 3089 goto next; 3090 if (zone->zsz_psetid != ZS_PSET_ERROR && 3091 zone->zsz_psetid != ZS_PSET_MULTI) { 3092 prev_psetid = zone->zsz_psetid; 3093 pset = zsd_lookup_pset_byid(ctl, prev_psetid); 3094 } else { 3095 pset = zsd_lookup_pset(ctl, zone->zsz_pset, -1); 3096 if (pset != NULL) 3097 prev_psetid = pset->zsp_id; 3098 } 3099 if (pset == NULL) 3100 goto next; 3101 3102 sched = zone->zsz_scheds; 3103 /* 3104 * Ignore FX high scheduling class if it is not the 3105 * only scheduling class in the zone. 3106 */ 3107 if (sched != ZS_SCHED_FX_60) 3108 sched &= (~ZS_SCHED_FX_60); 3109 /* 3110 * If more than one scheduling class has been found 3111 * in the zone, use zone's default scheduling class for 3112 * this process. 3113 */ 3114 if ((sched & (sched - 1)) != 0) 3115 sched = zone->zsz_default_sched; 3116 3117 /* Add the zone's usage to the pset */ 3118 usage = zsd_lookup_insert_usage(ctl, pset, zone); 3119 if (usage == NULL) 3120 goto next; 3121 3122 zsd_mark_pset_usage_found(usage, sched); 3123 zsd_add_usage(ctl, usage, &proc->zspr_usage); 3124 next: 3125 tmp = proc; 3126 proc = list_next(&pplist, proc); 3127 zsd_flush_proc_info(tmp); 3128 list_link_init(&tmp->zspr_next); 3129 } 3130 return; 3131 ea_err: 3132 /* 3133 * Close the next accounting file if we have not transitioned to it 3134 * yet. 3135 */ 3136 if (ctl->zsctl_proc_open_next) { 3137 (void) ea_close(&ctl->zsctl_proc_eaf_next); 3138 ctl->zsctl_proc_open_next = 0; 3139 ctl->zsctl_proc_fd_next = -1; 3140 } 3141 } 3142 3143 /* 3144 * getvmusage(2) uses size_t's in the passwd data structure, which differ 3145 * in size for 32bit and 64 bit kernels. Since this is a contracted interface, 3146 * and zonestatd does not necessarily match the kernel's bitness, marshal 3147 * results appropriately. 3148 */ 3149 static int 3150 zsd_getvmusage(zsd_ctl_t *ctl, uint_t flags, time_t age, zsd_vmusage64_t *buf, 3151 uint64_t *nres) 3152 { 3153 zsd_vmusage32_t *vmu32; 3154 zsd_vmusage64_t *vmu64; 3155 uint32_t nres32; 3156 int i; 3157 int ret; 3158 3159 if (ctl->zsctl_kern_bits == 32) { 3160 nres32 = *nres; 3161 ret = syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, 3162 flags, age, (uintptr_t)buf, (uintptr_t)&nres32); 3163 *nres = nres32; 3164 if (ret == 0 && buf != NULL) { 3165 /* 3166 * An array of vmusage32_t's has been returned. 3167 * Convert it to an array of vmusage64_t's. 3168 */ 3169 vmu32 = (zsd_vmusage32_t *)buf; 3170 vmu64 = (zsd_vmusage64_t *)buf; 3171 for (i = nres32 - 1; i >= 0; i--) { 3172 3173 vmu64[i].vmu_zoneid = vmu32[i].vmu_zoneid; 3174 vmu64[i].vmu_type = vmu32[i].vmu_type; 3175 vmu64[i].vmu_type = vmu32[i].vmu_type; 3176 vmu64[i].vmu_rss_all = vmu32[i].vmu_rss_all; 3177 vmu64[i].vmu_rss_private = 3178 vmu32[i].vmu_rss_private; 3179 vmu64[i].vmu_rss_shared = 3180 vmu32[i].vmu_rss_shared; 3181 vmu64[i].vmu_swap_all = vmu32[i].vmu_swap_all; 3182 vmu64[i].vmu_swap_private = 3183 vmu32[i].vmu_swap_private; 3184 vmu64[i].vmu_swap_shared = 3185 vmu32[i].vmu_swap_shared; 3186 } 3187 } 3188 return (ret); 3189 } else { 3190 /* 3191 * kernel is 64 bit, so use 64 bit structures as zonestat 3192 * expects. 3193 */ 3194 return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, 3195 flags, age, (uintptr_t)buf, (uintptr_t)nres)); 3196 3197 } 3198 } 3199 3200 /* 3201 * Update the current physical, virtual, and locked memory usage of the 3202 * running zones. 3203 */ 3204 static void 3205 zsd_refresh_memory(zsd_ctl_t *ctl, boolean_t init) 3206 { 3207 3208 uint64_t phys_total; 3209 uint64_t phys_used; 3210 uint64_t phys_zones; 3211 uint64_t phys_zones_overcount; 3212 uint64_t phys_zones_extra; 3213 uint64_t phys_zones_credit; 3214 3215 uint64_t vm_free; 3216 uint64_t vm_used; 3217 3218 uint64_t disk_swap_total; 3219 uint64_t disk_swap_used; /* disk swap with contents */ 3220 3221 uint64_t physmem; 3222 uint64_t pp_kernel; 3223 uint64_t arc_size = 0; 3224 struct anoninfo ani; 3225 3226 int num_swap_devices; 3227 struct swaptable *swt; 3228 struct swapent *swent; 3229 size_t swt_size; 3230 char *path; 3231 3232 zsd_vmusage64_t *vmusage; 3233 uint64_t num_vmusage; 3234 3235 int i, ret; 3236 3237 zsd_system_t *sys; 3238 zsd_zone_t *zone; 3239 int vmu_nzones; 3240 3241 kstat_t *kstat; 3242 char kstat_name[KSTAT_STRLEN]; 3243 kstat_named_t *knp; 3244 kid_t kid; 3245 3246 if (init) 3247 return; 3248 3249 sys = ctl->zsctl_system; 3250 3251 /* interrogate swap devices to find the amount of disk swap */ 3252 disk_swap_again: 3253 num_swap_devices = swapctl(SC_GETNSWP, NULL); 3254 3255 if (num_swap_devices == 0) { 3256 sys->zss_swap_total = disk_swap_total = 0; 3257 sys->zss_swap_used = disk_swap_used = 0; 3258 /* No disk swap */ 3259 goto disk_swap_done; 3260 } 3261 /* see if swap table needs to be larger */ 3262 if (num_swap_devices > ctl->zsctl_swap_cache_num) { 3263 swt_size = sizeof (int) + 3264 (num_swap_devices * sizeof (struct swapent)) + 3265 (num_swap_devices * MAXPATHLEN); 3266 if (ctl->zsctl_swap_cache != NULL) 3267 free(ctl->zsctl_swap_cache); 3268 3269 swt = (struct swaptable *)malloc(swt_size); 3270 if (swt == NULL) { 3271 /* 3272 * Could not allocate to get list of swap devices. 3273 * Just use data from the most recent read, which will 3274 * be zero if this is the first read. 3275 */ 3276 zsd_warn(gettext("Unable to allocate to determine " 3277 "virtual memory")); 3278 disk_swap_total = sys->zss_swap_total; 3279 disk_swap_used = sys->zss_swap_used; 3280 goto disk_swap_done; 3281 } 3282 swent = swt->swt_ent; 3283 path = (char *)swt + (sizeof (int) + 3284 num_swap_devices * sizeof (swapent_t)); 3285 for (i = 0; i < num_swap_devices; i++, swent++) { 3286 swent->ste_path = path; 3287 path += MAXPATHLEN; 3288 } 3289 swt->swt_n = num_swap_devices; 3290 ctl->zsctl_swap_cache = swt; 3291 ctl->zsctl_swap_cache_size = swt_size; 3292 ctl->zsctl_swap_cache_num = num_swap_devices; 3293 } 3294 num_swap_devices = swapctl(SC_LIST, ctl->zsctl_swap_cache); 3295 if (num_swap_devices < 0) { 3296 /* More swap devices have arrived */ 3297 if (errno == ENOMEM) 3298 goto disk_swap_again; 3299 3300 zsd_warn(gettext("Unable to determine disk swap devices")); 3301 /* Unexpected error. Use existing data */ 3302 disk_swap_total = sys->zss_swap_total; 3303 disk_swap_used = sys->zss_swap_used; 3304 goto disk_swap_done; 3305 } 3306 3307 /* add up the disk swap */ 3308 disk_swap_total = 0; 3309 disk_swap_used = 0; 3310 swent = ctl->zsctl_swap_cache->swt_ent; 3311 for (i = 0; i < num_swap_devices; i++, swent++) { 3312 disk_swap_total += swent->ste_pages; 3313 disk_swap_used += (swent->ste_pages - swent->ste_free); 3314 } 3315 disk_swap_total *= ctl->zsctl_pagesize; 3316 disk_swap_used *= ctl->zsctl_pagesize; 3317 3318 sys->zss_swap_total = disk_swap_total; 3319 sys->zss_swap_used = disk_swap_used; 3320 3321 disk_swap_done: 3322 3323 /* get system pages kstat */ 3324 kid = -1; 3325 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "unix", 0, "system_pages"); 3326 if (kstat == NULL) 3327 zsd_warn(gettext("Unable to lookup system pages kstat")); 3328 else 3329 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL); 3330 3331 if (kid == -1) { 3332 zsd_warn(gettext("Unable to read system pages kstat")); 3333 return; 3334 } else { 3335 knp = kstat_data_lookup(kstat, "physmem"); 3336 if (knp == NULL) { 3337 zsd_warn(gettext("Unable to read physmem")); 3338 } else { 3339 if (knp->data_type == KSTAT_DATA_UINT64) 3340 physmem = knp->value.ui64; 3341 else if (knp->data_type == KSTAT_DATA_UINT32) 3342 physmem = knp->value.ui32; 3343 else 3344 return; 3345 } 3346 knp = kstat_data_lookup(kstat, "pp_kernel"); 3347 if (knp == NULL) { 3348 zsd_warn(gettext("Unable to read pp_kernel")); 3349 } else { 3350 if (knp->data_type == KSTAT_DATA_UINT64) 3351 pp_kernel = knp->value.ui64; 3352 else if (knp->data_type == KSTAT_DATA_UINT32) 3353 pp_kernel = knp->value.ui32; 3354 else 3355 return; 3356 } 3357 } 3358 physmem *= ctl->zsctl_pagesize; 3359 pp_kernel *= ctl->zsctl_pagesize; 3360 3361 /* get the zfs arc size if available */ 3362 arc_size = 0; 3363 kid = -1; 3364 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "zfs", 0, "arcstats"); 3365 if (kstat != NULL) 3366 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL); 3367 if (kid != -1) { 3368 knp = kstat_data_lookup(kstat, "size"); 3369 if (knp != NULL) 3370 if (knp->data_type == KSTAT_DATA_UINT64) 3371 arc_size = knp->value.ui64; 3372 } 3373 3374 /* Try to get swap information */ 3375 if (swapctl(SC_AINFO, &ani) < 0) { 3376 zsd_warn(gettext("Unable to get swap info")); 3377 return; 3378 } 3379 3380 vmusage_again: 3381 /* getvmusage to get physical memory usage */ 3382 vmusage = ctl->zsctl_vmusage_cache; 3383 num_vmusage = ctl->zsctl_vmusage_cache_num; 3384 3385 ret = zsd_getvmusage(ctl, VMUSAGE_SYSTEM | VMUSAGE_ALL_ZONES, 0, 3386 vmusage, &num_vmusage); 3387 3388 if (ret != 0) { 3389 /* Unexpected error. Use existing data */ 3390 if (errno != EOVERFLOW) { 3391 zsd_warn(gettext( 3392 "Unable to read physical memory usage")); 3393 phys_zones = sys->zss_ram_zones; 3394 goto vmusage_done; 3395 } 3396 } 3397 /* vmusage results cache too small */ 3398 if (num_vmusage > ctl->zsctl_vmusage_cache_num) { 3399 3400 size_t size = sizeof (zsd_vmusage64_t) * num_vmusage; 3401 3402 if (ctl->zsctl_vmusage_cache != NULL) 3403 free(ctl->zsctl_vmusage_cache); 3404 vmusage = (zsd_vmusage64_t *)malloc(size); 3405 if (vmusage == NULL) { 3406 zsd_warn(gettext("Unable to alloc to determine " 3407 "physical memory usage")); 3408 phys_zones = sys->zss_ram_zones; 3409 goto vmusage_done; 3410 } 3411 ctl->zsctl_vmusage_cache = vmusage; 3412 ctl->zsctl_vmusage_cache_num = num_vmusage; 3413 goto vmusage_again; 3414 } 3415 3416 phys_zones_overcount = 0; 3417 vmu_nzones = 0; 3418 for (i = 0; i < num_vmusage; i++) { 3419 switch (vmusage[i].vmu_type) { 3420 case VMUSAGE_SYSTEM: 3421 /* total pages backing user process mappings */ 3422 phys_zones = sys->zss_ram_zones = 3423 vmusage[i].vmu_rss_all; 3424 break; 3425 case VMUSAGE_ZONE: 3426 vmu_nzones++; 3427 phys_zones_overcount += vmusage[i].vmu_rss_all; 3428 zone = zsd_lookup_zone_byid(ctl, vmusage[i].vmu_id); 3429 if (zone != NULL) 3430 zone->zsz_usage_ram = vmusage[i].vmu_rss_all; 3431 break; 3432 default: 3433 break; 3434 } 3435 } 3436 /* 3437 * Figure how much memory was double counted due to text sharing 3438 * between zones. Credit this back so that the sum of the zones 3439 * equals the total zone ram usage; 3440 */ 3441 phys_zones_extra = phys_zones_overcount - phys_zones; 3442 phys_zones_credit = phys_zones_extra / vmu_nzones; 3443 3444 vmusage_done: 3445 3446 /* walk the zones to get swap and locked kstats. Fetch ram cap. */ 3447 sys->zss_locked_zones = 0; 3448 sys->zss_vm_zones = 0; 3449 for (zone = list_head(&ctl->zsctl_zones); zone != NULL; 3450 zone = list_next(&ctl->zsctl_zones, zone)) { 3451 3452 /* If zone halted during interval, show memory usage as none */ 3453 if (zone->zsz_active == B_FALSE || 3454 zone->zsz_deleted == B_TRUE) { 3455 zone->zsz_usage_ram = 0; 3456 zone->zsz_usage_vm = 0; 3457 zone->zsz_usage_locked = 0; 3458 continue; 3459 } 3460 3461 if (phys_zones_credit > 0) { 3462 if (zone->zsz_usage_ram > phys_zones_credit) { 3463 zone->zsz_usage_ram -= phys_zones_credit; 3464 } 3465 } 3466 /* 3467 * Get zone's swap usage. Since zone could have halted, 3468 * treats as zero if cannot read 3469 */ 3470 zone->zsz_usage_vm = 0; 3471 (void) snprintf(kstat_name, sizeof (kstat_name), 3472 "swapresv_zone_%d", zone->zsz_id); 3473 kid = -1; 3474 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps", 3475 zone->zsz_id, kstat_name); 3476 if (kstat != NULL) 3477 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL); 3478 if (kid != -1) { 3479 knp = kstat_data_lookup(kstat, "usage"); 3480 if (knp != NULL && 3481 knp->data_type == KSTAT_DATA_UINT64) { 3482 zone->zsz_usage_vm = knp->value.ui64; 3483 sys->zss_vm_zones += knp->value.ui64; 3484 } 3485 } 3486 /* 3487 * Get zone's locked usage. Since zone could have halted, 3488 * treats as zero if cannot read 3489 */ 3490 zone->zsz_usage_locked = 0; 3491 (void) snprintf(kstat_name, sizeof (kstat_name), 3492 "lockedmem_zone_%d", zone->zsz_id); 3493 kid = -1; 3494 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps", 3495 zone->zsz_id, kstat_name); 3496 if (kstat != NULL) 3497 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL); 3498 if (kid != -1) { 3499 knp = kstat_data_lookup(kstat, "usage"); 3500 if (knp != NULL && 3501 knp->data_type == KSTAT_DATA_UINT64) { 3502 zone->zsz_usage_locked = knp->value.ui64; 3503 /* 3504 * Since locked memory accounting for zones 3505 * can double count ddi locked memory, cap each 3506 * zone's locked usage at its ram usage. 3507 */ 3508 if (zone->zsz_usage_locked > 3509 zone->zsz_usage_ram) 3510 zone->zsz_usage_locked = 3511 zone->zsz_usage_ram; 3512 sys->zss_locked_zones += 3513 zone->zsz_usage_locked; 3514 } 3515 } 3516 } 3517 3518 phys_total = 3519 sysconf(_SC_PHYS_PAGES) * ctl->zsctl_pagesize; 3520 3521 phys_used = (sysconf(_SC_PHYS_PAGES) - sysconf(_SC_AVPHYS_PAGES)) 3522 * ctl->zsctl_pagesize; 3523 3524 /* Compute remaining statistics */ 3525 sys->zss_ram_total = phys_total; 3526 sys->zss_ram_zones = phys_zones; 3527 sys->zss_ram_kern = phys_used - phys_zones - arc_size; 3528 3529 /* 3530 * The total for kernel locked memory should include 3531 * segkp locked pages, but oh well. The arc size is subtracted, 3532 * as that physical memory is reclaimable. 3533 */ 3534 sys->zss_locked_kern = pp_kernel - arc_size; 3535 /* Add memory used by kernel startup and obp to kernel locked */ 3536 if ((phys_total - physmem) > 0) 3537 sys->zss_locked_kern += phys_total - physmem; 3538 3539 /* 3540 * Add in the portion of (RAM+DISK) that is not available as swap, 3541 * and consider it swap used by the kernel. 3542 */ 3543 sys->zss_vm_total = phys_total + disk_swap_total; 3544 vm_free = (ani.ani_max - ani.ani_resv) * ctl->zsctl_pagesize; 3545 vm_used = sys->zss_vm_total - vm_free; 3546 sys->zss_vm_kern = vm_used - sys->zss_vm_zones - arc_size; 3547 } 3548 3549 /* 3550 * Charge each cpu's usage to its processor sets. Also add the cpu's total 3551 * time to each zone using the processor set. This tracks the maximum 3552 * amount of cpu time that a zone could have used. 3553 */ 3554 static void 3555 zsd_refresh_cpu_stats(zsd_ctl_t *ctl, boolean_t init) 3556 { 3557 zsd_system_t *sys; 3558 zsd_zone_t *zone; 3559 zsd_pset_usage_t *usage; 3560 zsd_cpu_t *cpu; 3561 zsd_cpu_t *cpu_next; 3562 zsd_pset_t *pset; 3563 timestruc_t ts; 3564 uint64_t hrtime; 3565 timestruc_t delta; 3566 3567 /* Update the per-cpu kstat data */ 3568 cpu_next = list_head(&ctl->zsctl_cpus); 3569 while (cpu_next != NULL) { 3570 cpu = cpu_next; 3571 cpu_next = list_next(&ctl->zsctl_cpus, cpu); 3572 zsd_update_cpu_stats(ctl, cpu); 3573 } 3574 /* Update the elapsed real time */ 3575 hrtime = gethrtime(); 3576 if (init) { 3577 /* first time around, store hrtime for future comparision */ 3578 ctl->zsctl_hrtime = hrtime; 3579 ctl->zsctl_hrtime_prev = hrtime; 3580 3581 } else { 3582 /* Compute increase in hrtime since the most recent read */ 3583 ctl->zsctl_hrtime_prev = ctl->zsctl_hrtime; 3584 ctl->zsctl_hrtime = hrtime; 3585 if ((hrtime = hrtime - ctl->zsctl_hrtime_prev) > 0) 3586 TIMESTRUC_ADD_NANOSEC(ctl->zsctl_hrtime_total, hrtime); 3587 } 3588 3589 /* On initialization, all psets have zero time */ 3590 if (init) 3591 return; 3592 3593 for (pset = list_head(&ctl->zsctl_psets); pset != NULL; 3594 pset = list_next(&ctl->zsctl_psets, pset)) { 3595 3596 if (pset->zsp_active == B_FALSE) { 3597 zsd_warn(gettext("Internal error,inactive pset found")); 3598 continue; 3599 } 3600 3601 /* sum total used time for pset */ 3602 ts.tv_sec = 0; 3603 ts.tv_nsec = 0; 3604 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_intr); 3605 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_kern); 3606 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_user); 3607 /* kernel time in pset is total time minus zone time */ 3608 TIMESTRUC_DELTA(pset->zsp_usage_kern, ts, 3609 pset->zsp_usage_zones); 3610 if (pset->zsp_usage_kern.tv_sec < 0 || 3611 pset->zsp_usage_kern.tv_nsec < 0) { 3612 pset->zsp_usage_kern.tv_sec = 0; 3613 pset->zsp_usage_kern.tv_nsec = 0; 3614 } 3615 /* Total pset elapsed time is used time plus idle time */ 3616 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_idle); 3617 3618 TIMESTRUC_DELTA(delta, ts, pset->zsp_total_time); 3619 3620 for (usage = list_head(&pset->zsp_usage_list); usage != NULL; 3621 usage = list_next(&pset->zsp_usage_list, usage)) { 3622 3623 zone = usage->zsu_zone; 3624 if (usage->zsu_cpu_shares != ZS_LIMIT_NONE && 3625 usage->zsu_cpu_shares != ZS_SHARES_UNLIMITED && 3626 usage->zsu_cpu_shares != 0) { 3627 /* 3628 * Figure out how many nanoseconds of share time 3629 * to give to the zone 3630 */ 3631 hrtime = delta.tv_sec; 3632 hrtime *= NANOSEC; 3633 hrtime += delta.tv_nsec; 3634 hrtime *= usage->zsu_cpu_shares; 3635 hrtime /= pset->zsp_cpu_shares; 3636 TIMESTRUC_ADD_NANOSEC(zone->zsz_share_time, 3637 hrtime); 3638 } 3639 /* Add pset time to each zone using pset */ 3640 TIMESTRUC_ADD_TIMESTRUC(zone->zsz_pset_time, delta); 3641 3642 zone->zsz_cpus_online += pset->zsp_online; 3643 } 3644 pset->zsp_total_time = ts; 3645 } 3646 3647 for (zone = list_head(&ctl->zsctl_zones); zone != NULL; 3648 zone = list_next(&ctl->zsctl_zones, zone)) { 3649 3650 /* update cpu cap tracking if the zone has a cpu cap */ 3651 if (zone->zsz_cpu_cap != ZS_LIMIT_NONE) { 3652 uint64_t elapsed; 3653 3654 elapsed = ctl->zsctl_hrtime - ctl->zsctl_hrtime_prev; 3655 elapsed *= zone->zsz_cpu_cap; 3656 elapsed = elapsed / 100; 3657 TIMESTRUC_ADD_NANOSEC(zone->zsz_cap_time, elapsed); 3658 } 3659 } 3660 sys = ctl->zsctl_system; 3661 ts.tv_sec = 0; 3662 ts.tv_nsec = 0; 3663 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_intr); 3664 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_kern); 3665 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_user); 3666 3667 /* kernel time in pset is total time minus zone time */ 3668 TIMESTRUC_DELTA(sys->zss_cpu_usage_kern, ts, 3669 sys->zss_cpu_usage_zones); 3670 if (sys->zss_cpu_usage_kern.tv_sec < 0 || 3671 sys->zss_cpu_usage_kern.tv_nsec < 0) { 3672 sys->zss_cpu_usage_kern.tv_sec = 0; 3673 sys->zss_cpu_usage_kern.tv_nsec = 0; 3674 } 3675 /* Total pset elapsed time is used time plus idle time */ 3676 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_idle); 3677 sys->zss_cpu_total_time = ts; 3678 } 3679 3680 /* 3681 * Saves current usage data to a cache that is read by libzonestat when 3682 * calling zs_usage_read(). 3683 * 3684 * All pointers in the cached data structure are set to NULL. When 3685 * libzonestat reads the cached data, it will set the pointers relative to 3686 * its address space. 3687 */ 3688 static void 3689 zsd_usage_cache_update(zsd_ctl_t *ctl) 3690 { 3691 zs_usage_cache_t *cache; 3692 zs_usage_cache_t *old; 3693 zs_usage_t *usage; 3694 3695 zs_system_t *sys; 3696 zsd_system_t *dsys; 3697 zs_zone_t *zone = NULL; 3698 zsd_zone_t *dzone; 3699 zs_pset_t *pset = NULL; 3700 zsd_pset_t *dpset; 3701 zs_pset_zone_t *pusage; 3702 zsd_pset_usage_t *dpusage; 3703 3704 char *next; 3705 uint_t size, i, j; 3706 3707 size = 3708 sizeof (zs_usage_cache_t) + 3709 sizeof (zs_usage_t) + 3710 sizeof (zs_system_t) + 3711 sizeof (zs_zone_t) * ctl->zsctl_nzones + 3712 sizeof (zs_pset_t) * ctl->zsctl_npsets + 3713 sizeof (zs_pset_zone_t) * ctl->zsctl_npset_usages; 3714 3715 cache = (zs_usage_cache_t *)malloc(size); 3716 if (cache == NULL) { 3717 zsd_warn(gettext("Unable to allocate usage cache\n")); 3718 return; 3719 } 3720 3721 next = (char *)cache; 3722 cache->zsuc_size = size - sizeof (zs_usage_cache_t); 3723 next += sizeof (zs_usage_cache_t); 3724 3725 /* LINTED */ 3726 usage = cache->zsuc_usage = (zs_usage_t *)next; 3727 next += sizeof (zs_usage_t); 3728 usage->zsu_start = g_start; 3729 usage->zsu_hrstart = g_hrstart; 3730 usage->zsu_time = g_now; 3731 usage->zsu_hrtime = g_hrnow; 3732 usage->zsu_nzones = ctl->zsctl_nzones; 3733 usage->zsu_npsets = ctl->zsctl_npsets; 3734 usage->zsu_system = NULL; 3735 3736 /* LINTED */ 3737 sys = (zs_system_t *)next; 3738 next += sizeof (zs_system_t); 3739 dsys = ctl->zsctl_system; 3740 sys->zss_ram_total = dsys->zss_ram_total; 3741 sys->zss_ram_kern = dsys->zss_ram_kern; 3742 sys->zss_ram_zones = dsys->zss_ram_zones; 3743 sys->zss_locked_kern = dsys->zss_locked_kern; 3744 sys->zss_locked_zones = dsys->zss_locked_zones; 3745 sys->zss_vm_total = dsys->zss_vm_total; 3746 sys->zss_vm_kern = dsys->zss_vm_kern; 3747 sys->zss_vm_zones = dsys->zss_vm_zones; 3748 sys->zss_swap_total = dsys->zss_swap_total; 3749 sys->zss_swap_used = dsys->zss_swap_used; 3750 sys->zss_ncpus = dsys->zss_ncpus; 3751 sys->zss_ncpus_online = dsys->zss_ncpus_online; 3752 3753 sys->zss_processes_max = dsys->zss_maxpid; 3754 sys->zss_lwps_max = dsys->zss_lwps_max; 3755 sys->zss_shm_max = dsys->zss_shm_max; 3756 sys->zss_shmids_max = dsys->zss_shmids_max; 3757 sys->zss_semids_max = dsys->zss_semids_max; 3758 sys->zss_msgids_max = dsys->zss_msgids_max; 3759 sys->zss_lofi_max = dsys->zss_lofi_max; 3760 3761 sys->zss_processes = dsys->zss_processes; 3762 sys->zss_lwps = dsys->zss_lwps; 3763 sys->zss_shm = dsys->zss_shm; 3764 sys->zss_shmids = dsys->zss_shmids; 3765 sys->zss_semids = dsys->zss_semids; 3766 sys->zss_msgids = dsys->zss_msgids; 3767 sys->zss_lofi = dsys->zss_lofi; 3768 3769 sys->zss_cpu_total_time = dsys->zss_cpu_total_time; 3770 sys->zss_cpu_usage_zones = dsys->zss_cpu_usage_zones; 3771 sys->zss_cpu_usage_kern = dsys->zss_cpu_usage_kern; 3772 3773 for (i = 0, dzone = list_head(&ctl->zsctl_zones); 3774 i < ctl->zsctl_nzones; 3775 i++, dzone = list_next(&ctl->zsctl_zones, dzone)) { 3776 /* LINTED */ 3777 zone = (zs_zone_t *)next; 3778 next += sizeof (zs_zone_t); 3779 list_link_init(&zone->zsz_next); 3780 zone->zsz_system = NULL; 3781 3782 (void) strlcpy(zone->zsz_name, dzone->zsz_name, 3783 sizeof (zone->zsz_name)); 3784 (void) strlcpy(zone->zsz_pool, dzone->zsz_pool, 3785 sizeof (zone->zsz_pool)); 3786 (void) strlcpy(zone->zsz_pset, dzone->zsz_pset, 3787 sizeof (zone->zsz_pset)); 3788 zone->zsz_id = dzone->zsz_id; 3789 zone->zsz_cputype = dzone->zsz_cputype; 3790 zone->zsz_iptype = dzone->zsz_iptype; 3791 zone->zsz_start = dzone->zsz_start; 3792 zone->zsz_hrstart = dzone->zsz_hrstart; 3793 zone->zsz_scheds = dzone->zsz_scheds; 3794 zone->zsz_cpu_shares = dzone->zsz_cpu_shares; 3795 zone->zsz_cpu_cap = dzone->zsz_cpu_cap; 3796 zone->zsz_ram_cap = dzone->zsz_ram_cap; 3797 zone->zsz_vm_cap = dzone->zsz_vm_cap; 3798 zone->zsz_locked_cap = dzone->zsz_locked_cap; 3799 zone->zsz_cpu_usage = dzone->zsz_cpu_usage; 3800 zone->zsz_cpus_online = dzone->zsz_cpus_online; 3801 zone->zsz_pset_time = dzone->zsz_pset_time; 3802 zone->zsz_cap_time = dzone->zsz_cap_time; 3803 zone->zsz_share_time = dzone->zsz_share_time; 3804 zone->zsz_usage_ram = dzone->zsz_usage_ram; 3805 zone->zsz_usage_locked = dzone->zsz_usage_locked; 3806 zone->zsz_usage_vm = dzone->zsz_usage_vm; 3807 3808 zone->zsz_processes_cap = dzone->zsz_processes_cap; 3809 zone->zsz_lwps_cap = dzone->zsz_lwps_cap; 3810 zone->zsz_shm_cap = dzone->zsz_shm_cap; 3811 zone->zsz_shmids_cap = dzone->zsz_shmids_cap; 3812 zone->zsz_semids_cap = dzone->zsz_semids_cap; 3813 zone->zsz_msgids_cap = dzone->zsz_msgids_cap; 3814 zone->zsz_lofi_cap = dzone->zsz_lofi_cap; 3815 3816 zone->zsz_processes = dzone->zsz_processes; 3817 zone->zsz_lwps = dzone->zsz_lwps; 3818 zone->zsz_shm = dzone->zsz_shm; 3819 zone->zsz_shmids = dzone->zsz_shmids; 3820 zone->zsz_semids = dzone->zsz_semids; 3821 zone->zsz_msgids = dzone->zsz_msgids; 3822 zone->zsz_lofi = dzone->zsz_lofi; 3823 } 3824 3825 for (i = 0, dpset = list_head(&ctl->zsctl_psets); 3826 i < ctl->zsctl_npsets; 3827 i++, dpset = list_next(&ctl->zsctl_psets, dpset)) { 3828 /* LINTED */ 3829 pset = (zs_pset_t *)next; 3830 next += sizeof (zs_pset_t); 3831 list_link_init(&pset->zsp_next); 3832 (void) strlcpy(pset->zsp_name, dpset->zsp_name, 3833 sizeof (pset->zsp_name)); 3834 pset->zsp_id = dpset->zsp_id; 3835 pset->zsp_cputype = dpset->zsp_cputype; 3836 pset->zsp_start = dpset->zsp_start; 3837 pset->zsp_hrstart = dpset->zsp_hrstart; 3838 pset->zsp_online = dpset->zsp_online; 3839 pset->zsp_size = dpset->zsp_size; 3840 pset->zsp_min = dpset->zsp_min; 3841 pset->zsp_max = dpset->zsp_max; 3842 pset->zsp_importance = dpset->zsp_importance; 3843 pset->zsp_scheds = dpset->zsp_scheds; 3844 pset->zsp_cpu_shares = dpset->zsp_cpu_shares; 3845 pset->zsp_total_time = dpset->zsp_total_time; 3846 pset->zsp_usage_kern = dpset->zsp_usage_kern; 3847 pset->zsp_usage_zones = dpset->zsp_usage_zones; 3848 pset->zsp_nusage = dpset->zsp_nusage; 3849 /* Add pset usages for pset */ 3850 for (j = 0, dpusage = list_head(&dpset->zsp_usage_list); 3851 j < dpset->zsp_nusage; 3852 j++, dpusage = list_next(&dpset->zsp_usage_list, dpusage)) { 3853 /* LINTED */ 3854 pusage = (zs_pset_zone_t *)next; 3855 next += sizeof (zs_pset_zone_t); 3856 /* pointers are computed by client */ 3857 pusage->zspz_pset = NULL; 3858 pusage->zspz_zone = NULL; 3859 list_link_init(&pusage->zspz_next); 3860 pusage->zspz_zoneid = dpusage->zsu_zone->zsz_id; 3861 pusage->zspz_start = dpusage->zsu_start; 3862 pusage->zspz_hrstart = dpusage->zsu_hrstart; 3863 pusage->zspz_hrstart = dpusage->zsu_hrstart; 3864 pusage->zspz_cpu_shares = dpusage->zsu_cpu_shares; 3865 pusage->zspz_scheds = dpusage->zsu_scheds; 3866 pusage->zspz_cpu_usage = dpusage->zsu_cpu_usage; 3867 } 3868 } 3869 3870 /* Update the current cache pointer */ 3871 (void) mutex_lock(&g_usage_cache_lock); 3872 old = g_usage_cache; 3873 cache->zsuc_ref = 1; 3874 cache->zsuc_gen = g_gen_next; 3875 usage->zsu_gen = g_gen_next; 3876 usage->zsu_size = size; 3877 g_usage_cache = cache; 3878 if (old != NULL) { 3879 old->zsuc_ref--; 3880 if (old->zsuc_ref == 0) 3881 free(old); 3882 } 3883 g_gen_next++; 3884 /* Wake up any clients that are waiting for this calculation */ 3885 if (g_usage_cache_kickers > 0) { 3886 (void) cond_broadcast(&g_usage_cache_wait); 3887 } 3888 (void) mutex_unlock(&g_usage_cache_lock); 3889 } 3890 3891 static zs_usage_cache_t * 3892 zsd_usage_cache_hold_locked() 3893 { 3894 zs_usage_cache_t *ret; 3895 3896 ret = g_usage_cache; 3897 ret->zsuc_ref++; 3898 return (ret); 3899 } 3900 3901 void 3902 zsd_usage_cache_rele(zs_usage_cache_t *cache) 3903 { 3904 (void) mutex_lock(&g_usage_cache_lock); 3905 cache->zsuc_ref--; 3906 if (cache->zsuc_ref == 0) 3907 free(cache); 3908 (void) mutex_unlock(&g_usage_cache_lock); 3909 } 3910 3911 /* Close the handles held by zsd_open() */ 3912 void 3913 zsd_close(zsd_ctl_t *ctl) 3914 { 3915 zsd_zone_t *zone; 3916 zsd_pset_t *pset; 3917 zsd_pset_usage_t *usage; 3918 zsd_cpu_t *cpu; 3919 int id; 3920 3921 if (ctl->zsctl_kstat_ctl) { 3922 (void) kstat_close(ctl->zsctl_kstat_ctl); 3923 ctl->zsctl_kstat_ctl = NULL; 3924 } 3925 if (ctl->zsctl_proc_open) { 3926 (void) ea_close(&ctl->zsctl_proc_eaf); 3927 ctl->zsctl_proc_open = 0; 3928 ctl->zsctl_proc_fd = -1; 3929 } 3930 if (ctl->zsctl_pool_conf) { 3931 if (ctl->zsctl_pool_status == POOL_ENABLED) 3932 (void) pool_conf_close(ctl->zsctl_pool_conf); 3933 ctl->zsctl_pool_status = POOL_DISABLED; 3934 } 3935 3936 while ((zone = list_head(&ctl->zsctl_zones)) != NULL) { 3937 list_remove(&ctl->zsctl_zones, zone); 3938 free(zone); 3939 ctl->zsctl_nzones--; 3940 } 3941 3942 while ((pset = list_head(&ctl->zsctl_psets)) != NULL) { 3943 while ((usage = list_head(&pset->zsp_usage_list)) 3944 != NULL) { 3945 list_remove(&pset->zsp_usage_list, usage); 3946 ctl->zsctl_npset_usages--; 3947 free(usage); 3948 } 3949 list_remove(&ctl->zsctl_psets, pset); 3950 free(pset); 3951 ctl->zsctl_npsets--; 3952 } 3953 3954 /* Release all cpus being tracked */ 3955 while (cpu = list_head(&ctl->zsctl_cpus)) { 3956 list_remove(&ctl->zsctl_cpus, cpu); 3957 id = cpu->zsc_id; 3958 bzero(cpu, sizeof (zsd_cpu_t)); 3959 cpu->zsc_id = id; 3960 cpu->zsc_allocated = B_FALSE; 3961 cpu->zsc_psetid = ZS_PSET_ERROR; 3962 cpu->zsc_psetid_prev = ZS_PSET_ERROR; 3963 } 3964 3965 assert(ctl->zsctl_npset_usages == 0); 3966 assert(ctl->zsctl_npsets == 0); 3967 assert(ctl->zsctl_nzones == 0); 3968 (void) zsd_disable_cpu_stats(); 3969 } 3970 3971 3972 /* 3973 * Update the utilization data for all zones and processor sets. 3974 */ 3975 static int 3976 zsd_read(zsd_ctl_t *ctl, boolean_t init, boolean_t do_memory) 3977 { 3978 (void) kstat_chain_update(ctl->zsctl_kstat_ctl); 3979 (void) gettimeofday(&(ctl->zsctl_timeofday), NULL); 3980 3981 zsd_refresh_system(ctl); 3982 3983 /* 3984 * Memory calculation is expensive. Only update it on sample 3985 * intervals. 3986 */ 3987 if (do_memory == B_TRUE) 3988 zsd_refresh_memory(ctl, init); 3989 zsd_refresh_zones(ctl); 3990 zsd_refresh_psets(ctl); 3991 zsd_refresh_procs(ctl, init); 3992 zsd_refresh_cpu_stats(ctl, init); 3993 3994 /* 3995 * Delete objects that no longer exist. 3996 * Pset usages must be deleted first as they point to zone and 3997 * pset objects. 3998 */ 3999 zsd_mark_pset_usages_end(ctl); 4000 zsd_mark_psets_end(ctl); 4001 zsd_mark_cpus_end(ctl); 4002 zsd_mark_zones_end(ctl); 4003 4004 /* 4005 * Save results for clients. 4006 */ 4007 zsd_usage_cache_update(ctl); 4008 4009 /* 4010 * Roll process accounting file. 4011 */ 4012 (void) zsd_roll_exacct(); 4013 return (0); 4014 } 4015 4016 /* 4017 * Get the system rctl, which is the upper most limit 4018 */ 4019 static uint64_t 4020 zsd_get_system_rctl(char *name) 4021 { 4022 rctlblk_t *rblk, *rblk_last; 4023 4024 rblk = (rctlblk_t *)alloca(rctlblk_size()); 4025 rblk_last = (rctlblk_t *)alloca(rctlblk_size()); 4026 4027 if (getrctl(name, NULL, rblk_last, RCTL_FIRST) != 0) 4028 return (ZS_LIMIT_NONE); 4029 4030 while (getrctl(name, rblk_last, rblk, RCTL_NEXT) == 0) 4031 (void) bcopy(rblk, rblk_last, rctlblk_size()); 4032 4033 return (rctlblk_get_value(rblk_last)); 4034 } 4035 4036 /* 4037 * Open any necessary subsystems for collecting utilization data, 4038 * allocate and initialize data structures, and get initial utilization. 4039 * 4040 * Errors: 4041 * ENOMEM out of memory 4042 * EINVAL other error 4043 */ 4044 static zsd_ctl_t * 4045 zsd_open(zsd_ctl_t *ctl) 4046 { 4047 zsd_system_t *system; 4048 4049 char path[MAXPATHLEN]; 4050 struct statvfs svfs; 4051 int ret; 4052 int i; 4053 size_t size; 4054 int err; 4055 4056 if (ctl == NULL && (ctl = (zsd_ctl_t *)calloc(1, 4057 sizeof (zsd_ctl_t))) == NULL) { 4058 zsd_warn(gettext("Out of Memory")); 4059 errno = ENOMEM; 4060 goto err; 4061 } 4062 ctl->zsctl_proc_fd = -1; 4063 4064 /* open kstats */ 4065 if (ctl->zsctl_kstat_ctl == NULL && 4066 (ctl->zsctl_kstat_ctl = kstat_open()) == NULL) { 4067 err = errno; 4068 zsd_warn(gettext("Unable to open kstats")); 4069 errno = err; 4070 if (errno != ENOMEM) 4071 errno = EAGAIN; 4072 goto err; 4073 } 4074 4075 /* 4076 * These are set when the accounting file is opened by 4077 * zsd_update_procs() 4078 */ 4079 ctl->zsctl_proc_fd = -1; 4080 ctl->zsctl_proc_fd_next = -1; 4081 ctl->zsctl_proc_open = 0; 4082 ctl->zsctl_proc_open_next = 0; 4083 4084 check_exacct: 4085 (void) zsd_enable_cpu_stats(); 4086 4087 /* Create structures to track usage */ 4088 if (ctl->zsctl_system == NULL && (ctl->zsctl_system = (zsd_system_t *) 4089 calloc(1, sizeof (zsd_system_t))) == NULL) { 4090 ret = -1; 4091 zsd_warn(gettext("Out of Memory")); 4092 errno = ENOMEM; 4093 goto err; 4094 } 4095 system = ctl->zsctl_system; 4096 /* get the kernel bitness to know structure layout for getvmusage */ 4097 ret = sysinfo(SI_ARCHITECTURE_64, path, sizeof (path)); 4098 if (ret < 0) 4099 ctl->zsctl_kern_bits = 32; 4100 else 4101 ctl->zsctl_kern_bits = 64; 4102 ctl->zsctl_pagesize = sysconf(_SC_PAGESIZE); 4103 4104 size = sysconf(_SC_CPUID_MAX); 4105 ctl->zsctl_maxcpuid = size; 4106 if (ctl->zsctl_cpu_array == NULL && (ctl->zsctl_cpu_array = 4107 (zsd_cpu_t *)calloc(size + 1, sizeof (zsd_cpu_t))) == NULL) { 4108 zsd_warn(gettext("Out of Memory")); 4109 errno = ENOMEM; 4110 goto err; 4111 } 4112 for (i = 0; i <= ctl->zsctl_maxcpuid; i++) { 4113 ctl->zsctl_cpu_array[i].zsc_id = i; 4114 ctl->zsctl_cpu_array[i].zsc_allocated = B_FALSE; 4115 ctl->zsctl_cpu_array[i].zsc_psetid = ZS_PSET_ERROR; 4116 ctl->zsctl_cpu_array[i].zsc_psetid_prev = ZS_PSET_ERROR; 4117 } 4118 if (statvfs("/proc", &svfs) != 0 || 4119 strcmp("/proc", svfs.f_fstr) != 0) { 4120 zsd_warn(gettext("/proc not a procfs filesystem")); 4121 errno = EINVAL; 4122 goto err; 4123 } 4124 4125 size = sysconf(_SC_MAXPID) + 1; 4126 ctl->zsctl_maxproc = size; 4127 if (ctl->zsctl_proc_array == NULL && 4128 (ctl->zsctl_proc_array = (zsd_proc_t *)calloc(size, 4129 sizeof (zsd_proc_t))) == NULL) { 4130 zsd_warn(gettext("Out of Memory")); 4131 errno = ENOMEM; 4132 goto err; 4133 } 4134 for (i = 0; i <= ctl->zsctl_maxproc; i++) { 4135 list_link_init(&(ctl->zsctl_proc_array[i].zspr_next)); 4136 ctl->zsctl_proc_array[i].zspr_psetid = ZS_PSET_ERROR; 4137 ctl->zsctl_proc_array[i].zspr_zoneid = -1; 4138 ctl->zsctl_proc_array[i].zspr_usage.tv_sec = 0; 4139 ctl->zsctl_proc_array[i].zspr_usage.tv_nsec = 0; 4140 ctl->zsctl_proc_array[i].zspr_ppid = -1; 4141 } 4142 4143 list_create(&ctl->zsctl_zones, sizeof (zsd_zone_t), 4144 offsetof(zsd_zone_t, zsz_next)); 4145 4146 list_create(&ctl->zsctl_psets, sizeof (zsd_pset_t), 4147 offsetof(zsd_pset_t, zsp_next)); 4148 4149 list_create(&ctl->zsctl_cpus, sizeof (zsd_cpu_t), 4150 offsetof(zsd_cpu_t, zsc_next)); 4151 4152 if (ctl->zsctl_pool_conf == NULL && 4153 (ctl->zsctl_pool_conf = pool_conf_alloc()) == NULL) { 4154 zsd_warn(gettext("Out of Memory")); 4155 errno = ENOMEM; 4156 goto err; 4157 } 4158 ctl->zsctl_pool_status = POOL_DISABLED; 4159 ctl->zsctl_pool_changed = 0; 4160 4161 if (ctl->zsctl_pool_vals[0] == NULL && 4162 (ctl->zsctl_pool_vals[0] = pool_value_alloc()) == NULL) { 4163 zsd_warn(gettext("Out of Memory")); 4164 errno = ENOMEM; 4165 goto err; 4166 } 4167 if (ctl->zsctl_pool_vals[1] == NULL && 4168 (ctl->zsctl_pool_vals[1] = pool_value_alloc()) == NULL) { 4169 zsd_warn(gettext("Out of Memory")); 4170 errno = ENOMEM; 4171 goto err; 4172 } 4173 ctl->zsctl_pool_vals[2] = NULL; 4174 4175 /* 4176 * get system limits 4177 */ 4178 system->zss_maxpid = size = sysconf(_SC_MAXPID); 4179 system->zss_processes_max = zsd_get_system_rctl("zone.max-processes"); 4180 system->zss_lwps_max = zsd_get_system_rctl("zone.max-lwps"); 4181 system->zss_shm_max = zsd_get_system_rctl("zone.max-shm-memory"); 4182 system->zss_shmids_max = zsd_get_system_rctl("zone.max-shm-ids"); 4183 system->zss_semids_max = zsd_get_system_rctl("zone.max-sem-ids"); 4184 system->zss_msgids_max = zsd_get_system_rctl("zone.max-msg-ids"); 4185 system->zss_lofi_max = zsd_get_system_rctl("zone.max-lofi"); 4186 4187 g_gen_next = 1; 4188 4189 if (zsd_read(ctl, B_TRUE, B_FALSE) != 0) 4190 zsd_warn(gettext("Reading zone statistics failed")); 4191 4192 return (ctl); 4193 err: 4194 if (ctl) 4195 zsd_close(ctl); 4196 4197 return (NULL); 4198 } 4199 4200 /* Copy utilization data to buffer, filtering data if non-global zone. */ 4201 static void 4202 zsd_usage_filter(zoneid_t zid, zs_usage_cache_t *cache, zs_usage_t *usage, 4203 boolean_t is_gz) 4204 { 4205 zs_usage_t *cusage; 4206 zs_system_t *sys, *csys; 4207 zs_zone_t *zone, *czone; 4208 zs_pset_t *pset, *cpset; 4209 zs_pset_zone_t *pz, *cpz, *foundpz; 4210 size_t size = 0, csize = 0; 4211 char *start, *cstart; 4212 int i, j; 4213 timestruc_t delta; 4214 4215 /* Privileged users in the global zone get everything */ 4216 if (is_gz) { 4217 cusage = cache->zsuc_usage; 4218 (void) bcopy(cusage, usage, cusage->zsu_size); 4219 return; 4220 } 4221 4222 /* Zones just get their own usage */ 4223 cusage = cache->zsuc_usage; 4224 4225 start = (char *)usage; 4226 cstart = (char *)cusage; 4227 size += sizeof (zs_usage_t); 4228 csize += sizeof (zs_usage_t); 4229 4230 usage->zsu_start = cusage->zsu_start; 4231 usage->zsu_hrstart = cusage->zsu_hrstart; 4232 usage->zsu_time = cusage->zsu_time; 4233 usage->zsu_hrtime = cusage->zsu_hrtime; 4234 usage->zsu_gen = cusage->zsu_gen; 4235 usage->zsu_nzones = 1; 4236 usage->zsu_npsets = 0; 4237 4238 /* LINTED */ 4239 sys = (zs_system_t *)(start + size); 4240 /* LINTED */ 4241 csys = (zs_system_t *)(cstart + csize); 4242 size += sizeof (zs_system_t); 4243 csize += sizeof (zs_system_t); 4244 4245 /* Save system limits but not usage */ 4246 *sys = *csys; 4247 sys->zss_ncpus = 0; 4248 sys->zss_ncpus_online = 0; 4249 4250 /* LINTED */ 4251 zone = (zs_zone_t *)(start + size); 4252 /* LINTED */ 4253 czone = (zs_zone_t *)(cstart + csize); 4254 /* Find the matching zone */ 4255 for (i = 0; i < cusage->zsu_nzones; i++) { 4256 if (czone->zsz_id == zid) { 4257 *zone = *czone; 4258 size += sizeof (zs_zone_t); 4259 } 4260 csize += sizeof (zs_zone_t); 4261 /* LINTED */ 4262 czone = (zs_zone_t *)(cstart + csize); 4263 } 4264 sys->zss_ram_kern += (sys->zss_ram_zones - zone->zsz_usage_ram); 4265 sys->zss_ram_zones = zone->zsz_usage_ram; 4266 4267 sys->zss_vm_kern += (sys->zss_vm_zones - zone->zsz_usage_vm); 4268 sys->zss_vm_zones = zone->zsz_usage_vm; 4269 4270 sys->zss_locked_kern += (sys->zss_locked_zones - 4271 zone->zsz_usage_locked); 4272 sys->zss_locked_zones = zone->zsz_usage_locked; 4273 4274 TIMESTRUC_DELTA(delta, sys->zss_cpu_usage_zones, zone->zsz_cpu_usage); 4275 TIMESTRUC_ADD_TIMESTRUC(sys->zss_cpu_usage_kern, delta); 4276 sys->zss_cpu_usage_zones = zone->zsz_cpu_usage; 4277 4278 /* LINTED */ 4279 pset = (zs_pset_t *)(start + size); 4280 /* LINTED */ 4281 cpset = (zs_pset_t *)(cstart + csize); 4282 for (i = 0; i < cusage->zsu_npsets; i++) { 4283 csize += sizeof (zs_pset_t); 4284 /* LINTED */ 4285 cpz = (zs_pset_zone_t *)(csize + cstart); 4286 foundpz = NULL; 4287 for (j = 0; j < cpset->zsp_nusage; j++) { 4288 if (cpz->zspz_zoneid == zid) 4289 foundpz = cpz; 4290 4291 csize += sizeof (zs_pset_zone_t); 4292 /* LINTED */ 4293 cpz = (zs_pset_zone_t *)(csize + cstart); 4294 } 4295 if (foundpz != NULL) { 4296 size += sizeof (zs_pset_t); 4297 /* LINTED */ 4298 pz = (zs_pset_zone_t *)(start + size); 4299 size += sizeof (zs_pset_zone_t); 4300 4301 *pset = *cpset; 4302 *pz = *foundpz; 4303 4304 TIMESTRUC_DELTA(delta, pset->zsp_usage_zones, 4305 pz->zspz_cpu_usage); 4306 TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_kern, delta); 4307 pset->zsp_usage_zones = pz->zspz_cpu_usage; 4308 pset->zsp_nusage = 1; 4309 usage->zsu_npsets++; 4310 sys->zss_ncpus += pset->zsp_size; 4311 sys->zss_ncpus_online += pset->zsp_online; 4312 } 4313 /* LINTED */ 4314 cpset = (zs_pset_t *)(cstart + csize); 4315 } 4316 usage->zsu_size = size; 4317 } 4318 4319 /* 4320 * Respond to new connections from libzonestat.so. Also respond to zoneadmd, 4321 * which reports new zones. 4322 */ 4323 /* ARGSUSED */ 4324 static void 4325 zsd_server(void *cookie, char *argp, size_t arg_size, 4326 door_desc_t *dp, uint_t n_desc) 4327 { 4328 int *args, cmd; 4329 door_desc_t door; 4330 ucred_t *ucred; 4331 const priv_set_t *eset; 4332 4333 if (argp == DOOR_UNREF_DATA) { 4334 (void) door_return(NULL, 0, NULL, 0); 4335 thr_exit(NULL); 4336 } 4337 4338 if (arg_size != sizeof (cmd) * 2) { 4339 (void) door_return(NULL, 0, NULL, 0); 4340 thr_exit(NULL); 4341 } 4342 4343 /* LINTED */ 4344 args = (int *)argp; 4345 cmd = args[0]; 4346 4347 /* If connection, return door to stat server */ 4348 if (cmd == ZSD_CMD_CONNECT) { 4349 4350 /* Verify client compilation version */ 4351 if (args[1] != ZS_VERSION) { 4352 args[1] = ZSD_STATUS_VERSION_MISMATCH; 4353 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0); 4354 thr_exit(NULL); 4355 } 4356 ucred = alloca(ucred_size()); 4357 /* Verify client permission */ 4358 if (door_ucred(&ucred) != 0) { 4359 args[1] = ZSD_STATUS_INTERNAL_ERROR; 4360 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0); 4361 thr_exit(NULL); 4362 } 4363 4364 eset = ucred_getprivset(ucred, PRIV_EFFECTIVE); 4365 if (eset == NULL) { 4366 args[1] = ZSD_STATUS_INTERNAL_ERROR; 4367 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0); 4368 thr_exit(NULL); 4369 } 4370 if (!priv_ismember(eset, PRIV_PROC_INFO)) { 4371 args[1] = ZSD_STATUS_PERMISSION; 4372 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0); 4373 thr_exit(NULL); 4374 } 4375 4376 /* Return stat server door */ 4377 args[1] = ZSD_STATUS_OK; 4378 door.d_attributes = DOOR_DESCRIPTOR; 4379 door.d_data.d_desc.d_descriptor = g_stat_door; 4380 (void) door_return(argp, sizeof (cmd) * 2, &door, 1); 4381 thr_exit(NULL); 4382 } 4383 4384 /* Respond to zoneadmd informing zonestatd of a new zone */ 4385 if (cmd == ZSD_CMD_NEW_ZONE) { 4386 zsd_fattach_zone(args[1], g_server_door, B_FALSE); 4387 (void) door_return(NULL, 0, NULL, 0); 4388 thr_exit(NULL); 4389 } 4390 4391 args[1] = ZSD_STATUS_INTERNAL_ERROR; 4392 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0); 4393 thr_exit(NULL); 4394 } 4395 4396 /* 4397 * Respond to libzonestat.so clients with the current utlilzation data. 4398 */ 4399 /* ARGSUSED */ 4400 static void 4401 zsd_stat_server(void *cookie, char *argp, size_t arg_size, 4402 door_desc_t *dp, uint_t n_desc) 4403 { 4404 uint64_t *args, cmd; 4405 zs_usage_cache_t *cache; 4406 int ret; 4407 char *rvalp; 4408 size_t rvals; 4409 zs_usage_t *usage; 4410 ucred_t *ucred; 4411 zoneid_t zoneid; 4412 const priv_set_t *eset; 4413 boolean_t is_gz = B_FALSE; 4414 4415 /* Tell stat thread there are no more clients */ 4416 if (argp == DOOR_UNREF_DATA) { 4417 (void) mutex_lock(&g_usage_cache_lock); 4418 g_hasclient = B_FALSE; 4419 (void) cond_signal(&g_usage_cache_kick); 4420 (void) mutex_unlock(&g_usage_cache_lock); 4421 (void) door_return(NULL, 0, NULL, 0); 4422 thr_exit(NULL); 4423 } 4424 if (arg_size != sizeof (cmd) * 2) { 4425 (void) door_return(NULL, 0, NULL, 0); 4426 thr_exit(NULL); 4427 } 4428 /* LINTED */ 4429 args = (uint64_t *)argp; 4430 cmd = args[0]; 4431 if (cmd != ZSD_CMD_READ) { 4432 (void) door_return(NULL, 0, NULL, 0); 4433 thr_exit(NULL); 4434 } 4435 ucred = alloca(ucred_size()); 4436 if (door_ucred(&ucred) != 0) { 4437 (void) door_return(NULL, 0, NULL, 0); 4438 thr_exit(NULL); 4439 } 4440 zoneid = ucred_getzoneid(ucred); 4441 4442 if (zoneid == GLOBAL_ZONEID) 4443 is_gz = B_TRUE; 4444 4445 eset = ucred_getprivset(ucred, PRIV_EFFECTIVE); 4446 if (eset == NULL) { 4447 (void) door_return(NULL, 0, NULL, 0); 4448 thr_exit(NULL); 4449 } 4450 if (!priv_ismember(eset, PRIV_PROC_INFO)) { 4451 (void) door_return(NULL, 0, NULL, 0); 4452 thr_exit(NULL); 4453 } 4454 (void) mutex_lock(&g_usage_cache_lock); 4455 g_hasclient = B_TRUE; 4456 4457 /* 4458 * Force a new cpu calculation for client. This will force a 4459 * new memory calculation if the memory data is older than the 4460 * sample period. 4461 */ 4462 g_usage_cache_kickers++; 4463 (void) cond_signal(&g_usage_cache_kick); 4464 ret = cond_wait(&g_usage_cache_wait, &g_usage_cache_lock); 4465 g_usage_cache_kickers--; 4466 if (ret != 0 && errno == EINTR) { 4467 (void) mutex_unlock(&g_usage_cache_lock); 4468 zsd_warn(gettext( 4469 "Interrupted before writing usage size to client\n")); 4470 (void) door_return(NULL, 0, NULL, 0); 4471 thr_exit(NULL); 4472 } 4473 cache = zsd_usage_cache_hold_locked(); 4474 if (cache == NULL) { 4475 zsd_warn(gettext("Usage cache empty.\n")); 4476 (void) door_return(NULL, 0, NULL, 0); 4477 thr_exit(NULL); 4478 } 4479 (void) mutex_unlock(&g_usage_cache_lock); 4480 4481 /* Copy current usage data to stack to send to client */ 4482 usage = (zs_usage_t *)alloca(cache->zsuc_size); 4483 4484 /* Filter out results if caller is non-global zone */ 4485 zsd_usage_filter(zoneid, cache, usage, is_gz); 4486 4487 rvalp = (void *)usage; 4488 rvals = usage->zsu_size; 4489 zsd_usage_cache_rele(cache); 4490 4491 (void) door_return(rvalp, rvals, NULL, 0); 4492 thr_exit(NULL); 4493 } 4494 4495 static volatile boolean_t g_quit; 4496 4497 /* ARGSUSED */ 4498 static void 4499 zonestat_quithandler(int sig) 4500 { 4501 g_quit = B_TRUE; 4502 } 4503 4504 /* 4505 * The stat thread generates new utilization data when clients request 4506 * it. It also manages opening and closing the subsystems used to gather 4507 * data depending on if clients exist. 4508 */ 4509 /* ARGSUSED */ 4510 void * 4511 stat_thread(void *arg) 4512 { 4513 time_t start; 4514 time_t now; 4515 time_t next_memory; 4516 boolean_t do_memory; 4517 boolean_t do_read; 4518 boolean_t do_close; 4519 4520 start = time(NULL); 4521 if (start < 0) { 4522 if (g_quit == B_TRUE) 4523 goto quit; 4524 zsd_warn(gettext("Unable to fetch current time")); 4525 g_quit = B_TRUE; 4526 goto quit; 4527 } 4528 4529 next_memory = start; 4530 while (g_quit == B_FALSE) { 4531 for (;;) { 4532 /* 4533 * These are used to decide if the most recent memory 4534 * calculation was within a sample interval, 4535 * and weather or not the usage collection needs to 4536 * be opened or closed. 4537 */ 4538 do_memory = B_FALSE; 4539 do_read = B_FALSE; 4540 do_close = B_FALSE; 4541 4542 /* 4543 * If all clients have gone, close usage collecting 4544 */ 4545 (void) mutex_lock(&g_usage_cache_lock); 4546 if (!g_hasclient && g_open == B_TRUE) { 4547 do_close = B_TRUE; 4548 (void) mutex_unlock(&g_usage_cache_lock); 4549 break; 4550 } 4551 if (g_quit == B_TRUE) { 4552 (void) mutex_unlock( 4553 &g_usage_cache_lock); 4554 break; 4555 } 4556 /* 4557 * Wait for a usage data request 4558 */ 4559 if (g_usage_cache_kickers == 0) { 4560 (void) cond_wait(&g_usage_cache_kick, 4561 &g_usage_cache_lock); 4562 } 4563 now = time(NULL); 4564 if (now < 0) { 4565 if (g_quit == B_TRUE) { 4566 (void) mutex_unlock( 4567 &g_usage_cache_lock); 4568 goto quit; 4569 } 4570 g_quit = B_TRUE; 4571 (void) mutex_unlock(&g_usage_cache_lock); 4572 zsd_warn(gettext( 4573 "Unable to fetch current time")); 4574 goto quit; 4575 } 4576 if (g_hasclient) { 4577 do_read = B_TRUE; 4578 if (now >= next_memory) { 4579 do_memory = B_TRUE; 4580 next_memory = now + g_interval; 4581 } 4582 } else { 4583 do_close = B_TRUE; 4584 } 4585 (void) mutex_unlock(&g_usage_cache_lock); 4586 if (do_read || do_close) 4587 break; 4588 } 4589 g_now = now; 4590 g_hrnow = gethrtime(); 4591 if (g_hasclient && g_open == B_FALSE) { 4592 g_start = g_now; 4593 g_hrstart = g_hrnow; 4594 g_ctl = zsd_open(g_ctl); 4595 if (g_ctl == NULL) 4596 zsd_warn(gettext( 4597 "Unable to open zone statistics")); 4598 else 4599 g_open = B_TRUE; 4600 } 4601 if (do_read && g_ctl) { 4602 if (zsd_read(g_ctl, B_FALSE, do_memory) != 0) { 4603 zsd_warn(gettext( 4604 "Unable to read zone statistics")); 4605 g_quit = B_TRUE; 4606 return (NULL); 4607 } 4608 } 4609 (void) mutex_lock(&g_usage_cache_lock); 4610 if (!g_hasclient && g_open == B_TRUE && g_ctl) { 4611 (void) mutex_unlock(&g_usage_cache_lock); 4612 zsd_close(g_ctl); 4613 g_open = B_FALSE; 4614 } else { 4615 (void) mutex_unlock(&g_usage_cache_lock); 4616 } 4617 } 4618 quit: 4619 if (g_open) 4620 zsd_close(g_ctl); 4621 4622 (void) thr_kill(g_main, SIGINT); 4623 thr_exit(NULL); 4624 return (NULL); 4625 } 4626 4627 void 4628 zsd_set_fx() 4629 { 4630 pcinfo_t pcinfo; 4631 pcparms_t pcparms; 4632 4633 (void) strlcpy(pcinfo.pc_clname, "FX", sizeof (pcinfo.pc_clname)); 4634 if (priocntl(0, 0, PC_GETCID, (caddr_t)&pcinfo) == -1) { 4635 zsd_warn(gettext("cannot get FX class parameters")); 4636 return; 4637 } 4638 pcparms.pc_cid = pcinfo.pc_cid; 4639 ((fxparms_t *)pcparms.pc_clparms)->fx_upri = 60; 4640 ((fxparms_t *)pcparms.pc_clparms)->fx_uprilim = 60; 4641 ((fxparms_t *)pcparms.pc_clparms)->fx_tqsecs = 0; 4642 ((fxparms_t *)pcparms.pc_clparms)->fx_tqnsecs = FX_NOCHANGE; 4643 if (priocntl(P_PID, getpid(), PC_SETPARMS, (caddr_t)&pcparms) == -1) 4644 zsd_warn(gettext("cannot enter the FX class")); 4645 } 4646 4647 static int pipe_fd; 4648 4649 static void 4650 daemonize_ready(char status) 4651 { 4652 /* 4653 * wake the parent with a clue 4654 */ 4655 (void) write(pipe_fd, &status, 1); 4656 (void) close(pipe_fd); 4657 } 4658 4659 static int 4660 daemonize_start(void) 4661 { 4662 char data; 4663 int status; 4664 4665 int filedes[2]; 4666 pid_t pid; 4667 4668 (void) close(0); 4669 (void) dup2(2, 1); 4670 4671 if (pipe(filedes) < 0) 4672 return (-1); 4673 4674 (void) fflush(NULL); 4675 4676 if ((pid = fork1()) < 0) 4677 return (-1); 4678 4679 if (pid != 0) { 4680 /* 4681 * parent 4682 */ 4683 struct sigaction act; 4684 4685 act.sa_handler = SIG_DFL; 4686 (void) sigemptyset(&act.sa_mask); 4687 act.sa_flags = 0; 4688 4689 (void) sigaction(SIGPIPE, &act, NULL); /* ignore SIGPIPE */ 4690 4691 (void) close(filedes[1]); 4692 if (read(filedes[0], &data, 1) == 1) { 4693 /* forward ready code via exit status */ 4694 exit(data); 4695 } 4696 status = -1; 4697 (void) wait4(pid, &status, 0, NULL); 4698 /* daemon process exited before becoming ready */ 4699 if (WIFEXITED(status)) { 4700 /* assume daemon process printed useful message */ 4701 exit(WEXITSTATUS(status)); 4702 } else { 4703 zsd_warn(gettext("daemon process killed or died")); 4704 exit(1); 4705 } 4706 } 4707 4708 /* 4709 * child 4710 */ 4711 pipe_fd = filedes[1]; 4712 (void) close(filedes[0]); 4713 4714 /* 4715 * generic Unix setup 4716 */ 4717 (void) setsid(); 4718 (void) umask(0000); 4719 4720 return (0); 4721 } 4722 4723 static void 4724 fattach_all_zones(boolean_t detach_only) 4725 { 4726 zoneid_t *zids; 4727 uint_t nzids, nzids_last; 4728 int i; 4729 4730 again: 4731 (void) zone_list(NULL, &nzids); 4732 nzids_last = nzids; 4733 zids = (zoneid_t *)malloc(sizeof (zoneid_t) * nzids_last); 4734 if (zids == NULL) 4735 zsd_error(gettext("Out of memory")); 4736 4737 (void) zone_list(zids, &nzids); 4738 if (nzids > nzids_last) { 4739 free(zids); 4740 goto again; 4741 } 4742 for (i = 0; i < nzids; i++) 4743 zsd_fattach_zone(zids[i], g_server_door, detach_only); 4744 4745 free(zids); 4746 } 4747 4748 int 4749 main(int argc, char *argv[]) 4750 { 4751 4752 int arg; 4753 thread_t tid; 4754 scf_simple_prop_t *prop; 4755 uint64_t *intervalp; 4756 boolean_t opt_cleanup = B_FALSE; 4757 4758 g_main = thr_self(); 4759 g_quit = B_FALSE; 4760 (void) signal(SIGINT, zonestat_quithandler); 4761 (void) signal(SIGTERM, zonestat_quithandler); 4762 (void) signal(SIGHUP, zonestat_quithandler); 4763 /* (void) sigignore(SIGCHLD); */ 4764 (void) sigignore(SIGPIPE); 4765 4766 if (getzoneid() != GLOBAL_ZONEID) 4767 zsd_error(gettext("Must be run from global zone only")); 4768 4769 while ((arg = getopt(argc, argv, "c")) 4770 != EOF) { 4771 switch (arg) { 4772 case 'c': 4773 opt_cleanup = B_TRUE; 4774 break; 4775 default: 4776 zsd_error(gettext("Invalid option")); 4777 } 4778 } 4779 4780 if (opt_cleanup) { 4781 if (zsd_disable_cpu_stats() != 0) 4782 exit(1); 4783 else 4784 exit(0); 4785 } 4786 4787 /* Get the configured sample interval */ 4788 prop = scf_simple_prop_get(NULL, "svc:/system/zones-monitoring:default", 4789 "config", "sample_interval"); 4790 if (prop == NULL) 4791 zsd_error(gettext("Unable to fetch SMF property " 4792 "\"config/sample_interval\"")); 4793 4794 if (scf_simple_prop_type(prop) != SCF_TYPE_COUNT) 4795 zsd_error(gettext("Malformed SMF property " 4796 "\"config/sample_interval\". Must be of type \"count\"")); 4797 4798 intervalp = scf_simple_prop_next_count(prop); 4799 g_interval = *intervalp; 4800 if (g_interval == 0) 4801 zsd_error(gettext("Malformed SMF property " 4802 "\"config/sample_interval\". Must be greater than zero")); 4803 4804 scf_simple_prop_free(prop); 4805 4806 if (daemonize_start() < 0) 4807 zsd_error(gettext("Unable to start daemon\n")); 4808 4809 /* Run at high priority */ 4810 zsd_set_fx(); 4811 4812 (void) mutex_init(&g_usage_cache_lock, USYNC_THREAD, NULL); 4813 (void) cond_init(&g_usage_cache_kick, USYNC_THREAD, NULL); 4814 (void) cond_init(&g_usage_cache_wait, USYNC_THREAD, NULL); 4815 4816 g_server_door = door_create(zsd_server, NULL, 4817 DOOR_REFUSE_DESC | DOOR_NO_CANCEL); 4818 if (g_server_door < 0) 4819 zsd_error(gettext("Unable to create server door\n")); 4820 4821 4822 g_stat_door = door_create(zsd_stat_server, NULL, DOOR_UNREF_MULTI | 4823 DOOR_REFUSE_DESC | DOOR_NO_CANCEL); 4824 if (g_stat_door < 0) 4825 zsd_error(gettext("Unable to create statistics door\n")); 4826 4827 fattach_all_zones(B_FALSE); 4828 4829 if (thr_create(NULL, 0, stat_thread, NULL, 0, &tid) != 0) 4830 zsd_error(gettext("Unable to create statistics thread\n")); 4831 4832 daemonize_ready(0); 4833 4834 /* Wait for signal to quit */ 4835 while (g_quit == B_FALSE) 4836 (void) pause(); 4837 4838 /* detach doors */ 4839 fattach_all_zones(B_TRUE); 4840 4841 (void) door_revoke(g_server_door); 4842 (void) door_revoke(g_stat_door); 4843 4844 /* kick stat thread and wait for it to close the statistics */ 4845 (void) mutex_lock(&g_usage_cache_lock); 4846 g_quit = B_TRUE; 4847 (void) cond_signal(&g_usage_cache_kick); 4848 (void) mutex_unlock(&g_usage_cache_lock); 4849 end: 4850 (void) thr_join(tid, NULL, NULL); 4851 return (0); 4852 } 4853