1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * rcapd is a long-running daemon enforcing project-based resource caps (see 30 * rcapd(1M)). Each instance of a process aggregate (project or, generically, 31 * "collection") may have a memory cap. A single thread monitors the resource 32 * utilization of capped collections, enforces caps when they are exceeded (and 33 * other conditions are met), and incorporates changes in configuration or 34 * caps. Each of these actions occurs not more frequently than the rate 35 * specified with rcapadm(1M). 36 */ 37 38 #include <sys/priocntl.h> 39 #include <sys/proc.h> 40 #include <sys/resource.h> 41 #include <sys/sysinfo.h> 42 #include <sys/stat.h> 43 #include <sys/sysmacros.h> 44 #include <sys/time.h> 45 #include <sys/types.h> 46 #include <dirent.h> 47 #include <errno.h> 48 #include <fcntl.h> 49 #include <kstat.h> 50 #include <libintl.h> 51 #include <limits.h> 52 #include <locale.h> 53 #include <priv.h> 54 #include <signal.h> 55 #include <stdarg.h> 56 #include <stdio.h> 57 #include <stdio_ext.h> 58 #include <stdlib.h> 59 #include <libscf.h> 60 #include <strings.h> 61 #include <time.h> 62 #include <unistd.h> 63 #include <zone.h> 64 #include <assert.h> 65 #include <sys/vm_usage.h> 66 #include "rcapd.h" 67 #include "rcapd_mapping.h" 68 #include "rcapd_rfd.h" 69 #include "rcapd_stat.h" 70 #include "utils.h" 71 72 #define POSITIVE_MIN(x, y) \ 73 (((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y)) 74 #define NEXT_EVENT_TIME(base, seconds) \ 75 (((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \ 76 : (hrtime_t)0) 77 #define NEXT_REPORT_EVENT_TIME(base, seconds) \ 78 ((rcfg.rcfg_stat_file[0] != 0) ? \ 79 NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0) 80 #define EVENT_TIME(time, eventtime) \ 81 (((time) > (eventtime)) && (eventtime) != 0) 82 #define STAT_TEMPLATE_SUFFIX ".XXXXXX" /* suffix of mkstemp() arg */ 83 #define DAEMON_UID 1 /* uid to use */ 84 85 #define CAPPED_PROJECT 0x01 86 #define CAPPED_ZONE 0x02 87 88 typedef struct soft_scan_arg { 89 uint64_t ssa_sum_excess; 90 int64_t ssa_scan_goal; 91 boolean_t ssa_project_over_cap; 92 } soft_scan_arg_t; 93 94 typedef struct sample_col_arg { 95 boolean_t sca_any_over_cap; 96 boolean_t sca_project_over_cap; 97 } sample_col_arg_t; 98 99 100 static int debug_mode = 0; /* debug mode flag */ 101 static pid_t rcapd_pid; /* rcapd's pid to ensure it's not */ 102 /* scanned */ 103 static kstat_ctl_t *kctl; /* kstat chain */ 104 static int memory_pressure = 0; /* physical memory utilization (%) */ 105 static int memory_pressure_sample = 0; /* count of samples */ 106 static long page_size_kb = 0; /* system page size in KB */ 107 static size_t nvmu_vals = 0; /* # of kernel RSS/swap vals in array */ 108 static size_t vmu_vals_len = 0; /* size of RSS/swap vals array */ 109 static vmusage_t *vmu_vals = NULL; /* snapshot of kernel RSS/swap values */ 110 static hrtime_t next_report; /* time of next report */ 111 static int termination_signal = 0; /* terminating signal */ 112 static zoneid_t my_zoneid = (zoneid_t)-1; 113 static lcollection_t *gz_col; /* global zone collection */ 114 115 rcfg_t rcfg; 116 /* 117 * Updated when we re-read the collection configurations if this rcapd instance 118 * is running in the global zone and the global zone is capped. 119 */ 120 boolean_t gz_capped = B_FALSE; 121 122 /* 123 * Flags. 124 */ 125 static int ever_ran; 126 int should_run; 127 static int should_reconfigure; 128 129 static int verify_statistics(void); 130 static int update_statistics(void); 131 132 /* 133 * Checks if a process is marked 'system'. Returns FALSE only when it is not. 134 */ 135 static boolean_t 136 proc_issystem(pid_t pid) 137 { 138 char pc_clname[PC_CLNMSZ]; 139 140 if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname, 141 PC_KY_NULL) != -1) { 142 return (strcmp(pc_clname, "SYS") == 0); 143 } else { 144 debug("cannot get class-specific scheduling parameters; " 145 "assuming system process\n"); 146 return (B_TRUE); 147 } 148 } 149 150 static void 151 lprocess_insert_mark(psinfo_t *psinfop) 152 { 153 pid_t pid = psinfop->pr_pid; 154 /* flag indicating whether the process should be scanned. */ 155 int unscannable = psinfop->pr_nlwp == 0; 156 rcid_t colid; 157 lcollection_t *lcol; 158 lprocess_t *lproc; 159 160 /* 161 * Determine which collection to put this process into. We only have 162 * to worry about tracking both zone and project capped processes if 163 * this rcapd instance is running in the global zone, since we'll only 164 * see processes in our own projects in a non-global zone. In the 165 * global zone, if the process belongs to a non-global zone, we only 166 * need to track it for the capped non-global zone collection. For 167 * global zone processes, we first attempt to put the process into a 168 * capped project collection. On the second pass into this function 169 * the projid will be cleared so we will just track the process for the 170 * global zone collection as a whole. 171 */ 172 if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) { 173 colid.rcid_type = RCIDT_PROJECT; 174 colid.rcid_val = psinfop->pr_projid; 175 } else { 176 /* try to add to zone collection */ 177 colid.rcid_type = RCIDT_ZONE; 178 colid.rcid_val = psinfop->pr_zoneid; 179 } 180 181 if ((lcol = lcollection_find(&colid)) == NULL) 182 return; 183 184 /* 185 * If the process is already being tracked, update the unscannable flag, 186 * as determined by the caller, from the process's psinfo. 187 */ 188 lproc = lcol->lcol_lprocess; 189 while (lproc != NULL) { 190 if (lproc->lpc_pid == pid) { 191 lproc->lpc_mark = 1; 192 if (unscannable != 0 && lproc->lpc_unscannable == 0) { 193 debug("process %d: became unscannable\n", 194 (int)lproc->lpc_pid); 195 lproc->lpc_unscannable = 1; 196 } 197 return; 198 } 199 lproc = lproc->lpc_next; 200 } 201 202 /* 203 * We've fallen off the list without finding our current process; 204 * insert it at the list head. 205 */ 206 if ((lproc = malloc(sizeof (*lproc))) == NULL) 207 debug("insufficient memory to track new process %d", (int)pid); 208 else { 209 (void) bzero(lproc, sizeof (*lproc)); 210 lproc->lpc_pid = pid; 211 lproc->lpc_mark = 1; 212 lproc->lpc_collection = lcol; 213 lproc->lpc_psinfo_fd = -1; 214 lproc->lpc_pgdata_fd = -1; 215 lproc->lpc_xmap_fd = -1; 216 217 /* 218 * If the caller didn't flag this process as unscannable 219 * already, do some more checking. 220 */ 221 lproc->lpc_unscannable = unscannable || proc_issystem(pid); 222 223 #ifdef DEBUG 224 /* 225 * Verify the sanity of lprocess. It should not contain the 226 * process we are about to prepend. 227 */ 228 if (lcollection_member(lcol, lproc)) { 229 lprocess_t *cur = lcol->lcol_lprocess; 230 debug("The collection %lld already has these members, " 231 "including me, %d!\n", 232 (long long)lcol->lcol_id.rcid_val, 233 (int)lproc->lpc_pid); 234 while (cur != NULL) { 235 debug("\t%d\n", (int)cur->lpc_pid); 236 cur = cur->lpc_next; 237 } 238 info(gettext("process already on lprocess\n")); 239 abort(); 240 } 241 #endif /* DEBUG */ 242 lproc->lpc_next = lcol->lcol_lprocess; 243 if (lproc->lpc_next != NULL) 244 lproc->lpc_next->lpc_prev = lproc; 245 lproc->lpc_prev = NULL; 246 lcol->lcol_lprocess = lproc; 247 248 debug("tracking %s %ld %d %s%s\n", 249 (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"), 250 (long)colid.rcid_val, 251 (int)pid, psinfop->pr_psargs, 252 (lproc->lpc_unscannable != 0) ? " (not scannable)" : ""); 253 lcol->lcol_stat.lcols_proc_in++; 254 } 255 } 256 257 static int 258 list_walk_process_cb(lcollection_t *lcol, void *arg) 259 { 260 int (*cb)(lcollection_t *, lprocess_t *) = 261 (int(*)(lcollection_t *, lprocess_t *))arg; 262 lprocess_t *member; 263 lprocess_t *next; 264 265 member = lcol->lcol_lprocess; 266 while (member != NULL) { 267 pid_t pid = member->lpc_pid; 268 next = member->lpc_next; 269 270 debug_high("list_walk_all lpc %d\n", (int)pid); 271 if (cb(lcol, member) != 0) { 272 debug_high("list_walk_all aborted at lpc %d\n", 273 (int)pid); 274 return (1); 275 } 276 member = next; 277 } 278 279 return (0); 280 } 281 282 /* 283 * Invoke the given callback for each process in each collection. Callbacks 284 * are allowed to change the linkage of the process on which they act. 285 */ 286 static void 287 list_walk_all(int (*cb)(lcollection_t *, lprocess_t *)) 288 { 289 list_walk_collection(list_walk_process_cb, (void *)cb); 290 } 291 292 static void 293 revoke_psinfo(rfd_t *rfd) 294 { 295 lprocess_t *lpc = (lprocess_t *)rfd->rfd_data; 296 297 if (lpc != NULL) { 298 debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid); 299 ASSERT(lpc->lpc_psinfo_fd != -1); 300 lpc->lpc_psinfo_fd = -1; 301 } else 302 debug("revoking psinfo fd for unknown process\n"); 303 } 304 305 /* 306 * Retrieve a process's psinfo via an already-opened or new file descriptor. 307 * The supplied descriptor will be closed on failure. An optional callback 308 * will be invoked with the last descriptor tried, and a supplied callback 309 * argument, as its arguments, such that the new descriptor may be cached, or 310 * an old one may be invalidated. If the result of the callback is zero, the 311 * the caller is to assume responsibility for the file descriptor, to close it 312 * with rfd_close(). 313 * 314 * On failure, a nonzero value is returned. 315 */ 316 int 317 get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd, 318 int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc) 319 { 320 int fd; 321 int can_try_uncached; 322 323 ASSERT(!(cached_fd > 0 && fd_update_cb == NULL)); 324 325 do { 326 if (cached_fd >= 0) { 327 fd = cached_fd; 328 can_try_uncached = 1; 329 debug_high("%d/psinfo, trying cached fd %d\n", 330 (int)pid, fd); 331 } else { 332 char pathbuf[PROC_PATH_MAX]; 333 334 can_try_uncached = 0; 335 (void) snprintf(pathbuf, sizeof (pathbuf), 336 "/proc/%d/psinfo", (int)pid); 337 if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO, 338 revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) { 339 debug("cannot open %s", pathbuf); 340 break; 341 } else 342 debug_high("opened %s, fd %d\n", pathbuf, fd); 343 } 344 345 if (pread(fd, psinfo, sizeof (*psinfo), 0) == 346 sizeof (*psinfo) && psinfo->pr_pid == pid) 347 break; 348 else { 349 debug_high("closed fd %d\n", fd); 350 if (rfd_close(fd) != 0) 351 debug("could not close fd %d", fd); 352 fd = cached_fd = -1; 353 } 354 } while (can_try_uncached == 1); 355 356 if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0) 357 if (fd >= 0) { 358 debug_high("closed %s fd %d\n", fd_update_cb == NULL ? 359 "uncached" : "cached", fd); 360 if (rfd_close(fd) != 0) 361 debug("could not close fd %d", fd); 362 } 363 364 debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd, 365 fd_update_cb != NULL ? "cached" : "uncached"); 366 return ((fd >= 0) ? 0 : -1); 367 } 368 369 /* 370 * Retrieve the collection membership of all processes and update the psinfo of 371 * those non-system, non-zombie ones in collections. For global zone processes, 372 * we first attempt to put the process into a capped project collection. We 373 * also want to track the process for the global zone collection as a whole. 374 */ 375 static void 376 proc_cb(const pid_t pid) 377 { 378 psinfo_t psinfo; 379 380 if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) { 381 lprocess_insert_mark(&psinfo); 382 if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) { 383 /* 384 * We also want to track this process for the global 385 * zone as a whole so add it to the global zone 386 * collection as well. 387 */ 388 psinfo.pr_projid = -1; 389 lprocess_insert_mark(&psinfo); 390 } 391 } 392 } 393 394 /* 395 * Cache the process' psinfo fd, taking responsibility for freeing it. 396 */ 397 int 398 lprocess_update_psinfo_fd_cb(void *arg, int fd) 399 { 400 lprocess_t *lpc = arg; 401 402 lpc->lpc_psinfo_fd = fd; 403 return (0); 404 } 405 406 /* 407 * Get the system pagesize. 408 */ 409 static void 410 get_page_size(void) 411 { 412 page_size_kb = sysconf(_SC_PAGESIZE) / 1024; 413 debug("physical page size: %luKB\n", page_size_kb); 414 } 415 416 static void 417 tm_fmt(char *msg, hrtime_t t1, hrtime_t t2) 418 { 419 hrtime_t diff = t2 - t1; 420 421 if (diff < MILLISEC) 422 debug("%s: %lld nanoseconds\n", msg, diff); 423 else if (diff < MICROSEC) 424 debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC); 425 else if (diff < NANOSEC) 426 debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC); 427 else 428 debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC); 429 } 430 431 /* 432 * Get the zone's & project's RSS from the kernel. 433 */ 434 static void 435 rss_sample(boolean_t my_zone_only, uint_t col_types) 436 { 437 size_t nres; 438 size_t i; 439 uint_t flags; 440 hrtime_t t1, t2; 441 442 if (my_zone_only) { 443 flags = VMUSAGE_ZONE; 444 } else { 445 flags = 0; 446 if (col_types & CAPPED_PROJECT) 447 flags |= VMUSAGE_PROJECTS; 448 if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID) 449 flags |= VMUSAGE_ALL_ZONES; 450 } 451 452 debug("vmusage sample flags 0x%x\n", flags); 453 if (flags == 0) 454 return; 455 456 again: 457 /* try the current buffer to see if the list will fit */ 458 nres = vmu_vals_len; 459 t1 = gethrtime(); 460 if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval, 461 vmu_vals, &nres) != 0) { 462 if (errno != EOVERFLOW) { 463 warn(gettext("can't read RSS from kernel\n")); 464 return; 465 } 466 } 467 t2 = gethrtime(); 468 tm_fmt("getvmusage time", t1, t2); 469 470 debug("kernel nres %lu\n", (ulong_t)nres); 471 472 if (nres > vmu_vals_len) { 473 /* array size is now too small, increase it and try again */ 474 free(vmu_vals); 475 476 if ((vmu_vals = (vmusage_t *)calloc(nres, 477 sizeof (vmusage_t))) == NULL) { 478 warn(gettext("out of memory: could not read RSS from " 479 "kernel\n")); 480 vmu_vals_len = nvmu_vals = 0; 481 return; 482 } 483 vmu_vals_len = nres; 484 goto again; 485 } 486 487 nvmu_vals = nres; 488 489 debug("vmusage_sample\n"); 490 for (i = 0; i < nvmu_vals; i++) { 491 debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), " 492 "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id, 493 vmu_vals[i].vmu_type, 494 (unsigned long long)vmu_vals[i].vmu_rss_all, 495 (unsigned long long)vmu_vals[i].vmu_rss_all / 1024, 496 (unsigned long long)vmu_vals[i].vmu_swap_all); 497 } 498 } 499 500 static void 501 update_col_rss(lcollection_t *lcol) 502 { 503 int i; 504 505 lcol->lcol_rss = 0; 506 lcol->lcol_image_size = 0; 507 508 for (i = 0; i < nvmu_vals; i++) { 509 if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val) 510 continue; 511 512 if (vmu_vals[i].vmu_type == VMUSAGE_ZONE && 513 lcol->lcol_id.rcid_type != RCIDT_ZONE) 514 continue; 515 516 if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS && 517 lcol->lcol_id.rcid_type != RCIDT_PROJECT) 518 continue; 519 520 /* we found the right RSS entry, update the collection vals */ 521 lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024; 522 lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024; 523 break; 524 } 525 } 526 527 /* 528 * Sample the collection RSS, updating the collection's statistics with the 529 * results. Also, sum the rss of all capped projects & return true if 530 * the collection is over cap. 531 */ 532 static int 533 rss_sample_col_cb(lcollection_t *lcol, void *arg) 534 { 535 int64_t excess; 536 uint64_t rss; 537 sample_col_arg_t *col_argp = (sample_col_arg_t *)arg; 538 539 update_col_rss(lcol); 540 541 lcol->lcol_stat.lcols_rss_sample++; 542 rss = lcol->lcol_rss; 543 excess = rss - lcol->lcol_rss_cap; 544 if (excess > 0) { 545 lcol->lcol_stat.lcols_rss_act_sum += rss; 546 col_argp->sca_any_over_cap = B_TRUE; 547 if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) 548 col_argp->sca_project_over_cap = B_TRUE; 549 } 550 lcol->lcol_stat.lcols_rss_sum += rss; 551 552 if (lcol->lcol_stat.lcols_min_rss > rss) 553 lcol->lcol_stat.lcols_min_rss = rss; 554 if (lcol->lcol_stat.lcols_max_rss < rss) 555 lcol->lcol_stat.lcols_max_rss = rss; 556 557 return (0); 558 } 559 560 /* 561 * Determine if we have capped projects, capped zones or both. 562 */ 563 static int 564 col_type_cb(lcollection_t *lcol, void *arg) 565 { 566 uint_t *col_type = (uint_t *)arg; 567 568 /* skip uncapped collections */ 569 if (lcol->lcol_rss_cap == 0) 570 return (1); 571 572 if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) 573 *col_type |= CAPPED_PROJECT; 574 else 575 *col_type |= CAPPED_ZONE; 576 577 /* once we know everything is capped, we can stop looking */ 578 if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT)) 579 return (1); 580 581 return (0); 582 } 583 584 /* 585 * Open /proc and walk entries. 586 */ 587 static void 588 proc_walk_all(void (*cb)(const pid_t)) 589 { 590 DIR *pdir; 591 struct dirent *dirent; 592 pid_t pid; 593 594 (void) rfd_reserve(1); 595 if ((pdir = opendir("/proc")) == NULL) 596 die(gettext("couldn't open /proc!")); 597 598 while ((dirent = readdir(pdir)) != NULL) { 599 if (strcmp(".", dirent->d_name) == 0 || 600 strcmp("..", dirent->d_name) == 0) 601 continue; 602 pid = atoi(dirent->d_name); 603 ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0); 604 if (pid == rcapd_pid) 605 continue; 606 else 607 cb(pid); 608 } 609 (void) closedir(pdir); 610 } 611 612 /* 613 * Clear unmarked callback. 614 */ 615 /*ARGSUSED*/ 616 static int 617 sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc) 618 { 619 if (lpc->lpc_mark) { 620 lpc->lpc_mark = 0; 621 } else { 622 debug("process %d finished\n", (int)lpc->lpc_pid); 623 lprocess_free(lpc); 624 } 625 626 return (0); 627 } 628 629 /* 630 * Print, for debugging purposes, a collection's recently-sampled RSS and 631 * excess. 632 */ 633 /*ARGSUSED*/ 634 static int 635 excess_print_cb(lcollection_t *lcol, void *arg) 636 { 637 int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap; 638 639 debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n", 640 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), 641 lcol->lcol_name, 642 (unsigned long long)lcol->lcol_rss, 643 (unsigned long long)lcol->lcol_rss_cap, 644 (long long)excess); 645 646 return (0); 647 } 648 649 /* 650 * Scan those collections which have exceeded their caps. 651 * 652 * If we're running in the global zone it might have a cap. We don't want to 653 * do any capping for the global zone yet since we might get under the cap by 654 * just capping the projects in the global zone. 655 */ 656 /*ARGSUSED*/ 657 static int 658 scan_cb(lcollection_t *lcol, void *arg) 659 { 660 int64_t excess; 661 662 /* skip over global zone collection for now but keep track for later */ 663 if (lcol->lcol_id.rcid_type == RCIDT_ZONE && 664 lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { 665 gz_col = lcol; 666 return (0); 667 } 668 669 if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { 670 scan(lcol, excess); 671 lcol->lcol_stat.lcols_scan++; 672 } 673 674 return (0); 675 } 676 677 /* 678 * Scan the global zone collection and see if it still exceeds its cap. 679 * We take into account the effects of capping any global zone projects here. 680 */ 681 static void 682 scan_gz(lcollection_t *lcol, boolean_t project_over_cap) 683 { 684 int64_t excess; 685 686 /* 687 * If we had projects over their cap and the global zone was also over 688 * its cap then we need to get the up-to-date global zone rss to 689 * determine if we are still over the global zone cap. We might have 690 * gone under while we scanned the capped projects. If there were no 691 * projects over cap then we can use the rss value we already have for 692 * the global zone. 693 */ 694 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 695 if (project_over_cap && excess > 0) { 696 rss_sample(B_TRUE, CAPPED_ZONE); 697 update_col_rss(lcol); 698 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 699 } 700 701 if (excess > 0) { 702 debug("global zone excess %lldKB\n", (long long)excess); 703 scan(lcol, excess); 704 lcol->lcol_stat.lcols_scan++; 705 } 706 } 707 708 /* 709 * Do a soft scan of those collections which have excesses. A soft scan is one 710 * in which the cap enforcement pressure is taken into account. The difference 711 * between the utilized physical memory and the cap enforcement pressure will 712 * be scanned-for, and each collection will be scanned proportionally by their 713 * present excesses. 714 */ 715 static int 716 soft_scan_cb(lcollection_t *lcol, void *a) 717 { 718 int64_t excess; 719 soft_scan_arg_t *arg = a; 720 721 /* skip over global zone collection for now but keep track for later */ 722 if (lcol->lcol_id.rcid_type == RCIDT_ZONE && 723 lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { 724 gz_col = lcol; 725 return (0); 726 } 727 728 if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { 729 int64_t adjusted_excess = 730 excess * arg->ssa_scan_goal / arg->ssa_sum_excess; 731 732 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " 733 "scanning %lld\n", 734 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 735 "project" : "zone"), 736 (long)lcol->lcol_id.rcid_val, 737 (long long)excess, (long long)arg->ssa_scan_goal, 738 (unsigned long long)arg->ssa_sum_excess, 739 (long long)adjusted_excess); 740 741 scan(lcol, adjusted_excess); 742 lcol->lcol_stat.lcols_scan++; 743 } 744 745 return (0); 746 } 747 748 static void 749 soft_scan_gz(lcollection_t *lcol, void *a) 750 { 751 int64_t excess; 752 soft_scan_arg_t *arg = a; 753 754 /* 755 * If we had projects over their cap and the global zone was also over 756 * its cap then we need to get the up-to-date global zone rss to 757 * determine if we are still over the global zone cap. We might have 758 * gone under while we scanned the capped projects. If there were no 759 * projects over cap then we can use the rss value we already have for 760 * the global zone. 761 */ 762 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 763 if (arg->ssa_project_over_cap && excess > 0) { 764 rss_sample(B_TRUE, CAPPED_ZONE); 765 update_col_rss(lcol); 766 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 767 } 768 769 if (excess > 0) { 770 int64_t adjusted_excess = 771 excess * arg->ssa_scan_goal / arg->ssa_sum_excess; 772 773 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " 774 "scanning %lld\n", 775 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 776 "project" : "zone"), 777 (long)lcol->lcol_id.rcid_val, 778 (long long)excess, (long long)arg->ssa_scan_goal, 779 (unsigned long long)arg->ssa_sum_excess, 780 (long long)adjusted_excess); 781 782 scan(lcol, adjusted_excess); 783 lcol->lcol_stat.lcols_scan++; 784 } 785 } 786 787 /* 788 * When a scan could happen, but caps aren't enforced tick the 789 * lcols_unenforced_cap counter. 790 */ 791 /*ARGSUSED*/ 792 static int 793 unenforced_cap_cb(lcollection_t *lcol, void *arg) 794 { 795 lcol->lcol_stat.lcols_unenforced_cap++; 796 797 return (0); 798 } 799 800 /* 801 * Update the count of physically installed memory. 802 */ 803 static void 804 update_phys_total(void) 805 { 806 uint64_t old_phys_total; 807 808 old_phys_total = phys_total; 809 phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb; 810 if (phys_total != old_phys_total) 811 debug("physical memory%s: %lluM\n", (old_phys_total == 0 ? 812 "" : " adjusted"), (unsigned long long)(phys_total / 1024)); 813 } 814 815 /* 816 * Unlink a process from its collection, updating relevant statistics, and 817 * freeing its associated memory. 818 */ 819 void 820 lprocess_free(lprocess_t *lpc) 821 { 822 pid_t pid; 823 824 lpc->lpc_collection->lcol_stat.lcols_proc_out++; 825 826 if (lpc->lpc_prev != NULL) 827 lpc->lpc_prev->lpc_next = lpc->lpc_next; 828 if (lpc->lpc_next != NULL) 829 lpc->lpc_next->lpc_prev = lpc->lpc_prev; 830 if (lpc->lpc_collection->lcol_lprocess == lpc) 831 lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next != 832 lpc ? lpc->lpc_next : NULL); 833 lpc->lpc_next = lpc->lpc_prev = NULL; 834 835 if (lpc->lpc_prpageheader != NULL) 836 free(lpc->lpc_prpageheader); 837 if (lpc->lpc_xmap != NULL) 838 free(lpc->lpc_xmap); 839 if (lpc->lpc_psinfo_fd >= 0) { 840 if (rfd_close(lpc->lpc_psinfo_fd) != 0) 841 debug("could not close %d lpc_psinfo_fd %d", 842 (int)lpc->lpc_pid, lpc->lpc_psinfo_fd); 843 lpc->lpc_psinfo_fd = -1; 844 } 845 if (lpc->lpc_pgdata_fd >= 0) { 846 if (rfd_close(lpc->lpc_pgdata_fd) != 0) 847 debug("could not close %d lpc_pgdata_fd %d", 848 (int)lpc->lpc_pid, lpc->lpc_pgdata_fd); 849 lpc->lpc_pgdata_fd = -1; 850 } 851 if (lpc->lpc_xmap_fd >= 0) { 852 if (rfd_close(lpc->lpc_xmap_fd) != 0) 853 debug("could not close %d lpc_xmap_fd %d", 854 (int)lpc->lpc_pid, lpc->lpc_xmap_fd); 855 lpc->lpc_xmap_fd = -1; 856 } 857 if (lpc->lpc_ignore != NULL) 858 lmapping_free(&lpc->lpc_ignore); 859 pid = lpc->lpc_pid; 860 free(lpc); 861 debug_high("process %d freed\n", (int)pid); 862 } 863 864 /* 865 * Collection clear callback. 866 */ 867 /*ARGSUSED*/ 868 static int 869 collection_clear_cb(lcollection_t *lcol, void *arg) 870 { 871 lcol->lcol_mark = 0; 872 873 return (0); 874 } 875 876 /* 877 * Respond to a terminating signal by setting a termination flag. 878 */ 879 /*ARGSUSED*/ 880 static void 881 terminate_signal(int signal) 882 { 883 if (termination_signal == 0) 884 termination_signal = signal; 885 should_run = 0; 886 } 887 888 /* 889 * Handle any synchronous or asynchronous signals that would ordinarily cause a 890 * process to abort. 891 */ 892 /*ARGSUSED*/ 893 static void 894 abort_signal(int signal) 895 { 896 /* 897 * Allow the scanner to make a last-ditch effort to resume any stopped 898 * processes. 899 */ 900 scan_abort(); 901 abort(); 902 } 903 904 /* 905 * Clean up collections which have been removed due to configuration. Unlink 906 * the collection from lcollection and free it. 907 */ 908 /*ARGSUSED*/ 909 static int 910 collection_sweep_cb(lcollection_t *lcol, void *arg) 911 { 912 if (lcol->lcol_mark == 0) { 913 debug("freeing %s %s\n", 914 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 915 "project" : "zone"), lcol->lcol_name); 916 lcollection_free(lcol); 917 } 918 919 return (0); 920 } 921 922 /* 923 * Set those variables which depend on the global configuration. 924 */ 925 static void 926 finish_configuration(void) 927 { 928 /* 929 * Warn that any lnode (or non-project) mode specification (by an SRM 930 * 1.3 configuration file, for example) is ignored. 931 */ 932 if (strcmp(rcfg.rcfg_mode_name, "project") != 0) { 933 warn(gettext("%s mode specification ignored -- using project" 934 " mode\n"), rcfg.rcfg_mode_name); 935 rcfg.rcfg_mode_name = "project"; 936 rcfg.rcfg_mode = rctype_project; 937 } 938 } 939 940 /* 941 * Cause the configuration to be reread and applied. 942 */ 943 static void 944 reread_configuration(void) 945 { 946 rcfg_t rcfg_new; 947 948 if (rcfg_read(&rcfg_new, update_statistics) != E_SUCCESS) { 949 warn(gettext("can't reread configuration \n")); 950 exit(SMF_EXIT_ERR_CONFIG); 951 } else { 952 /* 953 * Done reading configuration. Remove existing 954 * collections in case there is a change in collection type. 955 */ 956 if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) { 957 list_walk_collection(collection_clear_cb, NULL); 958 list_walk_collection(collection_sweep_cb, NULL); 959 } 960 961 /* 962 * Make the newly-read configuration the global one, and update 963 * any variables that depend on it. 964 */ 965 rcfg = rcfg_new; 966 finish_configuration(); 967 } 968 } 969 970 /* 971 * First, examine changes, additions, and deletions to cap definitions. 972 * Then, set the next event time. 973 */ 974 static void 975 reconfigure(hrtime_t now, hrtime_t *next_configuration, 976 hrtime_t *next_proc_walk, hrtime_t *next_rss_sample) 977 { 978 debug("reconfigure...\n"); 979 980 /* 981 * Walk the lcollection, marking active collections so inactive ones 982 * can be freed. 983 */ 984 list_walk_collection(collection_clear_cb, NULL); 985 lcollection_update(LCU_ACTIVE_ONLY); /* mark */ 986 list_walk_collection(collection_sweep_cb, NULL); 987 988 *next_configuration = NEXT_EVENT_TIME(now, 989 rcfg.rcfg_reconfiguration_interval); 990 991 /* 992 * Reset each event time to the shorter of the previous and new 993 * intervals. 994 */ 995 if (next_report == 0 && rcfg.rcfg_report_interval > 0) 996 next_report = now; 997 else 998 next_report = POSITIVE_MIN(next_report, 999 NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval)); 1000 1001 if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0) 1002 *next_proc_walk = now; 1003 else 1004 *next_proc_walk = POSITIVE_MIN(*next_proc_walk, 1005 NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval)); 1006 1007 if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0) 1008 *next_rss_sample = now; 1009 else 1010 *next_rss_sample = POSITIVE_MIN(*next_rss_sample, 1011 NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval)); 1012 } 1013 1014 /* 1015 * Respond to SIGHUP by triggering the rereading the configuration and cap 1016 * definitions. 1017 */ 1018 /*ARGSUSED*/ 1019 static void 1020 sighup(int signal) 1021 { 1022 should_reconfigure = 1; 1023 } 1024 1025 /* 1026 * Print, for debugging purposes, each collection's interval statistics. 1027 */ 1028 /*ARGSUSED*/ 1029 static int 1030 simple_report_collection_cb(lcollection_t *lcol, void *arg) 1031 { 1032 #define DELTA(field) \ 1033 (unsigned long long)( \ 1034 (lcol->lcol_stat.field - lcol->lcol_stat_old.field)) 1035 1036 debug("%s %s status: succeeded/attempted (k): %llu/%llu, " 1037 "ineffective/scans/unenforced/samplings: %llu/%llu/%llu/%llu, RSS " 1038 "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, " 1039 "%llu scans over %llu ms\n", 1040 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), 1041 lcol->lcol_name, 1042 DELTA(lcols_pg_eff), DELTA(lcols_pg_att), 1043 DELTA(lcols_scan_ineffective), DELTA(lcols_scan), 1044 DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample), 1045 (unsigned long long)lcol->lcol_stat.lcols_min_rss, 1046 (unsigned long long)lcol->lcol_stat.lcols_max_rss, 1047 (unsigned long long)lcol->lcol_rss_cap, 1048 (unsigned long long)(lcol->lcol_stat.lcols_proc_in - 1049 lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out), 1050 DELTA(lcols_scan_count), DELTA(lcols_scan_time_complete) / (NANOSEC 1051 / MILLISEC)); 1052 1053 #undef DELTA 1054 1055 return (0); 1056 } 1057 1058 /* 1059 * Record each collection's interval statistics in the statistics file. 1060 */ 1061 static int 1062 report_collection_cb(lcollection_t *lcol, void *arg) 1063 { 1064 lcollection_report_t dc; 1065 int fd = (intptr_t)arg; 1066 1067 /* 1068 * Copy the relevant fields to the collection's record. 1069 */ 1070 bzero(&dc, sizeof (dc)); 1071 dc.lcol_id = lcol->lcol_id; 1072 (void) strcpy(dc.lcol_name, lcol->lcol_name); 1073 dc.lcol_rss = lcol->lcol_rss; 1074 dc.lcol_image_size = lcol->lcol_image_size; 1075 dc.lcol_rss_cap = lcol->lcol_rss_cap; 1076 dc.lcol_stat = lcol->lcol_stat; 1077 1078 if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) { 1079 lcol->lcol_stat_old = lcol->lcol_stat; 1080 } else { 1081 debug("can't write %s %s statistics", 1082 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 1083 "project" : "zone"), 1084 lcol->lcol_name); 1085 } 1086 1087 return (0); 1088 } 1089 1090 /* 1091 * Determine the count of pages scanned by the global page scanner, obtained 1092 * from the cpu_stat:*::scan kstats. Return zero on success. 1093 */ 1094 static int 1095 get_globally_scanned_pages(uint64_t *scannedp) 1096 { 1097 kstat_t *ksp; 1098 uint64_t scanned = 0; 1099 1100 if (kstat_chain_update(kctl) == -1) { 1101 warn(gettext("can't update kstat chain")); 1102 return (0); 1103 } 1104 1105 for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) { 1106 if (strcmp(ksp->ks_module, "cpu_stat") == 0) { 1107 if (kstat_read(kctl, ksp, NULL) != -1) { 1108 scanned += ((cpu_stat_t *) 1109 ksp->ks_data)->cpu_vminfo.scan; 1110 } else { 1111 return (-1); 1112 } 1113 } 1114 } 1115 1116 *scannedp = scanned; 1117 return (0); 1118 } 1119 1120 /* 1121 * Determine if the global page scanner is running, during which no memory 1122 * caps should be enforced, to prevent interference with the global page 1123 * scanner. 1124 */ 1125 static boolean_t 1126 is_global_scanner_running() 1127 { 1128 /* measure delta in page scan count */ 1129 static uint64_t new_sp = 0; 1130 static uint64_t old_sp = 0; 1131 boolean_t res = B_FALSE; 1132 1133 if (get_globally_scanned_pages(&new_sp) == 0) { 1134 if (old_sp != 0 && (new_sp - old_sp) > 0) { 1135 debug("global memory pressure detected (%llu " 1136 "pages scanned since last interval)\n", 1137 (unsigned long long)(new_sp - old_sp)); 1138 res = B_TRUE; 1139 } 1140 old_sp = new_sp; 1141 } else { 1142 warn(gettext("unable to read cpu statistics")); 1143 new_sp = old_sp; 1144 } 1145 1146 return (res); 1147 } 1148 1149 /* 1150 * If soft caps are in use, determine if global memory pressure exceeds the 1151 * configured maximum above which soft caps are enforced. 1152 */ 1153 static boolean_t 1154 must_enforce_soft_caps() 1155 { 1156 /* 1157 * Check for changes to the amount of installed physical memory, to 1158 * compute the current memory pressure. 1159 */ 1160 update_phys_total(); 1161 1162 memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb) 1163 * 100.0 / phys_total); 1164 memory_pressure_sample++; 1165 if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 && 1166 memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) { 1167 return (B_TRUE); 1168 } 1169 1170 return (B_FALSE); 1171 } 1172 1173 /* 1174 * Update the shared statistics file with each collection's current statistics. 1175 * Return zero on success. 1176 */ 1177 static int 1178 update_statistics(void) 1179 { 1180 int fd, res; 1181 static char template[LINELEN]; 1182 1183 /* 1184 * Try to create a directory irrespective of whether it is existing 1185 * or not. If it is not there then it will create. Otherwise any way 1186 * it will fail at mkstemp call below. 1187 */ 1188 (void) mkdir(STAT_FILE_DIR, 0755); 1189 1190 /* 1191 * Create a temporary file. 1192 */ 1193 if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) + 1194 strlen(STAT_TEMPLATE_SUFFIX) + 1)) { 1195 debug("temporary file template size too small\n"); 1196 return (-1); 1197 } 1198 (void) strcpy(template, rcfg.rcfg_stat_file); 1199 (void) strcat(template, STAT_TEMPLATE_SUFFIX); 1200 (void) rfd_reserve(1); 1201 fd = mkstemp(template); 1202 1203 /* 1204 * Write the header and per-collection statistics. 1205 */ 1206 if (fd >= 0) { 1207 rcapd_stat_hdr_t rs; 1208 1209 rs.rs_pid = rcapd_pid; 1210 rs.rs_time = gethrtime(); 1211 ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name)); 1212 (void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name); 1213 rs.rs_pressure_cur = memory_pressure; 1214 rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure; 1215 rs.rs_pressure_sample = memory_pressure_sample; 1216 1217 if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) == 1218 sizeof (rs)) { 1219 list_walk_collection(report_collection_cb, 1220 (void *)(intptr_t)fd); 1221 /* 1222 * Replace the existing statistics file with this new 1223 * one. 1224 */ 1225 res = rename(template, rcfg.rcfg_stat_file); 1226 } else 1227 res = -1; 1228 (void) close(fd); 1229 } else 1230 res = -1; 1231 1232 return (res); 1233 } 1234 1235 /* 1236 * Verify the statistics file can be created and written to, and die if an 1237 * existing file may be in use by another rcapd. 1238 */ 1239 static int 1240 verify_statistics(void) 1241 { 1242 pid_t pid; 1243 1244 /* 1245 * Warn if another instance of rcapd might be active. 1246 */ 1247 (void) rfd_reserve(1); 1248 pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file); 1249 if (pid != rcapd_pid && pid != -1) 1250 die(gettext("%s exists; rcapd may already be active\n"), 1251 rcfg.rcfg_stat_file); 1252 1253 return (update_statistics()); 1254 } 1255 1256 static int 1257 sum_excess_cb(lcollection_t *lcol, void *arg) 1258 { 1259 uint64_t *sum_excess = arg; 1260 1261 *sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss - 1262 lcol->lcol_rss_cap)); 1263 return (0); 1264 } 1265 1266 /* 1267 * Compute the quantity of memory (in kilobytes) above the cap enforcement 1268 * pressure. Set the scan goal to that quantity (or at most the excess). 1269 */ 1270 static void 1271 compute_soft_scan_goal(soft_scan_arg_t *argp) 1272 { 1273 /* 1274 * Compute the sum of the collections' excesses, which will be the 1275 * denominator. 1276 */ 1277 argp->ssa_sum_excess = 0; 1278 list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess)); 1279 1280 argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) * 1281 (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 - 1282 sysconf(_SC_AVPHYS_PAGES)) * page_size_kb, 1283 argp->ssa_sum_excess); 1284 } 1285 1286 static void 1287 rcapd_usage(void) 1288 { 1289 info(gettext("usage: rcapd [-d]\n")); 1290 } 1291 1292 void 1293 check_update_statistics(void) 1294 { 1295 hrtime_t now = gethrtime(); 1296 1297 if (EVENT_TIME(now, next_report)) { 1298 debug("updating statistics...\n"); 1299 list_walk_collection(simple_report_collection_cb, NULL); 1300 if (update_statistics() != 0) 1301 debug("couldn't update statistics"); 1302 next_report = NEXT_REPORT_EVENT_TIME(now, 1303 rcfg.rcfg_report_interval); 1304 } 1305 } 1306 1307 static void 1308 verify_and_set_privileges(void) 1309 { 1310 priv_set_t *required = 1311 priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL); 1312 1313 /* 1314 * Ensure the required privileges, suitable for controlling processes, 1315 * are possessed. 1316 */ 1317 if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv( 1318 PRIV_SET, PRIV_EFFECTIVE, required) != 0) 1319 die(gettext("can't set requisite privileges")); 1320 1321 /* 1322 * Ensure access to /var/run/daemon. 1323 */ 1324 if (setreuid(DAEMON_UID, DAEMON_UID) != 0) 1325 die(gettext("cannot become user daemon")); 1326 1327 priv_freeset(required); 1328 } 1329 1330 /* 1331 * This function does the top-level work to determine if we should do any 1332 * memory capping, and if so, it invokes the right call-backs to do the work. 1333 */ 1334 static void 1335 do_capping(hrtime_t now, hrtime_t *next_proc_walk) 1336 { 1337 boolean_t enforce_caps; 1338 /* soft cap enforcement flag, depending on memory pressure */ 1339 boolean_t enforce_soft_caps; 1340 /* avoid interference with kernel's page scanner */ 1341 boolean_t global_scanner_running; 1342 sample_col_arg_t col_arg; 1343 soft_scan_arg_t arg; 1344 uint_t col_types = 0; 1345 1346 /* check what kind of collections (project/zone) are capped */ 1347 list_walk_collection(col_type_cb, &col_types); 1348 debug("collection types: 0x%x\n", col_types); 1349 1350 /* no capped collections, skip checking rss */ 1351 if (col_types == 0) 1352 return; 1353 1354 /* Determine if soft caps are enforced. */ 1355 enforce_soft_caps = must_enforce_soft_caps(); 1356 1357 /* Determine if the global page scanner is running. */ 1358 global_scanner_running = is_global_scanner_running(); 1359 1360 /* 1361 * Sample collections' member processes RSSes and recompute 1362 * collections' excess. 1363 */ 1364 rss_sample(B_FALSE, col_types); 1365 1366 col_arg.sca_any_over_cap = B_FALSE; 1367 col_arg.sca_project_over_cap = B_FALSE; 1368 list_walk_collection(rss_sample_col_cb, &col_arg); 1369 list_walk_collection(excess_print_cb, NULL); 1370 debug("any collection/project over cap = %d, %d\n", 1371 col_arg.sca_any_over_cap, col_arg.sca_project_over_cap); 1372 1373 if (enforce_soft_caps) 1374 debug("memory pressure %d%%\n", memory_pressure); 1375 1376 /* 1377 * Cap enforcement is determined by the previous conditions. 1378 */ 1379 enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap && 1380 (rcfg.rcfg_memory_cap_enforcement_pressure == 0 || 1381 enforce_soft_caps); 1382 1383 debug("%senforcing caps\n", enforce_caps ? "" : "not "); 1384 1385 /* 1386 * If soft caps are in use, determine the size of the portion from each 1387 * collection to scan for. 1388 */ 1389 if (enforce_caps && enforce_soft_caps) 1390 compute_soft_scan_goal(&arg); 1391 1392 /* 1393 * Victimize offending collections. 1394 */ 1395 if (enforce_caps && (!enforce_soft_caps || 1396 (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) { 1397 1398 /* 1399 * Since at least one collection is over its cap & needs 1400 * enforcing, check if it is at least time for a process walk 1401 * (we could be well past time since we only walk /proc when 1402 * we need to) and if so, update each collections process list 1403 * in a single pass through /proc. 1404 */ 1405 if (EVENT_TIME(now, *next_proc_walk)) { 1406 debug("scanning process list...\n"); 1407 proc_walk_all(proc_cb); /* insert & mark */ 1408 list_walk_all(sweep_process_cb); /* free dead procs */ 1409 *next_proc_walk = NEXT_EVENT_TIME(now, 1410 rcfg.rcfg_proc_walk_interval); 1411 } 1412 1413 gz_col = NULL; 1414 if (enforce_soft_caps) { 1415 debug("scan goal is %lldKB\n", 1416 (long long)arg.ssa_scan_goal); 1417 list_walk_collection(soft_scan_cb, &arg); 1418 if (gz_capped && gz_col != NULL) { 1419 /* process global zone */ 1420 arg.ssa_project_over_cap = 1421 col_arg.sca_project_over_cap; 1422 soft_scan_gz(gz_col, &arg); 1423 } 1424 } else { 1425 list_walk_collection(scan_cb, NULL); 1426 if (gz_capped && gz_col != NULL) { 1427 /* process global zone */ 1428 scan_gz(gz_col, col_arg.sca_project_over_cap); 1429 } 1430 } 1431 } else if (col_arg.sca_any_over_cap) { 1432 list_walk_collection(unenforced_cap_cb, NULL); 1433 } 1434 } 1435 1436 int 1437 main(int argc, char *argv[]) 1438 { 1439 int res; 1440 int should_fork = 1; /* fork flag */ 1441 hrtime_t now; /* current time */ 1442 hrtime_t next; /* time of next event */ 1443 int sig; /* signal iteration */ 1444 struct rlimit rl; 1445 hrtime_t next_proc_walk; /* time of next /proc scan */ 1446 hrtime_t next_configuration; /* time of next configuration */ 1447 hrtime_t next_rss_sample; /* (latest) time of next RSS sample */ 1448 1449 (void) set_message_priority(RCM_INFO); 1450 (void) setprogname("rcapd"); 1451 rcapd_pid = getpid(); 1452 (void) chdir("/"); 1453 should_run = 1; 1454 ever_ran = 0; 1455 1456 (void) setlocale(LC_ALL, ""); 1457 (void) textdomain(TEXT_DOMAIN); 1458 1459 /* 1460 * Parse command-line options. 1461 */ 1462 while ((res = getopt(argc, argv, "dF")) > 0) 1463 switch (res) { 1464 case 'd': 1465 should_fork = 0; 1466 if (debug_mode == 0) { 1467 debug_mode = 1; 1468 (void) set_message_priority(RCM_DEBUG); 1469 } else 1470 (void) set_message_priority(RCM_DEBUG_HIGH); 1471 break; 1472 case 'F': 1473 should_fork = 0; 1474 break; 1475 default: 1476 rcapd_usage(); 1477 return (E_USAGE); 1478 /*NOTREACHED*/ 1479 } 1480 1481 /* 1482 * Read the configuration. 1483 */ 1484 if (rcfg_read(&rcfg, verify_statistics) != E_SUCCESS) { 1485 warn(gettext("resource caps not configured\n")); 1486 return (SMF_EXIT_ERR_CONFIG); 1487 } 1488 1489 /* 1490 * If not debugging, fork and continue operating, changing the 1491 * destination of messages to syslog(). 1492 */ 1493 if (should_fork == 1) { 1494 pid_t child; 1495 debug("forking\n"); 1496 child = fork(); 1497 if (child == -1) 1498 die(gettext("cannot fork")); 1499 if (child > 0) 1500 return (0); 1501 else { 1502 rcapd_pid = getpid(); 1503 (void) set_message_destination(RCD_SYSLOG); 1504 (void) fclose(stdin); 1505 (void) fclose(stdout); 1506 (void) fclose(stderr); 1507 } 1508 /* 1509 * Start a new session and detatch from the controlling tty. 1510 */ 1511 if (setsid() == (pid_t)-1) 1512 debug(gettext("setsid() failed; cannot detach from " 1513 "terminal")); 1514 } 1515 1516 finish_configuration(); 1517 should_reconfigure = 0; 1518 1519 /* 1520 * Check that required privileges are possessed. 1521 */ 1522 verify_and_set_privileges(); 1523 1524 now = next_report = next_proc_walk = next_rss_sample = gethrtime(); 1525 next_configuration = NEXT_EVENT_TIME(gethrtime(), 1526 rcfg.rcfg_reconfiguration_interval); 1527 1528 /* 1529 * Open the kstat chain. 1530 */ 1531 kctl = kstat_open(); 1532 if (kctl == NULL) 1533 die(gettext("can't open kstats")); 1534 1535 /* 1536 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can 1537 * be effectively managed without revoking descriptors (at 3 per 1538 * process). 1539 */ 1540 rl.rlim_cur = 32 * 1024; 1541 rl.rlim_max = 32 * 1024; 1542 if (setrlimit(RLIMIT_NOFILE, &rl) != 0 && 1543 getrlimit(RLIMIT_NOFILE, &rl) == 0) { 1544 rl.rlim_cur = rl.rlim_max; 1545 (void) setrlimit(RLIMIT_NOFILE, &rl); 1546 } 1547 (void) enable_extended_FILE_stdio(-1, -1); 1548 1549 if (getrlimit(RLIMIT_NOFILE, &rl) == 0) 1550 debug("fd limit: %lu\n", rl.rlim_cur); 1551 else 1552 debug("fd limit: unknown\n"); 1553 1554 get_page_size(); 1555 my_zoneid = getzoneid(); 1556 1557 /* 1558 * Handle those signals whose (default) exit disposition 1559 * prevents rcapd from finishing scanning before terminating. 1560 */ 1561 (void) sigset(SIGINT, terminate_signal); 1562 (void) sigset(SIGQUIT, abort_signal); 1563 (void) sigset(SIGILL, abort_signal); 1564 (void) sigset(SIGEMT, abort_signal); 1565 (void) sigset(SIGFPE, abort_signal); 1566 (void) sigset(SIGBUS, abort_signal); 1567 (void) sigset(SIGSEGV, abort_signal); 1568 (void) sigset(SIGSYS, abort_signal); 1569 (void) sigset(SIGPIPE, terminate_signal); 1570 (void) sigset(SIGALRM, terminate_signal); 1571 (void) sigset(SIGTERM, terminate_signal); 1572 (void) sigset(SIGUSR1, terminate_signal); 1573 (void) sigset(SIGUSR2, terminate_signal); 1574 (void) sigset(SIGPOLL, terminate_signal); 1575 (void) sigset(SIGVTALRM, terminate_signal); 1576 (void) sigset(SIGXCPU, abort_signal); 1577 (void) sigset(SIGXFSZ, abort_signal); 1578 for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++) 1579 (void) sigset(sig, terminate_signal); 1580 1581 /* 1582 * Install a signal handler for reconfiguration processing. 1583 */ 1584 (void) sigset(SIGHUP, sighup); 1585 1586 /* 1587 * Determine which process collections to cap. 1588 */ 1589 lcollection_update(LCU_COMPLETE); 1590 1591 /* 1592 * Loop forever, monitoring collections' resident set sizes and 1593 * enforcing their caps. Look for changes in caps as well as 1594 * responding to requests to reread the configuration. Update 1595 * per-collection statistics periodically. 1596 */ 1597 while (should_run != 0) { 1598 struct timespec ts; 1599 1600 /* 1601 * Announce that rcapd is starting. 1602 */ 1603 if (ever_ran == 0) { 1604 info(gettext("starting\n")); 1605 ever_ran = 1; 1606 } 1607 1608 /* 1609 * Check the configuration at every next_configuration interval. 1610 * Update the rss data once every next_rss_sample interval. 1611 * The condition of global memory pressure is also checked at 1612 * the same frequency, if strict caps are in use. 1613 */ 1614 now = gethrtime(); 1615 1616 /* 1617 * Detect configuration and cap changes only when SIGHUP 1618 * is received. Call reconfigure to apply new configuration 1619 * parameters. 1620 */ 1621 if (should_reconfigure == 1) { 1622 reread_configuration(); 1623 should_reconfigure = 0; 1624 reconfigure(now, &next_configuration, &next_proc_walk, 1625 &next_rss_sample); 1626 } 1627 1628 if (EVENT_TIME(now, next_configuration)) { 1629 reconfigure(now, &next_configuration, &next_proc_walk, 1630 &next_rss_sample); 1631 } 1632 1633 /* 1634 * Do the main work for enforcing caps. 1635 */ 1636 if (EVENT_TIME(now, next_rss_sample)) { 1637 do_capping(now, &next_proc_walk); 1638 1639 next_rss_sample = NEXT_EVENT_TIME(now, 1640 rcfg.rcfg_rss_sample_interval); 1641 } 1642 1643 /* 1644 * Update the statistics file, if it's time. 1645 */ 1646 check_update_statistics(); 1647 1648 /* 1649 * Sleep for some time before repeating. 1650 */ 1651 now = gethrtime(); 1652 next = next_configuration; 1653 next = POSITIVE_MIN(next, next_report); 1654 next = POSITIVE_MIN(next, next_rss_sample); 1655 if (next > now && should_run != 0) { 1656 debug("sleeping %-4.2f seconds\n", (float)(next - 1657 now) / (float)NANOSEC); 1658 hrt2ts(next - now, &ts); 1659 (void) nanosleep(&ts, NULL); 1660 } 1661 } 1662 if (termination_signal != 0) 1663 debug("exiting due to signal %d\n", termination_signal); 1664 if (ever_ran != 0) 1665 info(gettext("exiting\n")); 1666 1667 /* 1668 * Unlink the statistics file before exiting. 1669 */ 1670 if (rcfg.rcfg_stat_file[0] != 0) 1671 (void) unlink(rcfg.rcfg_stat_file); 1672 1673 return (E_SUCCESS); 1674 } 1675