1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * rcapd is a long-running daemon enforcing project-based resource caps (see 30 * rcapd(1M)). Each instance of a process aggregate (project or, generically, 31 * "collection") may have a memory cap. A single thread monitors the resource 32 * utilization of capped collections, enforces caps when they are exceeded (and 33 * other conditions are met), and incorporates changes in configuration or 34 * caps. Each of these actions occurs not more frequently than the rate 35 * specified with rcapadm(1M). 36 */ 37 38 #include <sys/priocntl.h> 39 #include <sys/proc.h> 40 #include <sys/resource.h> 41 #include <sys/sysinfo.h> 42 #include <sys/stat.h> 43 #include <sys/sysmacros.h> 44 #include <sys/time.h> 45 #include <sys/types.h> 46 #include <dirent.h> 47 #include <errno.h> 48 #include <fcntl.h> 49 #include <kstat.h> 50 #include <libintl.h> 51 #include <limits.h> 52 #include <locale.h> 53 #include <priv.h> 54 #include <signal.h> 55 #include <stdarg.h> 56 #include <stdio.h> 57 #include <stdio_ext.h> 58 #include <stdlib.h> 59 #include <strings.h> 60 #include <time.h> 61 #include <unistd.h> 62 #include <zone.h> 63 #include <assert.h> 64 #include <sys/vm_usage.h> 65 #include "rcapd.h" 66 #include "rcapd_mapping.h" 67 #include "rcapd_rfd.h" 68 #include "rcapd_stat.h" 69 #include "utils.h" 70 71 #define POSITIVE_MIN(x, y) \ 72 (((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y)) 73 #define NEXT_EVENT_TIME(base, seconds) \ 74 (((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \ 75 : (hrtime_t)0) 76 #define NEXT_REPORT_EVENT_TIME(base, seconds) \ 77 ((rcfg.rcfg_stat_file[0] != 0) ? \ 78 NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0) 79 #define EVENT_TIME(time, eventtime) \ 80 (((time) > (eventtime)) && (eventtime) != 0) 81 #define STAT_TEMPLATE_SUFFIX ".XXXXXX" /* suffix of mkstemp() arg */ 82 #define DAEMON_UID 1 /* uid to use */ 83 84 #define CAPPED_PROJECT 0x01 85 #define CAPPED_ZONE 0x02 86 87 typedef struct soft_scan_arg { 88 uint64_t ssa_sum_excess; 89 int64_t ssa_scan_goal; 90 boolean_t ssa_project_over_cap; 91 } soft_scan_arg_t; 92 93 typedef struct sample_col_arg { 94 boolean_t sca_any_over_cap; 95 boolean_t sca_project_over_cap; 96 } sample_col_arg_t; 97 98 99 static int debug_mode = 0; /* debug mode flag */ 100 static pid_t rcapd_pid; /* rcapd's pid to ensure it's not */ 101 /* scanned */ 102 static kstat_ctl_t *kctl; /* kstat chain */ 103 static int memory_pressure = 0; /* physical memory utilization (%) */ 104 static int memory_pressure_sample = 0; /* count of samples */ 105 static long page_size_kb = 0; /* system page size in KB */ 106 static size_t nvmu_vals = 0; /* # of kernel RSS/swap vals in array */ 107 static size_t vmu_vals_len = 0; /* size of RSS/swap vals array */ 108 static vmusage_t *vmu_vals = NULL; /* snapshot of kernel RSS/swap values */ 109 static hrtime_t next_report; /* time of next report */ 110 static int termination_signal = 0; /* terminating signal */ 111 static zoneid_t my_zoneid = (zoneid_t)-1; 112 static lcollection_t *gz_col; /* global zone collection */ 113 114 rcfg_t rcfg; 115 /* 116 * Updated when we re-read the collection configurations if this rcapd instance 117 * is running in the global zone and the global zone is capped. 118 */ 119 boolean_t gz_capped = B_FALSE; 120 121 /* 122 * Flags. 123 */ 124 static int ever_ran; 125 int should_run; 126 static int should_reconfigure; 127 128 static int verify_statistics(void); 129 static int update_statistics(void); 130 131 /* 132 * Checks if a process is marked 'system'. Returns FALSE only when it is not. 133 */ 134 static boolean_t 135 proc_issystem(pid_t pid) 136 { 137 char pc_clname[PC_CLNMSZ]; 138 139 if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname, 140 PC_KY_NULL) != -1) { 141 return (strcmp(pc_clname, "SYS") == 0); 142 } else { 143 debug("cannot get class-specific scheduling parameters; " 144 "assuming system process\n"); 145 return (B_TRUE); 146 } 147 } 148 149 static void 150 lprocess_insert_mark(psinfo_t *psinfop) 151 { 152 pid_t pid = psinfop->pr_pid; 153 /* flag indicating whether the process should be scanned. */ 154 int unscannable = psinfop->pr_nlwp == 0; 155 rcid_t colid; 156 lcollection_t *lcol; 157 lprocess_t *lproc; 158 159 /* 160 * Determine which collection to put this process into. We only have 161 * to worry about tracking both zone and project capped processes if 162 * this rcapd instance is running in the global zone, since we'll only 163 * see processes in our own projects in a non-global zone. In the 164 * global zone, if the process belongs to a non-global zone, we only 165 * need to track it for the capped non-global zone collection. For 166 * global zone processes, we first attempt to put the process into a 167 * capped project collection. On the second pass into this function 168 * the projid will be cleared so we will just track the process for the 169 * global zone collection as a whole. 170 */ 171 if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) { 172 colid.rcid_type = RCIDT_PROJECT; 173 colid.rcid_val = psinfop->pr_projid; 174 } else { 175 /* try to add to zone collection */ 176 colid.rcid_type = RCIDT_ZONE; 177 colid.rcid_val = psinfop->pr_zoneid; 178 } 179 180 if ((lcol = lcollection_find(&colid)) == NULL) 181 return; 182 183 /* 184 * If the process is already being tracked, update the unscannable flag, 185 * as determined by the caller, from the process's psinfo. 186 */ 187 lproc = lcol->lcol_lprocess; 188 while (lproc != NULL) { 189 if (lproc->lpc_pid == pid) { 190 lproc->lpc_mark = 1; 191 if (unscannable != 0 && lproc->lpc_unscannable == 0) { 192 debug("process %d: became unscannable\n", 193 (int)lproc->lpc_pid); 194 lproc->lpc_unscannable = 1; 195 } 196 return; 197 } 198 lproc = lproc->lpc_next; 199 } 200 201 /* 202 * We've fallen off the list without finding our current process; 203 * insert it at the list head. 204 */ 205 if ((lproc = malloc(sizeof (*lproc))) == NULL) 206 debug("insufficient memory to track new process %d", (int)pid); 207 else { 208 (void) bzero(lproc, sizeof (*lproc)); 209 lproc->lpc_pid = pid; 210 lproc->lpc_mark = 1; 211 lproc->lpc_collection = lcol; 212 lproc->lpc_psinfo_fd = -1; 213 lproc->lpc_pgdata_fd = -1; 214 lproc->lpc_xmap_fd = -1; 215 216 /* 217 * If the caller didn't flag this process as unscannable 218 * already, do some more checking. 219 */ 220 lproc->lpc_unscannable = unscannable || proc_issystem(pid); 221 222 #ifdef DEBUG 223 /* 224 * Verify the sanity of lprocess. It should not contain the 225 * process we are about to prepend. 226 */ 227 if (lcollection_member(lcol, lproc)) { 228 lprocess_t *cur = lcol->lcol_lprocess; 229 debug("The collection %lld already has these members, " 230 "including me, %d!\n", 231 (long long)lcol->lcol_id.rcid_val, 232 (int)lproc->lpc_pid); 233 while (cur != NULL) { 234 debug("\t%d\n", (int)cur->lpc_pid); 235 cur = cur->lpc_next; 236 } 237 info(gettext("process already on lprocess\n")); 238 abort(); 239 } 240 #endif /* DEBUG */ 241 lproc->lpc_next = lcol->lcol_lprocess; 242 if (lproc->lpc_next != NULL) 243 lproc->lpc_next->lpc_prev = lproc; 244 lproc->lpc_prev = NULL; 245 lcol->lcol_lprocess = lproc; 246 247 debug("tracking %s %ld %d %s%s\n", 248 (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"), 249 (long)colid.rcid_val, 250 (int)pid, psinfop->pr_psargs, 251 (lproc->lpc_unscannable != 0) ? " (not scannable)" : ""); 252 lcol->lcol_stat.lcols_proc_in++; 253 } 254 } 255 256 static int 257 list_walk_process_cb(lcollection_t *lcol, void *arg) 258 { 259 int (*cb)(lcollection_t *, lprocess_t *) = 260 (int(*)(lcollection_t *, lprocess_t *))arg; 261 lprocess_t *member; 262 lprocess_t *next; 263 264 member = lcol->lcol_lprocess; 265 while (member != NULL) { 266 pid_t pid = member->lpc_pid; 267 next = member->lpc_next; 268 269 debug_high("list_walk_all lpc %d\n", (int)pid); 270 if (cb(lcol, member) != 0) { 271 debug_high("list_walk_all aborted at lpc %d\n", 272 (int)pid); 273 return (1); 274 } 275 member = next; 276 } 277 278 return (0); 279 } 280 281 /* 282 * Invoke the given callback for each process in each collection. Callbacks 283 * are allowed to change the linkage of the process on which they act. 284 */ 285 static void 286 list_walk_all(int (*cb)(lcollection_t *, lprocess_t *)) 287 { 288 list_walk_collection(list_walk_process_cb, (void *)cb); 289 } 290 291 static void 292 revoke_psinfo(rfd_t *rfd) 293 { 294 lprocess_t *lpc = (lprocess_t *)rfd->rfd_data; 295 296 if (lpc != NULL) { 297 debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid); 298 ASSERT(lpc->lpc_psinfo_fd != -1); 299 lpc->lpc_psinfo_fd = -1; 300 } else 301 debug("revoking psinfo fd for unknown process\n"); 302 } 303 304 /* 305 * Retrieve a process's psinfo via an already-opened or new file descriptor. 306 * The supplied descriptor will be closed on failure. An optional callback 307 * will be invoked with the last descriptor tried, and a supplied callback 308 * argument, as its arguments, such that the new descriptor may be cached, or 309 * an old one may be invalidated. If the result of the callback is zero, the 310 * the caller is to assume responsibility for the file descriptor, to close it 311 * with rfd_close(). 312 * 313 * On failure, a nonzero value is returned. 314 */ 315 int 316 get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd, 317 int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc) 318 { 319 int fd; 320 int can_try_uncached; 321 322 ASSERT(!(cached_fd > 0 && fd_update_cb == NULL)); 323 324 do { 325 if (cached_fd >= 0) { 326 fd = cached_fd; 327 can_try_uncached = 1; 328 debug_high("%d/psinfo, trying cached fd %d\n", 329 (int)pid, fd); 330 } else { 331 char pathbuf[PROC_PATH_MAX]; 332 333 can_try_uncached = 0; 334 (void) snprintf(pathbuf, sizeof (pathbuf), 335 "/proc/%d/psinfo", (int)pid); 336 if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO, 337 revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) { 338 debug("cannot open %s", pathbuf); 339 break; 340 } else 341 debug_high("opened %s, fd %d\n", pathbuf, fd); 342 } 343 344 if (pread(fd, psinfo, sizeof (*psinfo), 0) == 345 sizeof (*psinfo) && psinfo->pr_pid == pid) 346 break; 347 else { 348 debug_high("closed fd %d\n", fd); 349 if (rfd_close(fd) != 0) 350 debug("could not close fd %d", fd); 351 fd = cached_fd = -1; 352 } 353 } while (can_try_uncached == 1); 354 355 if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0) 356 if (fd >= 0) { 357 debug_high("closed %s fd %d\n", fd_update_cb == NULL ? 358 "uncached" : "cached", fd); 359 if (rfd_close(fd) != 0) 360 debug("could not close fd %d", fd); 361 } 362 363 debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd, 364 fd_update_cb != NULL ? "cached" : "uncached"); 365 return ((fd >= 0) ? 0 : -1); 366 } 367 368 /* 369 * Retrieve the collection membership of all processes and update the psinfo of 370 * those non-system, non-zombie ones in collections. For global zone processes, 371 * we first attempt to put the process into a capped project collection. We 372 * also want to track the process for the global zone collection as a whole. 373 */ 374 static void 375 proc_cb(const pid_t pid) 376 { 377 psinfo_t psinfo; 378 379 if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) { 380 lprocess_insert_mark(&psinfo); 381 if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) { 382 /* 383 * We also want to track this process for the global 384 * zone as a whole so add it to the global zone 385 * collection as well. 386 */ 387 psinfo.pr_projid = -1; 388 lprocess_insert_mark(&psinfo); 389 } 390 } 391 } 392 393 /* 394 * Cache the process' psinfo fd, taking responsibility for freeing it. 395 */ 396 int 397 lprocess_update_psinfo_fd_cb(void *arg, int fd) 398 { 399 lprocess_t *lpc = arg; 400 401 lpc->lpc_psinfo_fd = fd; 402 return (0); 403 } 404 405 /* 406 * Get the system pagesize. 407 */ 408 static void 409 get_page_size(void) 410 { 411 page_size_kb = sysconf(_SC_PAGESIZE) / 1024; 412 debug("physical page size: %luKB\n", page_size_kb); 413 } 414 415 static void 416 tm_fmt(char *msg, hrtime_t t1, hrtime_t t2) 417 { 418 hrtime_t diff = t2 - t1; 419 420 if (diff < MILLISEC) 421 debug("%s: %lld nanoseconds\n", msg, diff); 422 else if (diff < MICROSEC) 423 debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC); 424 else if (diff < NANOSEC) 425 debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC); 426 else 427 debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC); 428 } 429 430 /* 431 * Get the zone's & project's RSS from the kernel. 432 */ 433 static void 434 rss_sample(boolean_t my_zone_only, uint_t col_types) 435 { 436 size_t nres; 437 size_t i; 438 uint_t flags; 439 hrtime_t t1, t2; 440 441 if (my_zone_only) { 442 flags = VMUSAGE_ZONE; 443 } else { 444 flags = 0; 445 if (col_types & CAPPED_PROJECT) 446 flags |= VMUSAGE_PROJECTS; 447 if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID) 448 flags |= VMUSAGE_ALL_ZONES; 449 } 450 451 debug("vmusage sample flags 0x%x\n", flags); 452 if (flags == 0) 453 return; 454 455 again: 456 /* try the current buffer to see if the list will fit */ 457 nres = vmu_vals_len; 458 t1 = gethrtime(); 459 if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval, 460 vmu_vals, &nres) != 0) { 461 if (errno != EOVERFLOW) { 462 warn(gettext("can't read RSS from kernel\n")); 463 return; 464 } 465 } 466 t2 = gethrtime(); 467 tm_fmt("getvmusage time", t1, t2); 468 469 debug("kernel nres %lu\n", (ulong_t)nres); 470 471 if (nres > vmu_vals_len) { 472 /* array size is now too small, increase it and try again */ 473 free(vmu_vals); 474 475 if ((vmu_vals = (vmusage_t *)calloc(nres, 476 sizeof (vmusage_t))) == NULL) { 477 warn(gettext("out of memory: could not read RSS from " 478 "kernel\n")); 479 vmu_vals_len = nvmu_vals = 0; 480 return; 481 } 482 vmu_vals_len = nres; 483 goto again; 484 } 485 486 nvmu_vals = nres; 487 488 debug("vmusage_sample\n"); 489 for (i = 0; i < nvmu_vals; i++) { 490 debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), " 491 "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id, 492 vmu_vals[i].vmu_type, 493 (unsigned long long)vmu_vals[i].vmu_rss_all, 494 (unsigned long long)vmu_vals[i].vmu_rss_all / 1024, 495 (unsigned long long)vmu_vals[i].vmu_swap_all); 496 } 497 } 498 499 static void 500 update_col_rss(lcollection_t *lcol) 501 { 502 int i; 503 504 lcol->lcol_rss = 0; 505 lcol->lcol_image_size = 0; 506 507 for (i = 0; i < nvmu_vals; i++) { 508 if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val) 509 continue; 510 511 if (vmu_vals[i].vmu_type == VMUSAGE_ZONE && 512 lcol->lcol_id.rcid_type != RCIDT_ZONE) 513 continue; 514 515 if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS && 516 lcol->lcol_id.rcid_type != RCIDT_PROJECT) 517 continue; 518 519 /* we found the right RSS entry, update the collection vals */ 520 lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024; 521 lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024; 522 break; 523 } 524 } 525 526 /* 527 * Sample the collection RSS, updating the collection's statistics with the 528 * results. Also, sum the rss of all capped projects & return true if 529 * the collection is over cap. 530 */ 531 static int 532 rss_sample_col_cb(lcollection_t *lcol, void *arg) 533 { 534 int64_t excess; 535 uint64_t rss; 536 sample_col_arg_t *col_argp = (sample_col_arg_t *)arg; 537 538 update_col_rss(lcol); 539 540 lcol->lcol_stat.lcols_rss_sample++; 541 rss = lcol->lcol_rss; 542 excess = rss - lcol->lcol_rss_cap; 543 if (excess > 0) { 544 lcol->lcol_stat.lcols_rss_act_sum += rss; 545 col_argp->sca_any_over_cap = B_TRUE; 546 if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) 547 col_argp->sca_project_over_cap = B_TRUE; 548 } 549 lcol->lcol_stat.lcols_rss_sum += rss; 550 551 if (lcol->lcol_stat.lcols_min_rss > rss) 552 lcol->lcol_stat.lcols_min_rss = rss; 553 if (lcol->lcol_stat.lcols_max_rss < rss) 554 lcol->lcol_stat.lcols_max_rss = rss; 555 556 return (0); 557 } 558 559 /* 560 * Determine if we have capped projects, capped zones or both. 561 */ 562 static int 563 col_type_cb(lcollection_t *lcol, void *arg) 564 { 565 uint_t *col_type = (uint_t *)arg; 566 567 /* skip uncapped collections */ 568 if (lcol->lcol_rss_cap == 0) 569 return (1); 570 571 if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) 572 *col_type |= CAPPED_PROJECT; 573 else 574 *col_type |= CAPPED_ZONE; 575 576 /* once we know everything is capped, we can stop looking */ 577 if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT)) 578 return (1); 579 580 return (0); 581 } 582 583 /* 584 * Open /proc and walk entries. 585 */ 586 static void 587 proc_walk_all(void (*cb)(const pid_t)) 588 { 589 DIR *pdir; 590 struct dirent *dirent; 591 pid_t pid; 592 593 (void) rfd_reserve(1); 594 if ((pdir = opendir("/proc")) == NULL) 595 die(gettext("couldn't open /proc!")); 596 597 while ((dirent = readdir(pdir)) != NULL) { 598 if (strcmp(".", dirent->d_name) == 0 || 599 strcmp("..", dirent->d_name) == 0) 600 continue; 601 pid = atoi(dirent->d_name); 602 ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0); 603 if (pid == rcapd_pid) 604 continue; 605 else 606 cb(pid); 607 } 608 (void) closedir(pdir); 609 } 610 611 /* 612 * Clear unmarked callback. 613 */ 614 /*ARGSUSED*/ 615 static int 616 sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc) 617 { 618 if (lpc->lpc_mark) { 619 lpc->lpc_mark = 0; 620 } else { 621 debug("process %d finished\n", (int)lpc->lpc_pid); 622 lprocess_free(lpc); 623 } 624 625 return (0); 626 } 627 628 /* 629 * Print, for debugging purposes, a collection's recently-sampled RSS and 630 * excess. 631 */ 632 /*ARGSUSED*/ 633 static int 634 excess_print_cb(lcollection_t *lcol, void *arg) 635 { 636 int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap; 637 638 debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n", 639 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), 640 lcol->lcol_name, 641 (unsigned long long)lcol->lcol_rss, 642 (unsigned long long)lcol->lcol_rss_cap, 643 (long long)excess); 644 645 return (0); 646 } 647 648 /* 649 * Scan those collections which have exceeded their caps. 650 * 651 * If we're running in the global zone it might have a cap. We don't want to 652 * do any capping for the global zone yet since we might get under the cap by 653 * just capping the projects in the global zone. 654 */ 655 /*ARGSUSED*/ 656 static int 657 scan_cb(lcollection_t *lcol, void *arg) 658 { 659 int64_t excess; 660 661 /* skip over global zone collection for now but keep track for later */ 662 if (lcol->lcol_id.rcid_type == RCIDT_ZONE && 663 lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { 664 gz_col = lcol; 665 return (0); 666 } 667 668 if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { 669 scan(lcol, excess); 670 lcol->lcol_stat.lcols_scan++; 671 } 672 673 return (0); 674 } 675 676 /* 677 * Scan the global zone collection and see if it still exceeds its cap. 678 * We take into account the effects of capping any global zone projects here. 679 */ 680 static void 681 scan_gz(lcollection_t *lcol, boolean_t project_over_cap) 682 { 683 int64_t excess; 684 685 /* 686 * If we had projects over their cap and the global zone was also over 687 * its cap then we need to get the up-to-date global zone rss to 688 * determine if we are still over the global zone cap. We might have 689 * gone under while we scanned the capped projects. If there were no 690 * projects over cap then we can use the rss value we already have for 691 * the global zone. 692 */ 693 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 694 if (project_over_cap && excess > 0) { 695 rss_sample(B_TRUE, CAPPED_ZONE); 696 update_col_rss(lcol); 697 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 698 } 699 700 if (excess > 0) { 701 debug("global zone excess %lldKB\n", (long long)excess); 702 scan(lcol, excess); 703 lcol->lcol_stat.lcols_scan++; 704 } 705 } 706 707 /* 708 * Do a soft scan of those collections which have excesses. A soft scan is one 709 * in which the cap enforcement pressure is taken into account. The difference 710 * between the utilized physical memory and the cap enforcement pressure will 711 * be scanned-for, and each collection will be scanned proportionally by their 712 * present excesses. 713 */ 714 static int 715 soft_scan_cb(lcollection_t *lcol, void *a) 716 { 717 int64_t excess; 718 soft_scan_arg_t *arg = a; 719 720 /* skip over global zone collection for now but keep track for later */ 721 if (lcol->lcol_id.rcid_type == RCIDT_ZONE && 722 lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { 723 gz_col = lcol; 724 return (0); 725 } 726 727 if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { 728 int64_t adjusted_excess = 729 excess * arg->ssa_scan_goal / arg->ssa_sum_excess; 730 731 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " 732 "scanning %lld\n", 733 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 734 "project" : "zone"), 735 (long)lcol->lcol_id.rcid_val, 736 (long long)excess, (long long)arg->ssa_scan_goal, 737 (unsigned long long)arg->ssa_sum_excess, 738 (long long)adjusted_excess); 739 740 scan(lcol, adjusted_excess); 741 lcol->lcol_stat.lcols_scan++; 742 } 743 744 return (0); 745 } 746 747 static void 748 soft_scan_gz(lcollection_t *lcol, void *a) 749 { 750 int64_t excess; 751 soft_scan_arg_t *arg = a; 752 753 /* 754 * If we had projects over their cap and the global zone was also over 755 * its cap then we need to get the up-to-date global zone rss to 756 * determine if we are still over the global zone cap. We might have 757 * gone under while we scanned the capped projects. If there were no 758 * projects over cap then we can use the rss value we already have for 759 * the global zone. 760 */ 761 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 762 if (arg->ssa_project_over_cap && excess > 0) { 763 rss_sample(B_TRUE, CAPPED_ZONE); 764 update_col_rss(lcol); 765 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 766 } 767 768 if (excess > 0) { 769 int64_t adjusted_excess = 770 excess * arg->ssa_scan_goal / arg->ssa_sum_excess; 771 772 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " 773 "scanning %lld\n", 774 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 775 "project" : "zone"), 776 (long)lcol->lcol_id.rcid_val, 777 (long long)excess, (long long)arg->ssa_scan_goal, 778 (unsigned long long)arg->ssa_sum_excess, 779 (long long)adjusted_excess); 780 781 scan(lcol, adjusted_excess); 782 lcol->lcol_stat.lcols_scan++; 783 } 784 } 785 786 /* 787 * When a scan could happen, but caps aren't enforced tick the 788 * lcols_unenforced_cap counter. 789 */ 790 /*ARGSUSED*/ 791 static int 792 unenforced_cap_cb(lcollection_t *lcol, void *arg) 793 { 794 lcol->lcol_stat.lcols_unenforced_cap++; 795 796 return (0); 797 } 798 799 /* 800 * Update the count of physically installed memory. 801 */ 802 static void 803 update_phys_total(void) 804 { 805 uint64_t old_phys_total; 806 807 old_phys_total = phys_total; 808 phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb; 809 if (phys_total != old_phys_total) 810 debug("physical memory%s: %lluM\n", (old_phys_total == 0 ? 811 "" : " adjusted"), (unsigned long long)(phys_total / 1024)); 812 } 813 814 /* 815 * Unlink a process from its collection, updating relevant statistics, and 816 * freeing its associated memory. 817 */ 818 void 819 lprocess_free(lprocess_t *lpc) 820 { 821 pid_t pid; 822 823 lpc->lpc_collection->lcol_stat.lcols_proc_out++; 824 825 if (lpc->lpc_prev != NULL) 826 lpc->lpc_prev->lpc_next = lpc->lpc_next; 827 if (lpc->lpc_next != NULL) 828 lpc->lpc_next->lpc_prev = lpc->lpc_prev; 829 if (lpc->lpc_collection->lcol_lprocess == lpc) 830 lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next != 831 lpc ? lpc->lpc_next : NULL); 832 lpc->lpc_next = lpc->lpc_prev = NULL; 833 834 if (lpc->lpc_prpageheader != NULL) 835 free(lpc->lpc_prpageheader); 836 if (lpc->lpc_xmap != NULL) 837 free(lpc->lpc_xmap); 838 if (lpc->lpc_psinfo_fd >= 0) { 839 if (rfd_close(lpc->lpc_psinfo_fd) != 0) 840 debug("could not close %d lpc_psinfo_fd %d", 841 (int)lpc->lpc_pid, lpc->lpc_psinfo_fd); 842 lpc->lpc_psinfo_fd = -1; 843 } 844 if (lpc->lpc_pgdata_fd >= 0) { 845 if (rfd_close(lpc->lpc_pgdata_fd) != 0) 846 debug("could not close %d lpc_pgdata_fd %d", 847 (int)lpc->lpc_pid, lpc->lpc_pgdata_fd); 848 lpc->lpc_pgdata_fd = -1; 849 } 850 if (lpc->lpc_xmap_fd >= 0) { 851 if (rfd_close(lpc->lpc_xmap_fd) != 0) 852 debug("could not close %d lpc_xmap_fd %d", 853 (int)lpc->lpc_pid, lpc->lpc_xmap_fd); 854 lpc->lpc_xmap_fd = -1; 855 } 856 if (lpc->lpc_ignore != NULL) 857 lmapping_free(&lpc->lpc_ignore); 858 pid = lpc->lpc_pid; 859 free(lpc); 860 debug_high("process %d freed\n", (int)pid); 861 } 862 863 /* 864 * Collection clear callback. 865 */ 866 /*ARGSUSED*/ 867 static int 868 collection_clear_cb(lcollection_t *lcol, void *arg) 869 { 870 lcol->lcol_mark = 0; 871 872 return (0); 873 } 874 875 /* 876 * Respond to a terminating signal by setting a termination flag. 877 */ 878 /*ARGSUSED*/ 879 static void 880 terminate_signal(int signal) 881 { 882 if (termination_signal == 0) 883 termination_signal = signal; 884 should_run = 0; 885 } 886 887 /* 888 * Handle any synchronous or asynchronous signals that would ordinarily cause a 889 * process to abort. 890 */ 891 /*ARGSUSED*/ 892 static void 893 abort_signal(int signal) 894 { 895 /* 896 * Allow the scanner to make a last-ditch effort to resume any stopped 897 * processes. 898 */ 899 scan_abort(); 900 abort(); 901 } 902 903 /* 904 * Clean up collections which have been removed due to configuration. Unlink 905 * the collection from lcollection and free it. 906 */ 907 /*ARGSUSED*/ 908 static int 909 collection_sweep_cb(lcollection_t *lcol, void *arg) 910 { 911 if (lcol->lcol_mark == 0) { 912 debug("freeing %s %s\n", 913 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 914 "project" : "zone"), lcol->lcol_name); 915 lcollection_free(lcol); 916 } 917 918 return (0); 919 } 920 921 /* 922 * Set those variables which depend on the global configuration. 923 */ 924 static void 925 finish_configuration(void) 926 { 927 /* 928 * Warn that any lnode (or non-project) mode specification (by an SRM 929 * 1.3 configuration file, for example) is ignored. 930 */ 931 if (strcmp(rcfg.rcfg_mode_name, "project") != 0) { 932 warn(gettext("%s mode specification ignored -- using project" 933 " mode\n"), rcfg.rcfg_mode_name); 934 rcfg.rcfg_mode_name = "project"; 935 rcfg.rcfg_mode = rctype_project; 936 } 937 } 938 939 /* 940 * Cause the configuration file to be reread and applied. 941 */ 942 static void 943 reread_configuration_file(void) 944 { 945 rcfg_t rcfg_new; 946 struct stat st; 947 948 if (stat(rcfg.rcfg_filename, &st) == 0 && st.st_mtime == 949 rcfg.rcfg_last_modification) 950 return; 951 952 if (rcfg_read(rcfg.rcfg_filename, rcfg.rcfg_fd, &rcfg_new, 953 update_statistics) != 0) 954 warn(gettext("can't reread configuration")); 955 else { 956 /* 957 * The configuration file has been read. Remove existing 958 * collections in case there is a change in collection type. 959 */ 960 if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) { 961 list_walk_collection(collection_clear_cb, NULL); 962 list_walk_collection(collection_sweep_cb, NULL); 963 } 964 965 /* 966 * Make the newly-read configuration the global one, and update 967 * any variables that depend on it. 968 */ 969 rcfg = rcfg_new; 970 finish_configuration(); 971 } 972 } 973 974 /* 975 * Reread the configuration filex, then examine changes, additions, and 976 * deletions to cap definitions. 977 */ 978 static void 979 reconfigure(hrtime_t now, hrtime_t *next_configuration, 980 hrtime_t *next_proc_walk, hrtime_t *next_rss_sample) 981 { 982 debug("reconfigure...\n"); 983 984 /* 985 * Reread the configuration data. 986 */ 987 reread_configuration_file(); 988 989 /* 990 * Walk the lcollection, marking active collections so inactive ones 991 * can be freed. 992 */ 993 list_walk_collection(collection_clear_cb, NULL); 994 lcollection_update(LCU_ACTIVE_ONLY); /* mark */ 995 list_walk_collection(collection_sweep_cb, NULL); 996 997 *next_configuration = NEXT_EVENT_TIME(now, 998 rcfg.rcfg_reconfiguration_interval); 999 1000 /* 1001 * Reset each event time to the shorter of the previous and new 1002 * intervals. 1003 */ 1004 if (next_report == 0 && rcfg.rcfg_report_interval > 0) 1005 next_report = now; 1006 else 1007 next_report = POSITIVE_MIN(next_report, 1008 NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval)); 1009 1010 if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0) 1011 *next_proc_walk = now; 1012 else 1013 *next_proc_walk = POSITIVE_MIN(*next_proc_walk, 1014 NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval)); 1015 1016 if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0) 1017 *next_rss_sample = now; 1018 else 1019 *next_rss_sample = POSITIVE_MIN(*next_rss_sample, 1020 NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval)); 1021 } 1022 1023 /* 1024 * Respond to SIGHUP by triggering the rereading the configuration file and cap 1025 * definitions. 1026 */ 1027 /*ARGSUSED*/ 1028 static void 1029 sighup(int signal) 1030 { 1031 should_reconfigure = 1; 1032 } 1033 1034 /* 1035 * Print, for debugging purposes, each collection's interval statistics. 1036 */ 1037 /*ARGSUSED*/ 1038 static int 1039 simple_report_collection_cb(lcollection_t *lcol, void *arg) 1040 { 1041 #define DELTA(field) \ 1042 (unsigned long long)( \ 1043 (lcol->lcol_stat.field - lcol->lcol_stat_old.field)) 1044 1045 debug("%s %s status: succeeded/attempted (k): %llu/%llu, " 1046 "ineffective/scans/unenforced/samplings: %llu/%llu/%llu/%llu, RSS " 1047 "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, " 1048 "%llu scans over %llu ms\n", 1049 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), 1050 lcol->lcol_name, 1051 DELTA(lcols_pg_eff), DELTA(lcols_pg_att), 1052 DELTA(lcols_scan_ineffective), DELTA(lcols_scan), 1053 DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample), 1054 (unsigned long long)lcol->lcol_stat.lcols_min_rss, 1055 (unsigned long long)lcol->lcol_stat.lcols_max_rss, 1056 (unsigned long long)lcol->lcol_rss_cap, 1057 (unsigned long long)(lcol->lcol_stat.lcols_proc_in - 1058 lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out), 1059 DELTA(lcols_scan_count), DELTA(lcols_scan_time_complete) / (NANOSEC 1060 / MILLISEC)); 1061 1062 #undef DELTA 1063 1064 return (0); 1065 } 1066 1067 /* 1068 * Record each collection's interval statistics in the statistics file. 1069 */ 1070 static int 1071 report_collection_cb(lcollection_t *lcol, void *arg) 1072 { 1073 lcollection_report_t dc; 1074 int fd = (intptr_t)arg; 1075 1076 /* 1077 * Copy the relevant fields to the collection's record. 1078 */ 1079 bzero(&dc, sizeof (dc)); 1080 dc.lcol_id = lcol->lcol_id; 1081 (void) strcpy(dc.lcol_name, lcol->lcol_name); 1082 dc.lcol_rss = lcol->lcol_rss; 1083 dc.lcol_image_size = lcol->lcol_image_size; 1084 dc.lcol_rss_cap = lcol->lcol_rss_cap; 1085 dc.lcol_stat = lcol->lcol_stat; 1086 1087 if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) { 1088 lcol->lcol_stat_old = lcol->lcol_stat; 1089 } else { 1090 debug("can't write %s %s statistics", 1091 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 1092 "project" : "zone"), 1093 lcol->lcol_name); 1094 } 1095 1096 return (0); 1097 } 1098 1099 /* 1100 * Determine the count of pages scanned by the global page scanner, obtained 1101 * from the cpu_stat:*::scan kstats. Return zero on success. 1102 */ 1103 static int 1104 get_globally_scanned_pages(uint64_t *scannedp) 1105 { 1106 kstat_t *ksp; 1107 uint64_t scanned = 0; 1108 1109 if (kstat_chain_update(kctl) == -1) { 1110 warn(gettext("can't update kstat chain")); 1111 return (0); 1112 } 1113 1114 for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) { 1115 if (strcmp(ksp->ks_module, "cpu_stat") == 0) { 1116 if (kstat_read(kctl, ksp, NULL) != -1) { 1117 scanned += ((cpu_stat_t *) 1118 ksp->ks_data)->cpu_vminfo.scan; 1119 } else { 1120 return (-1); 1121 } 1122 } 1123 } 1124 1125 *scannedp = scanned; 1126 return (0); 1127 } 1128 1129 /* 1130 * Determine if the global page scanner is running, during which no memory 1131 * caps should be enforced, to prevent interference with the global page 1132 * scanner. 1133 */ 1134 static boolean_t 1135 is_global_scanner_running() 1136 { 1137 /* measure delta in page scan count */ 1138 static uint64_t new_sp = 0; 1139 static uint64_t old_sp = 0; 1140 boolean_t res = B_FALSE; 1141 1142 if (get_globally_scanned_pages(&new_sp) == 0) { 1143 if (old_sp != 0 && (new_sp - old_sp) > 0) { 1144 debug("global memory pressure detected (%llu " 1145 "pages scanned since last interval)\n", 1146 (unsigned long long)(new_sp - old_sp)); 1147 res = B_TRUE; 1148 } 1149 old_sp = new_sp; 1150 } else { 1151 warn(gettext("unable to read cpu statistics")); 1152 new_sp = old_sp; 1153 } 1154 1155 return (res); 1156 } 1157 1158 /* 1159 * If soft caps are in use, determine if global memory pressure exceeds the 1160 * configured maximum above which soft caps are enforced. 1161 */ 1162 static boolean_t 1163 must_enforce_soft_caps() 1164 { 1165 /* 1166 * Check for changes to the amount of installed physical memory, to 1167 * compute the current memory pressure. 1168 */ 1169 update_phys_total(); 1170 1171 memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb) 1172 * 100.0 / phys_total); 1173 memory_pressure_sample++; 1174 if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 && 1175 memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) { 1176 return (B_TRUE); 1177 } 1178 1179 return (B_FALSE); 1180 } 1181 1182 /* 1183 * Update the shared statistics file with each collection's current statistics. 1184 * Return zero on success. 1185 */ 1186 static int 1187 update_statistics(void) 1188 { 1189 int fd, res; 1190 static char template[LINELEN]; 1191 1192 /* 1193 * Try to create a directory irrespective of whether it is existing 1194 * or not. If it is not there then it will create. Otherwise any way 1195 * it will fail at mkstemp call below. 1196 */ 1197 (void) mkdir(STAT_FILE_DIR, 0755); 1198 1199 /* 1200 * Create a temporary file. 1201 */ 1202 if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) + 1203 strlen(STAT_TEMPLATE_SUFFIX) + 1)) { 1204 debug("temporary file template size too small\n"); 1205 return (-1); 1206 } 1207 (void) strcpy(template, rcfg.rcfg_stat_file); 1208 (void) strcat(template, STAT_TEMPLATE_SUFFIX); 1209 (void) rfd_reserve(1); 1210 fd = mkstemp(template); 1211 1212 /* 1213 * Write the header and per-collection statistics. 1214 */ 1215 if (fd >= 0) { 1216 rcapd_stat_hdr_t rs; 1217 1218 rs.rs_pid = rcapd_pid; 1219 rs.rs_time = gethrtime(); 1220 ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name)); 1221 (void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name); 1222 rs.rs_pressure_cur = memory_pressure; 1223 rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure; 1224 rs.rs_pressure_sample = memory_pressure_sample; 1225 1226 if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) == 1227 sizeof (rs)) { 1228 list_walk_collection(report_collection_cb, 1229 (void *)(intptr_t)fd); 1230 /* 1231 * Replace the existing statistics file with this new 1232 * one. 1233 */ 1234 res = rename(template, rcfg.rcfg_stat_file); 1235 } else 1236 res = -1; 1237 (void) close(fd); 1238 } else 1239 res = -1; 1240 1241 return (res); 1242 } 1243 1244 /* 1245 * Verify the statistics file can be created and written to, and die if an 1246 * existing file may be in use by another rcapd. 1247 */ 1248 static int 1249 verify_statistics(void) 1250 { 1251 pid_t pid; 1252 1253 /* 1254 * Warn if another instance of rcapd might be active. 1255 */ 1256 (void) rfd_reserve(1); 1257 pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file); 1258 if (pid != rcapd_pid && pid != -1) 1259 die(gettext("%s exists; rcapd may already be active\n"), 1260 rcfg.rcfg_stat_file); 1261 1262 return (update_statistics()); 1263 } 1264 1265 static int 1266 sum_excess_cb(lcollection_t *lcol, void *arg) 1267 { 1268 uint64_t *sum_excess = arg; 1269 1270 *sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss - 1271 lcol->lcol_rss_cap)); 1272 return (0); 1273 } 1274 1275 /* 1276 * Compute the quantity of memory (in kilobytes) above the cap enforcement 1277 * pressure. Set the scan goal to that quantity (or at most the excess). 1278 */ 1279 static void 1280 compute_soft_scan_goal(soft_scan_arg_t *argp) 1281 { 1282 /* 1283 * Compute the sum of the collections' excesses, which will be the 1284 * denominator. 1285 */ 1286 argp->ssa_sum_excess = 0; 1287 list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess)); 1288 1289 argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) * 1290 (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 - 1291 sysconf(_SC_AVPHYS_PAGES)) * page_size_kb, 1292 argp->ssa_sum_excess); 1293 } 1294 1295 static void 1296 rcapd_usage(void) 1297 { 1298 info(gettext("usage: rcapd [-d]\n")); 1299 } 1300 1301 void 1302 check_update_statistics(void) 1303 { 1304 hrtime_t now = gethrtime(); 1305 1306 if (EVENT_TIME(now, next_report)) { 1307 debug("updating statistics...\n"); 1308 list_walk_collection(simple_report_collection_cb, NULL); 1309 if (update_statistics() != 0) 1310 debug("couldn't update statistics"); 1311 next_report = NEXT_REPORT_EVENT_TIME(now, 1312 rcfg.rcfg_report_interval); 1313 } 1314 } 1315 1316 static void 1317 verify_and_set_privileges(void) 1318 { 1319 priv_set_t *required = 1320 priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL); 1321 1322 /* 1323 * Ensure the required privileges, suitable for controlling processes, 1324 * are possessed. 1325 */ 1326 if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv( 1327 PRIV_SET, PRIV_EFFECTIVE, required) != 0) 1328 die(gettext("can't set requisite privileges")); 1329 1330 /* 1331 * Ensure access to /var/run/daemon. 1332 */ 1333 if (setreuid(DAEMON_UID, DAEMON_UID) != 0) 1334 die(gettext("cannot become user daemon")); 1335 1336 priv_freeset(required); 1337 } 1338 1339 /* 1340 * This function does the top-level work to determine if we should do any 1341 * memory capping, and if so, it invokes the right call-backs to do the work. 1342 */ 1343 static void 1344 do_capping(hrtime_t now, hrtime_t *next_proc_walk) 1345 { 1346 boolean_t enforce_caps; 1347 /* soft cap enforcement flag, depending on memory pressure */ 1348 boolean_t enforce_soft_caps; 1349 /* avoid interference with kernel's page scanner */ 1350 boolean_t global_scanner_running; 1351 sample_col_arg_t col_arg; 1352 soft_scan_arg_t arg; 1353 uint_t col_types = 0; 1354 1355 /* check what kind of collections (project/zone) are capped */ 1356 list_walk_collection(col_type_cb, &col_types); 1357 debug("collection types: 0x%x\n", col_types); 1358 1359 /* no capped collections, skip checking rss */ 1360 if (col_types == 0) 1361 return; 1362 1363 /* Determine if soft caps are enforced. */ 1364 enforce_soft_caps = must_enforce_soft_caps(); 1365 1366 /* Determine if the global page scanner is running. */ 1367 global_scanner_running = is_global_scanner_running(); 1368 1369 /* 1370 * Sample collections' member processes RSSes and recompute 1371 * collections' excess. 1372 */ 1373 rss_sample(B_FALSE, col_types); 1374 1375 col_arg.sca_any_over_cap = B_FALSE; 1376 col_arg.sca_project_over_cap = B_FALSE; 1377 list_walk_collection(rss_sample_col_cb, &col_arg); 1378 list_walk_collection(excess_print_cb, NULL); 1379 debug("any collection/project over cap = %d, %d\n", 1380 col_arg.sca_any_over_cap, col_arg.sca_project_over_cap); 1381 1382 if (enforce_soft_caps) 1383 debug("memory pressure %d%%\n", memory_pressure); 1384 1385 /* 1386 * Cap enforcement is determined by the previous conditions. 1387 */ 1388 enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap && 1389 (rcfg.rcfg_memory_cap_enforcement_pressure == 0 || 1390 enforce_soft_caps); 1391 1392 debug("%senforcing caps\n", enforce_caps ? "" : "not "); 1393 1394 /* 1395 * If soft caps are in use, determine the size of the portion from each 1396 * collection to scan for. 1397 */ 1398 if (enforce_caps && enforce_soft_caps) 1399 compute_soft_scan_goal(&arg); 1400 1401 /* 1402 * Victimize offending collections. 1403 */ 1404 if (enforce_caps && (!enforce_soft_caps || 1405 (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) { 1406 1407 /* 1408 * Since at least one collection is over its cap & needs 1409 * enforcing, check if it is at least time for a process walk 1410 * (we could be well past time since we only walk /proc when 1411 * we need to) and if so, update each collections process list 1412 * in a single pass through /proc. 1413 */ 1414 if (EVENT_TIME(now, *next_proc_walk)) { 1415 debug("scanning process list...\n"); 1416 proc_walk_all(proc_cb); /* insert & mark */ 1417 list_walk_all(sweep_process_cb); /* free dead procs */ 1418 *next_proc_walk = NEXT_EVENT_TIME(now, 1419 rcfg.rcfg_proc_walk_interval); 1420 } 1421 1422 gz_col = NULL; 1423 if (enforce_soft_caps) { 1424 debug("scan goal is %lldKB\n", 1425 (long long)arg.ssa_scan_goal); 1426 list_walk_collection(soft_scan_cb, &arg); 1427 if (gz_capped && gz_col != NULL) { 1428 /* process global zone */ 1429 arg.ssa_project_over_cap = 1430 col_arg.sca_project_over_cap; 1431 soft_scan_gz(gz_col, &arg); 1432 } 1433 } else { 1434 list_walk_collection(scan_cb, NULL); 1435 if (gz_capped && gz_col != NULL) { 1436 /* process global zone */ 1437 scan_gz(gz_col, col_arg.sca_project_over_cap); 1438 } 1439 } 1440 } else if (col_arg.sca_any_over_cap) { 1441 list_walk_collection(unenforced_cap_cb, NULL); 1442 } 1443 } 1444 1445 int 1446 main(int argc, char *argv[]) 1447 { 1448 int res; 1449 int should_fork = 1; /* fork flag */ 1450 hrtime_t now; /* current time */ 1451 hrtime_t next; /* time of next event */ 1452 int sig; /* signal iteration */ 1453 struct rlimit rl; 1454 hrtime_t next_proc_walk; /* time of next /proc scan */ 1455 hrtime_t next_configuration; /* time of next configuration */ 1456 hrtime_t next_rss_sample; /* (latest) time of next RSS sample */ 1457 1458 (void) set_message_priority(RCM_INFO); 1459 (void) setprogname("rcapd"); 1460 rcapd_pid = getpid(); 1461 (void) chdir("/"); 1462 should_run = 1; 1463 ever_ran = 0; 1464 1465 (void) setlocale(LC_ALL, ""); 1466 (void) textdomain(TEXT_DOMAIN); 1467 1468 /* 1469 * Parse command-line options. 1470 */ 1471 while ((res = getopt(argc, argv, "dF")) > 0) 1472 switch (res) { 1473 case 'd': 1474 should_fork = 0; 1475 if (debug_mode == 0) { 1476 debug_mode = 1; 1477 (void) set_message_priority(RCM_DEBUG); 1478 } else 1479 (void) set_message_priority(RCM_DEBUG_HIGH); 1480 break; 1481 case 'F': 1482 should_fork = 0; 1483 break; 1484 default: 1485 rcapd_usage(); 1486 return (E_USAGE); 1487 /*NOTREACHED*/ 1488 } 1489 1490 /* 1491 * If not debugging, fork and continue operating, changing the 1492 * destination of messages to syslog(). 1493 */ 1494 if (should_fork == 1) { 1495 pid_t child; 1496 debug("forking\n"); 1497 child = fork(); 1498 if (child == -1) 1499 die(gettext("cannot fork")); 1500 if (child > 0) 1501 return (0); 1502 else { 1503 rcapd_pid = getpid(); 1504 (void) set_message_destination(RCD_SYSLOG); 1505 (void) fclose(stdin); 1506 (void) fclose(stdout); 1507 (void) fclose(stderr); 1508 } 1509 /* 1510 * Start a new session and detatch from the controlling tty. 1511 */ 1512 if (setsid() == (pid_t)-1) 1513 debug(gettext("setsid() failed; cannot detach from " 1514 "terminal")); 1515 } 1516 1517 /* 1518 * Read the configuration file. 1519 */ 1520 if (rcfg_read(RCAPD_DEFAULT_CONF_FILE, -1, &rcfg, verify_statistics) 1521 != 0) { 1522 /* 1523 * A configuration file may not exist if rcapd is started 1524 * by enabling the smf rcap service, so attempt to create 1525 * a default file. 1526 */ 1527 create_config_file(NULL); 1528 1529 /* 1530 * A real failure if still can't read the 1531 * configuration file 1532 */ 1533 if (rcfg_read(RCAPD_DEFAULT_CONF_FILE, -1, &rcfg, 1534 verify_statistics) != 0) 1535 die(gettext("resource caps not configured %s"), 1536 RCAPD_DEFAULT_CONF_FILE); 1537 } 1538 finish_configuration(); 1539 should_reconfigure = 0; 1540 1541 /* 1542 * Check that required privileges are possessed. 1543 */ 1544 verify_and_set_privileges(); 1545 1546 now = next_report = next_proc_walk = next_rss_sample = gethrtime(); 1547 next_configuration = NEXT_EVENT_TIME(gethrtime(), 1548 rcfg.rcfg_reconfiguration_interval); 1549 1550 /* 1551 * Open the kstat chain. 1552 */ 1553 kctl = kstat_open(); 1554 if (kctl == NULL) 1555 die(gettext("can't open kstats")); 1556 1557 /* 1558 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can 1559 * be effectively managed without revoking descriptors (at 3 per 1560 * process). 1561 */ 1562 rl.rlim_cur = 32 * 1024; 1563 rl.rlim_max = 32 * 1024; 1564 if (setrlimit(RLIMIT_NOFILE, &rl) != 0 && 1565 getrlimit(RLIMIT_NOFILE, &rl) == 0) { 1566 rl.rlim_cur = rl.rlim_max; 1567 (void) setrlimit(RLIMIT_NOFILE, &rl); 1568 } 1569 (void) enable_extended_FILE_stdio(-1, -1); 1570 1571 if (getrlimit(RLIMIT_NOFILE, &rl) == 0) 1572 debug("fd limit: %lu\n", rl.rlim_cur); 1573 else 1574 debug("fd limit: unknown\n"); 1575 1576 get_page_size(); 1577 my_zoneid = getzoneid(); 1578 1579 /* 1580 * Handle those signals whose (default) exit disposition 1581 * prevents rcapd from finishing scanning before terminating. 1582 */ 1583 (void) sigset(SIGINT, terminate_signal); 1584 (void) sigset(SIGQUIT, abort_signal); 1585 (void) sigset(SIGILL, abort_signal); 1586 (void) sigset(SIGEMT, abort_signal); 1587 (void) sigset(SIGFPE, abort_signal); 1588 (void) sigset(SIGBUS, abort_signal); 1589 (void) sigset(SIGSEGV, abort_signal); 1590 (void) sigset(SIGSYS, abort_signal); 1591 (void) sigset(SIGPIPE, terminate_signal); 1592 (void) sigset(SIGALRM, terminate_signal); 1593 (void) sigset(SIGTERM, terminate_signal); 1594 (void) sigset(SIGUSR1, terminate_signal); 1595 (void) sigset(SIGUSR2, terminate_signal); 1596 (void) sigset(SIGPOLL, terminate_signal); 1597 (void) sigset(SIGVTALRM, terminate_signal); 1598 (void) sigset(SIGXCPU, abort_signal); 1599 (void) sigset(SIGXFSZ, abort_signal); 1600 for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++) 1601 (void) sigset(sig, terminate_signal); 1602 1603 /* 1604 * Install a signal handler for reconfiguration processing. 1605 */ 1606 (void) sigset(SIGHUP, sighup); 1607 1608 /* 1609 * Determine which process collections to cap. 1610 */ 1611 lcollection_update(LCU_COMPLETE); 1612 1613 /* 1614 * Loop forever, monitoring collections' resident set sizes and 1615 * enforcing their caps. Look for changes in caps as well as 1616 * responding to requests to reread the configuration. Update 1617 * per-collection statistics periodically. 1618 */ 1619 while (should_run != 0) { 1620 struct timespec ts; 1621 1622 /* 1623 * Announce that rcapd is starting. 1624 */ 1625 if (ever_ran == 0) { 1626 info(gettext("starting\n")); 1627 ever_ran = 1; 1628 } 1629 1630 /* 1631 * Check the configuration at every next_configuration interval. 1632 * Update the rss data once every next_rss_sample interval. 1633 * The condition of global memory pressure is also checked at 1634 * the same frequency, if strict caps are in use. 1635 */ 1636 now = gethrtime(); 1637 1638 /* 1639 * Detect configuration and cap changes at every 1640 * reconfiguration_interval, or when SIGHUP has been received. 1641 */ 1642 if (EVENT_TIME(now, next_configuration) || 1643 should_reconfigure == 1) { 1644 reconfigure(now, &next_configuration, &next_proc_walk, 1645 &next_rss_sample); 1646 should_reconfigure = 0; 1647 } 1648 1649 /* 1650 * Do the main work for enforcing caps. 1651 */ 1652 if (EVENT_TIME(now, next_rss_sample)) { 1653 do_capping(now, &next_proc_walk); 1654 1655 next_rss_sample = NEXT_EVENT_TIME(now, 1656 rcfg.rcfg_rss_sample_interval); 1657 } 1658 1659 /* 1660 * Update the statistics file, if it's time. 1661 */ 1662 check_update_statistics(); 1663 1664 /* 1665 * Sleep for some time before repeating. 1666 */ 1667 now = gethrtime(); 1668 next = next_configuration; 1669 next = POSITIVE_MIN(next, next_report); 1670 next = POSITIVE_MIN(next, next_rss_sample); 1671 if (next > now && should_run != 0) { 1672 debug("sleeping %-4.2f seconds\n", (float)(next - 1673 now) / (float)NANOSEC); 1674 hrt2ts(next - now, &ts); 1675 (void) nanosleep(&ts, NULL); 1676 } 1677 } 1678 if (termination_signal != 0) 1679 debug("exiting due to signal %d\n", termination_signal); 1680 if (ever_ran != 0) 1681 info(gettext("exiting\n")); 1682 1683 /* 1684 * Unlink the statistics file before exiting. 1685 */ 1686 if (rcfg.rcfg_stat_file[0] != 0) 1687 (void) unlink(rcfg.rcfg_stat_file); 1688 1689 return (E_SUCCESS); 1690 } 1691