1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * rcapd is a long-running daemon enforcing project-based resource caps (see 28 * rcapd(1M)). Each instance of a process aggregate (project or, generically, 29 * "collection") may have a memory cap. A single thread monitors the resource 30 * utilization of capped collections, enforces caps when they are exceeded (and 31 * other conditions are met), and incorporates changes in configuration or 32 * caps. Each of these actions occurs not more frequently than the rate 33 * specified with rcapadm(1M). 34 */ 35 36 #include <sys/priocntl.h> 37 #include <sys/proc.h> 38 #include <sys/resource.h> 39 #include <sys/sysinfo.h> 40 #include <sys/stat.h> 41 #include <sys/sysmacros.h> 42 #include <sys/time.h> 43 #include <sys/types.h> 44 #include <dirent.h> 45 #include <errno.h> 46 #include <fcntl.h> 47 #include <kstat.h> 48 #include <libintl.h> 49 #include <limits.h> 50 #include <locale.h> 51 #include <priv.h> 52 #include <signal.h> 53 #include <stdarg.h> 54 #include <stdio.h> 55 #include <stdio_ext.h> 56 #include <stdlib.h> 57 #include <libscf.h> 58 #include <strings.h> 59 #include <time.h> 60 #include <unistd.h> 61 #include <zone.h> 62 #include <assert.h> 63 #include <sys/vm_usage.h> 64 #include "rcapd.h" 65 #include "rcapd_mapping.h" 66 #include "rcapd_rfd.h" 67 #include "rcapd_stat.h" 68 #include "utils.h" 69 70 #define POSITIVE_MIN(x, y) \ 71 (((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y)) 72 #define NEXT_EVENT_TIME(base, seconds) \ 73 (((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \ 74 : (hrtime_t)0) 75 #define NEXT_REPORT_EVENT_TIME(base, seconds) \ 76 ((rcfg.rcfg_stat_file[0] != 0) ? \ 77 NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0) 78 #define EVENT_TIME(time, eventtime) \ 79 (((time) > (eventtime)) && (eventtime) != 0) 80 #define STAT_TEMPLATE_SUFFIX ".XXXXXX" /* suffix of mkstemp() arg */ 81 #define DAEMON_UID 1 /* uid to use */ 82 83 #define CAPPED_PROJECT 0x01 84 #define CAPPED_ZONE 0x02 85 86 typedef struct soft_scan_arg { 87 uint64_t ssa_sum_excess; 88 int64_t ssa_scan_goal; 89 boolean_t ssa_project_over_cap; 90 } soft_scan_arg_t; 91 92 typedef struct sample_col_arg { 93 boolean_t sca_any_over_cap; 94 boolean_t sca_project_over_cap; 95 } sample_col_arg_t; 96 97 98 static int debug_mode = 0; /* debug mode flag */ 99 static pid_t rcapd_pid; /* rcapd's pid to ensure it's not */ 100 /* scanned */ 101 static kstat_ctl_t *kctl; /* kstat chain */ 102 static int memory_pressure = 0; /* physical memory utilization (%) */ 103 static int memory_pressure_sample = 0; /* count of samples */ 104 static long page_size_kb = 0; /* system page size in KB */ 105 static size_t nvmu_vals = 0; /* # of kernel RSS/swap vals in array */ 106 static size_t vmu_vals_len = 0; /* size of RSS/swap vals array */ 107 static vmusage_t *vmu_vals = NULL; /* snapshot of kernel RSS/swap values */ 108 static hrtime_t next_report; /* time of next report */ 109 static int termination_signal = 0; /* terminating signal */ 110 static zoneid_t my_zoneid = (zoneid_t)-1; 111 static lcollection_t *gz_col; /* global zone collection */ 112 113 rcfg_t rcfg; 114 /* 115 * Updated when we re-read the collection configurations if this rcapd instance 116 * is running in the global zone and the global zone is capped. 117 */ 118 boolean_t gz_capped = B_FALSE; 119 120 /* 121 * Flags. 122 */ 123 static int ever_ran; 124 int should_run; 125 static int should_reconfigure; 126 127 static int verify_statistics(void); 128 static int update_statistics(void); 129 130 /* 131 * Checks if a process is marked 'system'. Returns FALSE only when it is not. 132 */ 133 static boolean_t 134 proc_issystem(pid_t pid) 135 { 136 char pc_clname[PC_CLNMSZ]; 137 138 if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname, 139 PC_KY_NULL) != -1) { 140 return (strcmp(pc_clname, "SYS") == 0); 141 } else { 142 debug("cannot get class-specific scheduling parameters; " 143 "assuming system process\n"); 144 return (B_TRUE); 145 } 146 } 147 148 static void 149 lprocess_insert_mark(psinfo_t *psinfop) 150 { 151 pid_t pid = psinfop->pr_pid; 152 /* flag indicating whether the process should be scanned. */ 153 int unscannable = psinfop->pr_nlwp == 0; 154 rcid_t colid; 155 lcollection_t *lcol; 156 lprocess_t *lproc; 157 158 /* 159 * Determine which collection to put this process into. We only have 160 * to worry about tracking both zone and project capped processes if 161 * this rcapd instance is running in the global zone, since we'll only 162 * see processes in our own projects in a non-global zone. In the 163 * global zone, if the process belongs to a non-global zone, we only 164 * need to track it for the capped non-global zone collection. For 165 * global zone processes, we first attempt to put the process into a 166 * capped project collection. On the second pass into this function 167 * the projid will be cleared so we will just track the process for the 168 * global zone collection as a whole. 169 */ 170 if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) { 171 colid.rcid_type = RCIDT_PROJECT; 172 colid.rcid_val = psinfop->pr_projid; 173 } else { 174 /* try to add to zone collection */ 175 colid.rcid_type = RCIDT_ZONE; 176 colid.rcid_val = psinfop->pr_zoneid; 177 } 178 179 if ((lcol = lcollection_find(&colid)) == NULL) 180 return; 181 182 /* 183 * If the process is already being tracked, update the unscannable flag, 184 * as determined by the caller, from the process's psinfo. 185 */ 186 lproc = lcol->lcol_lprocess; 187 while (lproc != NULL) { 188 if (lproc->lpc_pid == pid) { 189 lproc->lpc_mark = 1; 190 if (unscannable != 0 && lproc->lpc_unscannable == 0) { 191 debug("process %d: became unscannable\n", 192 (int)lproc->lpc_pid); 193 lproc->lpc_unscannable = 1; 194 } 195 return; 196 } 197 lproc = lproc->lpc_next; 198 } 199 200 /* 201 * We've fallen off the list without finding our current process; 202 * insert it at the list head. 203 */ 204 if ((lproc = malloc(sizeof (*lproc))) == NULL) 205 debug("insufficient memory to track new process %d", (int)pid); 206 else { 207 (void) bzero(lproc, sizeof (*lproc)); 208 lproc->lpc_pid = pid; 209 lproc->lpc_mark = 1; 210 lproc->lpc_collection = lcol; 211 lproc->lpc_psinfo_fd = -1; 212 lproc->lpc_pgdata_fd = -1; 213 lproc->lpc_xmap_fd = -1; 214 215 /* 216 * If the caller didn't flag this process as unscannable 217 * already, do some more checking. 218 */ 219 lproc->lpc_unscannable = unscannable || proc_issystem(pid); 220 221 #ifdef DEBUG 222 /* 223 * Verify the sanity of lprocess. It should not contain the 224 * process we are about to prepend. 225 */ 226 if (lcollection_member(lcol, lproc)) { 227 lprocess_t *cur = lcol->lcol_lprocess; 228 debug("The collection %lld already has these members, " 229 "including me, %d!\n", 230 (long long)lcol->lcol_id.rcid_val, 231 (int)lproc->lpc_pid); 232 while (cur != NULL) { 233 debug("\t%d\n", (int)cur->lpc_pid); 234 cur = cur->lpc_next; 235 } 236 info(gettext("process already on lprocess\n")); 237 abort(); 238 } 239 #endif /* DEBUG */ 240 lproc->lpc_next = lcol->lcol_lprocess; 241 if (lproc->lpc_next != NULL) 242 lproc->lpc_next->lpc_prev = lproc; 243 lproc->lpc_prev = NULL; 244 lcol->lcol_lprocess = lproc; 245 246 debug("tracking %s %ld %d %s%s\n", 247 (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"), 248 (long)colid.rcid_val, 249 (int)pid, psinfop->pr_psargs, 250 (lproc->lpc_unscannable != 0) ? " (not scannable)" : ""); 251 lcol->lcol_stat.lcols_proc_in++; 252 } 253 } 254 255 static int 256 list_walk_process_cb(lcollection_t *lcol, void *arg) 257 { 258 int (*cb)(lcollection_t *, lprocess_t *) = 259 (int(*)(lcollection_t *, lprocess_t *))arg; 260 lprocess_t *member; 261 lprocess_t *next; 262 263 member = lcol->lcol_lprocess; 264 while (member != NULL) { 265 pid_t pid = member->lpc_pid; 266 next = member->lpc_next; 267 268 debug_high("list_walk_all lpc %d\n", (int)pid); 269 if (cb(lcol, member) != 0) { 270 debug_high("list_walk_all aborted at lpc %d\n", 271 (int)pid); 272 return (1); 273 } 274 member = next; 275 } 276 277 return (0); 278 } 279 280 /* 281 * Invoke the given callback for each process in each collection. Callbacks 282 * are allowed to change the linkage of the process on which they act. 283 */ 284 static void 285 list_walk_all(int (*cb)(lcollection_t *, lprocess_t *)) 286 { 287 list_walk_collection(list_walk_process_cb, (void *)cb); 288 } 289 290 static void 291 revoke_psinfo(rfd_t *rfd) 292 { 293 lprocess_t *lpc = (lprocess_t *)rfd->rfd_data; 294 295 if (lpc != NULL) { 296 debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid); 297 ASSERT(lpc->lpc_psinfo_fd != -1); 298 lpc->lpc_psinfo_fd = -1; 299 } else 300 debug("revoking psinfo fd for unknown process\n"); 301 } 302 303 /* 304 * Retrieve a process's psinfo via an already-opened or new file descriptor. 305 * The supplied descriptor will be closed on failure. An optional callback 306 * will be invoked with the last descriptor tried, and a supplied callback 307 * argument, as its arguments, such that the new descriptor may be cached, or 308 * an old one may be invalidated. If the result of the callback is zero, the 309 * the caller is to assume responsibility for the file descriptor, to close it 310 * with rfd_close(). 311 * 312 * On failure, a nonzero value is returned. 313 */ 314 int 315 get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd, 316 int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc) 317 { 318 int fd; 319 int can_try_uncached; 320 321 ASSERT(!(cached_fd > 0 && fd_update_cb == NULL)); 322 323 do { 324 if (cached_fd >= 0) { 325 fd = cached_fd; 326 can_try_uncached = 1; 327 debug_high("%d/psinfo, trying cached fd %d\n", 328 (int)pid, fd); 329 } else { 330 char pathbuf[PROC_PATH_MAX]; 331 332 can_try_uncached = 0; 333 (void) snprintf(pathbuf, sizeof (pathbuf), 334 "/proc/%d/psinfo", (int)pid); 335 if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO, 336 revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) { 337 debug("cannot open %s", pathbuf); 338 break; 339 } else 340 debug_high("opened %s, fd %d\n", pathbuf, fd); 341 } 342 343 if (pread(fd, psinfo, sizeof (*psinfo), 0) == 344 sizeof (*psinfo) && psinfo->pr_pid == pid) 345 break; 346 else { 347 debug_high("closed fd %d\n", fd); 348 if (rfd_close(fd) != 0) 349 debug("could not close fd %d", fd); 350 fd = cached_fd = -1; 351 } 352 } while (can_try_uncached == 1); 353 354 if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0) 355 if (fd >= 0) { 356 debug_high("closed %s fd %d\n", fd_update_cb == NULL ? 357 "uncached" : "cached", fd); 358 if (rfd_close(fd) != 0) 359 debug("could not close fd %d", fd); 360 } 361 362 debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd, 363 fd_update_cb != NULL ? "cached" : "uncached"); 364 return ((fd >= 0) ? 0 : -1); 365 } 366 367 /* 368 * Retrieve the collection membership of all processes and update the psinfo of 369 * those non-system, non-zombie ones in collections. For global zone processes, 370 * we first attempt to put the process into a capped project collection. We 371 * also want to track the process for the global zone collection as a whole. 372 */ 373 static void 374 proc_cb(const pid_t pid) 375 { 376 psinfo_t psinfo; 377 378 if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) { 379 lprocess_insert_mark(&psinfo); 380 if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) { 381 /* 382 * We also want to track this process for the global 383 * zone as a whole so add it to the global zone 384 * collection as well. 385 */ 386 psinfo.pr_projid = -1; 387 lprocess_insert_mark(&psinfo); 388 } 389 } 390 } 391 392 /* 393 * Cache the process' psinfo fd, taking responsibility for freeing it. 394 */ 395 int 396 lprocess_update_psinfo_fd_cb(void *arg, int fd) 397 { 398 lprocess_t *lpc = arg; 399 400 lpc->lpc_psinfo_fd = fd; 401 return (0); 402 } 403 404 /* 405 * Get the system pagesize. 406 */ 407 static void 408 get_page_size(void) 409 { 410 page_size_kb = sysconf(_SC_PAGESIZE) / 1024; 411 debug("physical page size: %luKB\n", page_size_kb); 412 } 413 414 static void 415 tm_fmt(char *msg, hrtime_t t1, hrtime_t t2) 416 { 417 hrtime_t diff = t2 - t1; 418 419 if (diff < MILLISEC) 420 debug("%s: %lld nanoseconds\n", msg, diff); 421 else if (diff < MICROSEC) 422 debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC); 423 else if (diff < NANOSEC) 424 debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC); 425 else 426 debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC); 427 } 428 429 /* 430 * Get the zone's & project's RSS from the kernel. 431 */ 432 static void 433 rss_sample(boolean_t my_zone_only, uint_t col_types) 434 { 435 size_t nres; 436 size_t i; 437 uint_t flags; 438 hrtime_t t1, t2; 439 440 if (my_zone_only) { 441 flags = VMUSAGE_ZONE; 442 } else { 443 flags = 0; 444 if (col_types & CAPPED_PROJECT) 445 flags |= VMUSAGE_PROJECTS; 446 if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID) 447 flags |= VMUSAGE_ALL_ZONES; 448 } 449 450 debug("vmusage sample flags 0x%x\n", flags); 451 if (flags == 0) 452 return; 453 454 again: 455 /* try the current buffer to see if the list will fit */ 456 nres = vmu_vals_len; 457 t1 = gethrtime(); 458 if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval, 459 vmu_vals, &nres) != 0) { 460 if (errno != EOVERFLOW) { 461 warn(gettext("can't read RSS from kernel\n")); 462 return; 463 } 464 } 465 t2 = gethrtime(); 466 tm_fmt("getvmusage time", t1, t2); 467 468 debug("kernel nres %lu\n", (ulong_t)nres); 469 470 if (nres > vmu_vals_len) { 471 /* array size is now too small, increase it and try again */ 472 free(vmu_vals); 473 474 if ((vmu_vals = (vmusage_t *)calloc(nres, 475 sizeof (vmusage_t))) == NULL) { 476 warn(gettext("out of memory: could not read RSS from " 477 "kernel\n")); 478 vmu_vals_len = nvmu_vals = 0; 479 return; 480 } 481 vmu_vals_len = nres; 482 goto again; 483 } 484 485 nvmu_vals = nres; 486 487 debug("vmusage_sample\n"); 488 for (i = 0; i < nvmu_vals; i++) { 489 debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), " 490 "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id, 491 vmu_vals[i].vmu_type, 492 (unsigned long long)vmu_vals[i].vmu_rss_all, 493 (unsigned long long)vmu_vals[i].vmu_rss_all / 1024, 494 (unsigned long long)vmu_vals[i].vmu_swap_all); 495 } 496 } 497 498 static void 499 update_col_rss(lcollection_t *lcol) 500 { 501 int i; 502 503 lcol->lcol_rss = 0; 504 lcol->lcol_image_size = 0; 505 506 for (i = 0; i < nvmu_vals; i++) { 507 if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val) 508 continue; 509 510 if (vmu_vals[i].vmu_type == VMUSAGE_ZONE && 511 lcol->lcol_id.rcid_type != RCIDT_ZONE) 512 continue; 513 514 if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS && 515 lcol->lcol_id.rcid_type != RCIDT_PROJECT) 516 continue; 517 518 /* we found the right RSS entry, update the collection vals */ 519 lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024; 520 lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024; 521 break; 522 } 523 } 524 525 /* 526 * Sample the collection RSS, updating the collection's statistics with the 527 * results. Also, sum the rss of all capped projects & return true if 528 * the collection is over cap. 529 */ 530 static int 531 rss_sample_col_cb(lcollection_t *lcol, void *arg) 532 { 533 int64_t excess; 534 uint64_t rss; 535 sample_col_arg_t *col_argp = (sample_col_arg_t *)arg; 536 537 update_col_rss(lcol); 538 539 lcol->lcol_stat.lcols_rss_sample++; 540 rss = lcol->lcol_rss; 541 excess = rss - lcol->lcol_rss_cap; 542 if (excess > 0) { 543 lcol->lcol_stat.lcols_rss_act_sum += rss; 544 col_argp->sca_any_over_cap = B_TRUE; 545 if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) 546 col_argp->sca_project_over_cap = B_TRUE; 547 } 548 lcol->lcol_stat.lcols_rss_sum += rss; 549 550 if (lcol->lcol_stat.lcols_min_rss > rss) 551 lcol->lcol_stat.lcols_min_rss = rss; 552 if (lcol->lcol_stat.lcols_max_rss < rss) 553 lcol->lcol_stat.lcols_max_rss = rss; 554 555 return (0); 556 } 557 558 /* 559 * Determine if we have capped projects, capped zones or both. 560 */ 561 static int 562 col_type_cb(lcollection_t *lcol, void *arg) 563 { 564 uint_t *col_type = (uint_t *)arg; 565 566 /* skip uncapped collections */ 567 if (lcol->lcol_rss_cap == 0) 568 return (1); 569 570 if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) 571 *col_type |= CAPPED_PROJECT; 572 else 573 *col_type |= CAPPED_ZONE; 574 575 /* once we know everything is capped, we can stop looking */ 576 if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT)) 577 return (1); 578 579 return (0); 580 } 581 582 /* 583 * Open /proc and walk entries. 584 */ 585 static void 586 proc_walk_all(void (*cb)(const pid_t)) 587 { 588 DIR *pdir; 589 struct dirent *dirent; 590 pid_t pid; 591 592 (void) rfd_reserve(1); 593 if ((pdir = opendir("/proc")) == NULL) 594 die(gettext("couldn't open /proc!")); 595 596 while ((dirent = readdir(pdir)) != NULL) { 597 if (strcmp(".", dirent->d_name) == 0 || 598 strcmp("..", dirent->d_name) == 0) 599 continue; 600 pid = atoi(dirent->d_name); 601 ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0); 602 if (pid == rcapd_pid) 603 continue; 604 else 605 cb(pid); 606 } 607 (void) closedir(pdir); 608 } 609 610 /* 611 * Clear unmarked callback. 612 */ 613 /*ARGSUSED*/ 614 static int 615 sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc) 616 { 617 if (lpc->lpc_mark) { 618 lpc->lpc_mark = 0; 619 } else { 620 debug("process %d finished\n", (int)lpc->lpc_pid); 621 lprocess_free(lpc); 622 } 623 624 return (0); 625 } 626 627 /* 628 * Print, for debugging purposes, a collection's recently-sampled RSS and 629 * excess. 630 */ 631 /*ARGSUSED*/ 632 static int 633 excess_print_cb(lcollection_t *lcol, void *arg) 634 { 635 int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap; 636 637 debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n", 638 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), 639 lcol->lcol_name, 640 (unsigned long long)lcol->lcol_rss, 641 (unsigned long long)lcol->lcol_rss_cap, 642 (long long)excess); 643 644 return (0); 645 } 646 647 /* 648 * Scan those collections which have exceeded their caps. 649 * 650 * If we're running in the global zone it might have a cap. We don't want to 651 * do any capping for the global zone yet since we might get under the cap by 652 * just capping the projects in the global zone. 653 */ 654 /*ARGSUSED*/ 655 static int 656 scan_cb(lcollection_t *lcol, void *arg) 657 { 658 int64_t excess; 659 660 /* skip over global zone collection for now but keep track for later */ 661 if (lcol->lcol_id.rcid_type == RCIDT_ZONE && 662 lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { 663 gz_col = lcol; 664 return (0); 665 } 666 667 if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { 668 scan(lcol, excess); 669 lcol->lcol_stat.lcols_scan++; 670 } 671 672 return (0); 673 } 674 675 /* 676 * Scan the global zone collection and see if it still exceeds its cap. 677 * We take into account the effects of capping any global zone projects here. 678 */ 679 static void 680 scan_gz(lcollection_t *lcol, boolean_t project_over_cap) 681 { 682 int64_t excess; 683 684 /* 685 * If we had projects over their cap and the global zone was also over 686 * its cap then we need to get the up-to-date global zone rss to 687 * determine if we are still over the global zone cap. We might have 688 * gone under while we scanned the capped projects. If there were no 689 * projects over cap then we can use the rss value we already have for 690 * the global zone. 691 */ 692 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 693 if (project_over_cap && excess > 0) { 694 rss_sample(B_TRUE, CAPPED_ZONE); 695 update_col_rss(lcol); 696 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 697 } 698 699 if (excess > 0) { 700 debug("global zone excess %lldKB\n", (long long)excess); 701 scan(lcol, excess); 702 lcol->lcol_stat.lcols_scan++; 703 } 704 } 705 706 /* 707 * Do a soft scan of those collections which have excesses. A soft scan is one 708 * in which the cap enforcement pressure is taken into account. The difference 709 * between the utilized physical memory and the cap enforcement pressure will 710 * be scanned-for, and each collection will be scanned proportionally by their 711 * present excesses. 712 */ 713 static int 714 soft_scan_cb(lcollection_t *lcol, void *a) 715 { 716 int64_t excess; 717 soft_scan_arg_t *arg = a; 718 719 /* skip over global zone collection for now but keep track for later */ 720 if (lcol->lcol_id.rcid_type == RCIDT_ZONE && 721 lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { 722 gz_col = lcol; 723 return (0); 724 } 725 726 if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { 727 int64_t adjusted_excess = 728 excess * arg->ssa_scan_goal / arg->ssa_sum_excess; 729 730 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " 731 "scanning %lld\n", 732 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 733 "project" : "zone"), 734 (long)lcol->lcol_id.rcid_val, 735 (long long)excess, (long long)arg->ssa_scan_goal, 736 (unsigned long long)arg->ssa_sum_excess, 737 (long long)adjusted_excess); 738 739 scan(lcol, adjusted_excess); 740 lcol->lcol_stat.lcols_scan++; 741 } 742 743 return (0); 744 } 745 746 static void 747 soft_scan_gz(lcollection_t *lcol, void *a) 748 { 749 int64_t excess; 750 soft_scan_arg_t *arg = a; 751 752 /* 753 * If we had projects over their cap and the global zone was also over 754 * its cap then we need to get the up-to-date global zone rss to 755 * determine if we are still over the global zone cap. We might have 756 * gone under while we scanned the capped projects. If there were no 757 * projects over cap then we can use the rss value we already have for 758 * the global zone. 759 */ 760 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 761 if (arg->ssa_project_over_cap && excess > 0) { 762 rss_sample(B_TRUE, CAPPED_ZONE); 763 update_col_rss(lcol); 764 excess = lcol->lcol_rss - lcol->lcol_rss_cap; 765 } 766 767 if (excess > 0) { 768 int64_t adjusted_excess = 769 excess * arg->ssa_scan_goal / arg->ssa_sum_excess; 770 771 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " 772 "scanning %lld\n", 773 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 774 "project" : "zone"), 775 (long)lcol->lcol_id.rcid_val, 776 (long long)excess, (long long)arg->ssa_scan_goal, 777 (unsigned long long)arg->ssa_sum_excess, 778 (long long)adjusted_excess); 779 780 scan(lcol, adjusted_excess); 781 lcol->lcol_stat.lcols_scan++; 782 } 783 } 784 785 /* 786 * When a scan could happen, but caps aren't enforced tick the 787 * lcols_unenforced_cap counter. 788 */ 789 /*ARGSUSED*/ 790 static int 791 unenforced_cap_cb(lcollection_t *lcol, void *arg) 792 { 793 lcol->lcol_stat.lcols_unenforced_cap++; 794 795 return (0); 796 } 797 798 /* 799 * Update the count of physically installed memory. 800 */ 801 static void 802 update_phys_total(void) 803 { 804 uint64_t old_phys_total; 805 806 old_phys_total = phys_total; 807 phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb; 808 if (phys_total != old_phys_total) 809 debug("physical memory%s: %lluM\n", (old_phys_total == 0 ? 810 "" : " adjusted"), (unsigned long long)(phys_total / 1024)); 811 } 812 813 /* 814 * Unlink a process from its collection, updating relevant statistics, and 815 * freeing its associated memory. 816 */ 817 void 818 lprocess_free(lprocess_t *lpc) 819 { 820 pid_t pid; 821 822 lpc->lpc_collection->lcol_stat.lcols_proc_out++; 823 824 if (lpc->lpc_prev != NULL) 825 lpc->lpc_prev->lpc_next = lpc->lpc_next; 826 if (lpc->lpc_next != NULL) 827 lpc->lpc_next->lpc_prev = lpc->lpc_prev; 828 if (lpc->lpc_collection->lcol_lprocess == lpc) 829 lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next != 830 lpc ? lpc->lpc_next : NULL); 831 lpc->lpc_next = lpc->lpc_prev = NULL; 832 833 if (lpc->lpc_prpageheader != NULL) 834 free(lpc->lpc_prpageheader); 835 if (lpc->lpc_xmap != NULL) 836 free(lpc->lpc_xmap); 837 if (lpc->lpc_psinfo_fd >= 0) { 838 if (rfd_close(lpc->lpc_psinfo_fd) != 0) 839 debug("could not close %d lpc_psinfo_fd %d", 840 (int)lpc->lpc_pid, lpc->lpc_psinfo_fd); 841 lpc->lpc_psinfo_fd = -1; 842 } 843 if (lpc->lpc_pgdata_fd >= 0) { 844 if (rfd_close(lpc->lpc_pgdata_fd) != 0) 845 debug("could not close %d lpc_pgdata_fd %d", 846 (int)lpc->lpc_pid, lpc->lpc_pgdata_fd); 847 lpc->lpc_pgdata_fd = -1; 848 } 849 if (lpc->lpc_xmap_fd >= 0) { 850 if (rfd_close(lpc->lpc_xmap_fd) != 0) 851 debug("could not close %d lpc_xmap_fd %d", 852 (int)lpc->lpc_pid, lpc->lpc_xmap_fd); 853 lpc->lpc_xmap_fd = -1; 854 } 855 if (lpc->lpc_ignore != NULL) 856 lmapping_free(&lpc->lpc_ignore); 857 pid = lpc->lpc_pid; 858 free(lpc); 859 debug_high("process %d freed\n", (int)pid); 860 } 861 862 /* 863 * Collection clear callback. 864 */ 865 /*ARGSUSED*/ 866 static int 867 collection_clear_cb(lcollection_t *lcol, void *arg) 868 { 869 lcol->lcol_mark = 0; 870 871 return (0); 872 } 873 874 /* 875 * Respond to a terminating signal by setting a termination flag. 876 */ 877 /*ARGSUSED*/ 878 static void 879 terminate_signal(int signal) 880 { 881 if (termination_signal == 0) 882 termination_signal = signal; 883 should_run = 0; 884 } 885 886 /* 887 * Handle any synchronous or asynchronous signals that would ordinarily cause a 888 * process to abort. 889 */ 890 /*ARGSUSED*/ 891 static void 892 abort_signal(int signal) 893 { 894 /* 895 * Allow the scanner to make a last-ditch effort to resume any stopped 896 * processes. 897 */ 898 scan_abort(); 899 abort(); 900 } 901 902 /* 903 * Clean up collections which have been removed due to configuration. Unlink 904 * the collection from lcollection and free it. 905 */ 906 /*ARGSUSED*/ 907 static int 908 collection_sweep_cb(lcollection_t *lcol, void *arg) 909 { 910 if (lcol->lcol_mark == 0) { 911 debug("freeing %s %s\n", 912 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 913 "project" : "zone"), lcol->lcol_name); 914 lcollection_free(lcol); 915 } 916 917 return (0); 918 } 919 920 /* 921 * Set those variables which depend on the global configuration. 922 */ 923 static void 924 finish_configuration(void) 925 { 926 /* 927 * Warn that any lnode (or non-project) mode specification (by an SRM 928 * 1.3 configuration file, for example) is ignored. 929 */ 930 if (strcmp(rcfg.rcfg_mode_name, "project") != 0) { 931 warn(gettext("%s mode specification ignored -- using project" 932 " mode\n"), rcfg.rcfg_mode_name); 933 rcfg.rcfg_mode_name = "project"; 934 rcfg.rcfg_mode = rctype_project; 935 } 936 } 937 938 /* 939 * Cause the configuration to be reread and applied. 940 */ 941 static void 942 reread_configuration(void) 943 { 944 rcfg_t rcfg_new; 945 946 if (rcfg_read(&rcfg_new, update_statistics) != E_SUCCESS) { 947 warn(gettext("can't reread configuration \n")); 948 exit(SMF_EXIT_ERR_CONFIG); 949 } else { 950 /* 951 * Done reading configuration. Remove existing 952 * collections in case there is a change in collection type. 953 */ 954 if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) { 955 list_walk_collection(collection_clear_cb, NULL); 956 list_walk_collection(collection_sweep_cb, NULL); 957 } 958 959 /* 960 * Make the newly-read configuration the global one, and update 961 * any variables that depend on it. 962 */ 963 rcfg = rcfg_new; 964 finish_configuration(); 965 } 966 } 967 968 /* 969 * First, examine changes, additions, and deletions to cap definitions. 970 * Then, set the next event time. 971 */ 972 static void 973 reconfigure(hrtime_t now, hrtime_t *next_configuration, 974 hrtime_t *next_proc_walk, hrtime_t *next_rss_sample) 975 { 976 debug("reconfigure...\n"); 977 978 /* 979 * Walk the lcollection, marking active collections so inactive ones 980 * can be freed. 981 */ 982 list_walk_collection(collection_clear_cb, NULL); 983 lcollection_update(LCU_ACTIVE_ONLY); /* mark */ 984 list_walk_collection(collection_sweep_cb, NULL); 985 986 *next_configuration = NEXT_EVENT_TIME(now, 987 rcfg.rcfg_reconfiguration_interval); 988 989 /* 990 * Reset each event time to the shorter of the previous and new 991 * intervals. 992 */ 993 if (next_report == 0 && rcfg.rcfg_report_interval > 0) 994 next_report = now; 995 else 996 next_report = POSITIVE_MIN(next_report, 997 NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval)); 998 999 if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0) 1000 *next_proc_walk = now; 1001 else 1002 *next_proc_walk = POSITIVE_MIN(*next_proc_walk, 1003 NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval)); 1004 1005 if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0) 1006 *next_rss_sample = now; 1007 else 1008 *next_rss_sample = POSITIVE_MIN(*next_rss_sample, 1009 NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval)); 1010 } 1011 1012 /* 1013 * Respond to SIGHUP by triggering the rereading the configuration and cap 1014 * definitions. 1015 */ 1016 /*ARGSUSED*/ 1017 static void 1018 sighup(int signal) 1019 { 1020 should_reconfigure = 1; 1021 } 1022 1023 /* 1024 * Print, for debugging purposes, each collection's interval statistics. 1025 */ 1026 /*ARGSUSED*/ 1027 static int 1028 simple_report_collection_cb(lcollection_t *lcol, void *arg) 1029 { 1030 #define DELTA(field) \ 1031 (unsigned long long)( \ 1032 (lcol->lcol_stat.field - lcol->lcol_stat_old.field)) 1033 1034 debug("%s %s status: succeeded/attempted (k): %llu/%llu, " 1035 "ineffective/scans/unenforced/samplings: %llu/%llu/%llu/%llu, RSS " 1036 "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, " 1037 "%llu scans over %llu ms\n", 1038 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), 1039 lcol->lcol_name, 1040 DELTA(lcols_pg_eff), DELTA(lcols_pg_att), 1041 DELTA(lcols_scan_ineffective), DELTA(lcols_scan), 1042 DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample), 1043 (unsigned long long)lcol->lcol_stat.lcols_min_rss, 1044 (unsigned long long)lcol->lcol_stat.lcols_max_rss, 1045 (unsigned long long)lcol->lcol_rss_cap, 1046 (unsigned long long)(lcol->lcol_stat.lcols_proc_in - 1047 lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out), 1048 DELTA(lcols_scan_count), 1049 NSEC2MSEC(DELTA(lcols_scan_time_complete))); 1050 1051 #undef DELTA 1052 1053 return (0); 1054 } 1055 1056 /* 1057 * Record each collection's interval statistics in the statistics file. 1058 */ 1059 static int 1060 report_collection_cb(lcollection_t *lcol, void *arg) 1061 { 1062 lcollection_report_t dc; 1063 int fd = (intptr_t)arg; 1064 1065 /* 1066 * Copy the relevant fields to the collection's record. 1067 */ 1068 bzero(&dc, sizeof (dc)); 1069 dc.lcol_id = lcol->lcol_id; 1070 (void) strcpy(dc.lcol_name, lcol->lcol_name); 1071 dc.lcol_rss = lcol->lcol_rss; 1072 dc.lcol_image_size = lcol->lcol_image_size; 1073 dc.lcol_rss_cap = lcol->lcol_rss_cap; 1074 dc.lcol_stat = lcol->lcol_stat; 1075 1076 if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) { 1077 lcol->lcol_stat_old = lcol->lcol_stat; 1078 } else { 1079 debug("can't write %s %s statistics", 1080 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? 1081 "project" : "zone"), 1082 lcol->lcol_name); 1083 } 1084 1085 return (0); 1086 } 1087 1088 /* 1089 * Determine the count of pages scanned by the global page scanner, obtained 1090 * from the cpu_stat:*::scan kstats. Return zero on success. 1091 */ 1092 static int 1093 get_globally_scanned_pages(uint64_t *scannedp) 1094 { 1095 kstat_t *ksp; 1096 uint64_t scanned = 0; 1097 1098 if (kstat_chain_update(kctl) == -1) { 1099 warn(gettext("can't update kstat chain")); 1100 return (0); 1101 } 1102 1103 for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) { 1104 if (strcmp(ksp->ks_module, "cpu_stat") == 0) { 1105 if (kstat_read(kctl, ksp, NULL) != -1) { 1106 scanned += ((cpu_stat_t *) 1107 ksp->ks_data)->cpu_vminfo.scan; 1108 } else { 1109 return (-1); 1110 } 1111 } 1112 } 1113 1114 *scannedp = scanned; 1115 return (0); 1116 } 1117 1118 /* 1119 * Determine if the global page scanner is running, during which no memory 1120 * caps should be enforced, to prevent interference with the global page 1121 * scanner. 1122 */ 1123 static boolean_t 1124 is_global_scanner_running() 1125 { 1126 /* measure delta in page scan count */ 1127 static uint64_t new_sp = 0; 1128 static uint64_t old_sp = 0; 1129 boolean_t res = B_FALSE; 1130 1131 if (get_globally_scanned_pages(&new_sp) == 0) { 1132 if (old_sp != 0 && (new_sp - old_sp) > 0) { 1133 debug("global memory pressure detected (%llu " 1134 "pages scanned since last interval)\n", 1135 (unsigned long long)(new_sp - old_sp)); 1136 res = B_TRUE; 1137 } 1138 old_sp = new_sp; 1139 } else { 1140 warn(gettext("unable to read cpu statistics")); 1141 new_sp = old_sp; 1142 } 1143 1144 return (res); 1145 } 1146 1147 /* 1148 * If soft caps are in use, determine if global memory pressure exceeds the 1149 * configured maximum above which soft caps are enforced. 1150 */ 1151 static boolean_t 1152 must_enforce_soft_caps() 1153 { 1154 /* 1155 * Check for changes to the amount of installed physical memory, to 1156 * compute the current memory pressure. 1157 */ 1158 update_phys_total(); 1159 1160 memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb) 1161 * 100.0 / phys_total); 1162 memory_pressure_sample++; 1163 if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 && 1164 memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) { 1165 return (B_TRUE); 1166 } 1167 1168 return (B_FALSE); 1169 } 1170 1171 /* 1172 * Update the shared statistics file with each collection's current statistics. 1173 * Return zero on success. 1174 */ 1175 static int 1176 update_statistics(void) 1177 { 1178 int fd, res; 1179 static char template[LINELEN]; 1180 1181 /* 1182 * Try to create a directory irrespective of whether it is existing 1183 * or not. If it is not there then it will create. Otherwise any way 1184 * it will fail at mkstemp call below. 1185 */ 1186 (void) mkdir(STAT_FILE_DIR, 0755); 1187 1188 /* 1189 * Create a temporary file. 1190 */ 1191 if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) + 1192 strlen(STAT_TEMPLATE_SUFFIX) + 1)) { 1193 debug("temporary file template size too small\n"); 1194 return (-1); 1195 } 1196 (void) strcpy(template, rcfg.rcfg_stat_file); 1197 (void) strcat(template, STAT_TEMPLATE_SUFFIX); 1198 (void) rfd_reserve(1); 1199 fd = mkstemp(template); 1200 1201 /* 1202 * Write the header and per-collection statistics. 1203 */ 1204 if (fd >= 0) { 1205 rcapd_stat_hdr_t rs; 1206 1207 rs.rs_pid = rcapd_pid; 1208 rs.rs_time = gethrtime(); 1209 ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name)); 1210 (void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name); 1211 rs.rs_pressure_cur = memory_pressure; 1212 rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure; 1213 rs.rs_pressure_sample = memory_pressure_sample; 1214 1215 if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) == 1216 sizeof (rs)) { 1217 list_walk_collection(report_collection_cb, 1218 (void *)(intptr_t)fd); 1219 /* 1220 * Replace the existing statistics file with this new 1221 * one. 1222 */ 1223 res = rename(template, rcfg.rcfg_stat_file); 1224 } else 1225 res = -1; 1226 (void) close(fd); 1227 } else 1228 res = -1; 1229 1230 return (res); 1231 } 1232 1233 /* 1234 * Verify the statistics file can be created and written to, and die if an 1235 * existing file may be in use by another rcapd. 1236 */ 1237 static int 1238 verify_statistics(void) 1239 { 1240 pid_t pid; 1241 1242 /* 1243 * Warn if another instance of rcapd might be active. 1244 */ 1245 (void) rfd_reserve(1); 1246 pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file); 1247 if (pid != rcapd_pid && pid != -1) 1248 die(gettext("%s exists; rcapd may already be active\n"), 1249 rcfg.rcfg_stat_file); 1250 1251 return (update_statistics()); 1252 } 1253 1254 static int 1255 sum_excess_cb(lcollection_t *lcol, void *arg) 1256 { 1257 uint64_t *sum_excess = arg; 1258 1259 *sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss - 1260 lcol->lcol_rss_cap)); 1261 return (0); 1262 } 1263 1264 /* 1265 * Compute the quantity of memory (in kilobytes) above the cap enforcement 1266 * pressure. Set the scan goal to that quantity (or at most the excess). 1267 */ 1268 static void 1269 compute_soft_scan_goal(soft_scan_arg_t *argp) 1270 { 1271 /* 1272 * Compute the sum of the collections' excesses, which will be the 1273 * denominator. 1274 */ 1275 argp->ssa_sum_excess = 0; 1276 list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess)); 1277 1278 argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) * 1279 (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 - 1280 sysconf(_SC_AVPHYS_PAGES)) * page_size_kb, 1281 argp->ssa_sum_excess); 1282 } 1283 1284 static void 1285 rcapd_usage(void) 1286 { 1287 info(gettext("usage: rcapd [-d]\n")); 1288 } 1289 1290 void 1291 check_update_statistics(void) 1292 { 1293 hrtime_t now = gethrtime(); 1294 1295 if (EVENT_TIME(now, next_report)) { 1296 debug("updating statistics...\n"); 1297 list_walk_collection(simple_report_collection_cb, NULL); 1298 if (update_statistics() != 0) 1299 debug("couldn't update statistics"); 1300 next_report = NEXT_REPORT_EVENT_TIME(now, 1301 rcfg.rcfg_report_interval); 1302 } 1303 } 1304 1305 static void 1306 verify_and_set_privileges(void) 1307 { 1308 priv_set_t *required = 1309 priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL); 1310 1311 /* 1312 * Ensure the required privileges, suitable for controlling processes, 1313 * are possessed. 1314 */ 1315 if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv( 1316 PRIV_SET, PRIV_EFFECTIVE, required) != 0) 1317 die(gettext("can't set requisite privileges")); 1318 1319 /* 1320 * Ensure access to /var/run/daemon. 1321 */ 1322 if (setreuid(DAEMON_UID, DAEMON_UID) != 0) 1323 die(gettext("cannot become user daemon")); 1324 1325 priv_freeset(required); 1326 } 1327 1328 /* 1329 * This function does the top-level work to determine if we should do any 1330 * memory capping, and if so, it invokes the right call-backs to do the work. 1331 */ 1332 static void 1333 do_capping(hrtime_t now, hrtime_t *next_proc_walk) 1334 { 1335 boolean_t enforce_caps; 1336 /* soft cap enforcement flag, depending on memory pressure */ 1337 boolean_t enforce_soft_caps; 1338 /* avoid interference with kernel's page scanner */ 1339 boolean_t global_scanner_running; 1340 sample_col_arg_t col_arg; 1341 soft_scan_arg_t arg; 1342 uint_t col_types = 0; 1343 1344 /* check what kind of collections (project/zone) are capped */ 1345 list_walk_collection(col_type_cb, &col_types); 1346 debug("collection types: 0x%x\n", col_types); 1347 1348 /* no capped collections, skip checking rss */ 1349 if (col_types == 0) 1350 return; 1351 1352 /* Determine if soft caps are enforced. */ 1353 enforce_soft_caps = must_enforce_soft_caps(); 1354 1355 /* Determine if the global page scanner is running. */ 1356 global_scanner_running = is_global_scanner_running(); 1357 1358 /* 1359 * Sample collections' member processes RSSes and recompute 1360 * collections' excess. 1361 */ 1362 rss_sample(B_FALSE, col_types); 1363 1364 col_arg.sca_any_over_cap = B_FALSE; 1365 col_arg.sca_project_over_cap = B_FALSE; 1366 list_walk_collection(rss_sample_col_cb, &col_arg); 1367 list_walk_collection(excess_print_cb, NULL); 1368 debug("any collection/project over cap = %d, %d\n", 1369 col_arg.sca_any_over_cap, col_arg.sca_project_over_cap); 1370 1371 if (enforce_soft_caps) 1372 debug("memory pressure %d%%\n", memory_pressure); 1373 1374 /* 1375 * Cap enforcement is determined by the previous conditions. 1376 */ 1377 enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap && 1378 (rcfg.rcfg_memory_cap_enforcement_pressure == 0 || 1379 enforce_soft_caps); 1380 1381 debug("%senforcing caps\n", enforce_caps ? "" : "not "); 1382 1383 /* 1384 * If soft caps are in use, determine the size of the portion from each 1385 * collection to scan for. 1386 */ 1387 if (enforce_caps && enforce_soft_caps) 1388 compute_soft_scan_goal(&arg); 1389 1390 /* 1391 * Victimize offending collections. 1392 */ 1393 if (enforce_caps && (!enforce_soft_caps || 1394 (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) { 1395 1396 /* 1397 * Since at least one collection is over its cap & needs 1398 * enforcing, check if it is at least time for a process walk 1399 * (we could be well past time since we only walk /proc when 1400 * we need to) and if so, update each collections process list 1401 * in a single pass through /proc. 1402 */ 1403 if (EVENT_TIME(now, *next_proc_walk)) { 1404 debug("scanning process list...\n"); 1405 proc_walk_all(proc_cb); /* insert & mark */ 1406 list_walk_all(sweep_process_cb); /* free dead procs */ 1407 *next_proc_walk = NEXT_EVENT_TIME(now, 1408 rcfg.rcfg_proc_walk_interval); 1409 } 1410 1411 gz_col = NULL; 1412 if (enforce_soft_caps) { 1413 debug("scan goal is %lldKB\n", 1414 (long long)arg.ssa_scan_goal); 1415 list_walk_collection(soft_scan_cb, &arg); 1416 if (gz_capped && gz_col != NULL) { 1417 /* process global zone */ 1418 arg.ssa_project_over_cap = 1419 col_arg.sca_project_over_cap; 1420 soft_scan_gz(gz_col, &arg); 1421 } 1422 } else { 1423 list_walk_collection(scan_cb, NULL); 1424 if (gz_capped && gz_col != NULL) { 1425 /* process global zone */ 1426 scan_gz(gz_col, col_arg.sca_project_over_cap); 1427 } 1428 } 1429 } else if (col_arg.sca_any_over_cap) { 1430 list_walk_collection(unenforced_cap_cb, NULL); 1431 } 1432 } 1433 1434 int 1435 main(int argc, char *argv[]) 1436 { 1437 int res; 1438 int should_fork = 1; /* fork flag */ 1439 hrtime_t now; /* current time */ 1440 hrtime_t next; /* time of next event */ 1441 int sig; /* signal iteration */ 1442 struct rlimit rl; 1443 hrtime_t next_proc_walk; /* time of next /proc scan */ 1444 hrtime_t next_configuration; /* time of next configuration */ 1445 hrtime_t next_rss_sample; /* (latest) time of next RSS sample */ 1446 1447 (void) set_message_priority(RCM_INFO); 1448 (void) setpname("rcapd"); 1449 rcapd_pid = getpid(); 1450 (void) chdir("/"); 1451 should_run = 1; 1452 ever_ran = 0; 1453 1454 (void) setlocale(LC_ALL, ""); 1455 (void) textdomain(TEXT_DOMAIN); 1456 1457 /* 1458 * Parse command-line options. 1459 */ 1460 while ((res = getopt(argc, argv, "dF")) > 0) 1461 switch (res) { 1462 case 'd': 1463 should_fork = 0; 1464 if (debug_mode == 0) { 1465 debug_mode = 1; 1466 (void) set_message_priority(RCM_DEBUG); 1467 } else 1468 (void) set_message_priority(RCM_DEBUG_HIGH); 1469 break; 1470 case 'F': 1471 should_fork = 0; 1472 break; 1473 default: 1474 rcapd_usage(); 1475 return (E_USAGE); 1476 /*NOTREACHED*/ 1477 } 1478 1479 /* 1480 * Read the configuration. 1481 */ 1482 if (rcfg_read(&rcfg, verify_statistics) != E_SUCCESS) { 1483 warn(gettext("resource caps not configured\n")); 1484 return (SMF_EXIT_ERR_CONFIG); 1485 } 1486 1487 /* 1488 * If not debugging, fork and continue operating, changing the 1489 * destination of messages to syslog(). 1490 */ 1491 if (should_fork == 1) { 1492 pid_t child; 1493 debug("forking\n"); 1494 child = fork(); 1495 if (child == -1) 1496 die(gettext("cannot fork")); 1497 if (child > 0) 1498 return (0); 1499 else { 1500 rcapd_pid = getpid(); 1501 (void) set_message_destination(RCD_SYSLOG); 1502 (void) fclose(stdin); 1503 (void) fclose(stdout); 1504 (void) fclose(stderr); 1505 } 1506 /* 1507 * Start a new session and detatch from the controlling tty. 1508 */ 1509 if (setsid() == (pid_t)-1) 1510 debug(gettext("setsid() failed; cannot detach from " 1511 "terminal")); 1512 } 1513 1514 finish_configuration(); 1515 should_reconfigure = 0; 1516 1517 /* 1518 * Check that required privileges are possessed. 1519 */ 1520 verify_and_set_privileges(); 1521 1522 now = next_report = next_proc_walk = next_rss_sample = gethrtime(); 1523 next_configuration = NEXT_EVENT_TIME(gethrtime(), 1524 rcfg.rcfg_reconfiguration_interval); 1525 1526 /* 1527 * Open the kstat chain. 1528 */ 1529 kctl = kstat_open(); 1530 if (kctl == NULL) 1531 die(gettext("can't open kstats")); 1532 1533 /* 1534 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can 1535 * be effectively managed without revoking descriptors (at 3 per 1536 * process). 1537 */ 1538 rl.rlim_cur = 32 * 1024; 1539 rl.rlim_max = 32 * 1024; 1540 if (setrlimit(RLIMIT_NOFILE, &rl) != 0 && 1541 getrlimit(RLIMIT_NOFILE, &rl) == 0) { 1542 rl.rlim_cur = rl.rlim_max; 1543 (void) setrlimit(RLIMIT_NOFILE, &rl); 1544 } 1545 (void) enable_extended_FILE_stdio(-1, -1); 1546 1547 if (getrlimit(RLIMIT_NOFILE, &rl) == 0) 1548 debug("fd limit: %lu\n", rl.rlim_cur); 1549 else 1550 debug("fd limit: unknown\n"); 1551 1552 get_page_size(); 1553 my_zoneid = getzoneid(); 1554 1555 /* 1556 * Handle those signals whose (default) exit disposition 1557 * prevents rcapd from finishing scanning before terminating. 1558 */ 1559 (void) sigset(SIGINT, terminate_signal); 1560 (void) sigset(SIGQUIT, abort_signal); 1561 (void) sigset(SIGILL, abort_signal); 1562 (void) sigset(SIGEMT, abort_signal); 1563 (void) sigset(SIGFPE, abort_signal); 1564 (void) sigset(SIGBUS, abort_signal); 1565 (void) sigset(SIGSEGV, abort_signal); 1566 (void) sigset(SIGSYS, abort_signal); 1567 (void) sigset(SIGPIPE, terminate_signal); 1568 (void) sigset(SIGALRM, terminate_signal); 1569 (void) sigset(SIGTERM, terminate_signal); 1570 (void) sigset(SIGUSR1, terminate_signal); 1571 (void) sigset(SIGUSR2, terminate_signal); 1572 (void) sigset(SIGPOLL, terminate_signal); 1573 (void) sigset(SIGVTALRM, terminate_signal); 1574 (void) sigset(SIGXCPU, abort_signal); 1575 (void) sigset(SIGXFSZ, abort_signal); 1576 for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++) 1577 (void) sigset(sig, terminate_signal); 1578 1579 /* 1580 * Install a signal handler for reconfiguration processing. 1581 */ 1582 (void) sigset(SIGHUP, sighup); 1583 1584 /* 1585 * Determine which process collections to cap. 1586 */ 1587 lcollection_update(LCU_COMPLETE); 1588 1589 /* 1590 * Loop forever, monitoring collections' resident set sizes and 1591 * enforcing their caps. Look for changes in caps as well as 1592 * responding to requests to reread the configuration. Update 1593 * per-collection statistics periodically. 1594 */ 1595 while (should_run != 0) { 1596 struct timespec ts; 1597 1598 /* 1599 * Announce that rcapd is starting. 1600 */ 1601 if (ever_ran == 0) { 1602 info(gettext("starting\n")); 1603 ever_ran = 1; 1604 } 1605 1606 /* 1607 * Check the configuration at every next_configuration interval. 1608 * Update the rss data once every next_rss_sample interval. 1609 * The condition of global memory pressure is also checked at 1610 * the same frequency, if strict caps are in use. 1611 */ 1612 now = gethrtime(); 1613 1614 /* 1615 * Detect configuration and cap changes only when SIGHUP 1616 * is received. Call reconfigure to apply new configuration 1617 * parameters. 1618 */ 1619 if (should_reconfigure == 1) { 1620 reread_configuration(); 1621 should_reconfigure = 0; 1622 reconfigure(now, &next_configuration, &next_proc_walk, 1623 &next_rss_sample); 1624 } 1625 1626 if (EVENT_TIME(now, next_configuration)) { 1627 reconfigure(now, &next_configuration, &next_proc_walk, 1628 &next_rss_sample); 1629 } 1630 1631 /* 1632 * Do the main work for enforcing caps. 1633 */ 1634 if (EVENT_TIME(now, next_rss_sample)) { 1635 do_capping(now, &next_proc_walk); 1636 1637 next_rss_sample = NEXT_EVENT_TIME(now, 1638 rcfg.rcfg_rss_sample_interval); 1639 } 1640 1641 /* 1642 * Update the statistics file, if it's time. 1643 */ 1644 check_update_statistics(); 1645 1646 /* 1647 * Sleep for some time before repeating. 1648 */ 1649 now = gethrtime(); 1650 next = next_configuration; 1651 next = POSITIVE_MIN(next, next_report); 1652 next = POSITIVE_MIN(next, next_rss_sample); 1653 if (next > now && should_run != 0) { 1654 debug("sleeping %-4.2f seconds\n", (float)(next - 1655 now) / (float)NANOSEC); 1656 hrt2ts(next - now, &ts); 1657 (void) nanosleep(&ts, NULL); 1658 } 1659 } 1660 if (termination_signal != 0) 1661 debug("exiting due to signal %d\n", termination_signal); 1662 if (ever_ran != 0) 1663 info(gettext("exiting\n")); 1664 1665 /* 1666 * Unlink the statistics file before exiting. 1667 */ 1668 if (rcfg.rcfg_stat_file[0] != 0) 1669 (void) unlink(rcfg.rcfg_stat_file); 1670 1671 return (E_SUCCESS); 1672 } 1673