/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" /* * rcapd is a long-running daemon enforcing project-based resource caps (see * rcapd(1M)). Each instance of a process aggregate (project or, generically, * "collection") may have a memory cap. A single thread monitors the resource * utilization of capped collections, enforces caps when they are exceeded (and * other conditions are met), and incorporates changes in configuration or * caps. Each of these actions occurs not more frequently than the rate * specified with rcapadm(1M). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "rcapd.h" #include "rcapd_mapping.h" #include "rcapd_rfd.h" #include "rcapd_stat.h" #include "utils.h" #define POSITIVE_MIN(x, y) \ (((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y)) #define NEXT_EVENT_TIME(base, seconds) \ (((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \ : (hrtime_t)0) #define NEXT_REPORT_EVENT_TIME(base, seconds) \ ((rcfg.rcfg_stat_file[0] != 0) ? \ NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0) #define EVENT_TIME(time, eventtime) \ (((time) > (eventtime)) && (eventtime) != 0) #define STAT_TEMPLATE_SUFFIX ".XXXXXX" /* suffix of mkstemp() arg */ #define DAEMON_UID 1 /* uid to use */ #define CAPPED_PROJECT 0x01 #define CAPPED_ZONE 0x02 typedef struct soft_scan_arg { uint64_t ssa_sum_excess; int64_t ssa_scan_goal; boolean_t ssa_project_over_cap; } soft_scan_arg_t; typedef struct sample_col_arg { boolean_t sca_any_over_cap; boolean_t sca_project_over_cap; } sample_col_arg_t; static int debug_mode = 0; /* debug mode flag */ static pid_t rcapd_pid; /* rcapd's pid to ensure it's not */ /* scanned */ static kstat_ctl_t *kctl; /* kstat chain */ static int memory_pressure = 0; /* physical memory utilization (%) */ static int memory_pressure_sample = 0; /* count of samples */ static long page_size_kb = 0; /* system page size in KB */ static size_t nvmu_vals = 0; /* # of kernel RSS/swap vals in array */ static size_t vmu_vals_len = 0; /* size of RSS/swap vals array */ static vmusage_t *vmu_vals = NULL; /* snapshot of kernel RSS/swap values */ static hrtime_t next_report; /* time of next report */ static int termination_signal = 0; /* terminating signal */ static zoneid_t my_zoneid = (zoneid_t)-1; static lcollection_t *gz_col; /* global zone collection */ rcfg_t rcfg; /* * Updated when we re-read the collection configurations if this rcapd instance * is running in the global zone and the global zone is capped. */ boolean_t gz_capped = B_FALSE; /* * Flags. */ static int ever_ran; int should_run; static int should_reconfigure; static int verify_statistics(void); static int update_statistics(void); /* * Checks if a process is marked 'system'. Returns FALSE only when it is not. */ static boolean_t proc_issystem(pid_t pid) { char pc_clname[PC_CLNMSZ]; if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname, PC_KY_NULL) != -1) { return (strcmp(pc_clname, "SYS") == 0); } else { debug("cannot get class-specific scheduling parameters; " "assuming system process\n"); return (B_TRUE); } } static void lprocess_insert_mark(psinfo_t *psinfop) { pid_t pid = psinfop->pr_pid; /* flag indicating whether the process should be scanned. */ int unscannable = psinfop->pr_nlwp == 0; rcid_t colid; lcollection_t *lcol; lprocess_t *lproc; /* * Determine which collection to put this process into. We only have * to worry about tracking both zone and project capped processes if * this rcapd instance is running in the global zone, since we'll only * see processes in our own projects in a non-global zone. In the * global zone, if the process belongs to a non-global zone, we only * need to track it for the capped non-global zone collection. For * global zone processes, we first attempt to put the process into a * capped project collection. On the second pass into this function * the projid will be cleared so we will just track the process for the * global zone collection as a whole. */ if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) { colid.rcid_type = RCIDT_PROJECT; colid.rcid_val = psinfop->pr_projid; } else { /* try to add to zone collection */ colid.rcid_type = RCIDT_ZONE; colid.rcid_val = psinfop->pr_zoneid; } if ((lcol = lcollection_find(&colid)) == NULL) return; /* * If the process is already being tracked, update the unscannable flag, * as determined by the caller, from the process's psinfo. */ lproc = lcol->lcol_lprocess; while (lproc != NULL) { if (lproc->lpc_pid == pid) { lproc->lpc_mark = 1; if (unscannable != 0 && lproc->lpc_unscannable == 0) { debug("process %d: became unscannable\n", (int)lproc->lpc_pid); lproc->lpc_unscannable = 1; } return; } lproc = lproc->lpc_next; } /* * We've fallen off the list without finding our current process; * insert it at the list head. */ if ((lproc = malloc(sizeof (*lproc))) == NULL) debug("insufficient memory to track new process %d", (int)pid); else { (void) bzero(lproc, sizeof (*lproc)); lproc->lpc_pid = pid; lproc->lpc_mark = 1; lproc->lpc_collection = lcol; lproc->lpc_psinfo_fd = -1; lproc->lpc_pgdata_fd = -1; lproc->lpc_xmap_fd = -1; /* * If the caller didn't flag this process as unscannable * already, do some more checking. */ lproc->lpc_unscannable = unscannable || proc_issystem(pid); #ifdef DEBUG /* * Verify the sanity of lprocess. It should not contain the * process we are about to prepend. */ if (lcollection_member(lcol, lproc)) { lprocess_t *cur = lcol->lcol_lprocess; debug("The collection %lld already has these members, " "including me, %d!\n", (long long)lcol->lcol_id.rcid_val, (int)lproc->lpc_pid); while (cur != NULL) { debug("\t%d\n", (int)cur->lpc_pid); cur = cur->lpc_next; } info(gettext("process already on lprocess\n")); abort(); } #endif /* DEBUG */ lproc->lpc_next = lcol->lcol_lprocess; if (lproc->lpc_next != NULL) lproc->lpc_next->lpc_prev = lproc; lproc->lpc_prev = NULL; lcol->lcol_lprocess = lproc; debug("tracking %s %ld %d %s%s\n", (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"), (long)colid.rcid_val, (int)pid, psinfop->pr_psargs, (lproc->lpc_unscannable != 0) ? " (not scannable)" : ""); lcol->lcol_stat.lcols_proc_in++; } } static int list_walk_process_cb(lcollection_t *lcol, void *arg) { int (*cb)(lcollection_t *, lprocess_t *) = (int(*)(lcollection_t *, lprocess_t *))arg; lprocess_t *member; lprocess_t *next; member = lcol->lcol_lprocess; while (member != NULL) { pid_t pid = member->lpc_pid; next = member->lpc_next; debug_high("list_walk_all lpc %d\n", (int)pid); if (cb(lcol, member) != 0) { debug_high("list_walk_all aborted at lpc %d\n", (int)pid); return (1); } member = next; } return (0); } /* * Invoke the given callback for each process in each collection. Callbacks * are allowed to change the linkage of the process on which they act. */ static void list_walk_all(int (*cb)(lcollection_t *, lprocess_t *)) { list_walk_collection(list_walk_process_cb, (void *)cb); } static void revoke_psinfo(rfd_t *rfd) { lprocess_t *lpc = (lprocess_t *)rfd->rfd_data; if (lpc != NULL) { debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid); ASSERT(lpc->lpc_psinfo_fd != -1); lpc->lpc_psinfo_fd = -1; } else debug("revoking psinfo fd for unknown process\n"); } /* * Retrieve a process's psinfo via an already-opened or new file descriptor. * The supplied descriptor will be closed on failure. An optional callback * will be invoked with the last descriptor tried, and a supplied callback * argument, as its arguments, such that the new descriptor may be cached, or * an old one may be invalidated. If the result of the callback is zero, the * the caller is to assume responsibility for the file descriptor, to close it * with rfd_close(). * * On failure, a nonzero value is returned. */ int get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd, int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc) { int fd; int can_try_uncached; ASSERT(!(cached_fd > 0 && fd_update_cb == NULL)); do { if (cached_fd >= 0) { fd = cached_fd; can_try_uncached = 1; debug_high("%d/psinfo, trying cached fd %d\n", (int)pid, fd); } else { char pathbuf[PROC_PATH_MAX]; can_try_uncached = 0; (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/psinfo", (int)pid); if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO, revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) { debug("cannot open %s", pathbuf); break; } else debug_high("opened %s, fd %d\n", pathbuf, fd); } if (pread(fd, psinfo, sizeof (*psinfo), 0) == sizeof (*psinfo) && psinfo->pr_pid == pid) break; else { debug_high("closed fd %d\n", fd); if (rfd_close(fd) != 0) debug("could not close fd %d", fd); fd = cached_fd = -1; } } while (can_try_uncached == 1); if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0) if (fd >= 0) { debug_high("closed %s fd %d\n", fd_update_cb == NULL ? "uncached" : "cached", fd); if (rfd_close(fd) != 0) debug("could not close fd %d", fd); } debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd, fd_update_cb != NULL ? "cached" : "uncached"); return ((fd >= 0) ? 0 : -1); } /* * Retrieve the collection membership of all processes and update the psinfo of * those non-system, non-zombie ones in collections. For global zone processes, * we first attempt to put the process into a capped project collection. We * also want to track the process for the global zone collection as a whole. */ static void proc_cb(const pid_t pid) { psinfo_t psinfo; if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) { lprocess_insert_mark(&psinfo); if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) { /* * We also want to track this process for the global * zone as a whole so add it to the global zone * collection as well. */ psinfo.pr_projid = -1; lprocess_insert_mark(&psinfo); } } } /* * Cache the process' psinfo fd, taking responsibility for freeing it. */ int lprocess_update_psinfo_fd_cb(void *arg, int fd) { lprocess_t *lpc = arg; lpc->lpc_psinfo_fd = fd; return (0); } /* * Get the system pagesize. */ static void get_page_size(void) { page_size_kb = sysconf(_SC_PAGESIZE) / 1024; debug("physical page size: %luKB\n", page_size_kb); } static void tm_fmt(char *msg, hrtime_t t1, hrtime_t t2) { hrtime_t diff = t2 - t1; if (diff < MILLISEC) debug("%s: %lld nanoseconds\n", msg, diff); else if (diff < MICROSEC) debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC); else if (diff < NANOSEC) debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC); else debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC); } /* * Get the zone's & project's RSS from the kernel. */ static void rss_sample(boolean_t my_zone_only, uint_t col_types) { size_t nres; size_t i; uint_t flags; hrtime_t t1, t2; if (my_zone_only) { flags = VMUSAGE_ZONE; } else { flags = 0; if (col_types & CAPPED_PROJECT) flags |= VMUSAGE_PROJECTS; if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID) flags |= VMUSAGE_ALL_ZONES; } debug("vmusage sample flags 0x%x\n", flags); if (flags == 0) return; again: /* try the current buffer to see if the list will fit */ nres = vmu_vals_len; t1 = gethrtime(); if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval, vmu_vals, &nres) != 0) { if (errno != EOVERFLOW) { warn(gettext("can't read RSS from kernel\n")); return; } } t2 = gethrtime(); tm_fmt("getvmusage time", t1, t2); debug("kernel nres %lu\n", (ulong_t)nres); if (nres > vmu_vals_len) { /* array size is now too small, increase it and try again */ free(vmu_vals); if ((vmu_vals = (vmusage_t *)calloc(nres, sizeof (vmusage_t))) == NULL) { warn(gettext("out of memory: could not read RSS from " "kernel\n")); vmu_vals_len = nvmu_vals = 0; return; } vmu_vals_len = nres; goto again; } nvmu_vals = nres; debug("vmusage_sample\n"); for (i = 0; i < nvmu_vals; i++) { debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), " "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id, vmu_vals[i].vmu_type, (unsigned long long)vmu_vals[i].vmu_rss_all, (unsigned long long)vmu_vals[i].vmu_rss_all / 1024, (unsigned long long)vmu_vals[i].vmu_swap_all); } } static void update_col_rss(lcollection_t *lcol) { int i; lcol->lcol_rss = 0; lcol->lcol_image_size = 0; for (i = 0; i < nvmu_vals; i++) { if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val) continue; if (vmu_vals[i].vmu_type == VMUSAGE_ZONE && lcol->lcol_id.rcid_type != RCIDT_ZONE) continue; if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS && lcol->lcol_id.rcid_type != RCIDT_PROJECT) continue; /* we found the right RSS entry, update the collection vals */ lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024; lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024; break; } } /* * Sample the collection RSS, updating the collection's statistics with the * results. Also, sum the rss of all capped projects & return true if * the collection is over cap. */ static int rss_sample_col_cb(lcollection_t *lcol, void *arg) { int64_t excess; uint64_t rss; sample_col_arg_t *col_argp = (sample_col_arg_t *)arg; update_col_rss(lcol); lcol->lcol_stat.lcols_rss_sample++; rss = lcol->lcol_rss; excess = rss - lcol->lcol_rss_cap; if (excess > 0) { lcol->lcol_stat.lcols_rss_act_sum += rss; col_argp->sca_any_over_cap = B_TRUE; if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) col_argp->sca_project_over_cap = B_TRUE; } lcol->lcol_stat.lcols_rss_sum += rss; if (lcol->lcol_stat.lcols_min_rss > rss) lcol->lcol_stat.lcols_min_rss = rss; if (lcol->lcol_stat.lcols_max_rss < rss) lcol->lcol_stat.lcols_max_rss = rss; return (0); } /* * Determine if we have capped projects, capped zones or both. */ static int col_type_cb(lcollection_t *lcol, void *arg) { uint_t *col_type = (uint_t *)arg; /* skip uncapped collections */ if (lcol->lcol_rss_cap == 0) return (1); if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) *col_type |= CAPPED_PROJECT; else *col_type |= CAPPED_ZONE; /* once we know everything is capped, we can stop looking */ if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT)) return (1); return (0); } /* * Open /proc and walk entries. */ static void proc_walk_all(void (*cb)(const pid_t)) { DIR *pdir; struct dirent *dirent; pid_t pid; (void) rfd_reserve(1); if ((pdir = opendir("/proc")) == NULL) die(gettext("couldn't open /proc!")); while ((dirent = readdir(pdir)) != NULL) { if (strcmp(".", dirent->d_name) == 0 || strcmp("..", dirent->d_name) == 0) continue; pid = atoi(dirent->d_name); ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0); if (pid == rcapd_pid) continue; else cb(pid); } (void) closedir(pdir); } /* * Clear unmarked callback. */ /*ARGSUSED*/ static int sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc) { if (lpc->lpc_mark) { lpc->lpc_mark = 0; } else { debug("process %d finished\n", (int)lpc->lpc_pid); lprocess_free(lpc); } return (0); } /* * Print, for debugging purposes, a collection's recently-sampled RSS and * excess. */ /*ARGSUSED*/ static int excess_print_cb(lcollection_t *lcol, void *arg) { int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap; debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n", (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), lcol->lcol_name, (unsigned long long)lcol->lcol_rss, (unsigned long long)lcol->lcol_rss_cap, (long long)excess); return (0); } /* * Scan those collections which have exceeded their caps. * * If we're running in the global zone it might have a cap. We don't want to * do any capping for the global zone yet since we might get under the cap by * just capping the projects in the global zone. */ /*ARGSUSED*/ static int scan_cb(lcollection_t *lcol, void *arg) { int64_t excess; /* skip over global zone collection for now but keep track for later */ if (lcol->lcol_id.rcid_type == RCIDT_ZONE && lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { gz_col = lcol; return (0); } if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { scan(lcol, excess); lcol->lcol_stat.lcols_scan++; } return (0); } /* * Scan the global zone collection and see if it still exceeds its cap. * We take into account the effects of capping any global zone projects here. */ static void scan_gz(lcollection_t *lcol, boolean_t project_over_cap) { int64_t excess; /* * If we had projects over their cap and the global zone was also over * its cap then we need to get the up-to-date global zone rss to * determine if we are still over the global zone cap. We might have * gone under while we scanned the capped projects. If there were no * projects over cap then we can use the rss value we already have for * the global zone. */ excess = lcol->lcol_rss - lcol->lcol_rss_cap; if (project_over_cap && excess > 0) { rss_sample(B_TRUE, CAPPED_ZONE); update_col_rss(lcol); excess = lcol->lcol_rss - lcol->lcol_rss_cap; } if (excess > 0) { debug("global zone excess %lldKB\n", (long long)excess); scan(lcol, excess); lcol->lcol_stat.lcols_scan++; } } /* * Do a soft scan of those collections which have excesses. A soft scan is one * in which the cap enforcement pressure is taken into account. The difference * between the utilized physical memory and the cap enforcement pressure will * be scanned-for, and each collection will be scanned proportionally by their * present excesses. */ static int soft_scan_cb(lcollection_t *lcol, void *a) { int64_t excess; soft_scan_arg_t *arg = a; /* skip over global zone collection for now but keep track for later */ if (lcol->lcol_id.rcid_type == RCIDT_ZONE && lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { gz_col = lcol; return (0); } if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { int64_t adjusted_excess = excess * arg->ssa_scan_goal / arg->ssa_sum_excess; debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " "scanning %lld\n", (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), (long)lcol->lcol_id.rcid_val, (long long)excess, (long long)arg->ssa_scan_goal, (unsigned long long)arg->ssa_sum_excess, (long long)adjusted_excess); scan(lcol, adjusted_excess); lcol->lcol_stat.lcols_scan++; } return (0); } static void soft_scan_gz(lcollection_t *lcol, void *a) { int64_t excess; soft_scan_arg_t *arg = a; /* * If we had projects over their cap and the global zone was also over * its cap then we need to get the up-to-date global zone rss to * determine if we are still over the global zone cap. We might have * gone under while we scanned the capped projects. If there were no * projects over cap then we can use the rss value we already have for * the global zone. */ excess = lcol->lcol_rss - lcol->lcol_rss_cap; if (arg->ssa_project_over_cap && excess > 0) { rss_sample(B_TRUE, CAPPED_ZONE); update_col_rss(lcol); excess = lcol->lcol_rss - lcol->lcol_rss_cap; } if (excess > 0) { int64_t adjusted_excess = excess * arg->ssa_scan_goal / arg->ssa_sum_excess; debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " "scanning %lld\n", (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), (long)lcol->lcol_id.rcid_val, (long long)excess, (long long)arg->ssa_scan_goal, (unsigned long long)arg->ssa_sum_excess, (long long)adjusted_excess); scan(lcol, adjusted_excess); lcol->lcol_stat.lcols_scan++; } } /* * When a scan could happen, but caps aren't enforced tick the * lcols_unenforced_cap counter. */ /*ARGSUSED*/ static int unenforced_cap_cb(lcollection_t *lcol, void *arg) { lcol->lcol_stat.lcols_unenforced_cap++; return (0); } /* * Update the count of physically installed memory. */ static void update_phys_total(void) { uint64_t old_phys_total; old_phys_total = phys_total; phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb; if (phys_total != old_phys_total) debug("physical memory%s: %lluM\n", (old_phys_total == 0 ? "" : " adjusted"), (unsigned long long)(phys_total / 1024)); } /* * Unlink a process from its collection, updating relevant statistics, and * freeing its associated memory. */ void lprocess_free(lprocess_t *lpc) { pid_t pid; lpc->lpc_collection->lcol_stat.lcols_proc_out++; if (lpc->lpc_prev != NULL) lpc->lpc_prev->lpc_next = lpc->lpc_next; if (lpc->lpc_next != NULL) lpc->lpc_next->lpc_prev = lpc->lpc_prev; if (lpc->lpc_collection->lcol_lprocess == lpc) lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next != lpc ? lpc->lpc_next : NULL); lpc->lpc_next = lpc->lpc_prev = NULL; if (lpc->lpc_prpageheader != NULL) free(lpc->lpc_prpageheader); if (lpc->lpc_xmap != NULL) free(lpc->lpc_xmap); if (lpc->lpc_psinfo_fd >= 0) { if (rfd_close(lpc->lpc_psinfo_fd) != 0) debug("could not close %d lpc_psinfo_fd %d", (int)lpc->lpc_pid, lpc->lpc_psinfo_fd); lpc->lpc_psinfo_fd = -1; } if (lpc->lpc_pgdata_fd >= 0) { if (rfd_close(lpc->lpc_pgdata_fd) != 0) debug("could not close %d lpc_pgdata_fd %d", (int)lpc->lpc_pid, lpc->lpc_pgdata_fd); lpc->lpc_pgdata_fd = -1; } if (lpc->lpc_xmap_fd >= 0) { if (rfd_close(lpc->lpc_xmap_fd) != 0) debug("could not close %d lpc_xmap_fd %d", (int)lpc->lpc_pid, lpc->lpc_xmap_fd); lpc->lpc_xmap_fd = -1; } if (lpc->lpc_ignore != NULL) lmapping_free(&lpc->lpc_ignore); pid = lpc->lpc_pid; free(lpc); debug_high("process %d freed\n", (int)pid); } /* * Collection clear callback. */ /*ARGSUSED*/ static int collection_clear_cb(lcollection_t *lcol, void *arg) { lcol->lcol_mark = 0; return (0); } /* * Respond to a terminating signal by setting a termination flag. */ /*ARGSUSED*/ static void terminate_signal(int signal) { if (termination_signal == 0) termination_signal = signal; should_run = 0; } /* * Handle any synchronous or asynchronous signals that would ordinarily cause a * process to abort. */ /*ARGSUSED*/ static void abort_signal(int signal) { /* * Allow the scanner to make a last-ditch effort to resume any stopped * processes. */ scan_abort(); abort(); } /* * Clean up collections which have been removed due to configuration. Unlink * the collection from lcollection and free it. */ /*ARGSUSED*/ static int collection_sweep_cb(lcollection_t *lcol, void *arg) { if (lcol->lcol_mark == 0) { debug("freeing %s %s\n", (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), lcol->lcol_name); lcollection_free(lcol); } return (0); } /* * Set those variables which depend on the global configuration. */ static void finish_configuration(void) { /* * Warn that any lnode (or non-project) mode specification (by an SRM * 1.3 configuration file, for example) is ignored. */ if (strcmp(rcfg.rcfg_mode_name, "project") != 0) { warn(gettext("%s mode specification ignored -- using project" " mode\n"), rcfg.rcfg_mode_name); rcfg.rcfg_mode_name = "project"; rcfg.rcfg_mode = rctype_project; } } /* * Cause the configuration to be reread and applied. */ static void reread_configuration(void) { rcfg_t rcfg_new; if (rcfg_read(&rcfg_new, update_statistics) != E_SUCCESS) { warn(gettext("can't reread configuration \n")); exit(SMF_EXIT_ERR_CONFIG); } else { /* * Done reading configuration. Remove existing * collections in case there is a change in collection type. */ if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) { list_walk_collection(collection_clear_cb, NULL); list_walk_collection(collection_sweep_cb, NULL); } /* * Make the newly-read configuration the global one, and update * any variables that depend on it. */ rcfg = rcfg_new; finish_configuration(); } } /* * First, examine changes, additions, and deletions to cap definitions. * Then, set the next event time. */ static void reconfigure(hrtime_t now, hrtime_t *next_configuration, hrtime_t *next_proc_walk, hrtime_t *next_rss_sample) { debug("reconfigure...\n"); /* * Walk the lcollection, marking active collections so inactive ones * can be freed. */ list_walk_collection(collection_clear_cb, NULL); lcollection_update(LCU_ACTIVE_ONLY); /* mark */ list_walk_collection(collection_sweep_cb, NULL); *next_configuration = NEXT_EVENT_TIME(now, rcfg.rcfg_reconfiguration_interval); /* * Reset each event time to the shorter of the previous and new * intervals. */ if (next_report == 0 && rcfg.rcfg_report_interval > 0) next_report = now; else next_report = POSITIVE_MIN(next_report, NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval)); if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0) *next_proc_walk = now; else *next_proc_walk = POSITIVE_MIN(*next_proc_walk, NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval)); if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0) *next_rss_sample = now; else *next_rss_sample = POSITIVE_MIN(*next_rss_sample, NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval)); } /* * Respond to SIGHUP by triggering the rereading the configuration and cap * definitions. */ /*ARGSUSED*/ static void sighup(int signal) { should_reconfigure = 1; } /* * Print, for debugging purposes, each collection's interval statistics. */ /*ARGSUSED*/ static int simple_report_collection_cb(lcollection_t *lcol, void *arg) { #define DELTA(field) \ (unsigned long long)( \ (lcol->lcol_stat.field - lcol->lcol_stat_old.field)) debug("%s %s status: succeeded/attempted (k): %llu/%llu, " "ineffective/scans/unenforced/samplings: %llu/%llu/%llu/%llu, RSS " "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, " "%llu scans over %llu ms\n", (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), lcol->lcol_name, DELTA(lcols_pg_eff), DELTA(lcols_pg_att), DELTA(lcols_scan_ineffective), DELTA(lcols_scan), DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample), (unsigned long long)lcol->lcol_stat.lcols_min_rss, (unsigned long long)lcol->lcol_stat.lcols_max_rss, (unsigned long long)lcol->lcol_rss_cap, (unsigned long long)(lcol->lcol_stat.lcols_proc_in - lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out), DELTA(lcols_scan_count), DELTA(lcols_scan_time_complete) / (NANOSEC / MILLISEC)); #undef DELTA return (0); } /* * Record each collection's interval statistics in the statistics file. */ static int report_collection_cb(lcollection_t *lcol, void *arg) { lcollection_report_t dc; int fd = (intptr_t)arg; /* * Copy the relevant fields to the collection's record. */ bzero(&dc, sizeof (dc)); dc.lcol_id = lcol->lcol_id; (void) strcpy(dc.lcol_name, lcol->lcol_name); dc.lcol_rss = lcol->lcol_rss; dc.lcol_image_size = lcol->lcol_image_size; dc.lcol_rss_cap = lcol->lcol_rss_cap; dc.lcol_stat = lcol->lcol_stat; if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) { lcol->lcol_stat_old = lcol->lcol_stat; } else { debug("can't write %s %s statistics", (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), lcol->lcol_name); } return (0); } /* * Determine the count of pages scanned by the global page scanner, obtained * from the cpu_stat:*::scan kstats. Return zero on success. */ static int get_globally_scanned_pages(uint64_t *scannedp) { kstat_t *ksp; uint64_t scanned = 0; if (kstat_chain_update(kctl) == -1) { warn(gettext("can't update kstat chain")); return (0); } for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) { if (strcmp(ksp->ks_module, "cpu_stat") == 0) { if (kstat_read(kctl, ksp, NULL) != -1) { scanned += ((cpu_stat_t *) ksp->ks_data)->cpu_vminfo.scan; } else { return (-1); } } } *scannedp = scanned; return (0); } /* * Determine if the global page scanner is running, during which no memory * caps should be enforced, to prevent interference with the global page * scanner. */ static boolean_t is_global_scanner_running() { /* measure delta in page scan count */ static uint64_t new_sp = 0; static uint64_t old_sp = 0; boolean_t res = B_FALSE; if (get_globally_scanned_pages(&new_sp) == 0) { if (old_sp != 0 && (new_sp - old_sp) > 0) { debug("global memory pressure detected (%llu " "pages scanned since last interval)\n", (unsigned long long)(new_sp - old_sp)); res = B_TRUE; } old_sp = new_sp; } else { warn(gettext("unable to read cpu statistics")); new_sp = old_sp; } return (res); } /* * If soft caps are in use, determine if global memory pressure exceeds the * configured maximum above which soft caps are enforced. */ static boolean_t must_enforce_soft_caps() { /* * Check for changes to the amount of installed physical memory, to * compute the current memory pressure. */ update_phys_total(); memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb) * 100.0 / phys_total); memory_pressure_sample++; if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 && memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) { return (B_TRUE); } return (B_FALSE); } /* * Update the shared statistics file with each collection's current statistics. * Return zero on success. */ static int update_statistics(void) { int fd, res; static char template[LINELEN]; /* * Try to create a directory irrespective of whether it is existing * or not. If it is not there then it will create. Otherwise any way * it will fail at mkstemp call below. */ (void) mkdir(STAT_FILE_DIR, 0755); /* * Create a temporary file. */ if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) + strlen(STAT_TEMPLATE_SUFFIX) + 1)) { debug("temporary file template size too small\n"); return (-1); } (void) strcpy(template, rcfg.rcfg_stat_file); (void) strcat(template, STAT_TEMPLATE_SUFFIX); (void) rfd_reserve(1); fd = mkstemp(template); /* * Write the header and per-collection statistics. */ if (fd >= 0) { rcapd_stat_hdr_t rs; rs.rs_pid = rcapd_pid; rs.rs_time = gethrtime(); ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name)); (void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name); rs.rs_pressure_cur = memory_pressure; rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure; rs.rs_pressure_sample = memory_pressure_sample; if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) == sizeof (rs)) { list_walk_collection(report_collection_cb, (void *)(intptr_t)fd); /* * Replace the existing statistics file with this new * one. */ res = rename(template, rcfg.rcfg_stat_file); } else res = -1; (void) close(fd); } else res = -1; return (res); } /* * Verify the statistics file can be created and written to, and die if an * existing file may be in use by another rcapd. */ static int verify_statistics(void) { pid_t pid; /* * Warn if another instance of rcapd might be active. */ (void) rfd_reserve(1); pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file); if (pid != rcapd_pid && pid != -1) die(gettext("%s exists; rcapd may already be active\n"), rcfg.rcfg_stat_file); return (update_statistics()); } static int sum_excess_cb(lcollection_t *lcol, void *arg) { uint64_t *sum_excess = arg; *sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss - lcol->lcol_rss_cap)); return (0); } /* * Compute the quantity of memory (in kilobytes) above the cap enforcement * pressure. Set the scan goal to that quantity (or at most the excess). */ static void compute_soft_scan_goal(soft_scan_arg_t *argp) { /* * Compute the sum of the collections' excesses, which will be the * denominator. */ argp->ssa_sum_excess = 0; list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess)); argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) * (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 - sysconf(_SC_AVPHYS_PAGES)) * page_size_kb, argp->ssa_sum_excess); } static void rcapd_usage(void) { info(gettext("usage: rcapd [-d]\n")); } void check_update_statistics(void) { hrtime_t now = gethrtime(); if (EVENT_TIME(now, next_report)) { debug("updating statistics...\n"); list_walk_collection(simple_report_collection_cb, NULL); if (update_statistics() != 0) debug("couldn't update statistics"); next_report = NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval); } } static void verify_and_set_privileges(void) { priv_set_t *required = priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL); /* * Ensure the required privileges, suitable for controlling processes, * are possessed. */ if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv( PRIV_SET, PRIV_EFFECTIVE, required) != 0) die(gettext("can't set requisite privileges")); /* * Ensure access to /var/run/daemon. */ if (setreuid(DAEMON_UID, DAEMON_UID) != 0) die(gettext("cannot become user daemon")); priv_freeset(required); } /* * This function does the top-level work to determine if we should do any * memory capping, and if so, it invokes the right call-backs to do the work. */ static void do_capping(hrtime_t now, hrtime_t *next_proc_walk) { boolean_t enforce_caps; /* soft cap enforcement flag, depending on memory pressure */ boolean_t enforce_soft_caps; /* avoid interference with kernel's page scanner */ boolean_t global_scanner_running; sample_col_arg_t col_arg; soft_scan_arg_t arg; uint_t col_types = 0; /* check what kind of collections (project/zone) are capped */ list_walk_collection(col_type_cb, &col_types); debug("collection types: 0x%x\n", col_types); /* no capped collections, skip checking rss */ if (col_types == 0) return; /* Determine if soft caps are enforced. */ enforce_soft_caps = must_enforce_soft_caps(); /* Determine if the global page scanner is running. */ global_scanner_running = is_global_scanner_running(); /* * Sample collections' member processes RSSes and recompute * collections' excess. */ rss_sample(B_FALSE, col_types); col_arg.sca_any_over_cap = B_FALSE; col_arg.sca_project_over_cap = B_FALSE; list_walk_collection(rss_sample_col_cb, &col_arg); list_walk_collection(excess_print_cb, NULL); debug("any collection/project over cap = %d, %d\n", col_arg.sca_any_over_cap, col_arg.sca_project_over_cap); if (enforce_soft_caps) debug("memory pressure %d%%\n", memory_pressure); /* * Cap enforcement is determined by the previous conditions. */ enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap && (rcfg.rcfg_memory_cap_enforcement_pressure == 0 || enforce_soft_caps); debug("%senforcing caps\n", enforce_caps ? "" : "not "); /* * If soft caps are in use, determine the size of the portion from each * collection to scan for. */ if (enforce_caps && enforce_soft_caps) compute_soft_scan_goal(&arg); /* * Victimize offending collections. */ if (enforce_caps && (!enforce_soft_caps || (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) { /* * Since at least one collection is over its cap & needs * enforcing, check if it is at least time for a process walk * (we could be well past time since we only walk /proc when * we need to) and if so, update each collections process list * in a single pass through /proc. */ if (EVENT_TIME(now, *next_proc_walk)) { debug("scanning process list...\n"); proc_walk_all(proc_cb); /* insert & mark */ list_walk_all(sweep_process_cb); /* free dead procs */ *next_proc_walk = NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval); } gz_col = NULL; if (enforce_soft_caps) { debug("scan goal is %lldKB\n", (long long)arg.ssa_scan_goal); list_walk_collection(soft_scan_cb, &arg); if (gz_capped && gz_col != NULL) { /* process global zone */ arg.ssa_project_over_cap = col_arg.sca_project_over_cap; soft_scan_gz(gz_col, &arg); } } else { list_walk_collection(scan_cb, NULL); if (gz_capped && gz_col != NULL) { /* process global zone */ scan_gz(gz_col, col_arg.sca_project_over_cap); } } } else if (col_arg.sca_any_over_cap) { list_walk_collection(unenforced_cap_cb, NULL); } } int main(int argc, char *argv[]) { int res; int should_fork = 1; /* fork flag */ hrtime_t now; /* current time */ hrtime_t next; /* time of next event */ int sig; /* signal iteration */ struct rlimit rl; hrtime_t next_proc_walk; /* time of next /proc scan */ hrtime_t next_configuration; /* time of next configuration */ hrtime_t next_rss_sample; /* (latest) time of next RSS sample */ (void) set_message_priority(RCM_INFO); (void) setprogname("rcapd"); rcapd_pid = getpid(); (void) chdir("/"); should_run = 1; ever_ran = 0; (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); /* * Parse command-line options. */ while ((res = getopt(argc, argv, "dF")) > 0) switch (res) { case 'd': should_fork = 0; if (debug_mode == 0) { debug_mode = 1; (void) set_message_priority(RCM_DEBUG); } else (void) set_message_priority(RCM_DEBUG_HIGH); break; case 'F': should_fork = 0; break; default: rcapd_usage(); return (E_USAGE); /*NOTREACHED*/ } /* * Read the configuration. */ if (rcfg_read(&rcfg, verify_statistics) != E_SUCCESS) { warn(gettext("resource caps not configured\n")); return (SMF_EXIT_ERR_CONFIG); } /* * If not debugging, fork and continue operating, changing the * destination of messages to syslog(). */ if (should_fork == 1) { pid_t child; debug("forking\n"); child = fork(); if (child == -1) die(gettext("cannot fork")); if (child > 0) return (0); else { rcapd_pid = getpid(); (void) set_message_destination(RCD_SYSLOG); (void) fclose(stdin); (void) fclose(stdout); (void) fclose(stderr); } /* * Start a new session and detatch from the controlling tty. */ if (setsid() == (pid_t)-1) debug(gettext("setsid() failed; cannot detach from " "terminal")); } finish_configuration(); should_reconfigure = 0; /* * Check that required privileges are possessed. */ verify_and_set_privileges(); now = next_report = next_proc_walk = next_rss_sample = gethrtime(); next_configuration = NEXT_EVENT_TIME(gethrtime(), rcfg.rcfg_reconfiguration_interval); /* * Open the kstat chain. */ kctl = kstat_open(); if (kctl == NULL) die(gettext("can't open kstats")); /* * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can * be effectively managed without revoking descriptors (at 3 per * process). */ rl.rlim_cur = 32 * 1024; rl.rlim_max = 32 * 1024; if (setrlimit(RLIMIT_NOFILE, &rl) != 0 && getrlimit(RLIMIT_NOFILE, &rl) == 0) { rl.rlim_cur = rl.rlim_max; (void) setrlimit(RLIMIT_NOFILE, &rl); } (void) enable_extended_FILE_stdio(-1, -1); if (getrlimit(RLIMIT_NOFILE, &rl) == 0) debug("fd limit: %lu\n", rl.rlim_cur); else debug("fd limit: unknown\n"); get_page_size(); my_zoneid = getzoneid(); /* * Handle those signals whose (default) exit disposition * prevents rcapd from finishing scanning before terminating. */ (void) sigset(SIGINT, terminate_signal); (void) sigset(SIGQUIT, abort_signal); (void) sigset(SIGILL, abort_signal); (void) sigset(SIGEMT, abort_signal); (void) sigset(SIGFPE, abort_signal); (void) sigset(SIGBUS, abort_signal); (void) sigset(SIGSEGV, abort_signal); (void) sigset(SIGSYS, abort_signal); (void) sigset(SIGPIPE, terminate_signal); (void) sigset(SIGALRM, terminate_signal); (void) sigset(SIGTERM, terminate_signal); (void) sigset(SIGUSR1, terminate_signal); (void) sigset(SIGUSR2, terminate_signal); (void) sigset(SIGPOLL, terminate_signal); (void) sigset(SIGVTALRM, terminate_signal); (void) sigset(SIGXCPU, abort_signal); (void) sigset(SIGXFSZ, abort_signal); for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++) (void) sigset(sig, terminate_signal); /* * Install a signal handler for reconfiguration processing. */ (void) sigset(SIGHUP, sighup); /* * Determine which process collections to cap. */ lcollection_update(LCU_COMPLETE); /* * Loop forever, monitoring collections' resident set sizes and * enforcing their caps. Look for changes in caps as well as * responding to requests to reread the configuration. Update * per-collection statistics periodically. */ while (should_run != 0) { struct timespec ts; /* * Announce that rcapd is starting. */ if (ever_ran == 0) { info(gettext("starting\n")); ever_ran = 1; } /* * Check the configuration at every next_configuration interval. * Update the rss data once every next_rss_sample interval. * The condition of global memory pressure is also checked at * the same frequency, if strict caps are in use. */ now = gethrtime(); /* * Detect configuration and cap changes only when SIGHUP * is received. Call reconfigure to apply new configuration * parameters. */ if (should_reconfigure == 1) { reread_configuration(); should_reconfigure = 0; reconfigure(now, &next_configuration, &next_proc_walk, &next_rss_sample); } if (EVENT_TIME(now, next_configuration)) { reconfigure(now, &next_configuration, &next_proc_walk, &next_rss_sample); } /* * Do the main work for enforcing caps. */ if (EVENT_TIME(now, next_rss_sample)) { do_capping(now, &next_proc_walk); next_rss_sample = NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval); } /* * Update the statistics file, if it's time. */ check_update_statistics(); /* * Sleep for some time before repeating. */ now = gethrtime(); next = next_configuration; next = POSITIVE_MIN(next, next_report); next = POSITIVE_MIN(next, next_rss_sample); if (next > now && should_run != 0) { debug("sleeping %-4.2f seconds\n", (float)(next - now) / (float)NANOSEC); hrt2ts(next - now, &ts); (void) nanosleep(&ts, NULL); } } if (termination_signal != 0) debug("exiting due to signal %d\n", termination_signal); if (ever_ran != 0) info(gettext("exiting\n")); /* * Unlink the statistics file before exiting. */ if (rcfg.rcfg_stat_file[0] != 0) (void) unlink(rcfg.rcfg_stat_file); return (E_SUCCESS); }