xref: /titanic_41/usr/src/cmd/rcap/rcapd/rcapd_main.c (revision 6f45ec7b0b964c3be967c4880e8867ac1e7763a5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * rcapd is a long-running daemon enforcing project-based resource caps (see
30  * rcapd(1M)).  Each instance of a process aggregate (project or, generically,
31  * "collection") may have a memory cap.  A single thread monitors the resource
32  * utilization of capped collections, enforces caps when they are exceeded (and
33  * other conditions are met), and incorporates changes in configuration or
34  * caps.  Each of these actions occurs not more frequently than the rate
35  * specified with rcapadm(1M).
36  */
37 
38 #include <sys/priocntl.h>
39 #include <sys/proc.h>
40 #include <sys/resource.h>
41 #include <sys/sysinfo.h>
42 #include <sys/stat.h>
43 #include <sys/sysmacros.h>
44 #include <sys/time.h>
45 #include <sys/types.h>
46 #include <dirent.h>
47 #include <errno.h>
48 #include <fcntl.h>
49 #include <kstat.h>
50 #include <libintl.h>
51 #include <limits.h>
52 #include <locale.h>
53 #include <priv.h>
54 #include <signal.h>
55 #include <stdarg.h>
56 #include <stdio.h>
57 #include <stdio_ext.h>
58 #include <stdlib.h>
59 #include <strings.h>
60 #include <time.h>
61 #include <unistd.h>
62 #include <zone.h>
63 #include <assert.h>
64 #include <sys/vm_usage.h>
65 #include "rcapd.h"
66 #include "rcapd_mapping.h"
67 #include "rcapd_rfd.h"
68 #include "rcapd_stat.h"
69 #include "utils.h"
70 
71 #define	POSITIVE_MIN(x, y) \
72 	(((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y))
73 #define	NEXT_EVENT_TIME(base, seconds) \
74 	(((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \
75 	: (hrtime_t)0)
76 #define	NEXT_REPORT_EVENT_TIME(base, seconds) \
77 	((rcfg.rcfg_stat_file[0] != 0) ?  \
78 	    NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0)
79 #define	EVENT_TIME(time, eventtime) \
80 	(((time) > (eventtime)) && (eventtime) != 0)
81 #define	STAT_TEMPLATE_SUFFIX	".XXXXXX"	/* suffix of mkstemp() arg */
82 #define	DAEMON_UID		1		/* uid to use */
83 
84 #define	CAPPED_PROJECT	0x01
85 #define	CAPPED_ZONE	0x02
86 
87 typedef struct soft_scan_arg {
88 	uint64_t ssa_sum_excess;
89 	int64_t ssa_scan_goal;
90 	boolean_t ssa_project_over_cap;
91 } soft_scan_arg_t;
92 
93 typedef struct sample_col_arg {
94 	boolean_t sca_any_over_cap;
95 	boolean_t sca_project_over_cap;
96 } sample_col_arg_t;
97 
98 
99 static int debug_mode = 0;		/* debug mode flag */
100 static pid_t rcapd_pid;			/* rcapd's pid to ensure it's not */
101 					/* scanned */
102 static kstat_ctl_t *kctl;		/* kstat chain */
103 static int memory_pressure = 0;		/* physical memory utilization (%) */
104 static int memory_pressure_sample = 0;	/* count of samples */
105 static long page_size_kb = 0;		/* system page size in KB */
106 static size_t nvmu_vals = 0;		/* # of kernel RSS/swap vals in array */
107 static size_t vmu_vals_len = 0;		/* size of RSS/swap vals array */
108 static vmusage_t *vmu_vals = NULL;	/* snapshot of kernel RSS/swap values */
109 static hrtime_t next_report;		/* time of next report */
110 static int termination_signal = 0;	/* terminating signal */
111 static zoneid_t my_zoneid = (zoneid_t)-1;
112 static lcollection_t *gz_col;		/* global zone collection */
113 
114 rcfg_t rcfg;
115 /*
116  * Updated when we re-read the collection configurations if this rcapd instance
117  * is running in the global zone and the global zone is capped.
118  */
119 boolean_t gz_capped = B_FALSE;
120 
121 /*
122  * Flags.
123  */
124 static int ever_ran;
125 int should_run;
126 static int should_reconfigure;
127 
128 static int verify_statistics(void);
129 static int update_statistics(void);
130 
131 /*
132  * Checks if a process is marked 'system'.  Returns FALSE only when it is not.
133  */
134 static boolean_t
135 proc_issystem(pid_t pid)
136 {
137 	char pc_clname[PC_CLNMSZ];
138 
139 	if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
140 	    PC_KY_NULL) != -1) {
141 		return (strcmp(pc_clname, "SYS") == 0);
142 	} else {
143 		debug("cannot get class-specific scheduling parameters; "
144 		    "assuming system process\n");
145 		return (B_TRUE);
146 	}
147 }
148 
149 static void
150 lprocess_insert_mark(psinfo_t *psinfop)
151 {
152 	pid_t pid = psinfop->pr_pid;
153 	/* flag indicating whether the process should be scanned. */
154 	int unscannable = psinfop->pr_nlwp == 0;
155 	rcid_t colid;
156 	lcollection_t *lcol;
157 	lprocess_t *lproc;
158 
159 	/*
160 	 * Determine which collection to put this process into.  We only have
161 	 * to worry about tracking both zone and project capped processes if
162 	 * this rcapd instance is running in the global zone, since we'll only
163 	 * see processes in our own projects in a non-global zone.  In the
164 	 * global zone, if the process belongs to a non-global zone, we only
165 	 * need to track it for the capped non-global zone collection.  For
166 	 * global zone processes, we first attempt to put the process into a
167 	 * capped project collection.  On the second pass into this function
168 	 * the projid will be cleared so we will just track the process for the
169 	 * global zone collection as a whole.
170 	 */
171 	if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) {
172 		colid.rcid_type = RCIDT_PROJECT;
173 		colid.rcid_val = psinfop->pr_projid;
174 	} else {
175 		/* try to add to zone collection */
176 		colid.rcid_type = RCIDT_ZONE;
177 		colid.rcid_val = psinfop->pr_zoneid;
178 	}
179 
180 	if ((lcol = lcollection_find(&colid)) == NULL)
181 		return;
182 
183 	/*
184 	 * If the process is already being tracked, update the unscannable flag,
185 	 * as determined by the caller, from the process's psinfo.
186 	 */
187 	lproc = lcol->lcol_lprocess;
188 	while (lproc != NULL) {
189 		if (lproc->lpc_pid == pid) {
190 			lproc->lpc_mark = 1;
191 			if (unscannable != 0 && lproc->lpc_unscannable == 0) {
192 				debug("process %d: became unscannable\n",
193 				    (int)lproc->lpc_pid);
194 				lproc->lpc_unscannable = 1;
195 			}
196 			return;
197 		}
198 		lproc = lproc->lpc_next;
199 	}
200 
201 	/*
202 	 * We've fallen off the list without finding our current process;
203 	 * insert it at the list head.
204 	 */
205 	if ((lproc = malloc(sizeof (*lproc))) == NULL)
206 		debug("insufficient memory to track new process %d", (int)pid);
207 	else {
208 		(void) bzero(lproc, sizeof (*lproc));
209 		lproc->lpc_pid = pid;
210 		lproc->lpc_mark = 1;
211 		lproc->lpc_collection = lcol;
212 		lproc->lpc_psinfo_fd = -1;
213 		lproc->lpc_pgdata_fd = -1;
214 		lproc->lpc_xmap_fd = -1;
215 
216 		/*
217 		 * If the caller didn't flag this process as unscannable
218 		 * already, do some more checking.
219 		 */
220 		lproc->lpc_unscannable = unscannable || proc_issystem(pid);
221 
222 #ifdef DEBUG
223 		/*
224 		 * Verify the sanity of lprocess.  It should not contain the
225 		 * process we are about to prepend.
226 		 */
227 		if (lcollection_member(lcol, lproc)) {
228 			lprocess_t *cur = lcol->lcol_lprocess;
229 			debug("The collection %lld already has these members, "
230 			    "including me, %d!\n",
231 			    (long long)lcol->lcol_id.rcid_val,
232 			    (int)lproc->lpc_pid);
233 			while (cur != NULL) {
234 				debug("\t%d\n", (int)cur->lpc_pid);
235 				cur = cur->lpc_next;
236 			}
237 			info(gettext("process already on lprocess\n"));
238 			abort();
239 		}
240 #endif /* DEBUG */
241 		lproc->lpc_next = lcol->lcol_lprocess;
242 		if (lproc->lpc_next != NULL)
243 			lproc->lpc_next->lpc_prev = lproc;
244 		lproc->lpc_prev = NULL;
245 		lcol->lcol_lprocess = lproc;
246 
247 		debug("tracking %s %ld %d %s%s\n",
248 		    (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
249 		    (long)colid.rcid_val,
250 		    (int)pid, psinfop->pr_psargs,
251 		    (lproc->lpc_unscannable != 0) ? " (not scannable)" : "");
252 		lcol->lcol_stat.lcols_proc_in++;
253 	}
254 }
255 
256 static int
257 list_walk_process_cb(lcollection_t *lcol, void *arg)
258 {
259 	int (*cb)(lcollection_t *, lprocess_t *) =
260 	    (int(*)(lcollection_t *, lprocess_t *))arg;
261 	lprocess_t *member;
262 	lprocess_t *next;
263 
264 	member = lcol->lcol_lprocess;
265 	while (member != NULL) {
266 		pid_t pid = member->lpc_pid;
267 		next = member->lpc_next;
268 
269 		debug_high("list_walk_all lpc %d\n", (int)pid);
270 		if (cb(lcol, member) != 0) {
271 			debug_high("list_walk_all aborted at lpc %d\n",
272 			    (int)pid);
273 			return (1);
274 		}
275 		member = next;
276 	}
277 
278 	return (0);
279 }
280 
281 /*
282  * Invoke the given callback for each process in each collection.  Callbacks
283  * are allowed to change the linkage of the process on which they act.
284  */
285 static void
286 list_walk_all(int (*cb)(lcollection_t *, lprocess_t *))
287 {
288 	list_walk_collection(list_walk_process_cb, (void *)cb);
289 }
290 
291 static void
292 revoke_psinfo(rfd_t *rfd)
293 {
294 	lprocess_t *lpc = (lprocess_t *)rfd->rfd_data;
295 
296 	if (lpc != NULL) {
297 		debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid);
298 		ASSERT(lpc->lpc_psinfo_fd != -1);
299 		lpc->lpc_psinfo_fd = -1;
300 	} else
301 		debug("revoking psinfo fd for unknown process\n");
302 }
303 
304 /*
305  * Retrieve a process's psinfo via an already-opened or new file descriptor.
306  * The supplied descriptor will be closed on failure.  An optional callback
307  * will be invoked with the last descriptor tried, and a supplied callback
308  * argument, as its arguments, such that the new descriptor may be cached, or
309  * an old one may be invalidated.  If the result of the callback is zero, the
310  * the caller is to assume responsibility for the file descriptor, to close it
311  * with rfd_close().
312  *
313  * On failure, a nonzero value is returned.
314  */
315 int
316 get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd,
317     int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc)
318 {
319 	int fd;
320 	int can_try_uncached;
321 
322 	ASSERT(!(cached_fd > 0 && fd_update_cb == NULL));
323 
324 	do {
325 		if (cached_fd >= 0) {
326 			fd = cached_fd;
327 			can_try_uncached = 1;
328 			debug_high("%d/psinfo, trying cached fd %d\n",
329 			    (int)pid, fd);
330 		} else {
331 			char pathbuf[PROC_PATH_MAX];
332 
333 			can_try_uncached = 0;
334 			(void) snprintf(pathbuf, sizeof (pathbuf),
335 			    "/proc/%d/psinfo", (int)pid);
336 			if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO,
337 			    revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) {
338 				debug("cannot open %s", pathbuf);
339 				break;
340 			} else
341 				debug_high("opened %s, fd %d\n", pathbuf, fd);
342 		}
343 
344 		if (pread(fd, psinfo, sizeof (*psinfo), 0) ==
345 		    sizeof (*psinfo) && psinfo->pr_pid == pid)
346 			break;
347 		else {
348 			debug_high("closed fd %d\n", fd);
349 			if (rfd_close(fd) != 0)
350 				debug("could not close fd %d", fd);
351 			fd = cached_fd = -1;
352 		}
353 	} while (can_try_uncached == 1);
354 
355 	if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0)
356 		if (fd >= 0) {
357 			debug_high("closed %s fd %d\n", fd_update_cb == NULL ?
358 			    "uncached" : "cached", fd);
359 			if (rfd_close(fd) != 0)
360 				debug("could not close fd %d", fd);
361 		}
362 
363 	debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd,
364 	    fd_update_cb != NULL ? "cached" : "uncached");
365 	return ((fd >= 0) ? 0 : -1);
366 }
367 
368 /*
369  * Retrieve the collection membership of all processes and update the psinfo of
370  * those non-system, non-zombie ones in collections.  For global zone processes,
371  * we first attempt to put the process into a capped project collection.  We
372  * also want to track the process for the global zone collection as a whole.
373  */
374 static void
375 proc_cb(const pid_t pid)
376 {
377 	psinfo_t psinfo;
378 
379 	if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) {
380 		lprocess_insert_mark(&psinfo);
381 		if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) {
382 			/*
383 			 * We also want to track this process for the global
384 			 * zone as a whole so add it to the global zone
385 			 * collection as well.
386 			 */
387 			psinfo.pr_projid = -1;
388 			lprocess_insert_mark(&psinfo);
389 		}
390 	}
391 }
392 
393 /*
394  * Cache the process' psinfo fd, taking responsibility for freeing it.
395  */
396 int
397 lprocess_update_psinfo_fd_cb(void *arg, int fd)
398 {
399 	lprocess_t *lpc = arg;
400 
401 	lpc->lpc_psinfo_fd = fd;
402 	return (0);
403 }
404 
405 /*
406  * Get the system pagesize.
407  */
408 static void
409 get_page_size(void)
410 {
411 	page_size_kb = sysconf(_SC_PAGESIZE) / 1024;
412 	debug("physical page size: %luKB\n", page_size_kb);
413 }
414 
415 static void
416 tm_fmt(char *msg, hrtime_t t1, hrtime_t t2)
417 {
418 	hrtime_t diff = t2 - t1;
419 
420 	if (diff < MILLISEC)
421 		debug("%s: %lld nanoseconds\n", msg, diff);
422 	else if (diff < MICROSEC)
423 		debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC);
424 	else if (diff < NANOSEC)
425 		debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC);
426 	else
427 		debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC);
428 }
429 
430 /*
431  * Get the zone's & project's RSS from the kernel.
432  */
433 static void
434 rss_sample(boolean_t my_zone_only, uint_t col_types)
435 {
436 	size_t nres;
437 	size_t i;
438 	uint_t flags;
439 	hrtime_t t1, t2;
440 
441 	if (my_zone_only) {
442 		flags = VMUSAGE_ZONE;
443 	} else {
444 		flags = 0;
445 		if (col_types & CAPPED_PROJECT)
446 			flags |= VMUSAGE_PROJECTS;
447 		if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID)
448 			flags |= VMUSAGE_ALL_ZONES;
449 	}
450 
451 	debug("vmusage sample flags 0x%x\n", flags);
452 	if (flags == 0)
453 		return;
454 
455 again:
456 	/* try the current buffer to see if the list will fit */
457 	nres = vmu_vals_len;
458 	t1 = gethrtime();
459 	if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval,
460 	    vmu_vals, &nres) != 0) {
461 		if (errno != EOVERFLOW) {
462 			warn(gettext("can't read RSS from kernel\n"));
463 			return;
464 		}
465 	}
466 	t2 = gethrtime();
467 	tm_fmt("getvmusage time", t1, t2);
468 
469 	debug("kernel nres %lu\n", (ulong_t)nres);
470 
471 	if (nres > vmu_vals_len) {
472 		/* array size is now too small, increase it and try again */
473 		free(vmu_vals);
474 
475 		if ((vmu_vals = (vmusage_t *)calloc(nres,
476 		    sizeof (vmusage_t))) == NULL) {
477 			warn(gettext("out of memory: could not read RSS from "
478 			    "kernel\n"));
479 			vmu_vals_len = nvmu_vals = 0;
480 			return;
481 		}
482 		vmu_vals_len = nres;
483 		goto again;
484 	}
485 
486 	nvmu_vals = nres;
487 
488 	debug("vmusage_sample\n");
489 	for (i = 0; i < nvmu_vals; i++) {
490 		debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), "
491 		    "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id,
492 		    vmu_vals[i].vmu_type,
493 		    (unsigned long long)vmu_vals[i].vmu_rss_all,
494 		    (unsigned long long)vmu_vals[i].vmu_rss_all / 1024,
495 		    (unsigned long long)vmu_vals[i].vmu_swap_all);
496 	}
497 }
498 
499 static void
500 update_col_rss(lcollection_t *lcol)
501 {
502 	int i;
503 
504 	lcol->lcol_rss = 0;
505 	lcol->lcol_image_size = 0;
506 
507 	for (i = 0; i < nvmu_vals; i++) {
508 		if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val)
509 			continue;
510 
511 		if (vmu_vals[i].vmu_type == VMUSAGE_ZONE &&
512 		    lcol->lcol_id.rcid_type != RCIDT_ZONE)
513 			continue;
514 
515 		if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS &&
516 		    lcol->lcol_id.rcid_type != RCIDT_PROJECT)
517 			continue;
518 
519 		/* we found the right RSS entry, update the collection vals */
520 		lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024;
521 		lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024;
522 		break;
523 	}
524 }
525 
526 /*
527  * Sample the collection RSS, updating the collection's statistics with the
528  * results.  Also, sum the rss of all capped projects & return true if
529  * the collection is over cap.
530  */
531 static int
532 rss_sample_col_cb(lcollection_t *lcol, void *arg)
533 {
534 	int64_t excess;
535 	uint64_t rss;
536 	sample_col_arg_t *col_argp = (sample_col_arg_t *)arg;
537 
538 	update_col_rss(lcol);
539 
540 	lcol->lcol_stat.lcols_rss_sample++;
541 	rss = lcol->lcol_rss;
542 	excess = rss - lcol->lcol_rss_cap;
543 	if (excess > 0) {
544 		lcol->lcol_stat.lcols_rss_act_sum += rss;
545 		col_argp->sca_any_over_cap = B_TRUE;
546 		if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
547 			col_argp->sca_project_over_cap = B_TRUE;
548 	}
549 	lcol->lcol_stat.lcols_rss_sum += rss;
550 
551 	if (lcol->lcol_stat.lcols_min_rss > rss)
552 		lcol->lcol_stat.lcols_min_rss = rss;
553 	if (lcol->lcol_stat.lcols_max_rss < rss)
554 		lcol->lcol_stat.lcols_max_rss = rss;
555 
556 	return (0);
557 }
558 
559 /*
560  * Determine if we have capped projects, capped zones or both.
561  */
562 static int
563 col_type_cb(lcollection_t *lcol, void *arg)
564 {
565 	uint_t *col_type = (uint_t *)arg;
566 
567 	/* skip uncapped collections */
568 	if (lcol->lcol_rss_cap == 0)
569 		return (1);
570 
571 	if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
572 		*col_type |= CAPPED_PROJECT;
573 	else
574 		*col_type |= CAPPED_ZONE;
575 
576 	/* once we know everything is capped, we can stop looking */
577 	if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT))
578 		return (1);
579 
580 	return (0);
581 }
582 
583 /*
584  * Open /proc and walk entries.
585  */
586 static void
587 proc_walk_all(void (*cb)(const pid_t))
588 {
589 	DIR *pdir;
590 	struct dirent *dirent;
591 	pid_t pid;
592 
593 	(void) rfd_reserve(1);
594 	if ((pdir = opendir("/proc")) == NULL)
595 		die(gettext("couldn't open /proc!"));
596 
597 	while ((dirent = readdir(pdir)) != NULL) {
598 		if (strcmp(".", dirent->d_name) == 0 ||
599 		    strcmp("..", dirent->d_name) == 0)
600 			continue;
601 		pid = atoi(dirent->d_name);
602 		ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0);
603 		if (pid == rcapd_pid)
604 			continue;
605 		else
606 			cb(pid);
607 	}
608 	(void) closedir(pdir);
609 }
610 
611 /*
612  * Clear unmarked callback.
613  */
614 /*ARGSUSED*/
615 static int
616 sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc)
617 {
618 	if (lpc->lpc_mark) {
619 		lpc->lpc_mark = 0;
620 	} else {
621 		debug("process %d finished\n", (int)lpc->lpc_pid);
622 		lprocess_free(lpc);
623 	}
624 
625 	return (0);
626 }
627 
628 /*
629  * Print, for debugging purposes, a collection's recently-sampled RSS and
630  * excess.
631  */
632 /*ARGSUSED*/
633 static int
634 excess_print_cb(lcollection_t *lcol, void *arg)
635 {
636 	int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap;
637 
638 	debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n",
639 	    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
640 	    lcol->lcol_name,
641 	    (unsigned long long)lcol->lcol_rss,
642 	    (unsigned long long)lcol->lcol_rss_cap,
643 	    (long long)excess);
644 
645 	return (0);
646 }
647 
648 /*
649  * Scan those collections which have exceeded their caps.
650  *
651  * If we're running in the global zone it might have a cap.  We don't want to
652  * do any capping for the global zone yet since we might get under the cap by
653  * just capping the projects in the global zone.
654  */
655 /*ARGSUSED*/
656 static int
657 scan_cb(lcollection_t *lcol, void *arg)
658 {
659 	int64_t excess;
660 
661 	/* skip over global zone collection for now but keep track for later */
662 	if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
663 	    lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
664 		gz_col = lcol;
665 		return (0);
666 	}
667 
668 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
669 		scan(lcol, excess);
670 		lcol->lcol_stat.lcols_scan++;
671 	}
672 
673 	return (0);
674 }
675 
676 /*
677  * Scan the global zone collection and see if it still exceeds its cap.
678  * We take into account the effects of capping any global zone projects here.
679  */
680 static void
681 scan_gz(lcollection_t *lcol, boolean_t project_over_cap)
682 {
683 	int64_t excess;
684 
685 	/*
686 	 * If we had projects over their cap and the global zone was also over
687 	 * its cap then we need to get the up-to-date global zone rss to
688 	 * determine if we are still over the global zone cap.  We might have
689 	 * gone under while we scanned the capped projects.  If there were no
690 	 * projects over cap then we can use the rss value we already have for
691 	 * the global zone.
692 	 */
693 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
694 	if (project_over_cap && excess > 0) {
695 		rss_sample(B_TRUE, CAPPED_ZONE);
696 		update_col_rss(lcol);
697 		excess = lcol->lcol_rss - lcol->lcol_rss_cap;
698 	}
699 
700 	if (excess > 0) {
701 		debug("global zone excess %lldKB\n", (long long)excess);
702 		scan(lcol, excess);
703 		lcol->lcol_stat.lcols_scan++;
704 	}
705 }
706 
707 /*
708  * Do a soft scan of those collections which have excesses.  A soft scan is one
709  * in which the cap enforcement pressure is taken into account.  The difference
710  * between the utilized physical memory and the cap enforcement pressure will
711  * be scanned-for, and each collection will be scanned proportionally by their
712  * present excesses.
713  */
714 static int
715 soft_scan_cb(lcollection_t *lcol, void *a)
716 {
717 	int64_t excess;
718 	soft_scan_arg_t *arg = a;
719 
720 	/* skip over global zone collection for now but keep track for later */
721 	if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
722 	    lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
723 		gz_col = lcol;
724 		return (0);
725 	}
726 
727 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
728 		int64_t adjusted_excess =
729 		    excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
730 
731 		debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
732 		    "scanning %lld\n",
733 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
734 		    "project" : "zone"),
735 		    (long)lcol->lcol_id.rcid_val,
736 		    (long long)excess, (long long)arg->ssa_scan_goal,
737 		    (unsigned long long)arg->ssa_sum_excess,
738 		    (long long)adjusted_excess);
739 
740 		scan(lcol, adjusted_excess);
741 		lcol->lcol_stat.lcols_scan++;
742 	}
743 
744 	return (0);
745 }
746 
747 static void
748 soft_scan_gz(lcollection_t *lcol, void *a)
749 {
750 	int64_t excess;
751 	soft_scan_arg_t *arg = a;
752 
753 	/*
754 	 * If we had projects over their cap and the global zone was also over
755 	 * its cap then we need to get the up-to-date global zone rss to
756 	 * determine if we are still over the global zone cap.  We might have
757 	 * gone under while we scanned the capped projects.  If there were no
758 	 * projects over cap then we can use the rss value we already have for
759 	 * the global zone.
760 	 */
761 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
762 	if (arg->ssa_project_over_cap && excess > 0) {
763 		rss_sample(B_TRUE, CAPPED_ZONE);
764 		update_col_rss(lcol);
765 		excess = lcol->lcol_rss - lcol->lcol_rss_cap;
766 	}
767 
768 	if (excess > 0) {
769 		int64_t adjusted_excess =
770 		    excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
771 
772 		debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
773 		    "scanning %lld\n",
774 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
775 		    "project" : "zone"),
776 		    (long)lcol->lcol_id.rcid_val,
777 		    (long long)excess, (long long)arg->ssa_scan_goal,
778 		    (unsigned long long)arg->ssa_sum_excess,
779 		    (long long)adjusted_excess);
780 
781 		scan(lcol, adjusted_excess);
782 		lcol->lcol_stat.lcols_scan++;
783 	}
784 }
785 
786 /*
787  * When a scan could happen, but caps aren't enforced tick the
788  * lcols_unenforced_cap counter.
789  */
790 /*ARGSUSED*/
791 static int
792 unenforced_cap_cb(lcollection_t *lcol, void *arg)
793 {
794 	lcol->lcol_stat.lcols_unenforced_cap++;
795 
796 	return (0);
797 }
798 
799 /*
800  * Update the count of physically installed memory.
801  */
802 static void
803 update_phys_total(void)
804 {
805 	uint64_t old_phys_total;
806 
807 	old_phys_total = phys_total;
808 	phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb;
809 	if (phys_total != old_phys_total)
810 		debug("physical memory%s: %lluM\n", (old_phys_total == 0 ?
811 		    "" : " adjusted"), (unsigned long long)(phys_total / 1024));
812 }
813 
814 /*
815  * Unlink a process from its collection, updating relevant statistics, and
816  * freeing its associated memory.
817  */
818 void
819 lprocess_free(lprocess_t *lpc)
820 {
821 	pid_t pid;
822 
823 	lpc->lpc_collection->lcol_stat.lcols_proc_out++;
824 
825 	if (lpc->lpc_prev != NULL)
826 		lpc->lpc_prev->lpc_next = lpc->lpc_next;
827 	if (lpc->lpc_next != NULL)
828 		lpc->lpc_next->lpc_prev = lpc->lpc_prev;
829 	if (lpc->lpc_collection->lcol_lprocess == lpc)
830 		lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next !=
831 		    lpc ? lpc->lpc_next : NULL);
832 	lpc->lpc_next = lpc->lpc_prev = NULL;
833 
834 	if (lpc->lpc_prpageheader != NULL)
835 		free(lpc->lpc_prpageheader);
836 	if (lpc->lpc_xmap != NULL)
837 		free(lpc->lpc_xmap);
838 	if (lpc->lpc_psinfo_fd >= 0) {
839 		if (rfd_close(lpc->lpc_psinfo_fd) != 0)
840 			debug("could not close %d lpc_psinfo_fd %d",
841 			    (int)lpc->lpc_pid, lpc->lpc_psinfo_fd);
842 		lpc->lpc_psinfo_fd = -1;
843 	}
844 	if (lpc->lpc_pgdata_fd >= 0) {
845 		if (rfd_close(lpc->lpc_pgdata_fd) != 0)
846 			debug("could not close %d lpc_pgdata_fd %d",
847 			    (int)lpc->lpc_pid, lpc->lpc_pgdata_fd);
848 		lpc->lpc_pgdata_fd = -1;
849 	}
850 	if (lpc->lpc_xmap_fd >= 0) {
851 		if (rfd_close(lpc->lpc_xmap_fd) != 0)
852 			debug("could not close %d lpc_xmap_fd %d",
853 			    (int)lpc->lpc_pid, lpc->lpc_xmap_fd);
854 		lpc->lpc_xmap_fd = -1;
855 	}
856 	if (lpc->lpc_ignore != NULL)
857 		lmapping_free(&lpc->lpc_ignore);
858 	pid = lpc->lpc_pid;
859 	free(lpc);
860 	debug_high("process %d freed\n", (int)pid);
861 }
862 
863 /*
864  * Collection clear callback.
865  */
866 /*ARGSUSED*/
867 static int
868 collection_clear_cb(lcollection_t *lcol, void *arg)
869 {
870 	lcol->lcol_mark = 0;
871 
872 	return (0);
873 }
874 
875 /*
876  * Respond to a terminating signal by setting a termination flag.
877  */
878 /*ARGSUSED*/
879 static void
880 terminate_signal(int signal)
881 {
882 	if (termination_signal == 0)
883 		termination_signal = signal;
884 	should_run = 0;
885 }
886 
887 /*
888  * Handle any synchronous or asynchronous signals that would ordinarily cause a
889  * process to abort.
890  */
891 /*ARGSUSED*/
892 static void
893 abort_signal(int signal)
894 {
895 	/*
896 	 * Allow the scanner to make a last-ditch effort to resume any stopped
897 	 * processes.
898 	 */
899 	scan_abort();
900 	abort();
901 }
902 
903 /*
904  * Clean up collections which have been removed due to configuration.  Unlink
905  * the collection from lcollection and free it.
906  */
907 /*ARGSUSED*/
908 static int
909 collection_sweep_cb(lcollection_t *lcol, void *arg)
910 {
911 	if (lcol->lcol_mark == 0) {
912 		debug("freeing %s %s\n",
913 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
914 		    "project" : "zone"), lcol->lcol_name);
915 		lcollection_free(lcol);
916 	}
917 
918 	return (0);
919 }
920 
921 /*
922  * Set those variables which depend on the global configuration.
923  */
924 static void
925 finish_configuration(void)
926 {
927 	/*
928 	 * Warn that any lnode (or non-project) mode specification (by an SRM
929 	 * 1.3 configuration file, for example) is ignored.
930 	 */
931 	if (strcmp(rcfg.rcfg_mode_name, "project") != 0) {
932 		warn(gettext("%s mode specification ignored -- using project"
933 		    " mode\n"), rcfg.rcfg_mode_name);
934 		rcfg.rcfg_mode_name = "project";
935 		rcfg.rcfg_mode = rctype_project;
936 	}
937 }
938 
939 /*
940  * Cause the configuration file to be reread and applied.
941  */
942 static void
943 reread_configuration_file(void)
944 {
945 	rcfg_t rcfg_new;
946 	struct stat st;
947 
948 	if (stat(rcfg.rcfg_filename, &st) == 0 && st.st_mtime ==
949 	    rcfg.rcfg_last_modification)
950 		return;
951 
952 	if (rcfg_read(rcfg.rcfg_filename, rcfg.rcfg_fd, &rcfg_new,
953 	    update_statistics) != 0)
954 		warn(gettext("can't reread configuration"));
955 	else {
956 		/*
957 		 * The configuration file has been read.  Remove existing
958 		 * collections in case there is a change in collection type.
959 		 */
960 		if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) {
961 			list_walk_collection(collection_clear_cb, NULL);
962 			list_walk_collection(collection_sweep_cb, NULL);
963 		}
964 
965 		/*
966 		 * Make the newly-read configuration the global one, and update
967 		 * any variables that depend on it.
968 		 */
969 		rcfg = rcfg_new;
970 		finish_configuration();
971 	}
972 }
973 
974 /*
975  * Reread the configuration filex, then examine changes, additions, and
976  * deletions to cap definitions.
977  */
978 static void
979 reconfigure(hrtime_t now, hrtime_t *next_configuration,
980     hrtime_t *next_proc_walk, hrtime_t *next_rss_sample)
981 {
982 	debug("reconfigure...\n");
983 
984 	/*
985 	 * Reread the configuration data.
986 	 */
987 	reread_configuration_file();
988 
989 	/*
990 	 * Walk the lcollection, marking active collections so inactive ones
991 	 * can be freed.
992 	 */
993 	list_walk_collection(collection_clear_cb, NULL);
994 	lcollection_update(LCU_ACTIVE_ONLY); /* mark */
995 	list_walk_collection(collection_sweep_cb, NULL);
996 
997 	*next_configuration = NEXT_EVENT_TIME(now,
998 	    rcfg.rcfg_reconfiguration_interval);
999 
1000 	/*
1001 	 * Reset each event time to the shorter of the previous and new
1002 	 * intervals.
1003 	 */
1004 	if (next_report == 0 && rcfg.rcfg_report_interval > 0)
1005 		next_report = now;
1006 	else
1007 		next_report = POSITIVE_MIN(next_report,
1008 		    NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval));
1009 
1010 	if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0)
1011 		*next_proc_walk = now;
1012 	else
1013 		*next_proc_walk = POSITIVE_MIN(*next_proc_walk,
1014 		    NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval));
1015 
1016 	if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0)
1017 		*next_rss_sample = now;
1018 	else
1019 		*next_rss_sample = POSITIVE_MIN(*next_rss_sample,
1020 		    NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval));
1021 }
1022 
1023 /*
1024  * Respond to SIGHUP by triggering the rereading the configuration file and cap
1025  * definitions.
1026  */
1027 /*ARGSUSED*/
1028 static void
1029 sighup(int signal)
1030 {
1031 	should_reconfigure = 1;
1032 }
1033 
1034 /*
1035  * Print, for debugging purposes, each collection's interval statistics.
1036  */
1037 /*ARGSUSED*/
1038 static int
1039 simple_report_collection_cb(lcollection_t *lcol, void *arg)
1040 {
1041 #define	DELTA(field) \
1042 	(unsigned long long)( \
1043 	    (lcol->lcol_stat.field - lcol->lcol_stat_old.field))
1044 
1045 	debug("%s %s status: succeeded/attempted (k): %llu/%llu, "
1046 	    "ineffective/scans/unenforced/samplings:  %llu/%llu/%llu/%llu, RSS "
1047 	    "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, "
1048 	    "%llu scans over %llu ms\n",
1049 	    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
1050 	    lcol->lcol_name,
1051 	    DELTA(lcols_pg_eff), DELTA(lcols_pg_att),
1052 	    DELTA(lcols_scan_ineffective), DELTA(lcols_scan),
1053 	    DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample),
1054 	    (unsigned long long)lcol->lcol_stat.lcols_min_rss,
1055 	    (unsigned long long)lcol->lcol_stat.lcols_max_rss,
1056 	    (unsigned long long)lcol->lcol_rss_cap,
1057 	    (unsigned long long)(lcol->lcol_stat.lcols_proc_in -
1058 	    lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out),
1059 	    DELTA(lcols_scan_count), DELTA(lcols_scan_time_complete) / (NANOSEC
1060 	    / MILLISEC));
1061 
1062 #undef DELTA
1063 
1064 	return (0);
1065 }
1066 
1067 /*
1068  * Record each collection's interval statistics in the statistics file.
1069  */
1070 static int
1071 report_collection_cb(lcollection_t *lcol, void *arg)
1072 {
1073 	lcollection_report_t dc;
1074 	int fd = (intptr_t)arg;
1075 
1076 	/*
1077 	 * Copy the relevant fields to the collection's record.
1078 	 */
1079 	bzero(&dc, sizeof (dc));
1080 	dc.lcol_id = lcol->lcol_id;
1081 	(void) strcpy(dc.lcol_name, lcol->lcol_name);
1082 	dc.lcol_rss = lcol->lcol_rss;
1083 	dc.lcol_image_size = lcol->lcol_image_size;
1084 	dc.lcol_rss_cap = lcol->lcol_rss_cap;
1085 	dc.lcol_stat = lcol->lcol_stat;
1086 
1087 	if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) {
1088 		lcol->lcol_stat_old = lcol->lcol_stat;
1089 	} else {
1090 		debug("can't write %s %s statistics",
1091 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
1092 		    "project" : "zone"),
1093 		    lcol->lcol_name);
1094 	}
1095 
1096 	return (0);
1097 }
1098 
1099 /*
1100  * Determine the count of pages scanned by the global page scanner, obtained
1101  * from the cpu_stat:*::scan kstats.  Return zero on success.
1102  */
1103 static int
1104 get_globally_scanned_pages(uint64_t *scannedp)
1105 {
1106 	kstat_t *ksp;
1107 	uint64_t scanned = 0;
1108 
1109 	if (kstat_chain_update(kctl) == -1) {
1110 		warn(gettext("can't update kstat chain"));
1111 		return (0);
1112 	}
1113 
1114 	for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
1115 		if (strcmp(ksp->ks_module, "cpu_stat") == 0) {
1116 			if (kstat_read(kctl, ksp, NULL) != -1) {
1117 				scanned += ((cpu_stat_t *)
1118 				    ksp->ks_data)->cpu_vminfo.scan;
1119 			} else {
1120 				return (-1);
1121 			}
1122 		}
1123 	}
1124 
1125 	*scannedp = scanned;
1126 	return (0);
1127 }
1128 
1129 /*
1130  * Determine if the global page scanner is running, during which no memory
1131  * caps should be enforced, to prevent interference with the global page
1132  * scanner.
1133  */
1134 static boolean_t
1135 is_global_scanner_running()
1136 {
1137 	/* measure delta in page scan count */
1138 	static uint64_t new_sp = 0;
1139 	static uint64_t old_sp = 0;
1140 	boolean_t res = B_FALSE;
1141 
1142 	if (get_globally_scanned_pages(&new_sp) == 0) {
1143 		if (old_sp != 0 && (new_sp - old_sp) > 0) {
1144 			debug("global memory pressure detected (%llu "
1145 			    "pages scanned since last interval)\n",
1146 			    (unsigned long long)(new_sp - old_sp));
1147 			res = B_TRUE;
1148 		}
1149 		old_sp = new_sp;
1150 	} else {
1151 		warn(gettext("unable to read cpu statistics"));
1152 		new_sp = old_sp;
1153 	}
1154 
1155 	return (res);
1156 }
1157 
1158 /*
1159  * If soft caps are in use, determine if global memory pressure exceeds the
1160  * configured maximum above which soft caps are enforced.
1161  */
1162 static boolean_t
1163 must_enforce_soft_caps()
1164 {
1165 	/*
1166 	 * Check for changes to the amount of installed physical memory, to
1167 	 * compute the current memory pressure.
1168 	 */
1169 	update_phys_total();
1170 
1171 	memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb)
1172 	    * 100.0 / phys_total);
1173 	memory_pressure_sample++;
1174 	if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 &&
1175 	    memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) {
1176 		return (B_TRUE);
1177 	}
1178 
1179 	return (B_FALSE);
1180 }
1181 
1182 /*
1183  * Update the shared statistics file with each collection's current statistics.
1184  * Return zero on success.
1185  */
1186 static int
1187 update_statistics(void)
1188 {
1189 	int fd, res;
1190 	static char template[LINELEN];
1191 
1192 	/*
1193 	 * Try to create a directory irrespective of whether it is existing
1194 	 * or not. If it is not there then it will create. Otherwise any way
1195 	 * it will fail at mkstemp call below.
1196 	 */
1197 	(void) mkdir(STAT_FILE_DIR, 0755);
1198 
1199 	/*
1200 	 * Create a temporary file.
1201 	 */
1202 	if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) +
1203 	    strlen(STAT_TEMPLATE_SUFFIX) + 1)) {
1204 		debug("temporary file template size too small\n");
1205 		return (-1);
1206 	}
1207 	(void) strcpy(template, rcfg.rcfg_stat_file);
1208 	(void) strcat(template, STAT_TEMPLATE_SUFFIX);
1209 	(void) rfd_reserve(1);
1210 	fd = mkstemp(template);
1211 
1212 	/*
1213 	 * Write the header and per-collection statistics.
1214 	 */
1215 	if (fd >= 0) {
1216 		rcapd_stat_hdr_t rs;
1217 
1218 		rs.rs_pid = rcapd_pid;
1219 		rs.rs_time = gethrtime();
1220 		ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name));
1221 		(void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name);
1222 		rs.rs_pressure_cur = memory_pressure;
1223 		rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure;
1224 		rs.rs_pressure_sample = memory_pressure_sample;
1225 
1226 		if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) ==
1227 		    sizeof (rs)) {
1228 			list_walk_collection(report_collection_cb,
1229 				(void *)(intptr_t)fd);
1230 			/*
1231 			 * Replace the existing statistics file with this new
1232 			 * one.
1233 			 */
1234 			res = rename(template, rcfg.rcfg_stat_file);
1235 		} else
1236 			res = -1;
1237 		(void) close(fd);
1238 	} else
1239 		res = -1;
1240 
1241 	return (res);
1242 }
1243 
1244 /*
1245  * Verify the statistics file can be created and written to, and die if an
1246  * existing file may be in use by another rcapd.
1247  */
1248 static int
1249 verify_statistics(void)
1250 {
1251 	pid_t pid;
1252 
1253 	/*
1254 	 * Warn if another instance of rcapd might be active.
1255 	 */
1256 	(void) rfd_reserve(1);
1257 	pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file);
1258 	if (pid != rcapd_pid && pid != -1)
1259 		die(gettext("%s exists; rcapd may already be active\n"),
1260 		    rcfg.rcfg_stat_file);
1261 
1262 	return (update_statistics());
1263 }
1264 
1265 static int
1266 sum_excess_cb(lcollection_t *lcol, void *arg)
1267 {
1268 	uint64_t *sum_excess = arg;
1269 
1270 	*sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss -
1271 	    lcol->lcol_rss_cap));
1272 	return (0);
1273 }
1274 
1275 /*
1276  * Compute the quantity of memory (in kilobytes) above the cap enforcement
1277  * pressure.  Set the scan goal to that quantity (or at most the excess).
1278  */
1279 static void
1280 compute_soft_scan_goal(soft_scan_arg_t *argp)
1281 {
1282 	/*
1283 	 * Compute the sum of the collections' excesses, which will be the
1284 	 * denominator.
1285 	 */
1286 	argp->ssa_sum_excess = 0;
1287 	list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess));
1288 
1289 	argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) *
1290 	    (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 -
1291 	    sysconf(_SC_AVPHYS_PAGES)) * page_size_kb,
1292 	    argp->ssa_sum_excess);
1293 }
1294 
1295 static void
1296 rcapd_usage(void)
1297 {
1298 	info(gettext("usage: rcapd [-d]\n"));
1299 }
1300 
1301 void
1302 check_update_statistics(void)
1303 {
1304 	hrtime_t now = gethrtime();
1305 
1306 	if (EVENT_TIME(now, next_report)) {
1307 		debug("updating statistics...\n");
1308 		list_walk_collection(simple_report_collection_cb, NULL);
1309 		if (update_statistics() != 0)
1310 			debug("couldn't update statistics");
1311 		next_report = NEXT_REPORT_EVENT_TIME(now,
1312 		    rcfg.rcfg_report_interval);
1313 	}
1314 }
1315 
1316 static void
1317 verify_and_set_privileges(void)
1318 {
1319 	priv_set_t *required =
1320 	    priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL);
1321 
1322 	/*
1323 	 * Ensure the required privileges, suitable for controlling processes,
1324 	 * are possessed.
1325 	 */
1326 	if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv(
1327 	    PRIV_SET, PRIV_EFFECTIVE, required) != 0)
1328 		die(gettext("can't set requisite privileges"));
1329 
1330 	/*
1331 	 * Ensure access to /var/run/daemon.
1332 	 */
1333 	if (setreuid(DAEMON_UID, DAEMON_UID) != 0)
1334 		die(gettext("cannot become user daemon"));
1335 
1336 	priv_freeset(required);
1337 }
1338 
1339 /*
1340  * This function does the top-level work to determine if we should do any
1341  * memory capping, and if so, it invokes the right call-backs to do the work.
1342  */
1343 static void
1344 do_capping(hrtime_t now, hrtime_t *next_proc_walk)
1345 {
1346 	boolean_t enforce_caps;
1347 	/* soft cap enforcement flag, depending on memory pressure */
1348 	boolean_t enforce_soft_caps;
1349 	/* avoid interference with kernel's page scanner */
1350 	boolean_t global_scanner_running;
1351 	sample_col_arg_t col_arg;
1352 	soft_scan_arg_t arg;
1353 	uint_t col_types = 0;
1354 
1355 	/* check what kind of collections (project/zone) are capped */
1356 	list_walk_collection(col_type_cb, &col_types);
1357 	debug("collection types: 0x%x\n", col_types);
1358 
1359 	/* no capped collections, skip checking rss */
1360 	if (col_types == 0)
1361 		return;
1362 
1363 	/* Determine if soft caps are enforced. */
1364 	enforce_soft_caps = must_enforce_soft_caps();
1365 
1366 	/* Determine if the global page scanner is running. */
1367 	global_scanner_running = is_global_scanner_running();
1368 
1369 	/*
1370 	 * Sample collections' member processes RSSes and recompute
1371 	 * collections' excess.
1372 	 */
1373 	rss_sample(B_FALSE, col_types);
1374 
1375 	col_arg.sca_any_over_cap = B_FALSE;
1376 	col_arg.sca_project_over_cap = B_FALSE;
1377 	list_walk_collection(rss_sample_col_cb, &col_arg);
1378 	list_walk_collection(excess_print_cb, NULL);
1379 	debug("any collection/project over cap = %d, %d\n",
1380 	    col_arg.sca_any_over_cap, col_arg.sca_project_over_cap);
1381 
1382 	if (enforce_soft_caps)
1383 		debug("memory pressure %d%%\n", memory_pressure);
1384 
1385 	/*
1386 	 * Cap enforcement is determined by the previous conditions.
1387 	 */
1388 	enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap &&
1389 	    (rcfg.rcfg_memory_cap_enforcement_pressure == 0 ||
1390 	    enforce_soft_caps);
1391 
1392 	debug("%senforcing caps\n", enforce_caps ? "" : "not ");
1393 
1394 	/*
1395 	 * If soft caps are in use, determine the size of the portion from each
1396 	 * collection to scan for.
1397 	 */
1398 	if (enforce_caps && enforce_soft_caps)
1399 		compute_soft_scan_goal(&arg);
1400 
1401 	/*
1402 	 * Victimize offending collections.
1403 	 */
1404 	if (enforce_caps && (!enforce_soft_caps ||
1405 	    (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) {
1406 
1407 		/*
1408 		 * Since at least one collection is over its cap & needs
1409 		 * enforcing, check if it is at least time for a process walk
1410 		 * (we could be well past time since we only walk /proc when
1411 		 * we need to) and if so, update each collections process list
1412 		 * in a single pass through /proc.
1413 		 */
1414 		if (EVENT_TIME(now, *next_proc_walk)) {
1415 			debug("scanning process list...\n");
1416 			proc_walk_all(proc_cb);		 /* insert & mark */
1417 			list_walk_all(sweep_process_cb); /* free dead procs */
1418 			*next_proc_walk = NEXT_EVENT_TIME(now,
1419 			    rcfg.rcfg_proc_walk_interval);
1420 		}
1421 
1422 		gz_col = NULL;
1423 		if (enforce_soft_caps) {
1424 			debug("scan goal is %lldKB\n",
1425 			    (long long)arg.ssa_scan_goal);
1426 			list_walk_collection(soft_scan_cb, &arg);
1427 			if (gz_capped && gz_col != NULL) {
1428 				/* process global zone */
1429 				arg.ssa_project_over_cap =
1430 				    col_arg.sca_project_over_cap;
1431 				soft_scan_gz(gz_col, &arg);
1432 			}
1433 		} else {
1434 			list_walk_collection(scan_cb, NULL);
1435 			if (gz_capped && gz_col != NULL) {
1436 				/* process global zone */
1437 				scan_gz(gz_col, col_arg.sca_project_over_cap);
1438 			}
1439 		}
1440 	} else if (col_arg.sca_any_over_cap) {
1441 		list_walk_collection(unenforced_cap_cb, NULL);
1442 	}
1443 }
1444 
1445 int
1446 main(int argc, char *argv[])
1447 {
1448 	int res;
1449 	int should_fork = 1;	/* fork flag */
1450 	hrtime_t now;		/* current time */
1451 	hrtime_t next;		/* time of next event */
1452 	int sig;		/* signal iteration */
1453 	struct rlimit rl;
1454 	hrtime_t next_proc_walk;	/* time of next /proc scan */
1455 	hrtime_t next_configuration;	/* time of next configuration */
1456 	hrtime_t next_rss_sample;	/* (latest) time of next RSS sample */
1457 
1458 	(void) set_message_priority(RCM_INFO);
1459 	(void) setprogname("rcapd");
1460 	rcapd_pid = getpid();
1461 	(void) chdir("/");
1462 	should_run = 1;
1463 	ever_ran = 0;
1464 
1465 	(void) setlocale(LC_ALL, "");
1466 	(void) textdomain(TEXT_DOMAIN);
1467 
1468 	/*
1469 	 * Parse command-line options.
1470 	 */
1471 	while ((res = getopt(argc, argv, "dF")) > 0)
1472 		switch (res) {
1473 		case 'd':
1474 			should_fork = 0;
1475 			if (debug_mode == 0) {
1476 				debug_mode = 1;
1477 				(void) set_message_priority(RCM_DEBUG);
1478 			} else
1479 				(void) set_message_priority(RCM_DEBUG_HIGH);
1480 			break;
1481 		case 'F':
1482 			should_fork = 0;
1483 			break;
1484 		default:
1485 			rcapd_usage();
1486 			return (E_USAGE);
1487 			/*NOTREACHED*/
1488 		}
1489 
1490 	/*
1491 	 * If not debugging, fork and continue operating, changing the
1492 	 * destination of messages to syslog().
1493 	 */
1494 	if (should_fork == 1) {
1495 		pid_t child;
1496 		debug("forking\n");
1497 		child = fork();
1498 		if (child == -1)
1499 			die(gettext("cannot fork"));
1500 		if (child > 0)
1501 			return (0);
1502 		else {
1503 			rcapd_pid = getpid();
1504 			(void) set_message_destination(RCD_SYSLOG);
1505 			(void) fclose(stdin);
1506 			(void) fclose(stdout);
1507 			(void) fclose(stderr);
1508 		}
1509 		/*
1510 		 * Start a new session and detatch from the controlling tty.
1511 		 */
1512 		if (setsid() == (pid_t)-1)
1513 			debug(gettext("setsid() failed; cannot detach from "
1514 			    "terminal"));
1515 	}
1516 
1517 	/*
1518 	 * Read the configuration file.
1519 	 */
1520 	if (rcfg_read(RCAPD_DEFAULT_CONF_FILE, -1, &rcfg, verify_statistics)
1521 	    != 0) {
1522 		/*
1523 		 * A configuration file may not exist if rcapd is started
1524 		 * by enabling the smf rcap service, so attempt to create
1525 		 * a default file.
1526 		 */
1527 		create_config_file(NULL);
1528 
1529 		/*
1530 		 * A real failure if still can't read the
1531 		 * configuration file
1532 		 */
1533 		if (rcfg_read(RCAPD_DEFAULT_CONF_FILE, -1, &rcfg,
1534 		    verify_statistics) != 0)
1535 			die(gettext("resource caps not configured %s"),
1536 			    RCAPD_DEFAULT_CONF_FILE);
1537 	}
1538 	finish_configuration();
1539 	should_reconfigure = 0;
1540 
1541 	/*
1542 	 * Check that required privileges are possessed.
1543 	 */
1544 	verify_and_set_privileges();
1545 
1546 	now = next_report = next_proc_walk = next_rss_sample = gethrtime();
1547 	next_configuration = NEXT_EVENT_TIME(gethrtime(),
1548 	    rcfg.rcfg_reconfiguration_interval);
1549 
1550 	/*
1551 	 * Open the kstat chain.
1552 	 */
1553 	kctl = kstat_open();
1554 	if (kctl == NULL)
1555 		die(gettext("can't open kstats"));
1556 
1557 	/*
1558 	 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can
1559 	 * be effectively managed without revoking descriptors (at 3 per
1560 	 * process).
1561 	 */
1562 	rl.rlim_cur = 32 * 1024;
1563 	rl.rlim_max = 32 * 1024;
1564 	if (setrlimit(RLIMIT_NOFILE, &rl) != 0 &&
1565 	    getrlimit(RLIMIT_NOFILE, &rl) == 0) {
1566 		rl.rlim_cur = rl.rlim_max;
1567 		(void) setrlimit(RLIMIT_NOFILE, &rl);
1568 	}
1569 	(void) enable_extended_FILE_stdio(-1, -1);
1570 
1571 	if (getrlimit(RLIMIT_NOFILE, &rl) == 0)
1572 		debug("fd limit: %lu\n", rl.rlim_cur);
1573 	else
1574 		debug("fd limit: unknown\n");
1575 
1576 	get_page_size();
1577 	my_zoneid = getzoneid();
1578 
1579 	/*
1580 	 * Handle those signals whose (default) exit disposition
1581 	 * prevents rcapd from finishing scanning before terminating.
1582 	 */
1583 	(void) sigset(SIGINT, terminate_signal);
1584 	(void) sigset(SIGQUIT, abort_signal);
1585 	(void) sigset(SIGILL, abort_signal);
1586 	(void) sigset(SIGEMT, abort_signal);
1587 	(void) sigset(SIGFPE, abort_signal);
1588 	(void) sigset(SIGBUS, abort_signal);
1589 	(void) sigset(SIGSEGV, abort_signal);
1590 	(void) sigset(SIGSYS, abort_signal);
1591 	(void) sigset(SIGPIPE, terminate_signal);
1592 	(void) sigset(SIGALRM, terminate_signal);
1593 	(void) sigset(SIGTERM, terminate_signal);
1594 	(void) sigset(SIGUSR1, terminate_signal);
1595 	(void) sigset(SIGUSR2, terminate_signal);
1596 	(void) sigset(SIGPOLL, terminate_signal);
1597 	(void) sigset(SIGVTALRM, terminate_signal);
1598 	(void) sigset(SIGXCPU, abort_signal);
1599 	(void) sigset(SIGXFSZ, abort_signal);
1600 	for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++)
1601 		(void) sigset(sig, terminate_signal);
1602 
1603 	/*
1604 	 * Install a signal handler for reconfiguration processing.
1605 	 */
1606 	(void) sigset(SIGHUP, sighup);
1607 
1608 	/*
1609 	 * Determine which process collections to cap.
1610 	 */
1611 	lcollection_update(LCU_COMPLETE);
1612 
1613 	/*
1614 	 * Loop forever, monitoring collections' resident set sizes and
1615 	 * enforcing their caps.  Look for changes in caps as well as
1616 	 * responding to requests to reread the configuration.  Update
1617 	 * per-collection statistics periodically.
1618 	 */
1619 	while (should_run != 0) {
1620 		struct timespec ts;
1621 
1622 		/*
1623 		 * Announce that rcapd is starting.
1624 		 */
1625 		if (ever_ran == 0) {
1626 			info(gettext("starting\n"));
1627 			ever_ran = 1;
1628 		}
1629 
1630 		/*
1631 		 * Check the configuration at every next_configuration interval.
1632 		 * Update the rss data once every next_rss_sample interval.
1633 		 * The condition of global memory pressure is also checked at
1634 		 * the same frequency, if strict caps are in use.
1635 		 */
1636 		now = gethrtime();
1637 
1638 		/*
1639 		 * Detect configuration and cap changes at every
1640 		 * reconfiguration_interval, or when SIGHUP has been received.
1641 		 */
1642 		if (EVENT_TIME(now, next_configuration) ||
1643 		    should_reconfigure == 1) {
1644 			reconfigure(now, &next_configuration, &next_proc_walk,
1645 			    &next_rss_sample);
1646 			should_reconfigure = 0;
1647 		}
1648 
1649 		/*
1650 		 * Do the main work for enforcing caps.
1651 		 */
1652 		if (EVENT_TIME(now, next_rss_sample)) {
1653 			do_capping(now, &next_proc_walk);
1654 
1655 			next_rss_sample = NEXT_EVENT_TIME(now,
1656 			    rcfg.rcfg_rss_sample_interval);
1657 		}
1658 
1659 		/*
1660 		 * Update the statistics file, if it's time.
1661 		 */
1662 		check_update_statistics();
1663 
1664 		/*
1665 		 * Sleep for some time before repeating.
1666 		 */
1667 		now = gethrtime();
1668 		next = next_configuration;
1669 		next = POSITIVE_MIN(next, next_report);
1670 		next = POSITIVE_MIN(next, next_rss_sample);
1671 		if (next > now && should_run != 0) {
1672 			debug("sleeping %-4.2f seconds\n", (float)(next -
1673 			    now) / (float)NANOSEC);
1674 			hrt2ts(next - now, &ts);
1675 			(void) nanosleep(&ts, NULL);
1676 		}
1677 	}
1678 	if (termination_signal != 0)
1679 		debug("exiting due to signal %d\n", termination_signal);
1680 	if (ever_ran != 0)
1681 		info(gettext("exiting\n"));
1682 
1683 	/*
1684 	 * Unlink the statistics file before exiting.
1685 	 */
1686 	if (rcfg.rcfg_stat_file[0] != 0)
1687 		(void) unlink(rcfg.rcfg_stat_file);
1688 
1689 	return (E_SUCCESS);
1690 }
1691