xref: /titanic_41/usr/src/cmd/rcap/rcapd/rcapd_main.c (revision 39b361b2ebefcef5612a54ae5cbd2179e19be296)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * rcapd is a long-running daemon enforcing project-based resource caps (see
30  * rcapd(1M)).  Each instance of a process aggregate (project or, generically,
31  * "collection") may have a memory cap.  A single thread monitors the resource
32  * utilization of capped collections, enforces caps when they are exceeded (and
33  * other conditions are met), and incorporates changes in configuration or
34  * caps.  Each of these actions occurs not more frequently than the rate
35  * specified with rcapadm(1M).
36  */
37 
38 #include <sys/priocntl.h>
39 #include <sys/proc.h>
40 #include <sys/resource.h>
41 #include <sys/sysinfo.h>
42 #include <sys/stat.h>
43 #include <sys/sysmacros.h>
44 #include <sys/time.h>
45 #include <sys/types.h>
46 #include <dirent.h>
47 #include <errno.h>
48 #include <fcntl.h>
49 #include <kstat.h>
50 #include <libintl.h>
51 #include <limits.h>
52 #include <locale.h>
53 #include <priv.h>
54 #include <signal.h>
55 #include <stdarg.h>
56 #include <stdio.h>
57 #include <stdio_ext.h>
58 #include <stdlib.h>
59 #include <libscf.h>
60 #include <strings.h>
61 #include <time.h>
62 #include <unistd.h>
63 #include <zone.h>
64 #include <assert.h>
65 #include <sys/vm_usage.h>
66 #include "rcapd.h"
67 #include "rcapd_mapping.h"
68 #include "rcapd_rfd.h"
69 #include "rcapd_stat.h"
70 #include "utils.h"
71 
72 #define	POSITIVE_MIN(x, y) \
73 	(((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y))
74 #define	NEXT_EVENT_TIME(base, seconds) \
75 	(((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \
76 	: (hrtime_t)0)
77 #define	NEXT_REPORT_EVENT_TIME(base, seconds) \
78 	((rcfg.rcfg_stat_file[0] != 0) ?  \
79 	    NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0)
80 #define	EVENT_TIME(time, eventtime) \
81 	(((time) > (eventtime)) && (eventtime) != 0)
82 #define	STAT_TEMPLATE_SUFFIX	".XXXXXX"	/* suffix of mkstemp() arg */
83 #define	DAEMON_UID		1		/* uid to use */
84 
85 #define	CAPPED_PROJECT	0x01
86 #define	CAPPED_ZONE	0x02
87 
88 typedef struct soft_scan_arg {
89 	uint64_t ssa_sum_excess;
90 	int64_t ssa_scan_goal;
91 	boolean_t ssa_project_over_cap;
92 } soft_scan_arg_t;
93 
94 typedef struct sample_col_arg {
95 	boolean_t sca_any_over_cap;
96 	boolean_t sca_project_over_cap;
97 } sample_col_arg_t;
98 
99 
100 static int debug_mode = 0;		/* debug mode flag */
101 static pid_t rcapd_pid;			/* rcapd's pid to ensure it's not */
102 					/* scanned */
103 static kstat_ctl_t *kctl;		/* kstat chain */
104 static int memory_pressure = 0;		/* physical memory utilization (%) */
105 static int memory_pressure_sample = 0;	/* count of samples */
106 static long page_size_kb = 0;		/* system page size in KB */
107 static size_t nvmu_vals = 0;		/* # of kernel RSS/swap vals in array */
108 static size_t vmu_vals_len = 0;		/* size of RSS/swap vals array */
109 static vmusage_t *vmu_vals = NULL;	/* snapshot of kernel RSS/swap values */
110 static hrtime_t next_report;		/* time of next report */
111 static int termination_signal = 0;	/* terminating signal */
112 static zoneid_t my_zoneid = (zoneid_t)-1;
113 static lcollection_t *gz_col;		/* global zone collection */
114 
115 rcfg_t rcfg;
116 /*
117  * Updated when we re-read the collection configurations if this rcapd instance
118  * is running in the global zone and the global zone is capped.
119  */
120 boolean_t gz_capped = B_FALSE;
121 
122 /*
123  * Flags.
124  */
125 static int ever_ran;
126 int should_run;
127 static int should_reconfigure;
128 
129 static int verify_statistics(void);
130 static int update_statistics(void);
131 
132 /*
133  * Checks if a process is marked 'system'.  Returns FALSE only when it is not.
134  */
135 static boolean_t
136 proc_issystem(pid_t pid)
137 {
138 	char pc_clname[PC_CLNMSZ];
139 
140 	if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
141 	    PC_KY_NULL) != -1) {
142 		return (strcmp(pc_clname, "SYS") == 0);
143 	} else {
144 		debug("cannot get class-specific scheduling parameters; "
145 		    "assuming system process\n");
146 		return (B_TRUE);
147 	}
148 }
149 
150 static void
151 lprocess_insert_mark(psinfo_t *psinfop)
152 {
153 	pid_t pid = psinfop->pr_pid;
154 	/* flag indicating whether the process should be scanned. */
155 	int unscannable = psinfop->pr_nlwp == 0;
156 	rcid_t colid;
157 	lcollection_t *lcol;
158 	lprocess_t *lproc;
159 
160 	/*
161 	 * Determine which collection to put this process into.  We only have
162 	 * to worry about tracking both zone and project capped processes if
163 	 * this rcapd instance is running in the global zone, since we'll only
164 	 * see processes in our own projects in a non-global zone.  In the
165 	 * global zone, if the process belongs to a non-global zone, we only
166 	 * need to track it for the capped non-global zone collection.  For
167 	 * global zone processes, we first attempt to put the process into a
168 	 * capped project collection.  On the second pass into this function
169 	 * the projid will be cleared so we will just track the process for the
170 	 * global zone collection as a whole.
171 	 */
172 	if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) {
173 		colid.rcid_type = RCIDT_PROJECT;
174 		colid.rcid_val = psinfop->pr_projid;
175 	} else {
176 		/* try to add to zone collection */
177 		colid.rcid_type = RCIDT_ZONE;
178 		colid.rcid_val = psinfop->pr_zoneid;
179 	}
180 
181 	if ((lcol = lcollection_find(&colid)) == NULL)
182 		return;
183 
184 	/*
185 	 * If the process is already being tracked, update the unscannable flag,
186 	 * as determined by the caller, from the process's psinfo.
187 	 */
188 	lproc = lcol->lcol_lprocess;
189 	while (lproc != NULL) {
190 		if (lproc->lpc_pid == pid) {
191 			lproc->lpc_mark = 1;
192 			if (unscannable != 0 && lproc->lpc_unscannable == 0) {
193 				debug("process %d: became unscannable\n",
194 				    (int)lproc->lpc_pid);
195 				lproc->lpc_unscannable = 1;
196 			}
197 			return;
198 		}
199 		lproc = lproc->lpc_next;
200 	}
201 
202 	/*
203 	 * We've fallen off the list without finding our current process;
204 	 * insert it at the list head.
205 	 */
206 	if ((lproc = malloc(sizeof (*lproc))) == NULL)
207 		debug("insufficient memory to track new process %d", (int)pid);
208 	else {
209 		(void) bzero(lproc, sizeof (*lproc));
210 		lproc->lpc_pid = pid;
211 		lproc->lpc_mark = 1;
212 		lproc->lpc_collection = lcol;
213 		lproc->lpc_psinfo_fd = -1;
214 		lproc->lpc_pgdata_fd = -1;
215 		lproc->lpc_xmap_fd = -1;
216 
217 		/*
218 		 * If the caller didn't flag this process as unscannable
219 		 * already, do some more checking.
220 		 */
221 		lproc->lpc_unscannable = unscannable || proc_issystem(pid);
222 
223 #ifdef DEBUG
224 		/*
225 		 * Verify the sanity of lprocess.  It should not contain the
226 		 * process we are about to prepend.
227 		 */
228 		if (lcollection_member(lcol, lproc)) {
229 			lprocess_t *cur = lcol->lcol_lprocess;
230 			debug("The collection %lld already has these members, "
231 			    "including me, %d!\n",
232 			    (long long)lcol->lcol_id.rcid_val,
233 			    (int)lproc->lpc_pid);
234 			while (cur != NULL) {
235 				debug("\t%d\n", (int)cur->lpc_pid);
236 				cur = cur->lpc_next;
237 			}
238 			info(gettext("process already on lprocess\n"));
239 			abort();
240 		}
241 #endif /* DEBUG */
242 		lproc->lpc_next = lcol->lcol_lprocess;
243 		if (lproc->lpc_next != NULL)
244 			lproc->lpc_next->lpc_prev = lproc;
245 		lproc->lpc_prev = NULL;
246 		lcol->lcol_lprocess = lproc;
247 
248 		debug("tracking %s %ld %d %s%s\n",
249 		    (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
250 		    (long)colid.rcid_val,
251 		    (int)pid, psinfop->pr_psargs,
252 		    (lproc->lpc_unscannable != 0) ? " (not scannable)" : "");
253 		lcol->lcol_stat.lcols_proc_in++;
254 	}
255 }
256 
257 static int
258 list_walk_process_cb(lcollection_t *lcol, void *arg)
259 {
260 	int (*cb)(lcollection_t *, lprocess_t *) =
261 	    (int(*)(lcollection_t *, lprocess_t *))arg;
262 	lprocess_t *member;
263 	lprocess_t *next;
264 
265 	member = lcol->lcol_lprocess;
266 	while (member != NULL) {
267 		pid_t pid = member->lpc_pid;
268 		next = member->lpc_next;
269 
270 		debug_high("list_walk_all lpc %d\n", (int)pid);
271 		if (cb(lcol, member) != 0) {
272 			debug_high("list_walk_all aborted at lpc %d\n",
273 			    (int)pid);
274 			return (1);
275 		}
276 		member = next;
277 	}
278 
279 	return (0);
280 }
281 
282 /*
283  * Invoke the given callback for each process in each collection.  Callbacks
284  * are allowed to change the linkage of the process on which they act.
285  */
286 static void
287 list_walk_all(int (*cb)(lcollection_t *, lprocess_t *))
288 {
289 	list_walk_collection(list_walk_process_cb, (void *)cb);
290 }
291 
292 static void
293 revoke_psinfo(rfd_t *rfd)
294 {
295 	lprocess_t *lpc = (lprocess_t *)rfd->rfd_data;
296 
297 	if (lpc != NULL) {
298 		debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid);
299 		ASSERT(lpc->lpc_psinfo_fd != -1);
300 		lpc->lpc_psinfo_fd = -1;
301 	} else
302 		debug("revoking psinfo fd for unknown process\n");
303 }
304 
305 /*
306  * Retrieve a process's psinfo via an already-opened or new file descriptor.
307  * The supplied descriptor will be closed on failure.  An optional callback
308  * will be invoked with the last descriptor tried, and a supplied callback
309  * argument, as its arguments, such that the new descriptor may be cached, or
310  * an old one may be invalidated.  If the result of the callback is zero, the
311  * the caller is to assume responsibility for the file descriptor, to close it
312  * with rfd_close().
313  *
314  * On failure, a nonzero value is returned.
315  */
316 int
317 get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd,
318     int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc)
319 {
320 	int fd;
321 	int can_try_uncached;
322 
323 	ASSERT(!(cached_fd > 0 && fd_update_cb == NULL));
324 
325 	do {
326 		if (cached_fd >= 0) {
327 			fd = cached_fd;
328 			can_try_uncached = 1;
329 			debug_high("%d/psinfo, trying cached fd %d\n",
330 			    (int)pid, fd);
331 		} else {
332 			char pathbuf[PROC_PATH_MAX];
333 
334 			can_try_uncached = 0;
335 			(void) snprintf(pathbuf, sizeof (pathbuf),
336 			    "/proc/%d/psinfo", (int)pid);
337 			if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO,
338 			    revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) {
339 				debug("cannot open %s", pathbuf);
340 				break;
341 			} else
342 				debug_high("opened %s, fd %d\n", pathbuf, fd);
343 		}
344 
345 		if (pread(fd, psinfo, sizeof (*psinfo), 0) ==
346 		    sizeof (*psinfo) && psinfo->pr_pid == pid)
347 			break;
348 		else {
349 			debug_high("closed fd %d\n", fd);
350 			if (rfd_close(fd) != 0)
351 				debug("could not close fd %d", fd);
352 			fd = cached_fd = -1;
353 		}
354 	} while (can_try_uncached == 1);
355 
356 	if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0)
357 		if (fd >= 0) {
358 			debug_high("closed %s fd %d\n", fd_update_cb == NULL ?
359 			    "uncached" : "cached", fd);
360 			if (rfd_close(fd) != 0)
361 				debug("could not close fd %d", fd);
362 		}
363 
364 	debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd,
365 	    fd_update_cb != NULL ? "cached" : "uncached");
366 	return ((fd >= 0) ? 0 : -1);
367 }
368 
369 /*
370  * Retrieve the collection membership of all processes and update the psinfo of
371  * those non-system, non-zombie ones in collections.  For global zone processes,
372  * we first attempt to put the process into a capped project collection.  We
373  * also want to track the process for the global zone collection as a whole.
374  */
375 static void
376 proc_cb(const pid_t pid)
377 {
378 	psinfo_t psinfo;
379 
380 	if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) {
381 		lprocess_insert_mark(&psinfo);
382 		if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) {
383 			/*
384 			 * We also want to track this process for the global
385 			 * zone as a whole so add it to the global zone
386 			 * collection as well.
387 			 */
388 			psinfo.pr_projid = -1;
389 			lprocess_insert_mark(&psinfo);
390 		}
391 	}
392 }
393 
394 /*
395  * Cache the process' psinfo fd, taking responsibility for freeing it.
396  */
397 int
398 lprocess_update_psinfo_fd_cb(void *arg, int fd)
399 {
400 	lprocess_t *lpc = arg;
401 
402 	lpc->lpc_psinfo_fd = fd;
403 	return (0);
404 }
405 
406 /*
407  * Get the system pagesize.
408  */
409 static void
410 get_page_size(void)
411 {
412 	page_size_kb = sysconf(_SC_PAGESIZE) / 1024;
413 	debug("physical page size: %luKB\n", page_size_kb);
414 }
415 
416 static void
417 tm_fmt(char *msg, hrtime_t t1, hrtime_t t2)
418 {
419 	hrtime_t diff = t2 - t1;
420 
421 	if (diff < MILLISEC)
422 		debug("%s: %lld nanoseconds\n", msg, diff);
423 	else if (diff < MICROSEC)
424 		debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC);
425 	else if (diff < NANOSEC)
426 		debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC);
427 	else
428 		debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC);
429 }
430 
431 /*
432  * Get the zone's & project's RSS from the kernel.
433  */
434 static void
435 rss_sample(boolean_t my_zone_only, uint_t col_types)
436 {
437 	size_t nres;
438 	size_t i;
439 	uint_t flags;
440 	hrtime_t t1, t2;
441 
442 	if (my_zone_only) {
443 		flags = VMUSAGE_ZONE;
444 	} else {
445 		flags = 0;
446 		if (col_types & CAPPED_PROJECT)
447 			flags |= VMUSAGE_PROJECTS;
448 		if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID)
449 			flags |= VMUSAGE_ALL_ZONES;
450 	}
451 
452 	debug("vmusage sample flags 0x%x\n", flags);
453 	if (flags == 0)
454 		return;
455 
456 again:
457 	/* try the current buffer to see if the list will fit */
458 	nres = vmu_vals_len;
459 	t1 = gethrtime();
460 	if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval,
461 	    vmu_vals, &nres) != 0) {
462 		if (errno != EOVERFLOW) {
463 			warn(gettext("can't read RSS from kernel\n"));
464 			return;
465 		}
466 	}
467 	t2 = gethrtime();
468 	tm_fmt("getvmusage time", t1, t2);
469 
470 	debug("kernel nres %lu\n", (ulong_t)nres);
471 
472 	if (nres > vmu_vals_len) {
473 		/* array size is now too small, increase it and try again */
474 		free(vmu_vals);
475 
476 		if ((vmu_vals = (vmusage_t *)calloc(nres,
477 		    sizeof (vmusage_t))) == NULL) {
478 			warn(gettext("out of memory: could not read RSS from "
479 			    "kernel\n"));
480 			vmu_vals_len = nvmu_vals = 0;
481 			return;
482 		}
483 		vmu_vals_len = nres;
484 		goto again;
485 	}
486 
487 	nvmu_vals = nres;
488 
489 	debug("vmusage_sample\n");
490 	for (i = 0; i < nvmu_vals; i++) {
491 		debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), "
492 		    "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id,
493 		    vmu_vals[i].vmu_type,
494 		    (unsigned long long)vmu_vals[i].vmu_rss_all,
495 		    (unsigned long long)vmu_vals[i].vmu_rss_all / 1024,
496 		    (unsigned long long)vmu_vals[i].vmu_swap_all);
497 	}
498 }
499 
500 static void
501 update_col_rss(lcollection_t *lcol)
502 {
503 	int i;
504 
505 	lcol->lcol_rss = 0;
506 	lcol->lcol_image_size = 0;
507 
508 	for (i = 0; i < nvmu_vals; i++) {
509 		if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val)
510 			continue;
511 
512 		if (vmu_vals[i].vmu_type == VMUSAGE_ZONE &&
513 		    lcol->lcol_id.rcid_type != RCIDT_ZONE)
514 			continue;
515 
516 		if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS &&
517 		    lcol->lcol_id.rcid_type != RCIDT_PROJECT)
518 			continue;
519 
520 		/* we found the right RSS entry, update the collection vals */
521 		lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024;
522 		lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024;
523 		break;
524 	}
525 }
526 
527 /*
528  * Sample the collection RSS, updating the collection's statistics with the
529  * results.  Also, sum the rss of all capped projects & return true if
530  * the collection is over cap.
531  */
532 static int
533 rss_sample_col_cb(lcollection_t *lcol, void *arg)
534 {
535 	int64_t excess;
536 	uint64_t rss;
537 	sample_col_arg_t *col_argp = (sample_col_arg_t *)arg;
538 
539 	update_col_rss(lcol);
540 
541 	lcol->lcol_stat.lcols_rss_sample++;
542 	rss = lcol->lcol_rss;
543 	excess = rss - lcol->lcol_rss_cap;
544 	if (excess > 0) {
545 		lcol->lcol_stat.lcols_rss_act_sum += rss;
546 		col_argp->sca_any_over_cap = B_TRUE;
547 		if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
548 			col_argp->sca_project_over_cap = B_TRUE;
549 	}
550 	lcol->lcol_stat.lcols_rss_sum += rss;
551 
552 	if (lcol->lcol_stat.lcols_min_rss > rss)
553 		lcol->lcol_stat.lcols_min_rss = rss;
554 	if (lcol->lcol_stat.lcols_max_rss < rss)
555 		lcol->lcol_stat.lcols_max_rss = rss;
556 
557 	return (0);
558 }
559 
560 /*
561  * Determine if we have capped projects, capped zones or both.
562  */
563 static int
564 col_type_cb(lcollection_t *lcol, void *arg)
565 {
566 	uint_t *col_type = (uint_t *)arg;
567 
568 	/* skip uncapped collections */
569 	if (lcol->lcol_rss_cap == 0)
570 		return (1);
571 
572 	if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
573 		*col_type |= CAPPED_PROJECT;
574 	else
575 		*col_type |= CAPPED_ZONE;
576 
577 	/* once we know everything is capped, we can stop looking */
578 	if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT))
579 		return (1);
580 
581 	return (0);
582 }
583 
584 /*
585  * Open /proc and walk entries.
586  */
587 static void
588 proc_walk_all(void (*cb)(const pid_t))
589 {
590 	DIR *pdir;
591 	struct dirent *dirent;
592 	pid_t pid;
593 
594 	(void) rfd_reserve(1);
595 	if ((pdir = opendir("/proc")) == NULL)
596 		die(gettext("couldn't open /proc!"));
597 
598 	while ((dirent = readdir(pdir)) != NULL) {
599 		if (strcmp(".", dirent->d_name) == 0 ||
600 		    strcmp("..", dirent->d_name) == 0)
601 			continue;
602 		pid = atoi(dirent->d_name);
603 		ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0);
604 		if (pid == rcapd_pid)
605 			continue;
606 		else
607 			cb(pid);
608 	}
609 	(void) closedir(pdir);
610 }
611 
612 /*
613  * Clear unmarked callback.
614  */
615 /*ARGSUSED*/
616 static int
617 sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc)
618 {
619 	if (lpc->lpc_mark) {
620 		lpc->lpc_mark = 0;
621 	} else {
622 		debug("process %d finished\n", (int)lpc->lpc_pid);
623 		lprocess_free(lpc);
624 	}
625 
626 	return (0);
627 }
628 
629 /*
630  * Print, for debugging purposes, a collection's recently-sampled RSS and
631  * excess.
632  */
633 /*ARGSUSED*/
634 static int
635 excess_print_cb(lcollection_t *lcol, void *arg)
636 {
637 	int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap;
638 
639 	debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n",
640 	    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
641 	    lcol->lcol_name,
642 	    (unsigned long long)lcol->lcol_rss,
643 	    (unsigned long long)lcol->lcol_rss_cap,
644 	    (long long)excess);
645 
646 	return (0);
647 }
648 
649 /*
650  * Scan those collections which have exceeded their caps.
651  *
652  * If we're running in the global zone it might have a cap.  We don't want to
653  * do any capping for the global zone yet since we might get under the cap by
654  * just capping the projects in the global zone.
655  */
656 /*ARGSUSED*/
657 static int
658 scan_cb(lcollection_t *lcol, void *arg)
659 {
660 	int64_t excess;
661 
662 	/* skip over global zone collection for now but keep track for later */
663 	if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
664 	    lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
665 		gz_col = lcol;
666 		return (0);
667 	}
668 
669 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
670 		scan(lcol, excess);
671 		lcol->lcol_stat.lcols_scan++;
672 	}
673 
674 	return (0);
675 }
676 
677 /*
678  * Scan the global zone collection and see if it still exceeds its cap.
679  * We take into account the effects of capping any global zone projects here.
680  */
681 static void
682 scan_gz(lcollection_t *lcol, boolean_t project_over_cap)
683 {
684 	int64_t excess;
685 
686 	/*
687 	 * If we had projects over their cap and the global zone was also over
688 	 * its cap then we need to get the up-to-date global zone rss to
689 	 * determine if we are still over the global zone cap.  We might have
690 	 * gone under while we scanned the capped projects.  If there were no
691 	 * projects over cap then we can use the rss value we already have for
692 	 * the global zone.
693 	 */
694 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
695 	if (project_over_cap && excess > 0) {
696 		rss_sample(B_TRUE, CAPPED_ZONE);
697 		update_col_rss(lcol);
698 		excess = lcol->lcol_rss - lcol->lcol_rss_cap;
699 	}
700 
701 	if (excess > 0) {
702 		debug("global zone excess %lldKB\n", (long long)excess);
703 		scan(lcol, excess);
704 		lcol->lcol_stat.lcols_scan++;
705 	}
706 }
707 
708 /*
709  * Do a soft scan of those collections which have excesses.  A soft scan is one
710  * in which the cap enforcement pressure is taken into account.  The difference
711  * between the utilized physical memory and the cap enforcement pressure will
712  * be scanned-for, and each collection will be scanned proportionally by their
713  * present excesses.
714  */
715 static int
716 soft_scan_cb(lcollection_t *lcol, void *a)
717 {
718 	int64_t excess;
719 	soft_scan_arg_t *arg = a;
720 
721 	/* skip over global zone collection for now but keep track for later */
722 	if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
723 	    lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
724 		gz_col = lcol;
725 		return (0);
726 	}
727 
728 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
729 		int64_t adjusted_excess =
730 		    excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
731 
732 		debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
733 		    "scanning %lld\n",
734 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
735 		    "project" : "zone"),
736 		    (long)lcol->lcol_id.rcid_val,
737 		    (long long)excess, (long long)arg->ssa_scan_goal,
738 		    (unsigned long long)arg->ssa_sum_excess,
739 		    (long long)adjusted_excess);
740 
741 		scan(lcol, adjusted_excess);
742 		lcol->lcol_stat.lcols_scan++;
743 	}
744 
745 	return (0);
746 }
747 
748 static void
749 soft_scan_gz(lcollection_t *lcol, void *a)
750 {
751 	int64_t excess;
752 	soft_scan_arg_t *arg = a;
753 
754 	/*
755 	 * If we had projects over their cap and the global zone was also over
756 	 * its cap then we need to get the up-to-date global zone rss to
757 	 * determine if we are still over the global zone cap.  We might have
758 	 * gone under while we scanned the capped projects.  If there were no
759 	 * projects over cap then we can use the rss value we already have for
760 	 * the global zone.
761 	 */
762 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
763 	if (arg->ssa_project_over_cap && excess > 0) {
764 		rss_sample(B_TRUE, CAPPED_ZONE);
765 		update_col_rss(lcol);
766 		excess = lcol->lcol_rss - lcol->lcol_rss_cap;
767 	}
768 
769 	if (excess > 0) {
770 		int64_t adjusted_excess =
771 		    excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
772 
773 		debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
774 		    "scanning %lld\n",
775 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
776 		    "project" : "zone"),
777 		    (long)lcol->lcol_id.rcid_val,
778 		    (long long)excess, (long long)arg->ssa_scan_goal,
779 		    (unsigned long long)arg->ssa_sum_excess,
780 		    (long long)adjusted_excess);
781 
782 		scan(lcol, adjusted_excess);
783 		lcol->lcol_stat.lcols_scan++;
784 	}
785 }
786 
787 /*
788  * When a scan could happen, but caps aren't enforced tick the
789  * lcols_unenforced_cap counter.
790  */
791 /*ARGSUSED*/
792 static int
793 unenforced_cap_cb(lcollection_t *lcol, void *arg)
794 {
795 	lcol->lcol_stat.lcols_unenforced_cap++;
796 
797 	return (0);
798 }
799 
800 /*
801  * Update the count of physically installed memory.
802  */
803 static void
804 update_phys_total(void)
805 {
806 	uint64_t old_phys_total;
807 
808 	old_phys_total = phys_total;
809 	phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb;
810 	if (phys_total != old_phys_total)
811 		debug("physical memory%s: %lluM\n", (old_phys_total == 0 ?
812 		    "" : " adjusted"), (unsigned long long)(phys_total / 1024));
813 }
814 
815 /*
816  * Unlink a process from its collection, updating relevant statistics, and
817  * freeing its associated memory.
818  */
819 void
820 lprocess_free(lprocess_t *lpc)
821 {
822 	pid_t pid;
823 
824 	lpc->lpc_collection->lcol_stat.lcols_proc_out++;
825 
826 	if (lpc->lpc_prev != NULL)
827 		lpc->lpc_prev->lpc_next = lpc->lpc_next;
828 	if (lpc->lpc_next != NULL)
829 		lpc->lpc_next->lpc_prev = lpc->lpc_prev;
830 	if (lpc->lpc_collection->lcol_lprocess == lpc)
831 		lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next !=
832 		    lpc ? lpc->lpc_next : NULL);
833 	lpc->lpc_next = lpc->lpc_prev = NULL;
834 
835 	if (lpc->lpc_prpageheader != NULL)
836 		free(lpc->lpc_prpageheader);
837 	if (lpc->lpc_xmap != NULL)
838 		free(lpc->lpc_xmap);
839 	if (lpc->lpc_psinfo_fd >= 0) {
840 		if (rfd_close(lpc->lpc_psinfo_fd) != 0)
841 			debug("could not close %d lpc_psinfo_fd %d",
842 			    (int)lpc->lpc_pid, lpc->lpc_psinfo_fd);
843 		lpc->lpc_psinfo_fd = -1;
844 	}
845 	if (lpc->lpc_pgdata_fd >= 0) {
846 		if (rfd_close(lpc->lpc_pgdata_fd) != 0)
847 			debug("could not close %d lpc_pgdata_fd %d",
848 			    (int)lpc->lpc_pid, lpc->lpc_pgdata_fd);
849 		lpc->lpc_pgdata_fd = -1;
850 	}
851 	if (lpc->lpc_xmap_fd >= 0) {
852 		if (rfd_close(lpc->lpc_xmap_fd) != 0)
853 			debug("could not close %d lpc_xmap_fd %d",
854 			    (int)lpc->lpc_pid, lpc->lpc_xmap_fd);
855 		lpc->lpc_xmap_fd = -1;
856 	}
857 	if (lpc->lpc_ignore != NULL)
858 		lmapping_free(&lpc->lpc_ignore);
859 	pid = lpc->lpc_pid;
860 	free(lpc);
861 	debug_high("process %d freed\n", (int)pid);
862 }
863 
864 /*
865  * Collection clear callback.
866  */
867 /*ARGSUSED*/
868 static int
869 collection_clear_cb(lcollection_t *lcol, void *arg)
870 {
871 	lcol->lcol_mark = 0;
872 
873 	return (0);
874 }
875 
876 /*
877  * Respond to a terminating signal by setting a termination flag.
878  */
879 /*ARGSUSED*/
880 static void
881 terminate_signal(int signal)
882 {
883 	if (termination_signal == 0)
884 		termination_signal = signal;
885 	should_run = 0;
886 }
887 
888 /*
889  * Handle any synchronous or asynchronous signals that would ordinarily cause a
890  * process to abort.
891  */
892 /*ARGSUSED*/
893 static void
894 abort_signal(int signal)
895 {
896 	/*
897 	 * Allow the scanner to make a last-ditch effort to resume any stopped
898 	 * processes.
899 	 */
900 	scan_abort();
901 	abort();
902 }
903 
904 /*
905  * Clean up collections which have been removed due to configuration.  Unlink
906  * the collection from lcollection and free it.
907  */
908 /*ARGSUSED*/
909 static int
910 collection_sweep_cb(lcollection_t *lcol, void *arg)
911 {
912 	if (lcol->lcol_mark == 0) {
913 		debug("freeing %s %s\n",
914 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
915 		    "project" : "zone"), lcol->lcol_name);
916 		lcollection_free(lcol);
917 	}
918 
919 	return (0);
920 }
921 
922 /*
923  * Set those variables which depend on the global configuration.
924  */
925 static void
926 finish_configuration(void)
927 {
928 	/*
929 	 * Warn that any lnode (or non-project) mode specification (by an SRM
930 	 * 1.3 configuration file, for example) is ignored.
931 	 */
932 	if (strcmp(rcfg.rcfg_mode_name, "project") != 0) {
933 		warn(gettext("%s mode specification ignored -- using project"
934 		    " mode\n"), rcfg.rcfg_mode_name);
935 		rcfg.rcfg_mode_name = "project";
936 		rcfg.rcfg_mode = rctype_project;
937 	}
938 }
939 
940 /*
941  * Cause the configuration to be reread and applied.
942  */
943 static void
944 reread_configuration(void)
945 {
946 	rcfg_t rcfg_new;
947 
948 	if (rcfg_read(&rcfg_new, update_statistics) != E_SUCCESS) {
949 		warn(gettext("can't reread configuration \n"));
950 		exit(SMF_EXIT_ERR_CONFIG);
951 	} else {
952 		/*
953 		 * Done reading configuration.  Remove existing
954 		 * collections in case there is a change in collection type.
955 		 */
956 		if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) {
957 			list_walk_collection(collection_clear_cb, NULL);
958 			list_walk_collection(collection_sweep_cb, NULL);
959 		}
960 
961 		/*
962 		 * Make the newly-read configuration the global one, and update
963 		 * any variables that depend on it.
964 		 */
965 		rcfg = rcfg_new;
966 		finish_configuration();
967 	}
968 }
969 
970 /*
971  * First, examine changes, additions, and deletions to cap definitions.
972  * Then, set the next event time.
973  */
974 static void
975 reconfigure(hrtime_t now, hrtime_t *next_configuration,
976     hrtime_t *next_proc_walk, hrtime_t *next_rss_sample)
977 {
978 	debug("reconfigure...\n");
979 
980 	/*
981 	 * Walk the lcollection, marking active collections so inactive ones
982 	 * can be freed.
983 	 */
984 	list_walk_collection(collection_clear_cb, NULL);
985 	lcollection_update(LCU_ACTIVE_ONLY); /* mark */
986 	list_walk_collection(collection_sweep_cb, NULL);
987 
988 	*next_configuration = NEXT_EVENT_TIME(now,
989 	    rcfg.rcfg_reconfiguration_interval);
990 
991 	/*
992 	 * Reset each event time to the shorter of the previous and new
993 	 * intervals.
994 	 */
995 	if (next_report == 0 && rcfg.rcfg_report_interval > 0)
996 		next_report = now;
997 	else
998 		next_report = POSITIVE_MIN(next_report,
999 		    NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval));
1000 
1001 	if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0)
1002 		*next_proc_walk = now;
1003 	else
1004 		*next_proc_walk = POSITIVE_MIN(*next_proc_walk,
1005 		    NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval));
1006 
1007 	if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0)
1008 		*next_rss_sample = now;
1009 	else
1010 		*next_rss_sample = POSITIVE_MIN(*next_rss_sample,
1011 		    NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval));
1012 }
1013 
1014 /*
1015  * Respond to SIGHUP by triggering the rereading the configuration and cap
1016  * definitions.
1017  */
1018 /*ARGSUSED*/
1019 static void
1020 sighup(int signal)
1021 {
1022 	should_reconfigure = 1;
1023 }
1024 
1025 /*
1026  * Print, for debugging purposes, each collection's interval statistics.
1027  */
1028 /*ARGSUSED*/
1029 static int
1030 simple_report_collection_cb(lcollection_t *lcol, void *arg)
1031 {
1032 #define	DELTA(field) \
1033 	(unsigned long long)( \
1034 	    (lcol->lcol_stat.field - lcol->lcol_stat_old.field))
1035 
1036 	debug("%s %s status: succeeded/attempted (k): %llu/%llu, "
1037 	    "ineffective/scans/unenforced/samplings:  %llu/%llu/%llu/%llu, RSS "
1038 	    "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, "
1039 	    "%llu scans over %llu ms\n",
1040 	    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
1041 	    lcol->lcol_name,
1042 	    DELTA(lcols_pg_eff), DELTA(lcols_pg_att),
1043 	    DELTA(lcols_scan_ineffective), DELTA(lcols_scan),
1044 	    DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample),
1045 	    (unsigned long long)lcol->lcol_stat.lcols_min_rss,
1046 	    (unsigned long long)lcol->lcol_stat.lcols_max_rss,
1047 	    (unsigned long long)lcol->lcol_rss_cap,
1048 	    (unsigned long long)(lcol->lcol_stat.lcols_proc_in -
1049 	    lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out),
1050 	    DELTA(lcols_scan_count), DELTA(lcols_scan_time_complete) / (NANOSEC
1051 	    / MILLISEC));
1052 
1053 #undef DELTA
1054 
1055 	return (0);
1056 }
1057 
1058 /*
1059  * Record each collection's interval statistics in the statistics file.
1060  */
1061 static int
1062 report_collection_cb(lcollection_t *lcol, void *arg)
1063 {
1064 	lcollection_report_t dc;
1065 	int fd = (intptr_t)arg;
1066 
1067 	/*
1068 	 * Copy the relevant fields to the collection's record.
1069 	 */
1070 	bzero(&dc, sizeof (dc));
1071 	dc.lcol_id = lcol->lcol_id;
1072 	(void) strcpy(dc.lcol_name, lcol->lcol_name);
1073 	dc.lcol_rss = lcol->lcol_rss;
1074 	dc.lcol_image_size = lcol->lcol_image_size;
1075 	dc.lcol_rss_cap = lcol->lcol_rss_cap;
1076 	dc.lcol_stat = lcol->lcol_stat;
1077 
1078 	if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) {
1079 		lcol->lcol_stat_old = lcol->lcol_stat;
1080 	} else {
1081 		debug("can't write %s %s statistics",
1082 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
1083 		    "project" : "zone"),
1084 		    lcol->lcol_name);
1085 	}
1086 
1087 	return (0);
1088 }
1089 
1090 /*
1091  * Determine the count of pages scanned by the global page scanner, obtained
1092  * from the cpu_stat:*::scan kstats.  Return zero on success.
1093  */
1094 static int
1095 get_globally_scanned_pages(uint64_t *scannedp)
1096 {
1097 	kstat_t *ksp;
1098 	uint64_t scanned = 0;
1099 
1100 	if (kstat_chain_update(kctl) == -1) {
1101 		warn(gettext("can't update kstat chain"));
1102 		return (0);
1103 	}
1104 
1105 	for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
1106 		if (strcmp(ksp->ks_module, "cpu_stat") == 0) {
1107 			if (kstat_read(kctl, ksp, NULL) != -1) {
1108 				scanned += ((cpu_stat_t *)
1109 				    ksp->ks_data)->cpu_vminfo.scan;
1110 			} else {
1111 				return (-1);
1112 			}
1113 		}
1114 	}
1115 
1116 	*scannedp = scanned;
1117 	return (0);
1118 }
1119 
1120 /*
1121  * Determine if the global page scanner is running, during which no memory
1122  * caps should be enforced, to prevent interference with the global page
1123  * scanner.
1124  */
1125 static boolean_t
1126 is_global_scanner_running()
1127 {
1128 	/* measure delta in page scan count */
1129 	static uint64_t new_sp = 0;
1130 	static uint64_t old_sp = 0;
1131 	boolean_t res = B_FALSE;
1132 
1133 	if (get_globally_scanned_pages(&new_sp) == 0) {
1134 		if (old_sp != 0 && (new_sp - old_sp) > 0) {
1135 			debug("global memory pressure detected (%llu "
1136 			    "pages scanned since last interval)\n",
1137 			    (unsigned long long)(new_sp - old_sp));
1138 			res = B_TRUE;
1139 		}
1140 		old_sp = new_sp;
1141 	} else {
1142 		warn(gettext("unable to read cpu statistics"));
1143 		new_sp = old_sp;
1144 	}
1145 
1146 	return (res);
1147 }
1148 
1149 /*
1150  * If soft caps are in use, determine if global memory pressure exceeds the
1151  * configured maximum above which soft caps are enforced.
1152  */
1153 static boolean_t
1154 must_enforce_soft_caps()
1155 {
1156 	/*
1157 	 * Check for changes to the amount of installed physical memory, to
1158 	 * compute the current memory pressure.
1159 	 */
1160 	update_phys_total();
1161 
1162 	memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb)
1163 	    * 100.0 / phys_total);
1164 	memory_pressure_sample++;
1165 	if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 &&
1166 	    memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) {
1167 		return (B_TRUE);
1168 	}
1169 
1170 	return (B_FALSE);
1171 }
1172 
1173 /*
1174  * Update the shared statistics file with each collection's current statistics.
1175  * Return zero on success.
1176  */
1177 static int
1178 update_statistics(void)
1179 {
1180 	int fd, res;
1181 	static char template[LINELEN];
1182 
1183 	/*
1184 	 * Try to create a directory irrespective of whether it is existing
1185 	 * or not. If it is not there then it will create. Otherwise any way
1186 	 * it will fail at mkstemp call below.
1187 	 */
1188 	(void) mkdir(STAT_FILE_DIR, 0755);
1189 
1190 	/*
1191 	 * Create a temporary file.
1192 	 */
1193 	if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) +
1194 	    strlen(STAT_TEMPLATE_SUFFIX) + 1)) {
1195 		debug("temporary file template size too small\n");
1196 		return (-1);
1197 	}
1198 	(void) strcpy(template, rcfg.rcfg_stat_file);
1199 	(void) strcat(template, STAT_TEMPLATE_SUFFIX);
1200 	(void) rfd_reserve(1);
1201 	fd = mkstemp(template);
1202 
1203 	/*
1204 	 * Write the header and per-collection statistics.
1205 	 */
1206 	if (fd >= 0) {
1207 		rcapd_stat_hdr_t rs;
1208 
1209 		rs.rs_pid = rcapd_pid;
1210 		rs.rs_time = gethrtime();
1211 		ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name));
1212 		(void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name);
1213 		rs.rs_pressure_cur = memory_pressure;
1214 		rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure;
1215 		rs.rs_pressure_sample = memory_pressure_sample;
1216 
1217 		if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) ==
1218 		    sizeof (rs)) {
1219 			list_walk_collection(report_collection_cb,
1220 				(void *)(intptr_t)fd);
1221 			/*
1222 			 * Replace the existing statistics file with this new
1223 			 * one.
1224 			 */
1225 			res = rename(template, rcfg.rcfg_stat_file);
1226 		} else
1227 			res = -1;
1228 		(void) close(fd);
1229 	} else
1230 		res = -1;
1231 
1232 	return (res);
1233 }
1234 
1235 /*
1236  * Verify the statistics file can be created and written to, and die if an
1237  * existing file may be in use by another rcapd.
1238  */
1239 static int
1240 verify_statistics(void)
1241 {
1242 	pid_t pid;
1243 
1244 	/*
1245 	 * Warn if another instance of rcapd might be active.
1246 	 */
1247 	(void) rfd_reserve(1);
1248 	pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file);
1249 	if (pid != rcapd_pid && pid != -1)
1250 		die(gettext("%s exists; rcapd may already be active\n"),
1251 		    rcfg.rcfg_stat_file);
1252 
1253 	return (update_statistics());
1254 }
1255 
1256 static int
1257 sum_excess_cb(lcollection_t *lcol, void *arg)
1258 {
1259 	uint64_t *sum_excess = arg;
1260 
1261 	*sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss -
1262 	    lcol->lcol_rss_cap));
1263 	return (0);
1264 }
1265 
1266 /*
1267  * Compute the quantity of memory (in kilobytes) above the cap enforcement
1268  * pressure.  Set the scan goal to that quantity (or at most the excess).
1269  */
1270 static void
1271 compute_soft_scan_goal(soft_scan_arg_t *argp)
1272 {
1273 	/*
1274 	 * Compute the sum of the collections' excesses, which will be the
1275 	 * denominator.
1276 	 */
1277 	argp->ssa_sum_excess = 0;
1278 	list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess));
1279 
1280 	argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) *
1281 	    (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 -
1282 	    sysconf(_SC_AVPHYS_PAGES)) * page_size_kb,
1283 	    argp->ssa_sum_excess);
1284 }
1285 
1286 static void
1287 rcapd_usage(void)
1288 {
1289 	info(gettext("usage: rcapd [-d]\n"));
1290 }
1291 
1292 void
1293 check_update_statistics(void)
1294 {
1295 	hrtime_t now = gethrtime();
1296 
1297 	if (EVENT_TIME(now, next_report)) {
1298 		debug("updating statistics...\n");
1299 		list_walk_collection(simple_report_collection_cb, NULL);
1300 		if (update_statistics() != 0)
1301 			debug("couldn't update statistics");
1302 		next_report = NEXT_REPORT_EVENT_TIME(now,
1303 		    rcfg.rcfg_report_interval);
1304 	}
1305 }
1306 
1307 static void
1308 verify_and_set_privileges(void)
1309 {
1310 	priv_set_t *required =
1311 	    priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL);
1312 
1313 	/*
1314 	 * Ensure the required privileges, suitable for controlling processes,
1315 	 * are possessed.
1316 	 */
1317 	if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv(
1318 	    PRIV_SET, PRIV_EFFECTIVE, required) != 0)
1319 		die(gettext("can't set requisite privileges"));
1320 
1321 	/*
1322 	 * Ensure access to /var/run/daemon.
1323 	 */
1324 	if (setreuid(DAEMON_UID, DAEMON_UID) != 0)
1325 		die(gettext("cannot become user daemon"));
1326 
1327 	priv_freeset(required);
1328 }
1329 
1330 /*
1331  * This function does the top-level work to determine if we should do any
1332  * memory capping, and if so, it invokes the right call-backs to do the work.
1333  */
1334 static void
1335 do_capping(hrtime_t now, hrtime_t *next_proc_walk)
1336 {
1337 	boolean_t enforce_caps;
1338 	/* soft cap enforcement flag, depending on memory pressure */
1339 	boolean_t enforce_soft_caps;
1340 	/* avoid interference with kernel's page scanner */
1341 	boolean_t global_scanner_running;
1342 	sample_col_arg_t col_arg;
1343 	soft_scan_arg_t arg;
1344 	uint_t col_types = 0;
1345 
1346 	/* check what kind of collections (project/zone) are capped */
1347 	list_walk_collection(col_type_cb, &col_types);
1348 	debug("collection types: 0x%x\n", col_types);
1349 
1350 	/* no capped collections, skip checking rss */
1351 	if (col_types == 0)
1352 		return;
1353 
1354 	/* Determine if soft caps are enforced. */
1355 	enforce_soft_caps = must_enforce_soft_caps();
1356 
1357 	/* Determine if the global page scanner is running. */
1358 	global_scanner_running = is_global_scanner_running();
1359 
1360 	/*
1361 	 * Sample collections' member processes RSSes and recompute
1362 	 * collections' excess.
1363 	 */
1364 	rss_sample(B_FALSE, col_types);
1365 
1366 	col_arg.sca_any_over_cap = B_FALSE;
1367 	col_arg.sca_project_over_cap = B_FALSE;
1368 	list_walk_collection(rss_sample_col_cb, &col_arg);
1369 	list_walk_collection(excess_print_cb, NULL);
1370 	debug("any collection/project over cap = %d, %d\n",
1371 	    col_arg.sca_any_over_cap, col_arg.sca_project_over_cap);
1372 
1373 	if (enforce_soft_caps)
1374 		debug("memory pressure %d%%\n", memory_pressure);
1375 
1376 	/*
1377 	 * Cap enforcement is determined by the previous conditions.
1378 	 */
1379 	enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap &&
1380 	    (rcfg.rcfg_memory_cap_enforcement_pressure == 0 ||
1381 	    enforce_soft_caps);
1382 
1383 	debug("%senforcing caps\n", enforce_caps ? "" : "not ");
1384 
1385 	/*
1386 	 * If soft caps are in use, determine the size of the portion from each
1387 	 * collection to scan for.
1388 	 */
1389 	if (enforce_caps && enforce_soft_caps)
1390 		compute_soft_scan_goal(&arg);
1391 
1392 	/*
1393 	 * Victimize offending collections.
1394 	 */
1395 	if (enforce_caps && (!enforce_soft_caps ||
1396 	    (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) {
1397 
1398 		/*
1399 		 * Since at least one collection is over its cap & needs
1400 		 * enforcing, check if it is at least time for a process walk
1401 		 * (we could be well past time since we only walk /proc when
1402 		 * we need to) and if so, update each collections process list
1403 		 * in a single pass through /proc.
1404 		 */
1405 		if (EVENT_TIME(now, *next_proc_walk)) {
1406 			debug("scanning process list...\n");
1407 			proc_walk_all(proc_cb);		 /* insert & mark */
1408 			list_walk_all(sweep_process_cb); /* free dead procs */
1409 			*next_proc_walk = NEXT_EVENT_TIME(now,
1410 			    rcfg.rcfg_proc_walk_interval);
1411 		}
1412 
1413 		gz_col = NULL;
1414 		if (enforce_soft_caps) {
1415 			debug("scan goal is %lldKB\n",
1416 			    (long long)arg.ssa_scan_goal);
1417 			list_walk_collection(soft_scan_cb, &arg);
1418 			if (gz_capped && gz_col != NULL) {
1419 				/* process global zone */
1420 				arg.ssa_project_over_cap =
1421 				    col_arg.sca_project_over_cap;
1422 				soft_scan_gz(gz_col, &arg);
1423 			}
1424 		} else {
1425 			list_walk_collection(scan_cb, NULL);
1426 			if (gz_capped && gz_col != NULL) {
1427 				/* process global zone */
1428 				scan_gz(gz_col, col_arg.sca_project_over_cap);
1429 			}
1430 		}
1431 	} else if (col_arg.sca_any_over_cap) {
1432 		list_walk_collection(unenforced_cap_cb, NULL);
1433 	}
1434 }
1435 
1436 int
1437 main(int argc, char *argv[])
1438 {
1439 	int res;
1440 	int should_fork = 1;	/* fork flag */
1441 	hrtime_t now;		/* current time */
1442 	hrtime_t next;		/* time of next event */
1443 	int sig;		/* signal iteration */
1444 	struct rlimit rl;
1445 	hrtime_t next_proc_walk;	/* time of next /proc scan */
1446 	hrtime_t next_configuration;	/* time of next configuration */
1447 	hrtime_t next_rss_sample;	/* (latest) time of next RSS sample */
1448 
1449 	(void) set_message_priority(RCM_INFO);
1450 	(void) setprogname("rcapd");
1451 	rcapd_pid = getpid();
1452 	(void) chdir("/");
1453 	should_run = 1;
1454 	ever_ran = 0;
1455 
1456 	(void) setlocale(LC_ALL, "");
1457 	(void) textdomain(TEXT_DOMAIN);
1458 
1459 	/*
1460 	 * Parse command-line options.
1461 	 */
1462 	while ((res = getopt(argc, argv, "dF")) > 0)
1463 		switch (res) {
1464 		case 'd':
1465 			should_fork = 0;
1466 			if (debug_mode == 0) {
1467 				debug_mode = 1;
1468 				(void) set_message_priority(RCM_DEBUG);
1469 			} else
1470 				(void) set_message_priority(RCM_DEBUG_HIGH);
1471 			break;
1472 		case 'F':
1473 			should_fork = 0;
1474 			break;
1475 		default:
1476 			rcapd_usage();
1477 			return (E_USAGE);
1478 			/*NOTREACHED*/
1479 		}
1480 
1481 	/*
1482 	 * Read the configuration.
1483 	 */
1484 	if (rcfg_read(&rcfg, verify_statistics) != E_SUCCESS) {
1485 		warn(gettext("resource caps not configured\n"));
1486 		return (SMF_EXIT_ERR_CONFIG);
1487 	}
1488 
1489 	/*
1490 	 * If not debugging, fork and continue operating, changing the
1491 	 * destination of messages to syslog().
1492 	 */
1493 	if (should_fork == 1) {
1494 		pid_t child;
1495 		debug("forking\n");
1496 		child = fork();
1497 		if (child == -1)
1498 			die(gettext("cannot fork"));
1499 		if (child > 0)
1500 			return (0);
1501 		else {
1502 			rcapd_pid = getpid();
1503 			(void) set_message_destination(RCD_SYSLOG);
1504 			(void) fclose(stdin);
1505 			(void) fclose(stdout);
1506 			(void) fclose(stderr);
1507 		}
1508 		/*
1509 		 * Start a new session and detatch from the controlling tty.
1510 		 */
1511 		if (setsid() == (pid_t)-1)
1512 			debug(gettext("setsid() failed; cannot detach from "
1513 			    "terminal"));
1514 	}
1515 
1516 	finish_configuration();
1517 	should_reconfigure = 0;
1518 
1519 	/*
1520 	 * Check that required privileges are possessed.
1521 	 */
1522 	verify_and_set_privileges();
1523 
1524 	now = next_report = next_proc_walk = next_rss_sample = gethrtime();
1525 	next_configuration = NEXT_EVENT_TIME(gethrtime(),
1526 	    rcfg.rcfg_reconfiguration_interval);
1527 
1528 	/*
1529 	 * Open the kstat chain.
1530 	 */
1531 	kctl = kstat_open();
1532 	if (kctl == NULL)
1533 		die(gettext("can't open kstats"));
1534 
1535 	/*
1536 	 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can
1537 	 * be effectively managed without revoking descriptors (at 3 per
1538 	 * process).
1539 	 */
1540 	rl.rlim_cur = 32 * 1024;
1541 	rl.rlim_max = 32 * 1024;
1542 	if (setrlimit(RLIMIT_NOFILE, &rl) != 0 &&
1543 	    getrlimit(RLIMIT_NOFILE, &rl) == 0) {
1544 		rl.rlim_cur = rl.rlim_max;
1545 		(void) setrlimit(RLIMIT_NOFILE, &rl);
1546 	}
1547 	(void) enable_extended_FILE_stdio(-1, -1);
1548 
1549 	if (getrlimit(RLIMIT_NOFILE, &rl) == 0)
1550 		debug("fd limit: %lu\n", rl.rlim_cur);
1551 	else
1552 		debug("fd limit: unknown\n");
1553 
1554 	get_page_size();
1555 	my_zoneid = getzoneid();
1556 
1557 	/*
1558 	 * Handle those signals whose (default) exit disposition
1559 	 * prevents rcapd from finishing scanning before terminating.
1560 	 */
1561 	(void) sigset(SIGINT, terminate_signal);
1562 	(void) sigset(SIGQUIT, abort_signal);
1563 	(void) sigset(SIGILL, abort_signal);
1564 	(void) sigset(SIGEMT, abort_signal);
1565 	(void) sigset(SIGFPE, abort_signal);
1566 	(void) sigset(SIGBUS, abort_signal);
1567 	(void) sigset(SIGSEGV, abort_signal);
1568 	(void) sigset(SIGSYS, abort_signal);
1569 	(void) sigset(SIGPIPE, terminate_signal);
1570 	(void) sigset(SIGALRM, terminate_signal);
1571 	(void) sigset(SIGTERM, terminate_signal);
1572 	(void) sigset(SIGUSR1, terminate_signal);
1573 	(void) sigset(SIGUSR2, terminate_signal);
1574 	(void) sigset(SIGPOLL, terminate_signal);
1575 	(void) sigset(SIGVTALRM, terminate_signal);
1576 	(void) sigset(SIGXCPU, abort_signal);
1577 	(void) sigset(SIGXFSZ, abort_signal);
1578 	for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++)
1579 		(void) sigset(sig, terminate_signal);
1580 
1581 	/*
1582 	 * Install a signal handler for reconfiguration processing.
1583 	 */
1584 	(void) sigset(SIGHUP, sighup);
1585 
1586 	/*
1587 	 * Determine which process collections to cap.
1588 	 */
1589 	lcollection_update(LCU_COMPLETE);
1590 
1591 	/*
1592 	 * Loop forever, monitoring collections' resident set sizes and
1593 	 * enforcing their caps.  Look for changes in caps as well as
1594 	 * responding to requests to reread the configuration.  Update
1595 	 * per-collection statistics periodically.
1596 	 */
1597 	while (should_run != 0) {
1598 		struct timespec ts;
1599 
1600 		/*
1601 		 * Announce that rcapd is starting.
1602 		 */
1603 		if (ever_ran == 0) {
1604 			info(gettext("starting\n"));
1605 			ever_ran = 1;
1606 		}
1607 
1608 		/*
1609 		 * Check the configuration at every next_configuration interval.
1610 		 * Update the rss data once every next_rss_sample interval.
1611 		 * The condition of global memory pressure is also checked at
1612 		 * the same frequency, if strict caps are in use.
1613 		 */
1614 		now = gethrtime();
1615 
1616 		/*
1617 		 * Detect configuration and cap changes only when SIGHUP
1618 		 * is received. Call reconfigure to apply new configuration
1619 		 * parameters.
1620 		 */
1621 		if (should_reconfigure == 1) {
1622 			reread_configuration();
1623 			should_reconfigure = 0;
1624 			reconfigure(now, &next_configuration, &next_proc_walk,
1625 			    &next_rss_sample);
1626 		}
1627 
1628 		if (EVENT_TIME(now, next_configuration)) {
1629 			reconfigure(now, &next_configuration, &next_proc_walk,
1630 			    &next_rss_sample);
1631 		}
1632 
1633 		/*
1634 		 * Do the main work for enforcing caps.
1635 		 */
1636 		if (EVENT_TIME(now, next_rss_sample)) {
1637 			do_capping(now, &next_proc_walk);
1638 
1639 			next_rss_sample = NEXT_EVENT_TIME(now,
1640 			    rcfg.rcfg_rss_sample_interval);
1641 		}
1642 
1643 		/*
1644 		 * Update the statistics file, if it's time.
1645 		 */
1646 		check_update_statistics();
1647 
1648 		/*
1649 		 * Sleep for some time before repeating.
1650 		 */
1651 		now = gethrtime();
1652 		next = next_configuration;
1653 		next = POSITIVE_MIN(next, next_report);
1654 		next = POSITIVE_MIN(next, next_rss_sample);
1655 		if (next > now && should_run != 0) {
1656 			debug("sleeping %-4.2f seconds\n", (float)(next -
1657 			    now) / (float)NANOSEC);
1658 			hrt2ts(next - now, &ts);
1659 			(void) nanosleep(&ts, NULL);
1660 		}
1661 	}
1662 	if (termination_signal != 0)
1663 		debug("exiting due to signal %d\n", termination_signal);
1664 	if (ever_ran != 0)
1665 		info(gettext("exiting\n"));
1666 
1667 	/*
1668 	 * Unlink the statistics file before exiting.
1669 	 */
1670 	if (rcfg.rcfg_stat_file[0] != 0)
1671 		(void) unlink(rcfg.rcfg_stat_file);
1672 
1673 	return (E_SUCCESS);
1674 }
1675