xref: /titanic_44/usr/src/cmd/rcap/rcapd/rcapd_main.c (revision 5fd03bc0f2e00e7ba02316c2e08f45d52aab15db)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * rcapd is a long-running daemon enforcing project-based resource caps (see
28  * rcapd(1M)).  Each instance of a process aggregate (project or, generically,
29  * "collection") may have a memory cap.  A single thread monitors the resource
30  * utilization of capped collections, enforces caps when they are exceeded (and
31  * other conditions are met), and incorporates changes in configuration or
32  * caps.  Each of these actions occurs not more frequently than the rate
33  * specified with rcapadm(1M).
34  */
35 
36 #include <sys/priocntl.h>
37 #include <sys/proc.h>
38 #include <sys/resource.h>
39 #include <sys/sysinfo.h>
40 #include <sys/stat.h>
41 #include <sys/sysmacros.h>
42 #include <sys/time.h>
43 #include <sys/types.h>
44 #include <dirent.h>
45 #include <errno.h>
46 #include <fcntl.h>
47 #include <kstat.h>
48 #include <libintl.h>
49 #include <limits.h>
50 #include <locale.h>
51 #include <priv.h>
52 #include <signal.h>
53 #include <stdarg.h>
54 #include <stdio.h>
55 #include <stdio_ext.h>
56 #include <stdlib.h>
57 #include <libscf.h>
58 #include <strings.h>
59 #include <time.h>
60 #include <unistd.h>
61 #include <zone.h>
62 #include <assert.h>
63 #include <sys/vm_usage.h>
64 #include "rcapd.h"
65 #include "rcapd_mapping.h"
66 #include "rcapd_rfd.h"
67 #include "rcapd_stat.h"
68 #include "utils.h"
69 
70 #define	POSITIVE_MIN(x, y) \
71 	(((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y))
72 #define	NEXT_EVENT_TIME(base, seconds) \
73 	(((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \
74 	: (hrtime_t)0)
75 #define	NEXT_REPORT_EVENT_TIME(base, seconds) \
76 	((rcfg.rcfg_stat_file[0] != 0) ?  \
77 	    NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0)
78 #define	EVENT_TIME(time, eventtime) \
79 	(((time) > (eventtime)) && (eventtime) != 0)
80 #define	STAT_TEMPLATE_SUFFIX	".XXXXXX"	/* suffix of mkstemp() arg */
81 #define	DAEMON_UID		1		/* uid to use */
82 
83 #define	CAPPED_PROJECT	0x01
84 #define	CAPPED_ZONE	0x02
85 
86 typedef struct soft_scan_arg {
87 	uint64_t ssa_sum_excess;
88 	int64_t ssa_scan_goal;
89 	boolean_t ssa_project_over_cap;
90 } soft_scan_arg_t;
91 
92 typedef struct sample_col_arg {
93 	boolean_t sca_any_over_cap;
94 	boolean_t sca_project_over_cap;
95 } sample_col_arg_t;
96 
97 
98 static int debug_mode = 0;		/* debug mode flag */
99 static pid_t rcapd_pid;			/* rcapd's pid to ensure it's not */
100 					/* scanned */
101 static kstat_ctl_t *kctl;		/* kstat chain */
102 static int memory_pressure = 0;		/* physical memory utilization (%) */
103 static int memory_pressure_sample = 0;	/* count of samples */
104 static long page_size_kb = 0;		/* system page size in KB */
105 static size_t nvmu_vals = 0;		/* # of kernel RSS/swap vals in array */
106 static size_t vmu_vals_len = 0;		/* size of RSS/swap vals array */
107 static vmusage_t *vmu_vals = NULL;	/* snapshot of kernel RSS/swap values */
108 static hrtime_t next_report;		/* time of next report */
109 static int termination_signal = 0;	/* terminating signal */
110 static zoneid_t my_zoneid = (zoneid_t)-1;
111 static lcollection_t *gz_col;		/* global zone collection */
112 
113 rcfg_t rcfg;
114 /*
115  * Updated when we re-read the collection configurations if this rcapd instance
116  * is running in the global zone and the global zone is capped.
117  */
118 boolean_t gz_capped = B_FALSE;
119 
120 /*
121  * Flags.
122  */
123 static int ever_ran;
124 int should_run;
125 static int should_reconfigure;
126 
127 static int verify_statistics(void);
128 static int update_statistics(void);
129 
130 /*
131  * Checks if a process is marked 'system'.  Returns FALSE only when it is not.
132  */
133 static boolean_t
134 proc_issystem(pid_t pid)
135 {
136 	char pc_clname[PC_CLNMSZ];
137 
138 	if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
139 	    PC_KY_NULL) != -1) {
140 		return (strcmp(pc_clname, "SYS") == 0);
141 	} else {
142 		debug("cannot get class-specific scheduling parameters; "
143 		    "assuming system process\n");
144 		return (B_TRUE);
145 	}
146 }
147 
148 static void
149 lprocess_insert_mark(psinfo_t *psinfop)
150 {
151 	pid_t pid = psinfop->pr_pid;
152 	/* flag indicating whether the process should be scanned. */
153 	int unscannable = psinfop->pr_nlwp == 0;
154 	rcid_t colid;
155 	lcollection_t *lcol;
156 	lprocess_t *lproc;
157 
158 	/*
159 	 * Determine which collection to put this process into.  We only have
160 	 * to worry about tracking both zone and project capped processes if
161 	 * this rcapd instance is running in the global zone, since we'll only
162 	 * see processes in our own projects in a non-global zone.  In the
163 	 * global zone, if the process belongs to a non-global zone, we only
164 	 * need to track it for the capped non-global zone collection.  For
165 	 * global zone processes, we first attempt to put the process into a
166 	 * capped project collection.  On the second pass into this function
167 	 * the projid will be cleared so we will just track the process for the
168 	 * global zone collection as a whole.
169 	 */
170 	if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) {
171 		colid.rcid_type = RCIDT_PROJECT;
172 		colid.rcid_val = psinfop->pr_projid;
173 	} else {
174 		/* try to add to zone collection */
175 		colid.rcid_type = RCIDT_ZONE;
176 		colid.rcid_val = psinfop->pr_zoneid;
177 	}
178 
179 	if ((lcol = lcollection_find(&colid)) == NULL)
180 		return;
181 
182 	/*
183 	 * If the process is already being tracked, update the unscannable flag,
184 	 * as determined by the caller, from the process's psinfo.
185 	 */
186 	lproc = lcol->lcol_lprocess;
187 	while (lproc != NULL) {
188 		if (lproc->lpc_pid == pid) {
189 			lproc->lpc_mark = 1;
190 			if (unscannable != 0 && lproc->lpc_unscannable == 0) {
191 				debug("process %d: became unscannable\n",
192 				    (int)lproc->lpc_pid);
193 				lproc->lpc_unscannable = 1;
194 			}
195 			return;
196 		}
197 		lproc = lproc->lpc_next;
198 	}
199 
200 	/*
201 	 * We've fallen off the list without finding our current process;
202 	 * insert it at the list head.
203 	 */
204 	if ((lproc = malloc(sizeof (*lproc))) == NULL)
205 		debug("insufficient memory to track new process %d", (int)pid);
206 	else {
207 		(void) bzero(lproc, sizeof (*lproc));
208 		lproc->lpc_pid = pid;
209 		lproc->lpc_mark = 1;
210 		lproc->lpc_collection = lcol;
211 		lproc->lpc_psinfo_fd = -1;
212 		lproc->lpc_pgdata_fd = -1;
213 		lproc->lpc_xmap_fd = -1;
214 
215 		/*
216 		 * If the caller didn't flag this process as unscannable
217 		 * already, do some more checking.
218 		 */
219 		lproc->lpc_unscannable = unscannable || proc_issystem(pid);
220 
221 #ifdef DEBUG
222 		/*
223 		 * Verify the sanity of lprocess.  It should not contain the
224 		 * process we are about to prepend.
225 		 */
226 		if (lcollection_member(lcol, lproc)) {
227 			lprocess_t *cur = lcol->lcol_lprocess;
228 			debug("The collection %lld already has these members, "
229 			    "including me, %d!\n",
230 			    (long long)lcol->lcol_id.rcid_val,
231 			    (int)lproc->lpc_pid);
232 			while (cur != NULL) {
233 				debug("\t%d\n", (int)cur->lpc_pid);
234 				cur = cur->lpc_next;
235 			}
236 			info(gettext("process already on lprocess\n"));
237 			abort();
238 		}
239 #endif /* DEBUG */
240 		lproc->lpc_next = lcol->lcol_lprocess;
241 		if (lproc->lpc_next != NULL)
242 			lproc->lpc_next->lpc_prev = lproc;
243 		lproc->lpc_prev = NULL;
244 		lcol->lcol_lprocess = lproc;
245 
246 		debug("tracking %s %ld %d %s%s\n",
247 		    (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
248 		    (long)colid.rcid_val,
249 		    (int)pid, psinfop->pr_psargs,
250 		    (lproc->lpc_unscannable != 0) ? " (not scannable)" : "");
251 		lcol->lcol_stat.lcols_proc_in++;
252 	}
253 }
254 
255 static int
256 list_walk_process_cb(lcollection_t *lcol, void *arg)
257 {
258 	int (*cb)(lcollection_t *, lprocess_t *) =
259 	    (int(*)(lcollection_t *, lprocess_t *))arg;
260 	lprocess_t *member;
261 	lprocess_t *next;
262 
263 	member = lcol->lcol_lprocess;
264 	while (member != NULL) {
265 		pid_t pid = member->lpc_pid;
266 		next = member->lpc_next;
267 
268 		debug_high("list_walk_all lpc %d\n", (int)pid);
269 		if (cb(lcol, member) != 0) {
270 			debug_high("list_walk_all aborted at lpc %d\n",
271 			    (int)pid);
272 			return (1);
273 		}
274 		member = next;
275 	}
276 
277 	return (0);
278 }
279 
280 /*
281  * Invoke the given callback for each process in each collection.  Callbacks
282  * are allowed to change the linkage of the process on which they act.
283  */
284 static void
285 list_walk_all(int (*cb)(lcollection_t *, lprocess_t *))
286 {
287 	list_walk_collection(list_walk_process_cb, (void *)cb);
288 }
289 
290 static void
291 revoke_psinfo(rfd_t *rfd)
292 {
293 	lprocess_t *lpc = (lprocess_t *)rfd->rfd_data;
294 
295 	if (lpc != NULL) {
296 		debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid);
297 		ASSERT(lpc->lpc_psinfo_fd != -1);
298 		lpc->lpc_psinfo_fd = -1;
299 	} else
300 		debug("revoking psinfo fd for unknown process\n");
301 }
302 
303 /*
304  * Retrieve a process's psinfo via an already-opened or new file descriptor.
305  * The supplied descriptor will be closed on failure.  An optional callback
306  * will be invoked with the last descriptor tried, and a supplied callback
307  * argument, as its arguments, such that the new descriptor may be cached, or
308  * an old one may be invalidated.  If the result of the callback is zero, the
309  * the caller is to assume responsibility for the file descriptor, to close it
310  * with rfd_close().
311  *
312  * On failure, a nonzero value is returned.
313  */
314 int
315 get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd,
316     int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc)
317 {
318 	int fd;
319 	int can_try_uncached;
320 
321 	ASSERT(!(cached_fd > 0 && fd_update_cb == NULL));
322 
323 	do {
324 		if (cached_fd >= 0) {
325 			fd = cached_fd;
326 			can_try_uncached = 1;
327 			debug_high("%d/psinfo, trying cached fd %d\n",
328 			    (int)pid, fd);
329 		} else {
330 			char pathbuf[PROC_PATH_MAX];
331 
332 			can_try_uncached = 0;
333 			(void) snprintf(pathbuf, sizeof (pathbuf),
334 			    "/proc/%d/psinfo", (int)pid);
335 			if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO,
336 			    revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) {
337 				debug("cannot open %s", pathbuf);
338 				break;
339 			} else
340 				debug_high("opened %s, fd %d\n", pathbuf, fd);
341 		}
342 
343 		if (pread(fd, psinfo, sizeof (*psinfo), 0) ==
344 		    sizeof (*psinfo) && psinfo->pr_pid == pid)
345 			break;
346 		else {
347 			debug_high("closed fd %d\n", fd);
348 			if (rfd_close(fd) != 0)
349 				debug("could not close fd %d", fd);
350 			fd = cached_fd = -1;
351 		}
352 	} while (can_try_uncached == 1);
353 
354 	if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0)
355 		if (fd >= 0) {
356 			debug_high("closed %s fd %d\n", fd_update_cb == NULL ?
357 			    "uncached" : "cached", fd);
358 			if (rfd_close(fd) != 0)
359 				debug("could not close fd %d", fd);
360 		}
361 
362 	debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd,
363 	    fd_update_cb != NULL ? "cached" : "uncached");
364 	return ((fd >= 0) ? 0 : -1);
365 }
366 
367 /*
368  * Retrieve the collection membership of all processes and update the psinfo of
369  * those non-system, non-zombie ones in collections.  For global zone processes,
370  * we first attempt to put the process into a capped project collection.  We
371  * also want to track the process for the global zone collection as a whole.
372  */
373 static void
374 proc_cb(const pid_t pid)
375 {
376 	psinfo_t psinfo;
377 
378 	if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) {
379 		lprocess_insert_mark(&psinfo);
380 		if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) {
381 			/*
382 			 * We also want to track this process for the global
383 			 * zone as a whole so add it to the global zone
384 			 * collection as well.
385 			 */
386 			psinfo.pr_projid = -1;
387 			lprocess_insert_mark(&psinfo);
388 		}
389 	}
390 }
391 
392 /*
393  * Cache the process' psinfo fd, taking responsibility for freeing it.
394  */
395 int
396 lprocess_update_psinfo_fd_cb(void *arg, int fd)
397 {
398 	lprocess_t *lpc = arg;
399 
400 	lpc->lpc_psinfo_fd = fd;
401 	return (0);
402 }
403 
404 /*
405  * Get the system pagesize.
406  */
407 static void
408 get_page_size(void)
409 {
410 	page_size_kb = sysconf(_SC_PAGESIZE) / 1024;
411 	debug("physical page size: %luKB\n", page_size_kb);
412 }
413 
414 static void
415 tm_fmt(char *msg, hrtime_t t1, hrtime_t t2)
416 {
417 	hrtime_t diff = t2 - t1;
418 
419 	if (diff < MILLISEC)
420 		debug("%s: %lld nanoseconds\n", msg, diff);
421 	else if (diff < MICROSEC)
422 		debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC);
423 	else if (diff < NANOSEC)
424 		debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC);
425 	else
426 		debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC);
427 }
428 
429 /*
430  * Get the zone's & project's RSS from the kernel.
431  */
432 static void
433 rss_sample(boolean_t my_zone_only, uint_t col_types)
434 {
435 	size_t nres;
436 	size_t i;
437 	uint_t flags;
438 	hrtime_t t1, t2;
439 
440 	if (my_zone_only) {
441 		flags = VMUSAGE_ZONE;
442 	} else {
443 		flags = 0;
444 		if (col_types & CAPPED_PROJECT)
445 			flags |= VMUSAGE_PROJECTS;
446 		if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID)
447 			flags |= VMUSAGE_ALL_ZONES;
448 	}
449 
450 	debug("vmusage sample flags 0x%x\n", flags);
451 	if (flags == 0)
452 		return;
453 
454 again:
455 	/* try the current buffer to see if the list will fit */
456 	nres = vmu_vals_len;
457 	t1 = gethrtime();
458 	if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval,
459 	    vmu_vals, &nres) != 0) {
460 		if (errno != EOVERFLOW) {
461 			warn(gettext("can't read RSS from kernel\n"));
462 			return;
463 		}
464 	}
465 	t2 = gethrtime();
466 	tm_fmt("getvmusage time", t1, t2);
467 
468 	debug("kernel nres %lu\n", (ulong_t)nres);
469 
470 	if (nres > vmu_vals_len) {
471 		/* array size is now too small, increase it and try again */
472 		free(vmu_vals);
473 
474 		if ((vmu_vals = (vmusage_t *)calloc(nres,
475 		    sizeof (vmusage_t))) == NULL) {
476 			warn(gettext("out of memory: could not read RSS from "
477 			    "kernel\n"));
478 			vmu_vals_len = nvmu_vals = 0;
479 			return;
480 		}
481 		vmu_vals_len = nres;
482 		goto again;
483 	}
484 
485 	nvmu_vals = nres;
486 
487 	debug("vmusage_sample\n");
488 	for (i = 0; i < nvmu_vals; i++) {
489 		debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), "
490 		    "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id,
491 		    vmu_vals[i].vmu_type,
492 		    (unsigned long long)vmu_vals[i].vmu_rss_all,
493 		    (unsigned long long)vmu_vals[i].vmu_rss_all / 1024,
494 		    (unsigned long long)vmu_vals[i].vmu_swap_all);
495 	}
496 }
497 
498 static void
499 update_col_rss(lcollection_t *lcol)
500 {
501 	int i;
502 
503 	lcol->lcol_rss = 0;
504 	lcol->lcol_image_size = 0;
505 
506 	for (i = 0; i < nvmu_vals; i++) {
507 		if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val)
508 			continue;
509 
510 		if (vmu_vals[i].vmu_type == VMUSAGE_ZONE &&
511 		    lcol->lcol_id.rcid_type != RCIDT_ZONE)
512 			continue;
513 
514 		if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS &&
515 		    lcol->lcol_id.rcid_type != RCIDT_PROJECT)
516 			continue;
517 
518 		/* we found the right RSS entry, update the collection vals */
519 		lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024;
520 		lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024;
521 		break;
522 	}
523 }
524 
525 /*
526  * Sample the collection RSS, updating the collection's statistics with the
527  * results.  Also, sum the rss of all capped projects & return true if
528  * the collection is over cap.
529  */
530 static int
531 rss_sample_col_cb(lcollection_t *lcol, void *arg)
532 {
533 	int64_t excess;
534 	uint64_t rss;
535 	sample_col_arg_t *col_argp = (sample_col_arg_t *)arg;
536 
537 	update_col_rss(lcol);
538 
539 	lcol->lcol_stat.lcols_rss_sample++;
540 	rss = lcol->lcol_rss;
541 	excess = rss - lcol->lcol_rss_cap;
542 	if (excess > 0) {
543 		lcol->lcol_stat.lcols_rss_act_sum += rss;
544 		col_argp->sca_any_over_cap = B_TRUE;
545 		if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
546 			col_argp->sca_project_over_cap = B_TRUE;
547 	}
548 	lcol->lcol_stat.lcols_rss_sum += rss;
549 
550 	if (lcol->lcol_stat.lcols_min_rss > rss)
551 		lcol->lcol_stat.lcols_min_rss = rss;
552 	if (lcol->lcol_stat.lcols_max_rss < rss)
553 		lcol->lcol_stat.lcols_max_rss = rss;
554 
555 	return (0);
556 }
557 
558 /*
559  * Determine if we have capped projects, capped zones or both.
560  */
561 static int
562 col_type_cb(lcollection_t *lcol, void *arg)
563 {
564 	uint_t *col_type = (uint_t *)arg;
565 
566 	/* skip uncapped collections */
567 	if (lcol->lcol_rss_cap == 0)
568 		return (1);
569 
570 	if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
571 		*col_type |= CAPPED_PROJECT;
572 	else
573 		*col_type |= CAPPED_ZONE;
574 
575 	/* once we know everything is capped, we can stop looking */
576 	if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT))
577 		return (1);
578 
579 	return (0);
580 }
581 
582 /*
583  * Open /proc and walk entries.
584  */
585 static void
586 proc_walk_all(void (*cb)(const pid_t))
587 {
588 	DIR *pdir;
589 	struct dirent *dirent;
590 	pid_t pid;
591 
592 	(void) rfd_reserve(1);
593 	if ((pdir = opendir("/proc")) == NULL)
594 		die(gettext("couldn't open /proc!"));
595 
596 	while ((dirent = readdir(pdir)) != NULL) {
597 		if (strcmp(".", dirent->d_name) == 0 ||
598 		    strcmp("..", dirent->d_name) == 0)
599 			continue;
600 		pid = atoi(dirent->d_name);
601 		ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0);
602 		if (pid == rcapd_pid)
603 			continue;
604 		else
605 			cb(pid);
606 	}
607 	(void) closedir(pdir);
608 }
609 
610 /*
611  * Clear unmarked callback.
612  */
613 /*ARGSUSED*/
614 static int
615 sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc)
616 {
617 	if (lpc->lpc_mark) {
618 		lpc->lpc_mark = 0;
619 	} else {
620 		debug("process %d finished\n", (int)lpc->lpc_pid);
621 		lprocess_free(lpc);
622 	}
623 
624 	return (0);
625 }
626 
627 /*
628  * Print, for debugging purposes, a collection's recently-sampled RSS and
629  * excess.
630  */
631 /*ARGSUSED*/
632 static int
633 excess_print_cb(lcollection_t *lcol, void *arg)
634 {
635 	int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap;
636 
637 	debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n",
638 	    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
639 	    lcol->lcol_name,
640 	    (unsigned long long)lcol->lcol_rss,
641 	    (unsigned long long)lcol->lcol_rss_cap,
642 	    (long long)excess);
643 
644 	return (0);
645 }
646 
647 /*
648  * Scan those collections which have exceeded their caps.
649  *
650  * If we're running in the global zone it might have a cap.  We don't want to
651  * do any capping for the global zone yet since we might get under the cap by
652  * just capping the projects in the global zone.
653  */
654 /*ARGSUSED*/
655 static int
656 scan_cb(lcollection_t *lcol, void *arg)
657 {
658 	int64_t excess;
659 
660 	/* skip over global zone collection for now but keep track for later */
661 	if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
662 	    lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
663 		gz_col = lcol;
664 		return (0);
665 	}
666 
667 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
668 		scan(lcol, excess);
669 		lcol->lcol_stat.lcols_scan++;
670 	}
671 
672 	return (0);
673 }
674 
675 /*
676  * Scan the global zone collection and see if it still exceeds its cap.
677  * We take into account the effects of capping any global zone projects here.
678  */
679 static void
680 scan_gz(lcollection_t *lcol, boolean_t project_over_cap)
681 {
682 	int64_t excess;
683 
684 	/*
685 	 * If we had projects over their cap and the global zone was also over
686 	 * its cap then we need to get the up-to-date global zone rss to
687 	 * determine if we are still over the global zone cap.  We might have
688 	 * gone under while we scanned the capped projects.  If there were no
689 	 * projects over cap then we can use the rss value we already have for
690 	 * the global zone.
691 	 */
692 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
693 	if (project_over_cap && excess > 0) {
694 		rss_sample(B_TRUE, CAPPED_ZONE);
695 		update_col_rss(lcol);
696 		excess = lcol->lcol_rss - lcol->lcol_rss_cap;
697 	}
698 
699 	if (excess > 0) {
700 		debug("global zone excess %lldKB\n", (long long)excess);
701 		scan(lcol, excess);
702 		lcol->lcol_stat.lcols_scan++;
703 	}
704 }
705 
706 /*
707  * Do a soft scan of those collections which have excesses.  A soft scan is one
708  * in which the cap enforcement pressure is taken into account.  The difference
709  * between the utilized physical memory and the cap enforcement pressure will
710  * be scanned-for, and each collection will be scanned proportionally by their
711  * present excesses.
712  */
713 static int
714 soft_scan_cb(lcollection_t *lcol, void *a)
715 {
716 	int64_t excess;
717 	soft_scan_arg_t *arg = a;
718 
719 	/* skip over global zone collection for now but keep track for later */
720 	if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
721 	    lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
722 		gz_col = lcol;
723 		return (0);
724 	}
725 
726 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
727 		int64_t adjusted_excess =
728 		    excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
729 
730 		debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
731 		    "scanning %lld\n",
732 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
733 		    "project" : "zone"),
734 		    (long)lcol->lcol_id.rcid_val,
735 		    (long long)excess, (long long)arg->ssa_scan_goal,
736 		    (unsigned long long)arg->ssa_sum_excess,
737 		    (long long)adjusted_excess);
738 
739 		scan(lcol, adjusted_excess);
740 		lcol->lcol_stat.lcols_scan++;
741 	}
742 
743 	return (0);
744 }
745 
746 static void
747 soft_scan_gz(lcollection_t *lcol, void *a)
748 {
749 	int64_t excess;
750 	soft_scan_arg_t *arg = a;
751 
752 	/*
753 	 * If we had projects over their cap and the global zone was also over
754 	 * its cap then we need to get the up-to-date global zone rss to
755 	 * determine if we are still over the global zone cap.  We might have
756 	 * gone under while we scanned the capped projects.  If there were no
757 	 * projects over cap then we can use the rss value we already have for
758 	 * the global zone.
759 	 */
760 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
761 	if (arg->ssa_project_over_cap && excess > 0) {
762 		rss_sample(B_TRUE, CAPPED_ZONE);
763 		update_col_rss(lcol);
764 		excess = lcol->lcol_rss - lcol->lcol_rss_cap;
765 	}
766 
767 	if (excess > 0) {
768 		int64_t adjusted_excess =
769 		    excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
770 
771 		debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
772 		    "scanning %lld\n",
773 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
774 		    "project" : "zone"),
775 		    (long)lcol->lcol_id.rcid_val,
776 		    (long long)excess, (long long)arg->ssa_scan_goal,
777 		    (unsigned long long)arg->ssa_sum_excess,
778 		    (long long)adjusted_excess);
779 
780 		scan(lcol, adjusted_excess);
781 		lcol->lcol_stat.lcols_scan++;
782 	}
783 }
784 
785 /*
786  * When a scan could happen, but caps aren't enforced tick the
787  * lcols_unenforced_cap counter.
788  */
789 /*ARGSUSED*/
790 static int
791 unenforced_cap_cb(lcollection_t *lcol, void *arg)
792 {
793 	lcol->lcol_stat.lcols_unenforced_cap++;
794 
795 	return (0);
796 }
797 
798 /*
799  * Update the count of physically installed memory.
800  */
801 static void
802 update_phys_total(void)
803 {
804 	uint64_t old_phys_total;
805 
806 	old_phys_total = phys_total;
807 	phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb;
808 	if (phys_total != old_phys_total)
809 		debug("physical memory%s: %lluM\n", (old_phys_total == 0 ?
810 		    "" : " adjusted"), (unsigned long long)(phys_total / 1024));
811 }
812 
813 /*
814  * Unlink a process from its collection, updating relevant statistics, and
815  * freeing its associated memory.
816  */
817 void
818 lprocess_free(lprocess_t *lpc)
819 {
820 	pid_t pid;
821 
822 	lpc->lpc_collection->lcol_stat.lcols_proc_out++;
823 
824 	if (lpc->lpc_prev != NULL)
825 		lpc->lpc_prev->lpc_next = lpc->lpc_next;
826 	if (lpc->lpc_next != NULL)
827 		lpc->lpc_next->lpc_prev = lpc->lpc_prev;
828 	if (lpc->lpc_collection->lcol_lprocess == lpc)
829 		lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next !=
830 		    lpc ? lpc->lpc_next : NULL);
831 	lpc->lpc_next = lpc->lpc_prev = NULL;
832 
833 	if (lpc->lpc_prpageheader != NULL)
834 		free(lpc->lpc_prpageheader);
835 	if (lpc->lpc_xmap != NULL)
836 		free(lpc->lpc_xmap);
837 	if (lpc->lpc_psinfo_fd >= 0) {
838 		if (rfd_close(lpc->lpc_psinfo_fd) != 0)
839 			debug("could not close %d lpc_psinfo_fd %d",
840 			    (int)lpc->lpc_pid, lpc->lpc_psinfo_fd);
841 		lpc->lpc_psinfo_fd = -1;
842 	}
843 	if (lpc->lpc_pgdata_fd >= 0) {
844 		if (rfd_close(lpc->lpc_pgdata_fd) != 0)
845 			debug("could not close %d lpc_pgdata_fd %d",
846 			    (int)lpc->lpc_pid, lpc->lpc_pgdata_fd);
847 		lpc->lpc_pgdata_fd = -1;
848 	}
849 	if (lpc->lpc_xmap_fd >= 0) {
850 		if (rfd_close(lpc->lpc_xmap_fd) != 0)
851 			debug("could not close %d lpc_xmap_fd %d",
852 			    (int)lpc->lpc_pid, lpc->lpc_xmap_fd);
853 		lpc->lpc_xmap_fd = -1;
854 	}
855 	if (lpc->lpc_ignore != NULL)
856 		lmapping_free(&lpc->lpc_ignore);
857 	pid = lpc->lpc_pid;
858 	free(lpc);
859 	debug_high("process %d freed\n", (int)pid);
860 }
861 
862 /*
863  * Collection clear callback.
864  */
865 /*ARGSUSED*/
866 static int
867 collection_clear_cb(lcollection_t *lcol, void *arg)
868 {
869 	lcol->lcol_mark = 0;
870 
871 	return (0);
872 }
873 
874 /*
875  * Respond to a terminating signal by setting a termination flag.
876  */
877 /*ARGSUSED*/
878 static void
879 terminate_signal(int signal)
880 {
881 	if (termination_signal == 0)
882 		termination_signal = signal;
883 	should_run = 0;
884 }
885 
886 /*
887  * Handle any synchronous or asynchronous signals that would ordinarily cause a
888  * process to abort.
889  */
890 /*ARGSUSED*/
891 static void
892 abort_signal(int signal)
893 {
894 	/*
895 	 * Allow the scanner to make a last-ditch effort to resume any stopped
896 	 * processes.
897 	 */
898 	scan_abort();
899 	abort();
900 }
901 
902 /*
903  * Clean up collections which have been removed due to configuration.  Unlink
904  * the collection from lcollection and free it.
905  */
906 /*ARGSUSED*/
907 static int
908 collection_sweep_cb(lcollection_t *lcol, void *arg)
909 {
910 	if (lcol->lcol_mark == 0) {
911 		debug("freeing %s %s\n",
912 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
913 		    "project" : "zone"), lcol->lcol_name);
914 		lcollection_free(lcol);
915 	}
916 
917 	return (0);
918 }
919 
920 /*
921  * Set those variables which depend on the global configuration.
922  */
923 static void
924 finish_configuration(void)
925 {
926 	/*
927 	 * Warn that any lnode (or non-project) mode specification (by an SRM
928 	 * 1.3 configuration file, for example) is ignored.
929 	 */
930 	if (strcmp(rcfg.rcfg_mode_name, "project") != 0) {
931 		warn(gettext("%s mode specification ignored -- using project"
932 		    " mode\n"), rcfg.rcfg_mode_name);
933 		rcfg.rcfg_mode_name = "project";
934 		rcfg.rcfg_mode = rctype_project;
935 	}
936 }
937 
938 /*
939  * Cause the configuration to be reread and applied.
940  */
941 static void
942 reread_configuration(void)
943 {
944 	rcfg_t rcfg_new;
945 
946 	if (rcfg_read(&rcfg_new, update_statistics) != E_SUCCESS) {
947 		warn(gettext("can't reread configuration \n"));
948 		exit(SMF_EXIT_ERR_CONFIG);
949 	} else {
950 		/*
951 		 * Done reading configuration.  Remove existing
952 		 * collections in case there is a change in collection type.
953 		 */
954 		if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) {
955 			list_walk_collection(collection_clear_cb, NULL);
956 			list_walk_collection(collection_sweep_cb, NULL);
957 		}
958 
959 		/*
960 		 * Make the newly-read configuration the global one, and update
961 		 * any variables that depend on it.
962 		 */
963 		rcfg = rcfg_new;
964 		finish_configuration();
965 	}
966 }
967 
968 /*
969  * First, examine changes, additions, and deletions to cap definitions.
970  * Then, set the next event time.
971  */
972 static void
973 reconfigure(hrtime_t now, hrtime_t *next_configuration,
974     hrtime_t *next_proc_walk, hrtime_t *next_rss_sample)
975 {
976 	debug("reconfigure...\n");
977 
978 	/*
979 	 * Walk the lcollection, marking active collections so inactive ones
980 	 * can be freed.
981 	 */
982 	list_walk_collection(collection_clear_cb, NULL);
983 	lcollection_update(LCU_ACTIVE_ONLY); /* mark */
984 	list_walk_collection(collection_sweep_cb, NULL);
985 
986 	*next_configuration = NEXT_EVENT_TIME(now,
987 	    rcfg.rcfg_reconfiguration_interval);
988 
989 	/*
990 	 * Reset each event time to the shorter of the previous and new
991 	 * intervals.
992 	 */
993 	if (next_report == 0 && rcfg.rcfg_report_interval > 0)
994 		next_report = now;
995 	else
996 		next_report = POSITIVE_MIN(next_report,
997 		    NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval));
998 
999 	if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0)
1000 		*next_proc_walk = now;
1001 	else
1002 		*next_proc_walk = POSITIVE_MIN(*next_proc_walk,
1003 		    NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval));
1004 
1005 	if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0)
1006 		*next_rss_sample = now;
1007 	else
1008 		*next_rss_sample = POSITIVE_MIN(*next_rss_sample,
1009 		    NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval));
1010 }
1011 
1012 /*
1013  * Respond to SIGHUP by triggering the rereading the configuration and cap
1014  * definitions.
1015  */
1016 /*ARGSUSED*/
1017 static void
1018 sighup(int signal)
1019 {
1020 	should_reconfigure = 1;
1021 }
1022 
1023 /*
1024  * Print, for debugging purposes, each collection's interval statistics.
1025  */
1026 /*ARGSUSED*/
1027 static int
1028 simple_report_collection_cb(lcollection_t *lcol, void *arg)
1029 {
1030 #define	DELTA(field) \
1031 	(unsigned long long)( \
1032 	    (lcol->lcol_stat.field - lcol->lcol_stat_old.field))
1033 
1034 	debug("%s %s status: succeeded/attempted (k): %llu/%llu, "
1035 	    "ineffective/scans/unenforced/samplings:  %llu/%llu/%llu/%llu, RSS "
1036 	    "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, "
1037 	    "%llu scans over %llu ms\n",
1038 	    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
1039 	    lcol->lcol_name,
1040 	    DELTA(lcols_pg_eff), DELTA(lcols_pg_att),
1041 	    DELTA(lcols_scan_ineffective), DELTA(lcols_scan),
1042 	    DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample),
1043 	    (unsigned long long)lcol->lcol_stat.lcols_min_rss,
1044 	    (unsigned long long)lcol->lcol_stat.lcols_max_rss,
1045 	    (unsigned long long)lcol->lcol_rss_cap,
1046 	    (unsigned long long)(lcol->lcol_stat.lcols_proc_in -
1047 	    lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out),
1048 	    DELTA(lcols_scan_count),
1049 	    NSEC2MSEC(DELTA(lcols_scan_time_complete)));
1050 
1051 #undef DELTA
1052 
1053 	return (0);
1054 }
1055 
1056 /*
1057  * Record each collection's interval statistics in the statistics file.
1058  */
1059 static int
1060 report_collection_cb(lcollection_t *lcol, void *arg)
1061 {
1062 	lcollection_report_t dc;
1063 	int fd = (intptr_t)arg;
1064 
1065 	/*
1066 	 * Copy the relevant fields to the collection's record.
1067 	 */
1068 	bzero(&dc, sizeof (dc));
1069 	dc.lcol_id = lcol->lcol_id;
1070 	(void) strcpy(dc.lcol_name, lcol->lcol_name);
1071 	dc.lcol_rss = lcol->lcol_rss;
1072 	dc.lcol_image_size = lcol->lcol_image_size;
1073 	dc.lcol_rss_cap = lcol->lcol_rss_cap;
1074 	dc.lcol_stat = lcol->lcol_stat;
1075 
1076 	if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) {
1077 		lcol->lcol_stat_old = lcol->lcol_stat;
1078 	} else {
1079 		debug("can't write %s %s statistics",
1080 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
1081 		    "project" : "zone"),
1082 		    lcol->lcol_name);
1083 	}
1084 
1085 	return (0);
1086 }
1087 
1088 /*
1089  * Determine the count of pages scanned by the global page scanner, obtained
1090  * from the cpu_stat:*::scan kstats.  Return zero on success.
1091  */
1092 static int
1093 get_globally_scanned_pages(uint64_t *scannedp)
1094 {
1095 	kstat_t *ksp;
1096 	uint64_t scanned = 0;
1097 
1098 	if (kstat_chain_update(kctl) == -1) {
1099 		warn(gettext("can't update kstat chain"));
1100 		return (0);
1101 	}
1102 
1103 	for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
1104 		if (strcmp(ksp->ks_module, "cpu_stat") == 0) {
1105 			if (kstat_read(kctl, ksp, NULL) != -1) {
1106 				scanned += ((cpu_stat_t *)
1107 				    ksp->ks_data)->cpu_vminfo.scan;
1108 			} else {
1109 				return (-1);
1110 			}
1111 		}
1112 	}
1113 
1114 	*scannedp = scanned;
1115 	return (0);
1116 }
1117 
1118 /*
1119  * Determine if the global page scanner is running, during which no memory
1120  * caps should be enforced, to prevent interference with the global page
1121  * scanner.
1122  */
1123 static boolean_t
1124 is_global_scanner_running()
1125 {
1126 	/* measure delta in page scan count */
1127 	static uint64_t new_sp = 0;
1128 	static uint64_t old_sp = 0;
1129 	boolean_t res = B_FALSE;
1130 
1131 	if (get_globally_scanned_pages(&new_sp) == 0) {
1132 		if (old_sp != 0 && (new_sp - old_sp) > 0) {
1133 			debug("global memory pressure detected (%llu "
1134 			    "pages scanned since last interval)\n",
1135 			    (unsigned long long)(new_sp - old_sp));
1136 			res = B_TRUE;
1137 		}
1138 		old_sp = new_sp;
1139 	} else {
1140 		warn(gettext("unable to read cpu statistics"));
1141 		new_sp = old_sp;
1142 	}
1143 
1144 	return (res);
1145 }
1146 
1147 /*
1148  * If soft caps are in use, determine if global memory pressure exceeds the
1149  * configured maximum above which soft caps are enforced.
1150  */
1151 static boolean_t
1152 must_enforce_soft_caps()
1153 {
1154 	/*
1155 	 * Check for changes to the amount of installed physical memory, to
1156 	 * compute the current memory pressure.
1157 	 */
1158 	update_phys_total();
1159 
1160 	memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb)
1161 	    * 100.0 / phys_total);
1162 	memory_pressure_sample++;
1163 	if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 &&
1164 	    memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) {
1165 		return (B_TRUE);
1166 	}
1167 
1168 	return (B_FALSE);
1169 }
1170 
1171 /*
1172  * Update the shared statistics file with each collection's current statistics.
1173  * Return zero on success.
1174  */
1175 static int
1176 update_statistics(void)
1177 {
1178 	int fd, res;
1179 	static char template[LINELEN];
1180 
1181 	/*
1182 	 * Try to create a directory irrespective of whether it is existing
1183 	 * or not. If it is not there then it will create. Otherwise any way
1184 	 * it will fail at mkstemp call below.
1185 	 */
1186 	(void) mkdir(STAT_FILE_DIR, 0755);
1187 
1188 	/*
1189 	 * Create a temporary file.
1190 	 */
1191 	if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) +
1192 	    strlen(STAT_TEMPLATE_SUFFIX) + 1)) {
1193 		debug("temporary file template size too small\n");
1194 		return (-1);
1195 	}
1196 	(void) strcpy(template, rcfg.rcfg_stat_file);
1197 	(void) strcat(template, STAT_TEMPLATE_SUFFIX);
1198 	(void) rfd_reserve(1);
1199 	fd = mkstemp(template);
1200 
1201 	/*
1202 	 * Write the header and per-collection statistics.
1203 	 */
1204 	if (fd >= 0) {
1205 		rcapd_stat_hdr_t rs;
1206 
1207 		rs.rs_pid = rcapd_pid;
1208 		rs.rs_time = gethrtime();
1209 		ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name));
1210 		(void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name);
1211 		rs.rs_pressure_cur = memory_pressure;
1212 		rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure;
1213 		rs.rs_pressure_sample = memory_pressure_sample;
1214 
1215 		if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) ==
1216 		    sizeof (rs)) {
1217 			list_walk_collection(report_collection_cb,
1218 			    (void *)(intptr_t)fd);
1219 			/*
1220 			 * Replace the existing statistics file with this new
1221 			 * one.
1222 			 */
1223 			res = rename(template, rcfg.rcfg_stat_file);
1224 		} else
1225 			res = -1;
1226 		(void) close(fd);
1227 	} else
1228 		res = -1;
1229 
1230 	return (res);
1231 }
1232 
1233 /*
1234  * Verify the statistics file can be created and written to, and die if an
1235  * existing file may be in use by another rcapd.
1236  */
1237 static int
1238 verify_statistics(void)
1239 {
1240 	pid_t pid;
1241 
1242 	/*
1243 	 * Warn if another instance of rcapd might be active.
1244 	 */
1245 	(void) rfd_reserve(1);
1246 	pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file);
1247 	if (pid != rcapd_pid && pid != -1)
1248 		die(gettext("%s exists; rcapd may already be active\n"),
1249 		    rcfg.rcfg_stat_file);
1250 
1251 	return (update_statistics());
1252 }
1253 
1254 static int
1255 sum_excess_cb(lcollection_t *lcol, void *arg)
1256 {
1257 	uint64_t *sum_excess = arg;
1258 
1259 	*sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss -
1260 	    lcol->lcol_rss_cap));
1261 	return (0);
1262 }
1263 
1264 /*
1265  * Compute the quantity of memory (in kilobytes) above the cap enforcement
1266  * pressure.  Set the scan goal to that quantity (or at most the excess).
1267  */
1268 static void
1269 compute_soft_scan_goal(soft_scan_arg_t *argp)
1270 {
1271 	/*
1272 	 * Compute the sum of the collections' excesses, which will be the
1273 	 * denominator.
1274 	 */
1275 	argp->ssa_sum_excess = 0;
1276 	list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess));
1277 
1278 	argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) *
1279 	    (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 -
1280 	    sysconf(_SC_AVPHYS_PAGES)) * page_size_kb,
1281 	    argp->ssa_sum_excess);
1282 }
1283 
1284 static void
1285 rcapd_usage(void)
1286 {
1287 	info(gettext("usage: rcapd [-d]\n"));
1288 }
1289 
1290 void
1291 check_update_statistics(void)
1292 {
1293 	hrtime_t now = gethrtime();
1294 
1295 	if (EVENT_TIME(now, next_report)) {
1296 		debug("updating statistics...\n");
1297 		list_walk_collection(simple_report_collection_cb, NULL);
1298 		if (update_statistics() != 0)
1299 			debug("couldn't update statistics");
1300 		next_report = NEXT_REPORT_EVENT_TIME(now,
1301 		    rcfg.rcfg_report_interval);
1302 	}
1303 }
1304 
1305 static void
1306 verify_and_set_privileges(void)
1307 {
1308 	priv_set_t *required =
1309 	    priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL);
1310 
1311 	/*
1312 	 * Ensure the required privileges, suitable for controlling processes,
1313 	 * are possessed.
1314 	 */
1315 	if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv(
1316 	    PRIV_SET, PRIV_EFFECTIVE, required) != 0)
1317 		die(gettext("can't set requisite privileges"));
1318 
1319 	/*
1320 	 * Ensure access to /var/run/daemon.
1321 	 */
1322 	if (setreuid(DAEMON_UID, DAEMON_UID) != 0)
1323 		die(gettext("cannot become user daemon"));
1324 
1325 	priv_freeset(required);
1326 }
1327 
1328 /*
1329  * This function does the top-level work to determine if we should do any
1330  * memory capping, and if so, it invokes the right call-backs to do the work.
1331  */
1332 static void
1333 do_capping(hrtime_t now, hrtime_t *next_proc_walk)
1334 {
1335 	boolean_t enforce_caps;
1336 	/* soft cap enforcement flag, depending on memory pressure */
1337 	boolean_t enforce_soft_caps;
1338 	/* avoid interference with kernel's page scanner */
1339 	boolean_t global_scanner_running;
1340 	sample_col_arg_t col_arg;
1341 	soft_scan_arg_t arg;
1342 	uint_t col_types = 0;
1343 
1344 	/* check what kind of collections (project/zone) are capped */
1345 	list_walk_collection(col_type_cb, &col_types);
1346 	debug("collection types: 0x%x\n", col_types);
1347 
1348 	/* no capped collections, skip checking rss */
1349 	if (col_types == 0)
1350 		return;
1351 
1352 	/* Determine if soft caps are enforced. */
1353 	enforce_soft_caps = must_enforce_soft_caps();
1354 
1355 	/* Determine if the global page scanner is running. */
1356 	global_scanner_running = is_global_scanner_running();
1357 
1358 	/*
1359 	 * Sample collections' member processes RSSes and recompute
1360 	 * collections' excess.
1361 	 */
1362 	rss_sample(B_FALSE, col_types);
1363 
1364 	col_arg.sca_any_over_cap = B_FALSE;
1365 	col_arg.sca_project_over_cap = B_FALSE;
1366 	list_walk_collection(rss_sample_col_cb, &col_arg);
1367 	list_walk_collection(excess_print_cb, NULL);
1368 	debug("any collection/project over cap = %d, %d\n",
1369 	    col_arg.sca_any_over_cap, col_arg.sca_project_over_cap);
1370 
1371 	if (enforce_soft_caps)
1372 		debug("memory pressure %d%%\n", memory_pressure);
1373 
1374 	/*
1375 	 * Cap enforcement is determined by the previous conditions.
1376 	 */
1377 	enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap &&
1378 	    (rcfg.rcfg_memory_cap_enforcement_pressure == 0 ||
1379 	    enforce_soft_caps);
1380 
1381 	debug("%senforcing caps\n", enforce_caps ? "" : "not ");
1382 
1383 	/*
1384 	 * If soft caps are in use, determine the size of the portion from each
1385 	 * collection to scan for.
1386 	 */
1387 	if (enforce_caps && enforce_soft_caps)
1388 		compute_soft_scan_goal(&arg);
1389 
1390 	/*
1391 	 * Victimize offending collections.
1392 	 */
1393 	if (enforce_caps && (!enforce_soft_caps ||
1394 	    (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) {
1395 
1396 		/*
1397 		 * Since at least one collection is over its cap & needs
1398 		 * enforcing, check if it is at least time for a process walk
1399 		 * (we could be well past time since we only walk /proc when
1400 		 * we need to) and if so, update each collections process list
1401 		 * in a single pass through /proc.
1402 		 */
1403 		if (EVENT_TIME(now, *next_proc_walk)) {
1404 			debug("scanning process list...\n");
1405 			proc_walk_all(proc_cb);		 /* insert & mark */
1406 			list_walk_all(sweep_process_cb); /* free dead procs */
1407 			*next_proc_walk = NEXT_EVENT_TIME(now,
1408 			    rcfg.rcfg_proc_walk_interval);
1409 		}
1410 
1411 		gz_col = NULL;
1412 		if (enforce_soft_caps) {
1413 			debug("scan goal is %lldKB\n",
1414 			    (long long)arg.ssa_scan_goal);
1415 			list_walk_collection(soft_scan_cb, &arg);
1416 			if (gz_capped && gz_col != NULL) {
1417 				/* process global zone */
1418 				arg.ssa_project_over_cap =
1419 				    col_arg.sca_project_over_cap;
1420 				soft_scan_gz(gz_col, &arg);
1421 			}
1422 		} else {
1423 			list_walk_collection(scan_cb, NULL);
1424 			if (gz_capped && gz_col != NULL) {
1425 				/* process global zone */
1426 				scan_gz(gz_col, col_arg.sca_project_over_cap);
1427 			}
1428 		}
1429 	} else if (col_arg.sca_any_over_cap) {
1430 		list_walk_collection(unenforced_cap_cb, NULL);
1431 	}
1432 }
1433 
1434 int
1435 main(int argc, char *argv[])
1436 {
1437 	int res;
1438 	int should_fork = 1;	/* fork flag */
1439 	hrtime_t now;		/* current time */
1440 	hrtime_t next;		/* time of next event */
1441 	int sig;		/* signal iteration */
1442 	struct rlimit rl;
1443 	hrtime_t next_proc_walk;	/* time of next /proc scan */
1444 	hrtime_t next_configuration;	/* time of next configuration */
1445 	hrtime_t next_rss_sample;	/* (latest) time of next RSS sample */
1446 
1447 	(void) set_message_priority(RCM_INFO);
1448 	(void) setpname("rcapd");
1449 	rcapd_pid = getpid();
1450 	(void) chdir("/");
1451 	should_run = 1;
1452 	ever_ran = 0;
1453 
1454 	(void) setlocale(LC_ALL, "");
1455 	(void) textdomain(TEXT_DOMAIN);
1456 
1457 	/*
1458 	 * Parse command-line options.
1459 	 */
1460 	while ((res = getopt(argc, argv, "dF")) > 0)
1461 		switch (res) {
1462 		case 'd':
1463 			should_fork = 0;
1464 			if (debug_mode == 0) {
1465 				debug_mode = 1;
1466 				(void) set_message_priority(RCM_DEBUG);
1467 			} else
1468 				(void) set_message_priority(RCM_DEBUG_HIGH);
1469 			break;
1470 		case 'F':
1471 			should_fork = 0;
1472 			break;
1473 		default:
1474 			rcapd_usage();
1475 			return (E_USAGE);
1476 			/*NOTREACHED*/
1477 		}
1478 
1479 	/*
1480 	 * Read the configuration.
1481 	 */
1482 	if (rcfg_read(&rcfg, verify_statistics) != E_SUCCESS) {
1483 		warn(gettext("resource caps not configured\n"));
1484 		return (SMF_EXIT_ERR_CONFIG);
1485 	}
1486 
1487 	/*
1488 	 * If not debugging, fork and continue operating, changing the
1489 	 * destination of messages to syslog().
1490 	 */
1491 	if (should_fork == 1) {
1492 		pid_t child;
1493 		debug("forking\n");
1494 		child = fork();
1495 		if (child == -1)
1496 			die(gettext("cannot fork"));
1497 		if (child > 0)
1498 			return (0);
1499 		else {
1500 			rcapd_pid = getpid();
1501 			(void) set_message_destination(RCD_SYSLOG);
1502 			(void) fclose(stdin);
1503 			(void) fclose(stdout);
1504 			(void) fclose(stderr);
1505 		}
1506 		/*
1507 		 * Start a new session and detatch from the controlling tty.
1508 		 */
1509 		if (setsid() == (pid_t)-1)
1510 			debug(gettext("setsid() failed; cannot detach from "
1511 			    "terminal"));
1512 	}
1513 
1514 	finish_configuration();
1515 	should_reconfigure = 0;
1516 
1517 	/*
1518 	 * Check that required privileges are possessed.
1519 	 */
1520 	verify_and_set_privileges();
1521 
1522 	now = next_report = next_proc_walk = next_rss_sample = gethrtime();
1523 	next_configuration = NEXT_EVENT_TIME(gethrtime(),
1524 	    rcfg.rcfg_reconfiguration_interval);
1525 
1526 	/*
1527 	 * Open the kstat chain.
1528 	 */
1529 	kctl = kstat_open();
1530 	if (kctl == NULL)
1531 		die(gettext("can't open kstats"));
1532 
1533 	/*
1534 	 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can
1535 	 * be effectively managed without revoking descriptors (at 3 per
1536 	 * process).
1537 	 */
1538 	rl.rlim_cur = 32 * 1024;
1539 	rl.rlim_max = 32 * 1024;
1540 	if (setrlimit(RLIMIT_NOFILE, &rl) != 0 &&
1541 	    getrlimit(RLIMIT_NOFILE, &rl) == 0) {
1542 		rl.rlim_cur = rl.rlim_max;
1543 		(void) setrlimit(RLIMIT_NOFILE, &rl);
1544 	}
1545 	(void) enable_extended_FILE_stdio(-1, -1);
1546 
1547 	if (getrlimit(RLIMIT_NOFILE, &rl) == 0)
1548 		debug("fd limit: %lu\n", rl.rlim_cur);
1549 	else
1550 		debug("fd limit: unknown\n");
1551 
1552 	get_page_size();
1553 	my_zoneid = getzoneid();
1554 
1555 	/*
1556 	 * Handle those signals whose (default) exit disposition
1557 	 * prevents rcapd from finishing scanning before terminating.
1558 	 */
1559 	(void) sigset(SIGINT, terminate_signal);
1560 	(void) sigset(SIGQUIT, abort_signal);
1561 	(void) sigset(SIGILL, abort_signal);
1562 	(void) sigset(SIGEMT, abort_signal);
1563 	(void) sigset(SIGFPE, abort_signal);
1564 	(void) sigset(SIGBUS, abort_signal);
1565 	(void) sigset(SIGSEGV, abort_signal);
1566 	(void) sigset(SIGSYS, abort_signal);
1567 	(void) sigset(SIGPIPE, terminate_signal);
1568 	(void) sigset(SIGALRM, terminate_signal);
1569 	(void) sigset(SIGTERM, terminate_signal);
1570 	(void) sigset(SIGUSR1, terminate_signal);
1571 	(void) sigset(SIGUSR2, terminate_signal);
1572 	(void) sigset(SIGPOLL, terminate_signal);
1573 	(void) sigset(SIGVTALRM, terminate_signal);
1574 	(void) sigset(SIGXCPU, abort_signal);
1575 	(void) sigset(SIGXFSZ, abort_signal);
1576 	for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++)
1577 		(void) sigset(sig, terminate_signal);
1578 
1579 	/*
1580 	 * Install a signal handler for reconfiguration processing.
1581 	 */
1582 	(void) sigset(SIGHUP, sighup);
1583 
1584 	/*
1585 	 * Determine which process collections to cap.
1586 	 */
1587 	lcollection_update(LCU_COMPLETE);
1588 
1589 	/*
1590 	 * Loop forever, monitoring collections' resident set sizes and
1591 	 * enforcing their caps.  Look for changes in caps as well as
1592 	 * responding to requests to reread the configuration.  Update
1593 	 * per-collection statistics periodically.
1594 	 */
1595 	while (should_run != 0) {
1596 		struct timespec ts;
1597 
1598 		/*
1599 		 * Announce that rcapd is starting.
1600 		 */
1601 		if (ever_ran == 0) {
1602 			info(gettext("starting\n"));
1603 			ever_ran = 1;
1604 		}
1605 
1606 		/*
1607 		 * Check the configuration at every next_configuration interval.
1608 		 * Update the rss data once every next_rss_sample interval.
1609 		 * The condition of global memory pressure is also checked at
1610 		 * the same frequency, if strict caps are in use.
1611 		 */
1612 		now = gethrtime();
1613 
1614 		/*
1615 		 * Detect configuration and cap changes only when SIGHUP
1616 		 * is received. Call reconfigure to apply new configuration
1617 		 * parameters.
1618 		 */
1619 		if (should_reconfigure == 1) {
1620 			reread_configuration();
1621 			should_reconfigure = 0;
1622 			reconfigure(now, &next_configuration, &next_proc_walk,
1623 			    &next_rss_sample);
1624 		}
1625 
1626 		if (EVENT_TIME(now, next_configuration)) {
1627 			reconfigure(now, &next_configuration, &next_proc_walk,
1628 			    &next_rss_sample);
1629 		}
1630 
1631 		/*
1632 		 * Do the main work for enforcing caps.
1633 		 */
1634 		if (EVENT_TIME(now, next_rss_sample)) {
1635 			do_capping(now, &next_proc_walk);
1636 
1637 			next_rss_sample = NEXT_EVENT_TIME(now,
1638 			    rcfg.rcfg_rss_sample_interval);
1639 		}
1640 
1641 		/*
1642 		 * Update the statistics file, if it's time.
1643 		 */
1644 		check_update_statistics();
1645 
1646 		/*
1647 		 * Sleep for some time before repeating.
1648 		 */
1649 		now = gethrtime();
1650 		next = next_configuration;
1651 		next = POSITIVE_MIN(next, next_report);
1652 		next = POSITIVE_MIN(next, next_rss_sample);
1653 		if (next > now && should_run != 0) {
1654 			debug("sleeping %-4.2f seconds\n", (float)(next -
1655 			    now) / (float)NANOSEC);
1656 			hrt2ts(next - now, &ts);
1657 			(void) nanosleep(&ts, NULL);
1658 		}
1659 	}
1660 	if (termination_signal != 0)
1661 		debug("exiting due to signal %d\n", termination_signal);
1662 	if (ever_ran != 0)
1663 		info(gettext("exiting\n"));
1664 
1665 	/*
1666 	 * Unlink the statistics file before exiting.
1667 	 */
1668 	if (rcfg.rcfg_stat_file[0] != 0)
1669 		(void) unlink(rcfg.rcfg_stat_file);
1670 
1671 	return (E_SUCCESS);
1672 }
1673