xref: /illumos-gate/usr/src/cmd/rcap/rcapd/rcapd_scanner.c (revision 45ede40b2394db7967e59f19288fae9b62efd4aa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/mman.h>
27 #include <sys/param.h>
28 #include <sys/stat.h>
29 #include <sys/types.h>
30 #include <assert.h>
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <libproc.h>
34 #include <limits.h>
35 #include <procfs.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <strings.h>
39 #include <time.h>
40 #include <unistd.h>
41 #include "rcapd.h"
42 #include "rcapd_rfd.h"
43 #include "rcapd_mapping.h"
44 #include "utils.h"
45 
46 static int lpc_xmap_update(lprocess_t *);
47 #ifdef DEBUG
48 extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2);
49 #endif /* DEBUG */
50 
51 /*
52  * The number of file descriptors required to grab a process and create an
53  * agent in it.
54  */
55 #define	PGRAB_FD_COUNT		10
56 
57 /*
58  * Record a position in an address space as it corresponds to a prpageheader_t
59  * and affiliated structures.
60  */
61 typedef struct prpageheader_cur {
62 	int pr_nmap;		/* number of mappings in address space */
63 	int pr_map;		/* number of this mapping */
64 	uint64_t pr_pgoff;	/* page offset into mapping */
65 	uint64_t pr_npage;	/* number of pages in mapping */
66 	uint64_t pr_pagesize;	/* page size of mapping */
67 	uintptr_t pr_addr;	/* base of mapping */
68 	prpageheader_t *pr_prpageheader;	/* associated page header */
69 	void *pr_pdaddr;	/* address of page's byte in pagedata */
70 	prxmap_t *pr_xmap;	/* array containing per-segment information */
71 	int pr_nxmap;		/* number of xmaps in array */
72 	int64_t pr_rss;		/* number of resident pages in mapping, */
73 				/* or -1 if xmap is out of sync */
74 	int64_t pr_pg_rss;	/* number of pageable pages in mapping, or -1 */
75 } prpageheader_cur_t;
76 
77 static struct ps_prochandle *scan_pr;	/* currently-scanned process's handle */
78 
79 typedef enum {
80 	STDL_NORMAL,
81 	STDL_HIGH
82 } st_debug_level_t;
83 
84 /*
85  * Output a scanning-related debug message.
86  */
87 /*PRINTFLIKE3*/ /*ARGSUSED*/
88 static void
89 st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
90 {
91 #ifdef DEBUG_MSG
92 	va_list alist;
93 	char *buf;
94 	size_t len;
95 
96 	if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH
97 	    : RCM_DEBUG))
98 		return;
99 
100 	len = strlen(msg) + LINELEN;
101 	buf = malloc(len);
102 	if (buf == NULL)
103 		return;
104 	(void) snprintf(buf, len, "%s %s scanner %s",
105 	    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
106 	    lcol->lcol_name, msg);
107 
108 	va_start(alist, msg);
109 	vdprintfe(RCM_DEBUG, buf, alist);
110 	va_end(alist);
111 
112 	free(buf);
113 #endif /* DEBUG_MSG */
114 }
115 
116 /*
117  * Determine the collection's current victim, based on its last.  The last will
118  * be returned, or, if invalid, any other valid process, if the collection has
119  * any.
120  */
121 static lprocess_t *
122 get_valid_victim(lcollection_t *lcol, lprocess_t *lpc)
123 {
124 	if (lpc == NULL || !lcollection_member(lcol, lpc))
125 		lpc = lcol->lcol_lprocess;
126 
127 	/*
128 	 * Find the next scannable process, and make it the victim.
129 	 */
130 	while (lpc != NULL && lpc->lpc_unscannable != 0)
131 		lpc = lpc->lpc_next;
132 
133 	return (lpc);
134 }
135 
136 /*
137  * Get a process's combined current pagedata (per-page referenced and modified
138  * bits) and set the supplied pointer to it.  The caller is responsible for
139  * freeing the data.  If the pagedata is unreadable, a nonzero value is
140  * returned, and errno is set.  Otherwise, 0 is returned.
141  */
142 static int
143 get_pagedata(prpageheader_t **pghpp, int fd)
144 {
145 	int res;
146 	struct stat st;
147 
148 redo:
149 	errno = 0;
150 	if (fstat(fd, &st) != 0) {
151 		debug("cannot stat pagedata\n");
152 		return (-1);
153 	}
154 
155 	errno = 0;
156 	*pghpp = malloc(st.st_size);
157 	if (*pghpp == NULL) {
158 		debug("cannot malloc() %ld bytes for pagedata", st.st_size);
159 		return (-1);
160 	}
161 	(void) bzero(*pghpp, st.st_size);
162 
163 	errno = 0;
164 	if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) {
165 		free(*pghpp);
166 		*pghpp = NULL;
167 		if (res > 0 || errno == E2BIG) {
168 			debug("pagedata changed size, retrying\n");
169 			goto redo;
170 		} else {
171 			debug("cannot read pagedata");
172 			return (-1);
173 		}
174 	}
175 
176 	return (0);
177 }
178 
179 /*
180  * Return the count of kilobytes of pages represented by the given pagedata
181  * which meet the given criteria, having pages which are in all of the states
182  * specified by the mask, and in none of the states in the notmask.  If the
183  * CP_CLEAR flag is set, the pagedata will also be cleared.
184  */
185 #define	CP_CLEAR	1
186 static uint64_t
187 count_pages(prpageheader_t *pghp, int flags, int mask, int notmask)
188 {
189 	int map;
190 	caddr_t cur, end;
191 	prpageheader_t pgh = *pghp;
192 	prasmap_t *asmapp;
193 	uint64_t count = 0;
194 
195 	cur = (caddr_t)pghp + sizeof (*pghp);
196 	for (map = 0; map < pgh.pr_nmap; map++) {
197 		asmapp = (prasmap_t *)(uintptr_t)cur;
198 		cur += sizeof (*asmapp);
199 		end = cur + asmapp->pr_npage;
200 		while (cur < end) {
201 			if ((*cur & mask) == mask && (*cur & notmask) == 0)
202 				count += asmapp->pr_pagesize / 1024;
203 			if ((flags & CP_CLEAR) != 0)
204 				*cur = 0;
205 			cur++;
206 		}
207 
208 		/*
209 		 * Skip to next 64-bit-aligned address to get the next
210 		 * prasmap_t.
211 		 */
212 		cur = (caddr_t)((intptr_t)(cur + 7) & ~7);
213 	}
214 
215 	return (count);
216 }
217 
218 /*
219  * Return the amount of memory (in kilobytes) that hasn't been referenced or
220  * modified, which memory which will be paged out first.  Should be written to
221  * exclude nonresident pages when sufficient interfaces exist.
222  */
223 static uint64_t
224 unrm_size(lprocess_t *lpc)
225 {
226 	return (count_pages(lpc->lpc_prpageheader, CP_CLEAR,
227 	    0, PG_MODIFIED | PG_REFERENCED));
228 }
229 
230 /*
231  * Advance a prpageheader_cur_t to the address space's next mapping, returning
232  * its address, or NULL if there is none.  Any known nonpageable or nonresident
233  * mappings will be skipped over.
234  */
235 static uintptr_t
236 advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp)
237 {
238 	prasmap_t *pap;
239 	int i;
240 
241 next:
242 	ASSERT(pcp->pr_map < pcp->pr_nmap);
243 	if ((pcp->pr_map + 1) == pcp->pr_nmap)
244 		return ((uintptr_t)NULL);
245 	pcp->pr_map++;
246 	if (pcp->pr_pgoff < pcp->pr_npage) {
247 		pcp->pr_pdaddr = (caddr_t)(uintptr_t)
248 		    ((uintptr_t)pcp->pr_pdaddr +
249 		    (pcp->pr_npage - pcp->pr_pgoff));
250 		pcp->pr_pgoff = pcp->pr_npage;
251 	}
252 	/*
253 	 * Skip to next 64-bit-aligned address to get the next prasmap_t.
254 	 */
255 	pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7);
256 	pap = (prasmap_t *)pcp->pr_pdaddr;
257 	pcp->pr_pgoff = 0;
258 	pcp->pr_npage = pap->pr_npage;
259 	pcp->pr_pagesize = pap->pr_pagesize;
260 	pcp->pr_addr = pap->pr_vaddr;
261 	pcp->pr_pdaddr = pap + 1;
262 
263 	/*
264 	 * Skip any known nonpageable mappings.  Currently, the only one
265 	 * detected is the schedctl page.
266 	 */
267 	if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC |
268 	    MA_ANON)) == 0 && pap->pr_npage == 1) {
269 		debug("identified nonpageable schedctl mapping at %p\n",
270 		    (void *)pcp->pr_addr);
271 		goto next;
272 	}
273 
274 	/*
275 	 * Skip mappings with no resident pages.  If the xmap does not
276 	 * correspond to the pagedata for any reason, it will be ignored.
277 	 */
278 	pcp->pr_rss = -1;
279 	pcp->pr_pg_rss = -1;
280 	for (i = 0; i < pcp->pr_nxmap; i++) {
281 		prxmap_t *xmap = &pcp->pr_xmap[i];
282 
283 		if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size ==
284 		    (pcp->pr_npage * pcp->pr_pagesize)) {
285 			pcp->pr_rss = xmap->pr_rss;
286 			/*
287 			 * Remove COW pages from the pageable RSS count.
288 			 */
289 			if ((xmap->pr_mflags & MA_SHARED) == 0)
290 				pcp->pr_pg_rss = xmap->pr_anon;
291 			break;
292 		}
293 	}
294 	if (pcp->pr_rss == 0) {
295 		debug("identified nonresident mapping at 0x%p\n",
296 		    (void *)pcp->pr_addr);
297 		goto next;
298 	} else if (pcp->pr_pg_rss == 0) {
299 		debug("identified unpageable mapping at 0x%p\n",
300 		    (void *)pcp->pr_addr);
301 		goto next;
302 	}
303 
304 	return (pcp->pr_addr);
305 }
306 
307 /*
308  * Advance a prpageheader_cur_t to the mapping's next page, returning its
309  * address, or NULL if there is none.
310  */
311 static void *
312 advance_prpageheader_cur(prpageheader_cur_t *pcp)
313 {
314 	ASSERT(pcp->pr_pgoff < pcp->pr_npage);
315 	if ((pcp->pr_pgoff + 1) == pcp->pr_npage)
316 		return (NULL);
317 	pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1;
318 	pcp->pr_pgoff++;
319 
320 	ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0);
321 	return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize);
322 }
323 
324 /*
325  * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
326  * of an address space.
327  */
328 static void *
329 set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php,
330     prxmap_t *xmap, int nxmap)
331 {
332 	bzero(pcp, sizeof (*pcp));
333 	pcp->pr_nmap = php->pr_nmap;
334 	pcp->pr_map = -1;
335 	pcp->pr_prpageheader = php;
336 	pcp->pr_xmap = xmap;
337 	pcp->pr_nxmap = nxmap;
338 	pcp->pr_pdaddr = (prpageheader_t *)php + 1;
339 
340 	return ((void *)advance_prpageheader_cur_nextmapping(pcp));
341 }
342 
343 /*
344  * Position a prpageheader_cur_t to the mapped address greater or equal to the
345  * given value.
346  */
347 static void *
348 set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php,
349     prxmap_t *xmap, int nxmap, void *naddr)
350 {
351 	void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap);
352 
353 	while (addr != NULL && addr <= naddr)
354 		if (naddr < (void *)((caddr_t)pcp->pr_addr +
355 		    pcp->pr_pagesize * pcp->pr_npage)) {
356 			uint64_t pgdiff = ((uintptr_t)naddr -
357 			    (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize;
358 			pcp->pr_pgoff += pgdiff;
359 			pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff;
360 			addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize *
361 			    pcp->pr_pgoff;
362 			break;
363 		} else
364 			addr =
365 			    (void *)advance_prpageheader_cur_nextmapping(pcp);
366 
367 	return (addr);
368 }
369 
370 static void
371 revoke_pagedata(rfd_t *rfd)
372 {
373 	lprocess_t *lpc = rfd->rfd_data;
374 
375 	st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for"
376 	    " process %d\n", (int)lpc->lpc_pid);
377 	ASSERT(lpc->lpc_pgdata_fd != -1);
378 	lpc->lpc_pgdata_fd = -1;
379 }
380 
381 #ifdef DEBUG
382 static void
383 mklmapping(lmapping_t **lm, prpageheader_t *pgh)
384 {
385 	prpageheader_cur_t cur;
386 	void *addr;
387 
388 	addr = set_prpageheader_cur(&cur, pgh, NULL, -1);
389 	ASSERT(*lm == NULL);
390 	while (addr != NULL) {
391 		(void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage *
392 		    cur.pr_pagesize);
393 		addr = (void *)advance_prpageheader_cur_nextmapping(&cur);
394 	}
395 }
396 
397 static void
398 lmapping_dump(lmapping_t *lm)
399 {
400 	debug("lm: %p\n", (void *)lm);
401 	while (lm != NULL) {
402 		debug("\t(%p, %llx\n", (void *)lm->lm_addr,
403 		    (unsigned long long)lm->lm_size);
404 		lm = lm->lm_next;
405 	}
406 }
407 #endif /* DEBUG */
408 
409 /*
410  * OR two prpagedata_t which are supposedly snapshots of the same address
411  * space.  Intersecting mappings with different page sizes are tolerated but
412  * not normalized (not accurate).  If the mappings of the two snapshots differ
413  * in any regard, the supplied mappings_changed flag will be set.
414  */
415 static void
416 OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp)
417 {
418 	prpageheader_cur_t src_cur;
419 	prpageheader_cur_t dst_cur;
420 	uintptr_t src_addr;
421 	uintptr_t dst_addr;
422 	int mappings_changed = 0;
423 
424 	/*
425 	 * OR source pagedata with the destination, for pages of intersecting
426 	 * mappings.
427 	 */
428 	src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1);
429 	dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1);
430 	while (src_addr != (uintptr_t)NULL && dst_addr != (uintptr_t)NULL) {
431 		while (src_addr == dst_addr && src_addr != (uintptr_t)NULL) {
432 			*(char *)dst_cur.pr_pdaddr |=
433 			    *(char *)src_cur.pr_pdaddr;
434 			src_addr = (uintptr_t)advance_prpageheader_cur(
435 			    &src_cur);
436 			dst_addr = (uintptr_t)advance_prpageheader_cur(
437 			    &dst_cur);
438 		}
439 		if (src_addr != dst_addr)
440 			mappings_changed = 1;
441 		src_addr = advance_prpageheader_cur_nextmapping(&src_cur);
442 		dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur);
443 		while (src_addr != dst_addr && src_addr != (uintptr_t)NULL &&
444 		    dst_addr != (uintptr_t)NULL) {
445 			mappings_changed = 1;
446 			if (src_addr < dst_addr)
447 				src_addr = advance_prpageheader_cur_nextmapping(
448 				    &src_cur);
449 			else
450 				dst_addr = advance_prpageheader_cur_nextmapping(
451 				    &dst_cur);
452 		}
453 	}
454 
455 	*mappings_changedp = mappings_changed;
456 }
457 
458 /*
459  * Merge the current pagedata with that on hand.  If the pagedata is
460  * unretrievable for any reason, such as the process having exited or being a
461  * zombie, a nonzero value is returned, the process should be marked
462  * unscannable, and future attempts to scan it should be avoided, since the
463  * symptom is probably permament.  If the mappings of either pagedata
464  * differ in any respect, the supplied callback will be invoked once.
465  */
466 static int
467 merge_current_pagedata(lprocess_t *lpc,
468     void(*mappings_changed_cb) (lprocess_t *))
469 {
470 	prpageheader_t *pghp;
471 	int mappings_changed = 0;
472 	uint64_t cnt;
473 
474 	if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
475 	    0) {
476 		char pathbuf[PROC_PATH_MAX];
477 
478 		(void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata",
479 		    (int)lpc->lpc_pid);
480 		if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA,
481 		    revoke_pagedata, lpc, O_RDONLY, 0)) < 0 ||
482 		    get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0)
483 			return (-1);
484 		debug("starting/resuming pagedata collection for %d\n",
485 		    (int)lpc->lpc_pid);
486 	}
487 
488 	cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
489 	if (cnt != 0 || lpc->lpc_rss != 0)
490 		debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
491 		    (int)lpc->lpc_pid, (unsigned long long)cnt,
492 		    (unsigned long long)lpc->lpc_rss);
493 	if (lpc->lpc_prpageheader != NULL) {
494 		/*
495 		 * OR the two snapshots.
496 		 */
497 #ifdef DEBUG
498 		lmapping_t *old = NULL;
499 		lmapping_t *new = NULL;
500 
501 		mklmapping(&new, pghp);
502 		mklmapping(&old, lpc->lpc_prpageheader);
503 #endif /* DEBUG */
504 		OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed);
505 #ifdef DEBUG
506 		if (((mappings_changed != 0) ^
507 		    (lmapping_dump_diff(old, new) != 0))) {
508 			debug("lmapping_changed inconsistent with lmapping\n");
509 			debug("old\n");
510 			lmapping_dump(old);
511 			debug("new\n");
512 			lmapping_dump(new);
513 			debug("ignored\n");
514 			lmapping_dump(lpc->lpc_ignore);
515 			ASSERT(0);
516 		}
517 		lmapping_free(&new);
518 		lmapping_free(&old);
519 #endif /* DEBUG */
520 		free(lpc->lpc_prpageheader);
521 	} else
522 		mappings_changed = 1;
523 	lpc->lpc_prpageheader = pghp;
524 
525 	cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
526 	if (cnt != 0 || lpc->lpc_rss != 0)
527 		debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
528 		    (int)lpc->lpc_pid, (unsigned long long)cnt,
529 		    (unsigned long long)lpc->lpc_rss);
530 	if (mappings_changed != 0) {
531 		debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
532 		if (mappings_changed_cb != NULL)
533 			mappings_changed_cb(lpc);
534 	}
535 	return (0);
536 }
537 
538 /*
539  * Attempt to page out a region of the given process's address space.  May
540  * return nonzero if not all of the pages may are pageable, for any reason.
541  */
542 static int
543 pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
544 {
545 	int res;
546 
547 	if (end <= start)
548 		return (0);
549 
550 	errno = 0;
551 	res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
552 	    (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
553 	debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
554 
555 	/*
556 	 * EBUSY indicates none of the pages have backing store allocated, or
557 	 * some pages were locked, which are less interesting than other
558 	 * conditions, which are noted.
559 	 */
560 	if (res != 0)
561 		if (errno == EBUSY)
562 			res = 0;
563 		else
564 			debug("%d: can't pageout %p+%llx (errno %d)", (int)pid,
565 			    (void *)start, (long long)(end - start), errno);
566 
567 	return (res);
568 }
569 
570 /*
571  * Compute the delta of the victim process's RSS since the last call.  If the
572  * psinfo cannot be obtained, no work is done, and no error is returned; it is
573  * up to the caller to detect the process' termination via other means.
574  */
575 static int64_t
576 rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
577 {
578 	int64_t d_rss = 0;
579 
580 	if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd,
581 	    lprocess_update_psinfo_fd_cb, vic, vic) == 0) {
582 		d_rss = (int64_t)new_psinfo->pr_rssize -
583 		    (int64_t)old_psinfo->pr_rssize;
584 		if (d_rss < 0)
585 			vic->lpc_collection->lcol_stat.lcols_pg_eff +=
586 			    (- d_rss);
587 		*old_psinfo = *new_psinfo;
588 	}
589 
590 	return (d_rss);
591 }
592 
593 static void
594 unignore_mappings(lprocess_t *lpc)
595 {
596 	lmapping_free(&lpc->lpc_ignore);
597 }
598 
599 static void
600 unignore_referenced_mappings(lprocess_t *lpc)
601 {
602 	prpageheader_cur_t cur;
603 	void *vicaddr;
604 
605 	vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1);
606 	while (vicaddr != NULL) {
607 		if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED))
608 		    != 0) {
609 			if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr,
610 			    cur.pr_npage * cur.pr_pagesize) == 0)
611 				debug("removed mapping 0x%p+0t%llukB from"
612 				    " ignored set\n", (void *)cur.pr_addr,
613 				    (unsigned long long)(cur.pr_npage *
614 				    cur.pr_pagesize / 1024));
615 			vicaddr = (void *)advance_prpageheader_cur_nextmapping(
616 			    &cur);
617 		} else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL)
618 			vicaddr = (void *)advance_prpageheader_cur_nextmapping(
619 			    &cur);
620 	}
621 }
622 
623 /*
624  * Resume scanning, starting with the last victim, if it is still valid, or any
625  * other one, otherwise.
626  */
627 void
628 scan(lcollection_t *lcol, int64_t excess)
629 {
630 	lprocess_t *vic, *lpc;
631 	void *vicaddr, *endaddr, *nvicaddr;
632 	prpageheader_cur_t cur;
633 	psinfo_t old_psinfo, new_psinfo;
634 	hrtime_t scan_start;
635 	int res, resumed;
636 	uint64_t col_unrm_size;
637 
638 	st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n",
639 	    (long long)excess);
640 
641 	/*
642 	 * Determine the address to start scanning at, depending on whether
643 	 * scanning can be resumed.
644 	 */
645 	endaddr = NULL;
646 	if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) ==
647 	    lcol->lcol_victim && lcol->lcol_resaddr != NULL) {
648 		vicaddr = lcol->lcol_resaddr;
649 		st_debug(STDL_NORMAL, lcol, "resuming process %d\n",
650 		    (int)vic->lpc_pid);
651 		resumed = 1;
652 	} else {
653 		vicaddr = NULL;
654 		resumed = 0;
655 	}
656 
657 	scan_start = gethrtime();
658 	/*
659 	 * Obtain the most current pagedata for the processes that might be
660 	 * scanned, and remove from the ignored set any mappings which have
661 	 * referenced or modified pages (in the hopes that the pageability of
662 	 * the mapping's pages may have changed).  Determine if the
663 	 * unreferenced and unmodified portion is impossibly small to suffice
664 	 * to reduce the excess completely.  If so, ignore these bits so that
665 	 * even working set will be paged out.
666 	 */
667 	col_unrm_size = 0;
668 	lpc = vic;
669 	while (lpc != NULL && should_run) {
670 		if (merge_current_pagedata(lpc, unignore_mappings) != 0) {
671 			st_debug(STDL_NORMAL, lcol, "process %d:"
672 			    " exited/temporarily unscannable",
673 			    (int)lpc->lpc_pid);
674 			goto next;
675 		}
676 		debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid,
677 		    (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)),
678 		    (unsigned long long)lpc->lpc_size);
679 		col_unrm_size += lpc->lpc_unrm = unrm_size(lpc);
680 
681 		if ((lcol->lcol_stat.lcols_scan_count %
682 		    RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) {
683 			/*
684 			 * Periodically clear the set of ignored mappings.
685 			 * This will allow processes whose ignored segments'
686 			 * pageability have changed (without a corresponding
687 			 * reference or modification to a page) to be
688 			 * recognized.
689 			 */
690 			if (lcol->lcol_stat.lcols_scan_count > 0)
691 				unignore_mappings(lpc);
692 		} else {
693 			/*
694 			 * Ensure mappings with referenced or modified pages
695 			 * are not in the ignored set.  Their usage might mean
696 			 * the condition which made them unpageable is gone.
697 			 */
698 			unignore_referenced_mappings(lpc);
699 		}
700 next:
701 		lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
702 		    lpc->lpc_next) : NULL;
703 	}
704 	if (col_unrm_size < excess) {
705 		lpc = vic;
706 		debug("will not reduce excess with only unreferenced pages\n");
707 		while (lpc != NULL && should_run) {
708 			if (lpc->lpc_prpageheader != NULL) {
709 				(void) count_pages(lpc->lpc_prpageheader,
710 				    CP_CLEAR, 0, 0);
711 				if (lpc->lpc_pgdata_fd >= 0) {
712 					if (rfd_close(lpc->lpc_pgdata_fd) != 0)
713 						debug("coud not close %d"
714 						    " lpc_pgdata_fd %d",
715 						    (int)lpc->lpc_pid,
716 						    lpc->lpc_pgdata_fd);
717 					lpc->lpc_pgdata_fd = -1;
718 				}
719 			}
720 			lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
721 			    lpc->lpc_next) : NULL;
722 		}
723 	}
724 
725 	/*
726 	 * Examine each process for pages to remove until the excess is
727 	 * reduced.
728 	 */
729 	while (vic != NULL && excess > 0 && should_run) {
730 		/*
731 		 * Skip processes whose death was reported when the merging of
732 		 * pagedata was attempted.
733 		 */
734 		if (vic->lpc_prpageheader == NULL)
735 			goto nextproc;
736 
737 		/*
738 		 * Obtain optional segment residency information.
739 		 */
740 		if (lpc_xmap_update(vic) != 0)
741 			st_debug(STDL_NORMAL, lcol, "process %d: xmap"
742 			    " unreadable; ignoring", (int)vic->lpc_pid);
743 
744 #ifdef DEBUG_MSG
745 		{
746 			void *ovicaddr = vicaddr;
747 #endif /* DEBUG_MSG */
748 		vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader,
749 		    vic->lpc_xmap, vic->lpc_nxmap, vicaddr);
750 #ifdef DEBUG_MSG
751 			st_debug(STDL_NORMAL, lcol, "trying to resume from"
752 			    " 0x%p, next 0x%p\n", ovicaddr, vicaddr);
753 		}
754 #endif /* DEBUG_MSG */
755 
756 		/*
757 		 * Take control of the victim.
758 		 */
759 		if (get_psinfo(vic->lpc_pid, &old_psinfo,
760 		    vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb,
761 		    vic, vic) != 0) {
762 			st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo",
763 			    (int)vic->lpc_pid);
764 			goto nextproc;
765 		}
766 		(void) rfd_reserve(PGRAB_FD_COUNT);
767 		if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) {
768 			st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)",
769 			    (int)vic->lpc_pid, res);
770 			goto nextproc;
771 		}
772 		if (Pcreate_agent(scan_pr) != 0) {
773 			st_debug(STDL_NORMAL, lcol, "cannot control %d",
774 			    (int)vic->lpc_pid);
775 			goto nextproc;
776 		}
777 		/*
778 		 * Be very pessimistic about the state of the agent LWP --
779 		 * verify it's actually stopped.
780 		 */
781 		errno = 0;
782 		while (Pstate(scan_pr) == PS_RUN)
783 			(void) Pwait(scan_pr, 0);
784 		if (Pstate(scan_pr) != PS_STOP) {
785 			st_debug(STDL_NORMAL, lcol, "agent not in expected"
786 			    " state (%d)", Pstate(scan_pr));
787 			goto nextproc;
788 		}
789 
790 		/*
791 		 * Within the victim's address space, find contiguous ranges of
792 		 * unreferenced pages to page out.
793 		 */
794 		st_debug(STDL_NORMAL, lcol, "paging out process %d\n",
795 		    (int)vic->lpc_pid);
796 		while (excess > 0 && vicaddr != NULL && should_run) {
797 			/*
798 			 * Skip mappings in the ignored set.  Mappings get
799 			 * placed in the ignored set when all their resident
800 			 * pages are unreference and unmodified, yet unpageable
801 			 * -- such as when they are locked, or involved in
802 			 * asynchronous I/O.  They will be scanned again when
803 			 * some page is referenced or modified.
804 			 */
805 			if (lmapping_contains(vic->lpc_ignore, cur.pr_addr,
806 			    cur.pr_npage * cur.pr_pagesize)) {
807 				debug("ignored mapping at 0x%p\n",
808 				    (void *)cur.pr_addr);
809 				/*
810 				 * Update statistics.
811 				 */
812 				lcol->lcol_stat.lcols_pg_att +=
813 				    cur.pr_npage * cur.pr_pagesize / 1024;
814 
815 				vicaddr = (void *)
816 				    advance_prpageheader_cur_nextmapping(&cur);
817 				continue;
818 			}
819 
820 			/*
821 			 * Determine a range of unreferenced pages to page out,
822 			 * and clear the R/M bits in the preceding referenced
823 			 * range.
824 			 */
825 			st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p,"
826 			    " npage %llu\n", vicaddr,
827 			    (unsigned long long)cur.pr_npage);
828 			while (vicaddr != NULL &&
829 			    *(caddr_t)cur.pr_pdaddr != 0) {
830 				*(caddr_t)cur.pr_pdaddr = 0;
831 				vicaddr = advance_prpageheader_cur(&cur);
832 			}
833 			st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr"
834 			    " %p\n", vicaddr, cur.pr_pdaddr);
835 			if (vicaddr == NULL) {
836 				/*
837 				 * The end of mapping was reached before any
838 				 * unreferenced pages were seen.
839 				 */
840 				vicaddr = (void *)
841 				    advance_prpageheader_cur_nextmapping(&cur);
842 				continue;
843 			}
844 			do
845 				endaddr = advance_prpageheader_cur(&cur);
846 			while (endaddr != NULL &&
847 			    *(caddr_t)cur.pr_pdaddr == 0 &&
848 			    (((intptr_t)endaddr - (intptr_t)vicaddr) /
849 			    1024) < excess)
850 				;
851 			st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n",
852 			    endaddr, *(caddr_t)cur.pr_pdaddr);
853 
854 			/*
855 			 * Page out from vicaddr to the end of the mapping, or
856 			 * endaddr if set, then continue scanning after
857 			 * endaddr, or the next mapping, if not set.
858 			 */
859 			nvicaddr = endaddr;
860 			if (endaddr == NULL)
861 				endaddr = (caddr_t)cur.pr_addr +
862 				    cur.pr_pagesize * cur.pr_npage;
863 			if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) ==
864 			    0) {
865 				int64_t d_rss, att;
866 				int willignore = 0;
867 
868 				excess += (d_rss = rss_delta(
869 				    &new_psinfo, &old_psinfo, vic));
870 
871 				/*
872 				 * If this pageout attempt was unsuccessful
873 				 * (the resident portion was not affected), and
874 				 * was for the whole mapping, put it in the
875 				 * ignored set, so it will not be scanned again
876 				 * until some page is referenced or modified.
877 				 */
878 				if (d_rss >= 0 && (void *)cur.pr_addr ==
879 				    vicaddr && (cur.pr_pagesize * cur.pr_npage)
880 				    == ((uintptr_t)endaddr -
881 				    (uintptr_t)vicaddr)) {
882 					if (lmapping_insert(
883 					    &vic->lpc_ignore,
884 					    cur.pr_addr,
885 					    cur.pr_pagesize *
886 					    cur.pr_npage) != 0)
887 						debug("not enough memory to add"
888 						    " mapping at %p to ignored"
889 						    " set\n",
890 						    (void *)cur.pr_addr);
891 					willignore = 1;
892 				}
893 
894 				/*
895 				 * Update statistics.
896 				 */
897 				lcol->lcol_stat.lcols_pg_att += (att =
898 				    ((intptr_t)endaddr - (intptr_t)vicaddr) /
899 				    1024);
900 				st_debug(STDL_NORMAL, lcol, "paged out 0x%p"
901 				    "+0t(%llu/%llu)kB%s\n", vicaddr,
902 				    (unsigned long long)((d_rss <
903 				    0) ? - d_rss : 0), (unsigned long long)att,
904 				    willignore ? " (will ignore)" : "");
905 			} else {
906 				st_debug(STDL_NORMAL, lcol,
907 				    "process %d: exited/unscannable\n",
908 				    (int)vic->lpc_pid);
909 				vic->lpc_unscannable = 1;
910 				goto nextproc;
911 			}
912 
913 			/*
914 			 * Update the statistics file, if it's time.
915 			 */
916 			check_update_statistics();
917 
918 			vicaddr = (nvicaddr != NULL) ? nvicaddr : (void
919 			    *)advance_prpageheader_cur_nextmapping(&cur);
920 		}
921 		excess += rss_delta(&new_psinfo, &old_psinfo, vic);
922 		st_debug(STDL_NORMAL, lcol, "done, excess %lld\n",
923 		    (long long)excess);
924 nextproc:
925 		/*
926 		 * If a process was grabbed, release it, destroying its agent.
927 		 */
928 		if (scan_pr != NULL) {
929 			(void) Prelease(scan_pr, 0);
930 			scan_pr = NULL;
931 		}
932 		lcol->lcol_victim = vic;
933 		/*
934 		 * Scan the collection at most once.  Only if scanning was not
935 		 * aborted for any reason, and the end of lprocess has not been
936 		 * reached, determine the next victim and scan it.
937 		 */
938 		if (vic != NULL) {
939 			if (vic->lpc_next != NULL) {
940 				/*
941 				 * Determine the next process to be scanned.
942 				 */
943 				if (excess > 0) {
944 					vic = get_valid_victim(lcol,
945 					    vic->lpc_next);
946 					vicaddr = 0;
947 				}
948 			} else {
949 				/*
950 				 * A complete scan of the collection was made,
951 				 * so tick the scan counter and stop scanning
952 				 * until the next request.
953 				 */
954 				lcol->lcol_stat.lcols_scan_count++;
955 				lcol->lcol_stat.lcols_scan_time_complete
956 				    = lcol->lcol_stat.lcols_scan_time;
957 				/*
958 				 * If an excess still exists, tick the
959 				 * "ineffective scan" counter, signalling that
960 				 * the cap may be uneforceable.
961 				 */
962 				if (resumed == 0 && excess > 0)
963 					lcol->lcol_stat
964 					    .lcols_scan_ineffective++;
965 				/*
966 				 * Scanning should start at the beginning of
967 				 * the process list at the next request.
968 				 */
969 				if (excess > 0)
970 					vic = NULL;
971 			}
972 		}
973 	}
974 	lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start);
975 	st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n",
976 	    (long long)excess);
977 
978 	lcol->lcol_resaddr = vicaddr;
979 	if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) {
980 		lcol->lcol_victim = get_valid_victim(lcol,
981 		    lcol->lcol_victim->lpc_next);
982 	}
983 }
984 
985 /*
986  * Abort the scan in progress, and destroy the agent LWP of any grabbed
987  * processes.
988  */
989 void
990 scan_abort(void)
991 {
992 	if (scan_pr != NULL)
993 		(void) Prelease(scan_pr, 0);
994 }
995 
996 static void
997 revoke_xmap(rfd_t *rfd)
998 {
999 	lprocess_t *lpc = rfd->rfd_data;
1000 
1001 	debug("revoking xmap for process %d\n", (int)lpc->lpc_pid);
1002 	ASSERT(lpc->lpc_xmap_fd != -1);
1003 	lpc->lpc_xmap_fd = -1;
1004 }
1005 
1006 /*
1007  * Retrieve the process's current xmap , which is used to determine the size of
1008  * the resident portion of its segments.  Return zero if successful.
1009  */
1010 static int
1011 lpc_xmap_update(lprocess_t *lpc)
1012 {
1013 	int res;
1014 	struct stat st;
1015 
1016 	free(lpc->lpc_xmap);
1017 	lpc->lpc_xmap = NULL;
1018 	lpc->lpc_nxmap = -1;
1019 
1020 	if (lpc->lpc_xmap_fd == -1) {
1021 		char pathbuf[PROC_PATH_MAX];
1022 
1023 		(void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap",
1024 		    (int)lpc->lpc_pid);
1025 		if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP,
1026 		    revoke_xmap, lpc, O_RDONLY, 0)) < 0)
1027 			return (-1);
1028 	}
1029 
1030 redo:
1031 	errno = 0;
1032 	if (fstat(lpc->lpc_xmap_fd, &st) != 0) {
1033 		debug("cannot stat xmap\n");
1034 		(void) rfd_close(lpc->lpc_xmap_fd);
1035 		lpc->lpc_xmap_fd = -1;
1036 		return (-1);
1037 	}
1038 
1039 	if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) {
1040 		debug("xmap wrong size\n");
1041 		(void) rfd_close(lpc->lpc_xmap_fd);
1042 		lpc->lpc_xmap_fd = -1;
1043 		return (-1);
1044 	}
1045 
1046 	lpc->lpc_xmap = malloc(st.st_size);
1047 	if (lpc->lpc_xmap == NULL) {
1048 		debug("cannot malloc() %ld bytes for xmap", st.st_size);
1049 		(void) rfd_close(lpc->lpc_xmap_fd);
1050 		lpc->lpc_xmap_fd = -1;
1051 		return (-1);
1052 	}
1053 
1054 	if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) !=
1055 	    st.st_size) {
1056 		free(lpc->lpc_xmap);
1057 		lpc->lpc_xmap = NULL;
1058 		if (res > 0) {
1059 			debug("xmap changed size, retrying\n");
1060 			goto redo;
1061 		} else {
1062 			debug("cannot read xmap");
1063 			return (-1);
1064 		}
1065 	}
1066 	lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap);
1067 
1068 	return (0);
1069 }
1070