xref: /titanic_50/usr/src/cmd/rcap/rcapd/rcapd_scanner.c (revision 35fe197b91640f2efc8c0b3849eee882e373c729)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/mman.h>
30 #include <sys/param.h>
31 #include <sys/stat.h>
32 #include <sys/types.h>
33 #include <assert.h>
34 #include <errno.h>
35 #include <fcntl.h>
36 #include <libproc.h>
37 #include <limits.h>
38 #include <procfs.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <strings.h>
42 #include <time.h>
43 #include <unistd.h>
44 #include "rcapd.h"
45 #include "rcapd_rfd.h"
46 #include "rcapd_mapping.h"
47 #include "utils.h"
48 
49 static int lpc_xmap_update(lprocess_t *);
50 #ifdef DEBUG
51 extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2);
52 #endif /* DEBUG */
53 
54 /*
55  * The number of file descriptors required to grab a process and create an
56  * agent in it.
57  */
58 #define	PGRAB_FD_COUNT		10
59 
60 /*
61  * Record a position in an address space as it corresponds to a prpageheader_t
62  * and affiliated structures.
63  */
64 typedef struct prpageheader_cur {
65 	int pr_nmap;		/* number of mappings in address space */
66 	int pr_map;		/* number of this mapping */
67 	uint64_t pr_pgoff;	/* page offset into mapping */
68 	uint64_t pr_npage;	/* number of pages in mapping */
69 	uint64_t pr_pagesize;	/* page size of mapping */
70 	uintptr_t pr_addr;	/* base of mapping */
71 	prpageheader_t *pr_prpageheader;	/* associated page header */
72 	void *pr_pdaddr;	/* address of page's byte in pagedata */
73 	prxmap_t *pr_xmap;	/* array containing per-segment information */
74 	int pr_nxmap;		/* number of xmaps in array */
75 	int64_t pr_rss;		/* number of resident pages in mapping, */
76 				/* or -1 if xmap is out of sync */
77 	int64_t pr_pg_rss;	/* number of pageable pages in mapping, or -1 */
78 } prpageheader_cur_t;
79 
80 static struct ps_prochandle *scan_pr;	/* currently-scanned process's handle */
81 
82 typedef enum {
83 	STDL_NORMAL,
84 	STDL_HIGH
85 } st_debug_level_t;
86 
87 /*
88  * Output a scanning-related debug message.
89  */
90 /*PRINTFLIKE3*/ /*ARGSUSED*/
91 static void
92 st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
93 {
94 #ifdef DEBUG_MSG
95 	va_list alist;
96 	char *buf;
97 	size_t len;
98 
99 	if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH
100 	    : RCM_DEBUG))
101 		return;
102 
103 	len = strlen(msg) + LINELEN;
104 	buf = malloc(len);
105 	if (buf == NULL)
106 		return;
107 	(void) snprintf(buf, len, "%s %s scanner %s", rcfg.rcfg_mode_name,
108 	    lcol->lcol_name, msg);
109 
110 	va_start(alist, msg);
111 	vdprintfe(RCM_DEBUG, buf, alist);
112 	va_end(alist);
113 
114 	free(buf);
115 #endif /* DEBUG_MSG */
116 }
117 
118 /*
119  * Determine the collection's current victim, based on its last.  The last will
120  * be returned, or, if invalid, any other valid process, if the collection has
121  * any.
122  */
123 static lprocess_t *
124 get_valid_victim(lcollection_t *lcol, lprocess_t *lpc)
125 {
126 	if (lpc == NULL || !lcollection_member(lcol, lpc))
127 		lpc = lcol->lcol_lprocess;
128 
129 	/*
130 	 * Find the next scannable process, and make it the victim.
131 	 */
132 	while (lpc != NULL && lpc->lpc_unscannable != 0)
133 		lpc = lpc->lpc_next;
134 
135 	return (lpc);
136 }
137 
138 /*
139  * Get a process's combined current pagedata (per-page referenced and modified
140  * bits) and set the supplied pointer to it.  The caller is responsible for
141  * freeing the data.  If the pagedata is unreadable, a nonzero value is
142  * returned, and errno is set.  Otherwise, 0 is returned.
143  */
144 static int
145 get_pagedata(prpageheader_t **pghpp, int fd)
146 {
147 	int res;
148 	struct stat st;
149 
150 redo:
151 	errno = 0;
152 	if (fstat(fd, &st) != 0) {
153 		debug("cannot stat pagedata\n");
154 		return (-1);
155 	}
156 
157 	errno = 0;
158 	*pghpp = malloc(st.st_size);
159 	if (*pghpp == NULL) {
160 		debug("cannot malloc() %ld bytes for pagedata", st.st_size);
161 		return (-1);
162 	}
163 	(void) bzero(*pghpp, st.st_size);
164 
165 	errno = 0;
166 	if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) {
167 		free(*pghpp);
168 		*pghpp = NULL;
169 		if (res > 0 || errno == E2BIG) {
170 			debug("pagedata changed size, retrying\n");
171 			goto redo;
172 		} else {
173 			debug("cannot read pagedata");
174 			return (-1);
175 		}
176 	}
177 
178 	return (0);
179 }
180 
181 /*
182  * Return the count of kilobytes of pages represented by the given pagedata
183  * which meet the given criteria, having pages which are in all of the states
184  * specified by the mask, and in none of the states in the notmask.  If the
185  * CP_CLEAR flag is set, the pagedata will also be cleared.
186  */
187 #define	CP_CLEAR	1
188 static uint64_t
189 count_pages(prpageheader_t *pghp, int flags, int mask, int notmask)
190 {
191 	int map;
192 	caddr_t cur, end;
193 	prpageheader_t pgh = *pghp;
194 	prasmap_t *asmapp;
195 	uint64_t count = 0;
196 
197 	cur = (caddr_t)pghp + sizeof (*pghp);
198 	for (map = 0; map < pgh.pr_nmap; map++) {
199 		asmapp = (prasmap_t *)(uintptr_t)cur;
200 		cur += sizeof (*asmapp);
201 		end = cur + asmapp->pr_npage;
202 		while (cur < end) {
203 			if ((*cur & mask) == mask && (*cur & notmask) == 0)
204 				count += asmapp->pr_pagesize / 1024;
205 			if ((flags & CP_CLEAR) != 0)
206 				*cur = 0;
207 			cur++;
208 		}
209 
210 		/*
211 		 * Skip to next 64-bit-aligned address to get the next
212 		 * prasmap_t.
213 		 */
214 		cur = (caddr_t)((intptr_t)(cur + 7) & ~7);
215 	}
216 
217 	return (count);
218 }
219 
220 /*
221  * Return the amount of memory (in kilobytes) that hasn't been referenced or
222  * modified, which memory which will be paged out first.  Should be written to
223  * exclude nonresident pages when sufficient interfaces exist.
224  */
225 static uint64_t
226 unrm_size(lprocess_t *lpc)
227 {
228 	return (count_pages(lpc->lpc_prpageheader, CP_CLEAR,
229 	    0, PG_MODIFIED | PG_REFERENCED));
230 }
231 
232 /*
233  * Advance a prpageheader_cur_t to the address space's next mapping, returning
234  * its address, or NULL if there is none.  Any known nonpageable or nonresident
235  * mappings will be skipped over.
236  */
237 static uintptr_t
238 advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp)
239 {
240 	prasmap_t *pap;
241 	int i;
242 
243 next:
244 	ASSERT(pcp->pr_map < pcp->pr_nmap);
245 	if ((pcp->pr_map + 1) == pcp->pr_nmap)
246 		return (NULL);
247 	pcp->pr_map++;
248 	if (pcp->pr_pgoff < pcp->pr_npage) {
249 		pcp->pr_pdaddr = (caddr_t)(uintptr_t)
250 		    ((uintptr_t)pcp->pr_pdaddr +
251 		    (pcp->pr_npage - pcp->pr_pgoff));
252 		pcp->pr_pgoff = pcp->pr_npage;
253 	}
254 	/*
255 	 * Skip to next 64-bit-aligned address to get the next prasmap_t.
256 	 */
257 	pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7);
258 	pap = (prasmap_t *)pcp->pr_pdaddr;
259 	pcp->pr_pgoff = 0;
260 	pcp->pr_npage = pap->pr_npage;
261 	pcp->pr_pagesize = pap->pr_pagesize;
262 	pcp->pr_addr = pap->pr_vaddr;
263 	pcp->pr_pdaddr = pap + 1;
264 
265 	/*
266 	 * Skip any known nonpageable mappings.  Currently, the only one
267 	 * detected is the schedctl page.
268 	 */
269 	if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC |
270 	    MA_ANON)) == 0 && pap->pr_npage == 1) {
271 		debug("identified nonpageable schedctl mapping at %p\n",
272 		    (void *)pcp->pr_addr);
273 		goto next;
274 	}
275 
276 	/*
277 	 * Skip mappings with no resident pages.  If the xmap does not
278 	 * correspond to the pagedata for any reason, it will be ignored.
279 	 */
280 	pcp->pr_rss = -1;
281 	pcp->pr_pg_rss = -1;
282 	for (i = 0; i < pcp->pr_nxmap; i++) {
283 		prxmap_t *xmap = &pcp->pr_xmap[i];
284 
285 		if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size ==
286 		    (pcp->pr_npage * pcp->pr_pagesize)) {
287 			pcp->pr_rss = xmap->pr_rss;
288 			/*
289 			 * Remove COW pages from the pageable RSS count.
290 			 */
291 			if ((xmap->pr_mflags & MA_SHARED) == 0)
292 				pcp->pr_pg_rss = xmap->pr_anon;
293 			break;
294 		}
295 	}
296 	if (pcp->pr_rss == 0) {
297 		debug("identified nonresident mapping at 0x%p\n",
298 		    (void *)pcp->pr_addr);
299 		goto next;
300 	} else if (pcp->pr_pg_rss == 0) {
301 		debug("identified unpageable mapping at 0x%p\n",
302 		    (void *)pcp->pr_addr);
303 		goto next;
304 	}
305 
306 	return (pcp->pr_addr);
307 }
308 
309 /*
310  * Advance a prpageheader_cur_t to the mapping's next page, returning its
311  * address, or NULL if there is none.
312  */
313 static void *
314 advance_prpageheader_cur(prpageheader_cur_t *pcp)
315 {
316 	ASSERT(pcp->pr_pgoff < pcp->pr_npage);
317 	if ((pcp->pr_pgoff + 1) == pcp->pr_npage)
318 		return (NULL);
319 	pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1;
320 	pcp->pr_pgoff++;
321 
322 	ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0);
323 	return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize);
324 }
325 
326 /*
327  * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
328  * of an address space.
329  */
330 static void *
331 set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php,
332     prxmap_t *xmap, int nxmap)
333 {
334 	bzero(pcp, sizeof (*pcp));
335 	pcp->pr_nmap = php->pr_nmap;
336 	pcp->pr_map = -1;
337 	pcp->pr_prpageheader = php;
338 	pcp->pr_xmap = xmap;
339 	pcp->pr_nxmap = nxmap;
340 	pcp->pr_pdaddr = (prpageheader_t *)php + 1;
341 
342 	return ((void *)advance_prpageheader_cur_nextmapping(pcp));
343 }
344 
345 /*
346  * Position a prpageheader_cur_t to the mapped address greater or equal to the
347  * given value.
348  */
349 static void *
350 set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php,
351     prxmap_t *xmap, int nxmap, void *naddr)
352 {
353 	void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap);
354 
355 	while (addr != NULL && addr <= naddr)
356 		if (naddr < (void *)((caddr_t)pcp->pr_addr +
357 		    pcp->pr_pagesize * pcp->pr_npage)) {
358 			uint64_t pgdiff = ((uintptr_t)naddr -
359 			    (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize;
360 			pcp->pr_pgoff += pgdiff;
361 			pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff;
362 			addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize *
363 			    pcp->pr_pgoff;
364 			break;
365 		} else
366 			addr =
367 			    (void *)advance_prpageheader_cur_nextmapping(pcp);
368 
369 	return (addr);
370 }
371 
372 static void
373 revoke_pagedata(rfd_t *rfd)
374 {
375 	lprocess_t *lpc = rfd->rfd_data;
376 
377 	st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for"
378 	    " process %d\n", (int)lpc->lpc_pid);
379 	ASSERT(lpc->lpc_pgdata_fd != -1);
380 	lpc->lpc_pgdata_fd = -1;
381 }
382 
383 #ifdef DEBUG
384 static void
385 mklmapping(lmapping_t **lm, prpageheader_t *pgh)
386 {
387 	prpageheader_cur_t cur;
388 	void *addr;
389 
390 	addr = set_prpageheader_cur(&cur, pgh, NULL, -1);
391 	ASSERT(*lm == NULL);
392 	while (addr != NULL) {
393 		(void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage *
394 		    cur.pr_pagesize);
395 		addr = (void *)advance_prpageheader_cur_nextmapping(&cur);
396 	}
397 }
398 
399 static void
400 lmapping_dump(lmapping_t *lm)
401 {
402 	debug("lm: %p\n", (void *)lm);
403 	while (lm != NULL) {
404 		debug("\t(%p, %llx\n", (void *)lm->lm_addr,
405 		    (unsigned long long)lm->lm_size);
406 		lm = lm->lm_next;
407 	}
408 }
409 #endif /* DEBUG */
410 
411 /*
412  * OR two prpagedata_t which are supposedly snapshots of the same address
413  * space.  Intersecting mappings with different page sizes are tolerated but
414  * not normalized (not accurate).  If the mappings of the two snapshots differ
415  * in any regard, the supplied mappings_changed flag will be set.
416  */
417 static void
418 OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp)
419 {
420 	prpageheader_cur_t src_cur;
421 	prpageheader_cur_t dst_cur;
422 	uintptr_t src_addr;
423 	uintptr_t dst_addr;
424 	int mappings_changed = 0;
425 
426 	/*
427 	 * OR source pagedata with the destination, for pages of intersecting
428 	 * mappings.
429 	 */
430 	src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1);
431 	dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1);
432 	while (src_addr != NULL && dst_addr != NULL) {
433 		while (src_addr == dst_addr && src_addr != NULL) {
434 			*(char *)dst_cur.pr_pdaddr |=
435 			    *(char *)src_cur.pr_pdaddr;
436 			src_addr = (uintptr_t)advance_prpageheader_cur(
437 			    &src_cur);
438 			dst_addr = (uintptr_t)advance_prpageheader_cur(
439 			    &dst_cur);
440 		}
441 		if (src_addr != dst_addr)
442 			mappings_changed = 1;
443 		src_addr = advance_prpageheader_cur_nextmapping(&src_cur);
444 		dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur);
445 		while (src_addr != dst_addr && src_addr != NULL && dst_addr !=
446 		    NULL) {
447 			mappings_changed = 1;
448 			if (src_addr < dst_addr)
449 				src_addr = advance_prpageheader_cur_nextmapping(
450 				    &src_cur);
451 			else
452 				dst_addr = advance_prpageheader_cur_nextmapping(
453 				    &dst_cur);
454 		}
455 	}
456 
457 	*mappings_changedp = mappings_changed;
458 }
459 
460 /*
461  * Merge the current pagedata with that on hand.  If the pagedata is
462  * unretrievable for any reason, such as the process having exited or being a
463  * zombie, a nonzero value is returned, the process should be marked
464  * unscannable, and future attempts to scan it should be avoided, since the
465  * symptom is probably permament.  If the mappings of either pagedata
466  * differ in any respect, the supplied callback will be invoked once.
467  */
468 static int
469 merge_current_pagedata(lprocess_t *lpc,
470     void(*mappings_changed_cb) (lprocess_t *))
471 {
472 	prpageheader_t *pghp;
473 	int mappings_changed = 0;
474 
475 	if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
476 	    0) {
477 		char pathbuf[PROC_PATH_MAX];
478 
479 		(void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata",
480 		    (int)lpc->lpc_pid);
481 		if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA,
482 		    revoke_pagedata, lpc, O_RDONLY, 0)) < 0 ||
483 		    get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0)
484 			return (-1);
485 		debug("starting/resuming pagedata collection for %d\n",
486 		    (int)lpc->lpc_pid);
487 	}
488 	debug("process %d: %llu/%llukB r/m'd since last read\n",
489 	    (int)lpc->lpc_pid, (unsigned long long)count_pages(pghp, 0,
490 	    PG_MODIFIED | PG_REFERENCED, 0), (unsigned long long)lpc->lpc_rss);
491 	if (lpc->lpc_prpageheader != NULL) {
492 		/*
493 		 * OR the two snapshots.
494 		 */
495 #ifdef DEBUG
496 		lmapping_t *old = NULL;
497 		lmapping_t *new = NULL;
498 
499 		mklmapping(&new, pghp);
500 		mklmapping(&old, lpc->lpc_prpageheader);
501 #endif /* DEBUG */
502 		OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed);
503 #ifdef DEBUG
504 		if (((mappings_changed != 0) ^
505 		    (lmapping_dump_diff(old, new) != 0))) {
506 			debug("lmapping_changed inconsistent with lmapping\n");
507 			debug("old\n");
508 			lmapping_dump(old);
509 			debug("new\n");
510 			lmapping_dump(new);
511 			debug("ignored\n");
512 			lmapping_dump(lpc->lpc_ignore);
513 			ASSERT(0);
514 		}
515 		lmapping_free(&new);
516 		lmapping_free(&old);
517 #endif /* DEBUG */
518 		free(lpc->lpc_prpageheader);
519 	} else
520 		mappings_changed = 1;
521 	lpc->lpc_prpageheader = pghp;
522 	debug("process %d: %llu/%llukB r/m'd since hand swept\n",
523 	    (int)lpc->lpc_pid, (unsigned long long)count_pages(pghp, 0,
524 	    PG_MODIFIED | PG_REFERENCED, 0),
525 	    (unsigned long long)lpc->lpc_rss);
526 	if (mappings_changed != 0) {
527 		debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
528 		if (mappings_changed_cb != NULL)
529 			mappings_changed_cb(lpc);
530 	}
531 	return (0);
532 }
533 
534 /*
535  * Attempt to page out a region of the given process's address space.  May
536  * return nonzero if not all of the pages may are pageable, for any reason.
537  */
538 static int
539 pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
540 {
541 	int res;
542 
543 	if (end <= start)
544 		return (0);
545 
546 	errno = 0;
547 	res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
548 	    (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
549 	debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
550 
551 	/*
552 	 * EBUSY indicates none of the pages have backing store allocated, or
553 	 * some pages were locked, which are less interesting than other
554 	 * conditions, which are noted.
555 	 */
556 	if (res != 0)
557 		if (errno == EBUSY)
558 			res = 0;
559 		else
560 			debug("%d: can't pageout %p+%llx (errno %d)", (int)pid,
561 			    (void *)start, (long long)(end - start), errno);
562 
563 	return (res);
564 }
565 
566 /*
567  * Compute the delta of the victim process's RSS since the last call.  If the
568  * psinfo cannot be obtained, no work is done, and no error is returned; it is
569  * up to the caller to detect the process' termination via other means.
570  */
571 static int64_t
572 rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
573 {
574 	int64_t d_rss = 0;
575 
576 	if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd,
577 	    lprocess_update_psinfo_fd_cb, vic, vic) == 0) {
578 		d_rss = (int64_t)new_psinfo->pr_rssize -
579 		    (int64_t)old_psinfo->pr_rssize;
580 		if (d_rss < 0)
581 			vic->lpc_collection->lcol_stat.lcols_pg_eff +=
582 			    (- d_rss);
583 		*old_psinfo = *new_psinfo;
584 	}
585 
586 	return (d_rss);
587 }
588 
589 static void
590 unignore_mappings(lprocess_t *lpc)
591 {
592 	debug("clearing ignored set\n");
593 	lmapping_free(&lpc->lpc_ignore);
594 }
595 
596 static void
597 unignore_referenced_mappings(lprocess_t *lpc)
598 {
599 	prpageheader_cur_t cur;
600 	void *vicaddr;
601 
602 	vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1);
603 	while (vicaddr != NULL) {
604 		if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED))
605 		    != 0) {
606 			if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr,
607 			    cur.pr_npage * cur.pr_pagesize) == 0)
608 				debug("removed mapping 0x%p+0t%llukB from"
609 				    " ignored set\n", (void *)cur.pr_addr,
610 				    (unsigned long long)(cur.pr_npage *
611 				    cur.pr_pagesize / 1024));
612 			vicaddr = (void *)advance_prpageheader_cur_nextmapping(
613 			    &cur);
614 		} else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL)
615 			vicaddr = (void *)advance_prpageheader_cur_nextmapping(
616 			    &cur);
617 	}
618 }
619 
620 /*
621  * Resume scanning, starting with the last victim, if it is still valid, or any
622  * other one, otherwise.
623  */
624 void
625 scan(lcollection_t *lcol, int64_t excess)
626 {
627 	lprocess_t *vic, *lpc;
628 	void *vicaddr, *endaddr, *nvicaddr;
629 	prpageheader_cur_t cur;
630 	psinfo_t old_psinfo, new_psinfo;
631 	hrtime_t scan_start;
632 	int res, resumed;
633 	uint64_t col_unrm_size;
634 
635 	st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n",
636 	    (long long)excess);
637 
638 	/*
639 	 * Determine the address to start scanning at, depending on whether
640 	 * scanning can be resumed.
641 	 */
642 	endaddr = NULL;
643 	if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) ==
644 	    lcol->lcol_victim && lcol->lcol_resaddr != NULL) {
645 		vicaddr = lcol->lcol_resaddr;
646 		st_debug(STDL_NORMAL, lcol, "resuming process %d\n",
647 		    (int)vic->lpc_pid);
648 		resumed = 1;
649 	} else {
650 		vicaddr = NULL;
651 		resumed = 0;
652 	}
653 
654 	scan_start = gethrtime();
655 	/*
656 	 * Obtain the most current pagedata for the processes that might be
657 	 * scanned, and remove from the ignored set any mappings which have
658 	 * referenced or modified pages (in the hopes that the pageability of
659 	 * the mapping's pages may have changed).  Determine if the
660 	 * unreferenced and unmodified portion is impossibly small to suffice
661 	 * to reduce the excess completely.  If so, ignore these bits so that
662 	 * even working set will be paged out.
663 	 */
664 	col_unrm_size = 0;
665 	lpc = vic;
666 	while (lpc != NULL && should_run) {
667 		if (merge_current_pagedata(lpc, unignore_mappings) != 0) {
668 			st_debug(STDL_NORMAL, lcol, "process %d:"
669 			    " exited/temporarily unscannable",
670 			    (int)lpc->lpc_pid);
671 			goto next;
672 		}
673 		debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid,
674 		    (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)),
675 		    (unsigned long long)lpc->lpc_size);
676 		col_unrm_size += lpc->lpc_unrm = unrm_size(lpc);
677 
678 		if ((lcol->lcol_stat.lcols_scan_count %
679 		    RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) {
680 			/*
681 			 * Periodically clear the set of ignored mappings.
682 			 * This will allow processes whose ignored segments'
683 			 * pageability have changed (without a corresponding
684 			 * reference or modification to a page) to be
685 			 * recognized.
686 			 */
687 			if (lcol->lcol_stat.lcols_scan_count > 0)
688 				unignore_mappings(lpc);
689 		} else {
690 			/*
691 			 * Ensure mappings with referenced or modified pages
692 			 * are not in the ignored set.  Their usage might mean
693 			 * the condition which made them unpageable is gone.
694 			 */
695 			unignore_referenced_mappings(lpc);
696 		}
697 next:
698 		lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
699 		    lpc->lpc_next) : NULL;
700 	}
701 	if (col_unrm_size < excess) {
702 		lpc = vic;
703 		debug("will not reduce excess with only unreferenced pages\n");
704 		while (lpc != NULL && should_run) {
705 			if (lpc->lpc_prpageheader != NULL) {
706 				(void) count_pages(lpc->lpc_prpageheader,
707 				    CP_CLEAR, 0, 0);
708 				if (lpc->lpc_pgdata_fd >= 0) {
709 					if (rfd_close(lpc->lpc_pgdata_fd) != 0)
710 						debug("coud not close %d"
711 						    " lpc_pgdata_fd %d",
712 						    (int)lpc->lpc_pid,
713 						    lpc->lpc_pgdata_fd);
714 					lpc->lpc_pgdata_fd = -1;
715 				}
716 			}
717 			lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
718 			    lpc->lpc_next) : NULL;
719 		}
720 	}
721 
722 	/*
723 	 * Examine each process for pages to remove until the excess is
724 	 * reduced.
725 	 */
726 	while (vic != NULL && excess > 0 && should_run) {
727 		/*
728 		 * Skip processes whose death was reported when the merging of
729 		 * pagedata was attempted.
730 		 */
731 		if (vic->lpc_prpageheader == NULL)
732 			goto nextproc;
733 
734 		/*
735 		 * Obtain optional segment residency information.
736 		 */
737 		if (lpc_xmap_update(vic) != 0)
738 			st_debug(STDL_NORMAL, lcol, "process %d: xmap"
739 			    " unreadable; ignoring", (int)vic->lpc_pid);
740 
741 #ifdef DEBUG_MSG
742 		{
743 			void *ovicaddr = vicaddr;
744 #endif /* DEBUG_MSG */
745 		vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader,
746 		    vic->lpc_xmap, vic->lpc_nxmap, vicaddr);
747 #ifdef DEBUG_MSG
748 			st_debug(STDL_NORMAL, lcol, "trying to resume from"
749 			    " 0x%p, next 0x%p\n", ovicaddr, vicaddr);
750 		}
751 #endif /* DEBUG_MSG */
752 
753 		/*
754 		 * Take control of the victim.
755 		 */
756 		if (get_psinfo(vic->lpc_pid, &old_psinfo,
757 		    vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb,
758 		    vic, vic) != 0) {
759 			st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo",
760 			    (int)vic->lpc_pid);
761 			goto nextproc;
762 		}
763 		(void) rfd_reserve(PGRAB_FD_COUNT);
764 		if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) {
765 			st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)",
766 			    (int)vic->lpc_pid, res);
767 			goto nextproc;
768 		}
769 		if (Pcreate_agent(scan_pr) != 0) {
770 			st_debug(STDL_NORMAL, lcol, "cannot control %d",
771 			    (int)vic->lpc_pid);
772 			goto nextproc;
773 		}
774 		/*
775 		 * Be very pessimistic about the state of the agent LWP --
776 		 * verify it's actually stopped.
777 		 */
778 		errno = 0;
779 		while (Pstate(scan_pr) == PS_RUN)
780 			(void) Pwait(scan_pr, 0);
781 		if (Pstate(scan_pr) != PS_STOP) {
782 			st_debug(STDL_NORMAL, lcol, "agent not in expected"
783 			    " state (%d)", Pstate(scan_pr));
784 			goto nextproc;
785 		}
786 
787 		/*
788 		 * Within the victim's address space, find contiguous ranges of
789 		 * unreferenced pages to page out.
790 		 */
791 		st_debug(STDL_NORMAL, lcol, "paging out process %d\n",
792 		    (int)vic->lpc_pid);
793 		while (excess > 0 && vicaddr != NULL && should_run) {
794 			/*
795 			 * Skip mappings in the ignored set.  Mappings get
796 			 * placed in the ignored set when all their resident
797 			 * pages are unreference and unmodified, yet unpageable
798 			 * -- such as when they are locked, or involved in
799 			 * asynchronous I/O.  They will be scanned again when
800 			 * some page is referenced or modified.
801 			 */
802 			if (lmapping_contains(vic->lpc_ignore, cur.pr_addr,
803 			    cur.pr_npage * cur.pr_pagesize)) {
804 				debug("ignored mapping at 0x%p\n",
805 				    (void *)cur.pr_addr);
806 				/*
807 				 * Update statistics.
808 				 */
809 				lcol->lcol_stat.lcols_pg_att +=
810 				    cur.pr_npage * cur.pr_pagesize / 1024;
811 
812 				vicaddr = (void *)
813 				    advance_prpageheader_cur_nextmapping(&cur);
814 				continue;
815 			}
816 
817 			/*
818 			 * Determine a range of unreferenced pages to page out,
819 			 * and clear the R/M bits in the preceding referenced
820 			 * range.
821 			 */
822 			st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p,"
823 			    " npage %llu\n", vicaddr,
824 			    (unsigned long long)cur.pr_npage);
825 			while (vicaddr != NULL &&
826 			    *(caddr_t)cur.pr_pdaddr != 0) {
827 				*(caddr_t)cur.pr_pdaddr = 0;
828 				vicaddr = advance_prpageheader_cur(&cur);
829 			}
830 			st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr"
831 			    " %p\n", vicaddr, cur.pr_pdaddr);
832 			if (vicaddr == NULL) {
833 				/*
834 				 * The end of mapping was reached before any
835 				 * unreferenced pages were seen.
836 				 */
837 				vicaddr = (void *)
838 				    advance_prpageheader_cur_nextmapping(&cur);
839 				continue;
840 			}
841 			do
842 				endaddr = advance_prpageheader_cur(&cur);
843 			while (endaddr != NULL &&
844 			    *(caddr_t)cur.pr_pdaddr == 0 &&
845 			    (((intptr_t)endaddr - (intptr_t)vicaddr) /
846 				1024) < excess);
847 			st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n",
848 			    endaddr, *(caddr_t)cur.pr_pdaddr);
849 
850 			/*
851 			 * Page out from vicaddr to the end of the mapping, or
852 			 * endaddr if set, then continue scanning after
853 			 * endaddr, or the next mapping, if not set.
854 			 */
855 			nvicaddr = endaddr;
856 			if (endaddr == NULL)
857 				endaddr = (caddr_t)cur.pr_addr +
858 				    cur.pr_pagesize * cur.pr_npage;
859 			if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) ==
860 			    0) {
861 				int64_t d_rss, att;
862 				int willignore = 0;
863 
864 				excess += (d_rss = rss_delta(
865 				    &new_psinfo, &old_psinfo, vic));
866 
867 				/*
868 				 * If this pageout attempt was unsuccessful
869 				 * (the resident portion was not affected), and
870 				 * was for the whole mapping, put it in the
871 				 * ignored set, so it will not be scanned again
872 				 * until some page is referenced or modified.
873 				 */
874 				if (d_rss >= 0 && (void *)cur.pr_addr ==
875 				    vicaddr && (cur.pr_pagesize * cur.pr_npage)
876 				    == ((uintptr_t)endaddr -
877 				    (uintptr_t)vicaddr)) {
878 					if (lmapping_insert(
879 					    &vic->lpc_ignore,
880 					    cur.pr_addr,
881 					    cur.pr_pagesize *
882 					    cur.pr_npage) != 0)
883 						debug("not enough memory to add"
884 						    " mapping at %p to ignored"
885 						    " set\n",
886 						    (void *)cur.pr_addr);
887 					willignore = 1;
888 				}
889 
890 				/*
891 				 * Update statistics.
892 				 */
893 				lcol->lcol_stat.lcols_pg_att += (att =
894 				    ((intptr_t)endaddr - (intptr_t)vicaddr) /
895 				    1024);
896 				st_debug(STDL_NORMAL, lcol, "paged out 0x%p"
897 				    "+0t(%llu/%llu)kB%s\n", vicaddr,
898 				    (unsigned long long)((d_rss <
899 				    0) ? - d_rss : 0), (unsigned long long)att,
900 				    willignore ? " (will ignore)" : "");
901 			} else {
902 				st_debug(STDL_NORMAL, lcol,
903 				    "process %d: exited/unscannable\n",
904 				    (int)vic->lpc_pid);
905 				vic->lpc_unscannable = 1;
906 				goto nextproc;
907 			}
908 
909 			/*
910 			 * Update the statistics file, if it's time.
911 			 */
912 			check_update_statistics();
913 
914 			vicaddr = (nvicaddr != NULL) ? nvicaddr : (void
915 			    *)advance_prpageheader_cur_nextmapping(&cur);
916 		}
917 		excess += rss_delta(&new_psinfo, &old_psinfo, vic);
918 		st_debug(STDL_NORMAL, lcol, "done, excess %lld\n",
919 		    (long long)excess);
920 nextproc:
921 		/*
922 		 * If a process was grabbed, release it, destroying its agent.
923 		 */
924 		if (scan_pr != NULL) {
925 			(void) Prelease(scan_pr, 0);
926 			scan_pr = NULL;
927 		}
928 		lcol->lcol_victim = vic;
929 		/*
930 		 * Scan the collection at most once.  Only if scanning was not
931 		 * aborted for any reason, and the end of lprocess has not been
932 		 * reached, determine the next victim and scan it.
933 		 */
934 		if (vic != NULL) {
935 			if (vic->lpc_next != NULL) {
936 				/*
937 				 * Determine the next process to be scanned.
938 				 */
939 				if (excess > 0) {
940 					vic = get_valid_victim(lcol,
941 					    vic->lpc_next);
942 					vicaddr = 0;
943 				}
944 			} else {
945 				/*
946 				 * A complete scan of the collection was made,
947 				 * so tick the scan counter and stop scanning
948 				 * until the next request.
949 				 */
950 				lcol->lcol_stat.lcols_scan_count++;
951 				lcol->lcol_stat.lcols_scan_time_complete
952 				    = lcol->lcol_stat.lcols_scan_time;
953 				/*
954 				 * If an excess still exists, tick the
955 				 * "ineffective scan" counter, signalling that
956 				 * the cap may be uneforceable.
957 				 */
958 				if (resumed == 0 && excess > 0)
959 					lcol->lcol_stat
960 					    .lcols_scan_ineffective++;
961 				/*
962 				 * Scanning should start at the beginning of
963 				 * the process list at the next request.
964 				 */
965 				if (excess > 0)
966 					vic = NULL;
967 			}
968 		}
969 	}
970 	lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start);
971 	st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n",
972 	    (long long)excess);
973 
974 	lcol->lcol_resaddr = vicaddr;
975 	if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) {
976 		lcol->lcol_victim = get_valid_victim(lcol,
977 		    lcol->lcol_victim->lpc_next);
978 	}
979 }
980 
981 /*
982  * Abort the scan in progress, and destroy the agent LWP of any grabbed
983  * processes.
984  */
985 void
986 scan_abort(void)
987 {
988 	if (scan_pr != NULL)
989 		(void) Prelease(scan_pr, NULL);
990 }
991 
992 static void
993 revoke_xmap(rfd_t *rfd)
994 {
995 	lprocess_t *lpc = rfd->rfd_data;
996 
997 	debug("revoking xmap for process %d\n", (int)lpc->lpc_pid);
998 	ASSERT(lpc->lpc_xmap_fd != -1);
999 	lpc->lpc_xmap_fd = -1;
1000 }
1001 
1002 /*
1003  * Retrieve the process's current xmap , which is used to determine the size of
1004  * the resident portion of its segments.  Return zero if successful.
1005  */
1006 static int
1007 lpc_xmap_update(lprocess_t *lpc)
1008 {
1009 	int res;
1010 	struct stat st;
1011 
1012 	free(lpc->lpc_xmap);
1013 	lpc->lpc_xmap = NULL;
1014 	lpc->lpc_nxmap = -1;
1015 
1016 	if (lpc->lpc_xmap_fd == -1) {
1017 		char pathbuf[PROC_PATH_MAX];
1018 
1019 		(void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap",
1020 		    (int)lpc->lpc_pid);
1021 		if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP,
1022 		    revoke_xmap, lpc, O_RDONLY, 0)) < 0)
1023 			return (-1);
1024 	}
1025 
1026 redo:
1027 	errno = 0;
1028 	if (fstat(lpc->lpc_xmap_fd, &st) != 0) {
1029 		debug("cannot stat xmap\n");
1030 		(void) rfd_close(lpc->lpc_xmap_fd);
1031 		lpc->lpc_xmap_fd = -1;
1032 		return (-1);
1033 	}
1034 
1035 	if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) {
1036 		debug("xmap wrong size\n");
1037 		(void) rfd_close(lpc->lpc_xmap_fd);
1038 		lpc->lpc_xmap_fd = -1;
1039 		return (-1);
1040 	}
1041 
1042 	lpc->lpc_xmap = malloc(st.st_size);
1043 	if (lpc->lpc_xmap == NULL) {
1044 		debug("cannot malloc() %ld bytes for xmap", st.st_size);
1045 		(void) rfd_close(lpc->lpc_xmap_fd);
1046 		lpc->lpc_xmap_fd = -1;
1047 		return (-1);
1048 	}
1049 
1050 	if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) !=
1051 	    st.st_size) {
1052 		free(lpc->lpc_xmap);
1053 		lpc->lpc_xmap = NULL;
1054 		if (res > 0) {
1055 			debug("xmap changed size, retrying\n");
1056 			goto redo;
1057 		} else {
1058 			debug("cannot read xmap");
1059 			return (-1);
1060 		}
1061 	}
1062 	lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap);
1063 
1064 	return (0);
1065 }
1066