1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/mman.h>
27 #include <sys/param.h>
28 #include <sys/stat.h>
29 #include <sys/types.h>
30 #include <assert.h>
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <libproc.h>
34 #include <limits.h>
35 #include <procfs.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <strings.h>
39 #include <time.h>
40 #include <unistd.h>
41 #include "rcapd.h"
42 #include "rcapd_rfd.h"
43 #include "rcapd_mapping.h"
44 #include "utils.h"
45
46 static int lpc_xmap_update(lprocess_t *);
47 #ifdef DEBUG
48 extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2);
49 #endif /* DEBUG */
50
51 /*
52 * The number of file descriptors required to grab a process and create an
53 * agent in it.
54 */
55 #define PGRAB_FD_COUNT 10
56
57 /*
58 * Record a position in an address space as it corresponds to a prpageheader_t
59 * and affiliated structures.
60 */
61 typedef struct prpageheader_cur {
62 int pr_nmap; /* number of mappings in address space */
63 int pr_map; /* number of this mapping */
64 uint64_t pr_pgoff; /* page offset into mapping */
65 uint64_t pr_npage; /* number of pages in mapping */
66 uint64_t pr_pagesize; /* page size of mapping */
67 uintptr_t pr_addr; /* base of mapping */
68 prpageheader_t *pr_prpageheader; /* associated page header */
69 void *pr_pdaddr; /* address of page's byte in pagedata */
70 prxmap_t *pr_xmap; /* array containing per-segment information */
71 int pr_nxmap; /* number of xmaps in array */
72 int64_t pr_rss; /* number of resident pages in mapping, */
73 /* or -1 if xmap is out of sync */
74 int64_t pr_pg_rss; /* number of pageable pages in mapping, or -1 */
75 } prpageheader_cur_t;
76
77 static struct ps_prochandle *scan_pr; /* currently-scanned process's handle */
78
79 typedef enum {
80 STDL_NORMAL,
81 STDL_HIGH
82 } st_debug_level_t;
83
84 /*
85 * Output a scanning-related debug message.
86 */
87 /*PRINTFLIKE3*/ /*ARGSUSED*/
88 static void
st_debug(st_debug_level_t level,lcollection_t * lcol,char * msg,...)89 st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
90 {
91 #ifdef DEBUG_MSG
92 va_list alist;
93 char *buf;
94 size_t len;
95
96 if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH
97 : RCM_DEBUG))
98 return;
99
100 len = strlen(msg) + LINELEN;
101 buf = malloc(len);
102 if (buf == NULL)
103 return;
104 (void) snprintf(buf, len, "%s %s scanner %s",
105 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
106 lcol->lcol_name, msg);
107
108 va_start(alist, msg);
109 vdprintfe(RCM_DEBUG, buf, alist);
110 va_end(alist);
111
112 free(buf);
113 #endif /* DEBUG_MSG */
114 }
115
116 /*
117 * Determine the collection's current victim, based on its last. The last will
118 * be returned, or, if invalid, any other valid process, if the collection has
119 * any.
120 */
121 static lprocess_t *
get_valid_victim(lcollection_t * lcol,lprocess_t * lpc)122 get_valid_victim(lcollection_t *lcol, lprocess_t *lpc)
123 {
124 if (lpc == NULL || !lcollection_member(lcol, lpc))
125 lpc = lcol->lcol_lprocess;
126
127 /*
128 * Find the next scannable process, and make it the victim.
129 */
130 while (lpc != NULL && lpc->lpc_unscannable != 0)
131 lpc = lpc->lpc_next;
132
133 return (lpc);
134 }
135
136 /*
137 * Get a process's combined current pagedata (per-page referenced and modified
138 * bits) and set the supplied pointer to it. The caller is responsible for
139 * freeing the data. If the pagedata is unreadable, a nonzero value is
140 * returned, and errno is set. Otherwise, 0 is returned.
141 */
142 static int
get_pagedata(prpageheader_t ** pghpp,int fd)143 get_pagedata(prpageheader_t **pghpp, int fd)
144 {
145 int res;
146 struct stat st;
147
148 redo:
149 errno = 0;
150 if (fstat(fd, &st) != 0) {
151 debug("cannot stat pagedata\n");
152 return (-1);
153 }
154
155 errno = 0;
156 *pghpp = malloc(st.st_size);
157 if (*pghpp == NULL) {
158 debug("cannot malloc() %ld bytes for pagedata", st.st_size);
159 return (-1);
160 }
161 (void) bzero(*pghpp, st.st_size);
162
163 errno = 0;
164 if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) {
165 free(*pghpp);
166 *pghpp = NULL;
167 if (res > 0 || errno == E2BIG) {
168 debug("pagedata changed size, retrying\n");
169 goto redo;
170 } else {
171 debug("cannot read pagedata");
172 return (-1);
173 }
174 }
175
176 return (0);
177 }
178
179 /*
180 * Return the count of kilobytes of pages represented by the given pagedata
181 * which meet the given criteria, having pages which are in all of the states
182 * specified by the mask, and in none of the states in the notmask. If the
183 * CP_CLEAR flag is set, the pagedata will also be cleared.
184 */
185 #define CP_CLEAR 1
186 static uint64_t
count_pages(prpageheader_t * pghp,int flags,int mask,int notmask)187 count_pages(prpageheader_t *pghp, int flags, int mask, int notmask)
188 {
189 int map;
190 caddr_t cur, end;
191 prpageheader_t pgh = *pghp;
192 prasmap_t *asmapp;
193 uint64_t count = 0;
194
195 cur = (caddr_t)pghp + sizeof (*pghp);
196 for (map = 0; map < pgh.pr_nmap; map++) {
197 asmapp = (prasmap_t *)(uintptr_t)cur;
198 cur += sizeof (*asmapp);
199 end = cur + asmapp->pr_npage;
200 while (cur < end) {
201 if ((*cur & mask) == mask && (*cur & notmask) == 0)
202 count += asmapp->pr_pagesize / 1024;
203 if ((flags & CP_CLEAR) != 0)
204 *cur = 0;
205 cur++;
206 }
207
208 /*
209 * Skip to next 64-bit-aligned address to get the next
210 * prasmap_t.
211 */
212 cur = (caddr_t)((intptr_t)(cur + 7) & ~7);
213 }
214
215 return (count);
216 }
217
218 /*
219 * Return the amount of memory (in kilobytes) that hasn't been referenced or
220 * modified, which memory which will be paged out first. Should be written to
221 * exclude nonresident pages when sufficient interfaces exist.
222 */
223 static uint64_t
unrm_size(lprocess_t * lpc)224 unrm_size(lprocess_t *lpc)
225 {
226 return (count_pages(lpc->lpc_prpageheader, CP_CLEAR,
227 0, PG_MODIFIED | PG_REFERENCED));
228 }
229
230 /*
231 * Advance a prpageheader_cur_t to the address space's next mapping, returning
232 * its address, or NULL if there is none. Any known nonpageable or nonresident
233 * mappings will be skipped over.
234 */
235 static uintptr_t
advance_prpageheader_cur_nextmapping(prpageheader_cur_t * pcp)236 advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp)
237 {
238 prasmap_t *pap;
239 int i;
240
241 next:
242 ASSERT(pcp->pr_map < pcp->pr_nmap);
243 if ((pcp->pr_map + 1) == pcp->pr_nmap)
244 return ((uintptr_t)NULL);
245 pcp->pr_map++;
246 if (pcp->pr_pgoff < pcp->pr_npage) {
247 pcp->pr_pdaddr = (caddr_t)(uintptr_t)
248 ((uintptr_t)pcp->pr_pdaddr +
249 (pcp->pr_npage - pcp->pr_pgoff));
250 pcp->pr_pgoff = pcp->pr_npage;
251 }
252 /*
253 * Skip to next 64-bit-aligned address to get the next prasmap_t.
254 */
255 pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7);
256 pap = (prasmap_t *)pcp->pr_pdaddr;
257 pcp->pr_pgoff = 0;
258 pcp->pr_npage = pap->pr_npage;
259 pcp->pr_pagesize = pap->pr_pagesize;
260 pcp->pr_addr = pap->pr_vaddr;
261 pcp->pr_pdaddr = pap + 1;
262
263 /*
264 * Skip any known nonpageable mappings. Currently, the only one
265 * detected is the schedctl page.
266 */
267 if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC |
268 MA_ANON)) == 0 && pap->pr_npage == 1) {
269 debug("identified nonpageable schedctl mapping at %p\n",
270 (void *)pcp->pr_addr);
271 goto next;
272 }
273
274 /*
275 * Skip mappings with no resident pages. If the xmap does not
276 * correspond to the pagedata for any reason, it will be ignored.
277 */
278 pcp->pr_rss = -1;
279 pcp->pr_pg_rss = -1;
280 for (i = 0; i < pcp->pr_nxmap; i++) {
281 prxmap_t *xmap = &pcp->pr_xmap[i];
282
283 if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size ==
284 (pcp->pr_npage * pcp->pr_pagesize)) {
285 pcp->pr_rss = xmap->pr_rss;
286 /*
287 * Remove COW pages from the pageable RSS count.
288 */
289 if ((xmap->pr_mflags & MA_SHARED) == 0)
290 pcp->pr_pg_rss = xmap->pr_anon;
291 break;
292 }
293 }
294 if (pcp->pr_rss == 0) {
295 debug("identified nonresident mapping at 0x%p\n",
296 (void *)pcp->pr_addr);
297 goto next;
298 } else if (pcp->pr_pg_rss == 0) {
299 debug("identified unpageable mapping at 0x%p\n",
300 (void *)pcp->pr_addr);
301 goto next;
302 }
303
304 return (pcp->pr_addr);
305 }
306
307 /*
308 * Advance a prpageheader_cur_t to the mapping's next page, returning its
309 * address, or NULL if there is none.
310 */
311 static void *
advance_prpageheader_cur(prpageheader_cur_t * pcp)312 advance_prpageheader_cur(prpageheader_cur_t *pcp)
313 {
314 ASSERT(pcp->pr_pgoff < pcp->pr_npage);
315 if ((pcp->pr_pgoff + 1) == pcp->pr_npage)
316 return (NULL);
317 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1;
318 pcp->pr_pgoff++;
319
320 ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0);
321 return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize);
322 }
323
324 /*
325 * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
326 * of an address space.
327 */
328 static void *
set_prpageheader_cur(prpageheader_cur_t * pcp,prpageheader_t * php,prxmap_t * xmap,int nxmap)329 set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php,
330 prxmap_t *xmap, int nxmap)
331 {
332 bzero(pcp, sizeof (*pcp));
333 pcp->pr_nmap = php->pr_nmap;
334 pcp->pr_map = -1;
335 pcp->pr_prpageheader = php;
336 pcp->pr_xmap = xmap;
337 pcp->pr_nxmap = nxmap;
338 pcp->pr_pdaddr = (prpageheader_t *)php + 1;
339
340 return ((void *)advance_prpageheader_cur_nextmapping(pcp));
341 }
342
343 /*
344 * Position a prpageheader_cur_t to the mapped address greater or equal to the
345 * given value.
346 */
347 static void *
set_prpageheader_cur_addr(prpageheader_cur_t * pcp,prpageheader_t * php,prxmap_t * xmap,int nxmap,void * naddr)348 set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php,
349 prxmap_t *xmap, int nxmap, void *naddr)
350 {
351 void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap);
352
353 while (addr != NULL && addr <= naddr)
354 if (naddr < (void *)((caddr_t)pcp->pr_addr +
355 pcp->pr_pagesize * pcp->pr_npage)) {
356 uint64_t pgdiff = ((uintptr_t)naddr -
357 (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize;
358 pcp->pr_pgoff += pgdiff;
359 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff;
360 addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize *
361 pcp->pr_pgoff;
362 break;
363 } else
364 addr =
365 (void *)advance_prpageheader_cur_nextmapping(pcp);
366
367 return (addr);
368 }
369
370 static void
revoke_pagedata(rfd_t * rfd)371 revoke_pagedata(rfd_t *rfd)
372 {
373 lprocess_t *lpc = rfd->rfd_data;
374
375 st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for"
376 " process %d\n", (int)lpc->lpc_pid);
377 ASSERT(lpc->lpc_pgdata_fd != -1);
378 lpc->lpc_pgdata_fd = -1;
379 }
380
381 #ifdef DEBUG
382 static void
mklmapping(lmapping_t ** lm,prpageheader_t * pgh)383 mklmapping(lmapping_t **lm, prpageheader_t *pgh)
384 {
385 prpageheader_cur_t cur;
386 void *addr;
387
388 addr = set_prpageheader_cur(&cur, pgh, NULL, -1);
389 ASSERT(*lm == NULL);
390 while (addr != NULL) {
391 (void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage *
392 cur.pr_pagesize);
393 addr = (void *)advance_prpageheader_cur_nextmapping(&cur);
394 }
395 }
396
397 static void
lmapping_dump(lmapping_t * lm)398 lmapping_dump(lmapping_t *lm)
399 {
400 debug("lm: %p\n", (void *)lm);
401 while (lm != NULL) {
402 debug("\t(%p, %llx\n", (void *)lm->lm_addr,
403 (unsigned long long)lm->lm_size);
404 lm = lm->lm_next;
405 }
406 }
407 #endif /* DEBUG */
408
409 /*
410 * OR two prpagedata_t which are supposedly snapshots of the same address
411 * space. Intersecting mappings with different page sizes are tolerated but
412 * not normalized (not accurate). If the mappings of the two snapshots differ
413 * in any regard, the supplied mappings_changed flag will be set.
414 */
415 static void
OR_pagedata(prpageheader_t * src,prpageheader_t * dst,int * mappings_changedp)416 OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp)
417 {
418 prpageheader_cur_t src_cur;
419 prpageheader_cur_t dst_cur;
420 uintptr_t src_addr;
421 uintptr_t dst_addr;
422 int mappings_changed = 0;
423
424 /*
425 * OR source pagedata with the destination, for pages of intersecting
426 * mappings.
427 */
428 src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1);
429 dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1);
430 while (src_addr != (uintptr_t)NULL && dst_addr != (uintptr_t)NULL) {
431 while (src_addr == dst_addr && src_addr != (uintptr_t)NULL) {
432 *(char *)dst_cur.pr_pdaddr |=
433 *(char *)src_cur.pr_pdaddr;
434 src_addr = (uintptr_t)advance_prpageheader_cur(
435 &src_cur);
436 dst_addr = (uintptr_t)advance_prpageheader_cur(
437 &dst_cur);
438 }
439 if (src_addr != dst_addr)
440 mappings_changed = 1;
441 src_addr = advance_prpageheader_cur_nextmapping(&src_cur);
442 dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur);
443 while (src_addr != dst_addr && src_addr != (uintptr_t)NULL &&
444 dst_addr != (uintptr_t)NULL) {
445 mappings_changed = 1;
446 if (src_addr < dst_addr)
447 src_addr = advance_prpageheader_cur_nextmapping(
448 &src_cur);
449 else
450 dst_addr = advance_prpageheader_cur_nextmapping(
451 &dst_cur);
452 }
453 }
454
455 *mappings_changedp = mappings_changed;
456 }
457
458 /*
459 * Merge the current pagedata with that on hand. If the pagedata is
460 * unretrievable for any reason, such as the process having exited or being a
461 * zombie, a nonzero value is returned, the process should be marked
462 * unscannable, and future attempts to scan it should be avoided, since the
463 * symptom is probably permament. If the mappings of either pagedata
464 * differ in any respect, the supplied callback will be invoked once.
465 */
466 static int
merge_current_pagedata(lprocess_t * lpc,void (* mappings_changed_cb)(lprocess_t *))467 merge_current_pagedata(lprocess_t *lpc,
468 void(*mappings_changed_cb) (lprocess_t *))
469 {
470 prpageheader_t *pghp;
471 int mappings_changed = 0;
472 uint64_t cnt;
473
474 if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
475 0) {
476 char pathbuf[PROC_PATH_MAX];
477
478 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata",
479 (int)lpc->lpc_pid);
480 if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA,
481 revoke_pagedata, lpc, O_RDONLY, 0)) < 0 ||
482 get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0)
483 return (-1);
484 debug("starting/resuming pagedata collection for %d\n",
485 (int)lpc->lpc_pid);
486 }
487
488 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
489 if (cnt != 0 || lpc->lpc_rss != 0)
490 debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
491 (int)lpc->lpc_pid, (unsigned long long)cnt,
492 (unsigned long long)lpc->lpc_rss);
493 if (lpc->lpc_prpageheader != NULL) {
494 /*
495 * OR the two snapshots.
496 */
497 #ifdef DEBUG
498 lmapping_t *old = NULL;
499 lmapping_t *new = NULL;
500
501 mklmapping(&new, pghp);
502 mklmapping(&old, lpc->lpc_prpageheader);
503 #endif /* DEBUG */
504 OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed);
505 #ifdef DEBUG
506 if (((mappings_changed != 0) ^
507 (lmapping_dump_diff(old, new) != 0))) {
508 debug("lmapping_changed inconsistent with lmapping\n");
509 debug("old\n");
510 lmapping_dump(old);
511 debug("new\n");
512 lmapping_dump(new);
513 debug("ignored\n");
514 lmapping_dump(lpc->lpc_ignore);
515 ASSERT(0);
516 }
517 lmapping_free(&new);
518 lmapping_free(&old);
519 #endif /* DEBUG */
520 free(lpc->lpc_prpageheader);
521 } else
522 mappings_changed = 1;
523 lpc->lpc_prpageheader = pghp;
524
525 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
526 if (cnt != 0 || lpc->lpc_rss != 0)
527 debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
528 (int)lpc->lpc_pid, (unsigned long long)cnt,
529 (unsigned long long)lpc->lpc_rss);
530 if (mappings_changed != 0) {
531 debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
532 if (mappings_changed_cb != NULL)
533 mappings_changed_cb(lpc);
534 }
535 return (0);
536 }
537
538 /*
539 * Attempt to page out a region of the given process's address space. May
540 * return nonzero if not all of the pages may are pageable, for any reason.
541 */
542 static int
pageout(pid_t pid,struct ps_prochandle * Pr,caddr_t start,caddr_t end)543 pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
544 {
545 int res;
546
547 if (end <= start)
548 return (0);
549
550 errno = 0;
551 res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
552 (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
553 debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
554
555 /*
556 * EBUSY indicates none of the pages have backing store allocated, or
557 * some pages were locked, which are less interesting than other
558 * conditions, which are noted.
559 */
560 if (res != 0)
561 if (errno == EBUSY)
562 res = 0;
563 else
564 debug("%d: can't pageout %p+%llx (errno %d)", (int)pid,
565 (void *)start, (long long)(end - start), errno);
566
567 return (res);
568 }
569
570 /*
571 * Compute the delta of the victim process's RSS since the last call. If the
572 * psinfo cannot be obtained, no work is done, and no error is returned; it is
573 * up to the caller to detect the process' termination via other means.
574 */
575 static int64_t
rss_delta(psinfo_t * new_psinfo,psinfo_t * old_psinfo,lprocess_t * vic)576 rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
577 {
578 int64_t d_rss = 0;
579
580 if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd,
581 lprocess_update_psinfo_fd_cb, vic, vic) == 0) {
582 d_rss = (int64_t)new_psinfo->pr_rssize -
583 (int64_t)old_psinfo->pr_rssize;
584 if (d_rss < 0)
585 vic->lpc_collection->lcol_stat.lcols_pg_eff +=
586 (- d_rss);
587 *old_psinfo = *new_psinfo;
588 }
589
590 return (d_rss);
591 }
592
593 static void
unignore_mappings(lprocess_t * lpc)594 unignore_mappings(lprocess_t *lpc)
595 {
596 lmapping_free(&lpc->lpc_ignore);
597 }
598
599 static void
unignore_referenced_mappings(lprocess_t * lpc)600 unignore_referenced_mappings(lprocess_t *lpc)
601 {
602 prpageheader_cur_t cur;
603 void *vicaddr;
604
605 vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1);
606 while (vicaddr != NULL) {
607 if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED))
608 != 0) {
609 if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr,
610 cur.pr_npage * cur.pr_pagesize) == 0)
611 debug("removed mapping 0x%p+0t%llukB from"
612 " ignored set\n", (void *)cur.pr_addr,
613 (unsigned long long)(cur.pr_npage *
614 cur.pr_pagesize / 1024));
615 vicaddr = (void *)advance_prpageheader_cur_nextmapping(
616 &cur);
617 } else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL)
618 vicaddr = (void *)advance_prpageheader_cur_nextmapping(
619 &cur);
620 }
621 }
622
623 /*
624 * Resume scanning, starting with the last victim, if it is still valid, or any
625 * other one, otherwise.
626 */
627 void
scan(lcollection_t * lcol,int64_t excess)628 scan(lcollection_t *lcol, int64_t excess)
629 {
630 lprocess_t *vic, *lpc;
631 void *vicaddr, *endaddr, *nvicaddr;
632 prpageheader_cur_t cur;
633 psinfo_t old_psinfo, new_psinfo;
634 hrtime_t scan_start;
635 int res, resumed;
636 uint64_t col_unrm_size;
637
638 st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n",
639 (long long)excess);
640
641 /*
642 * Determine the address to start scanning at, depending on whether
643 * scanning can be resumed.
644 */
645 endaddr = NULL;
646 if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) ==
647 lcol->lcol_victim && lcol->lcol_resaddr != NULL) {
648 vicaddr = lcol->lcol_resaddr;
649 st_debug(STDL_NORMAL, lcol, "resuming process %d\n",
650 (int)vic->lpc_pid);
651 resumed = 1;
652 } else {
653 vicaddr = NULL;
654 resumed = 0;
655 }
656
657 scan_start = gethrtime();
658 /*
659 * Obtain the most current pagedata for the processes that might be
660 * scanned, and remove from the ignored set any mappings which have
661 * referenced or modified pages (in the hopes that the pageability of
662 * the mapping's pages may have changed). Determine if the
663 * unreferenced and unmodified portion is impossibly small to suffice
664 * to reduce the excess completely. If so, ignore these bits so that
665 * even working set will be paged out.
666 */
667 col_unrm_size = 0;
668 lpc = vic;
669 while (lpc != NULL && should_run) {
670 if (merge_current_pagedata(lpc, unignore_mappings) != 0) {
671 st_debug(STDL_NORMAL, lcol, "process %d:"
672 " exited/temporarily unscannable",
673 (int)lpc->lpc_pid);
674 goto next;
675 }
676 debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid,
677 (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)),
678 (unsigned long long)lpc->lpc_size);
679 col_unrm_size += lpc->lpc_unrm = unrm_size(lpc);
680
681 if ((lcol->lcol_stat.lcols_scan_count %
682 RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) {
683 /*
684 * Periodically clear the set of ignored mappings.
685 * This will allow processes whose ignored segments'
686 * pageability have changed (without a corresponding
687 * reference or modification to a page) to be
688 * recognized.
689 */
690 if (lcol->lcol_stat.lcols_scan_count > 0)
691 unignore_mappings(lpc);
692 } else {
693 /*
694 * Ensure mappings with referenced or modified pages
695 * are not in the ignored set. Their usage might mean
696 * the condition which made them unpageable is gone.
697 */
698 unignore_referenced_mappings(lpc);
699 }
700 next:
701 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
702 lpc->lpc_next) : NULL;
703 }
704 if (col_unrm_size < excess) {
705 lpc = vic;
706 debug("will not reduce excess with only unreferenced pages\n");
707 while (lpc != NULL && should_run) {
708 if (lpc->lpc_prpageheader != NULL) {
709 (void) count_pages(lpc->lpc_prpageheader,
710 CP_CLEAR, 0, 0);
711 if (lpc->lpc_pgdata_fd >= 0) {
712 if (rfd_close(lpc->lpc_pgdata_fd) != 0)
713 debug("coud not close %d"
714 " lpc_pgdata_fd %d",
715 (int)lpc->lpc_pid,
716 lpc->lpc_pgdata_fd);
717 lpc->lpc_pgdata_fd = -1;
718 }
719 }
720 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
721 lpc->lpc_next) : NULL;
722 }
723 }
724
725 /*
726 * Examine each process for pages to remove until the excess is
727 * reduced.
728 */
729 while (vic != NULL && excess > 0 && should_run) {
730 /*
731 * Skip processes whose death was reported when the merging of
732 * pagedata was attempted.
733 */
734 if (vic->lpc_prpageheader == NULL)
735 goto nextproc;
736
737 /*
738 * Obtain optional segment residency information.
739 */
740 if (lpc_xmap_update(vic) != 0)
741 st_debug(STDL_NORMAL, lcol, "process %d: xmap"
742 " unreadable; ignoring", (int)vic->lpc_pid);
743
744 #ifdef DEBUG_MSG
745 {
746 void *ovicaddr = vicaddr;
747 #endif /* DEBUG_MSG */
748 vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader,
749 vic->lpc_xmap, vic->lpc_nxmap, vicaddr);
750 #ifdef DEBUG_MSG
751 st_debug(STDL_NORMAL, lcol, "trying to resume from"
752 " 0x%p, next 0x%p\n", ovicaddr, vicaddr);
753 }
754 #endif /* DEBUG_MSG */
755
756 /*
757 * Take control of the victim.
758 */
759 if (get_psinfo(vic->lpc_pid, &old_psinfo,
760 vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb,
761 vic, vic) != 0) {
762 st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo",
763 (int)vic->lpc_pid);
764 goto nextproc;
765 }
766 (void) rfd_reserve(PGRAB_FD_COUNT);
767 if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) {
768 st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)",
769 (int)vic->lpc_pid, res);
770 goto nextproc;
771 }
772 if (Pcreate_agent(scan_pr) != 0) {
773 st_debug(STDL_NORMAL, lcol, "cannot control %d",
774 (int)vic->lpc_pid);
775 goto nextproc;
776 }
777 /*
778 * Be very pessimistic about the state of the agent LWP --
779 * verify it's actually stopped.
780 */
781 errno = 0;
782 while (Pstate(scan_pr) == PS_RUN)
783 (void) Pwait(scan_pr, 0);
784 if (Pstate(scan_pr) != PS_STOP) {
785 st_debug(STDL_NORMAL, lcol, "agent not in expected"
786 " state (%d)", Pstate(scan_pr));
787 goto nextproc;
788 }
789
790 /*
791 * Within the victim's address space, find contiguous ranges of
792 * unreferenced pages to page out.
793 */
794 st_debug(STDL_NORMAL, lcol, "paging out process %d\n",
795 (int)vic->lpc_pid);
796 while (excess > 0 && vicaddr != NULL && should_run) {
797 /*
798 * Skip mappings in the ignored set. Mappings get
799 * placed in the ignored set when all their resident
800 * pages are unreference and unmodified, yet unpageable
801 * -- such as when they are locked, or involved in
802 * asynchronous I/O. They will be scanned again when
803 * some page is referenced or modified.
804 */
805 if (lmapping_contains(vic->lpc_ignore, cur.pr_addr,
806 cur.pr_npage * cur.pr_pagesize)) {
807 debug("ignored mapping at 0x%p\n",
808 (void *)cur.pr_addr);
809 /*
810 * Update statistics.
811 */
812 lcol->lcol_stat.lcols_pg_att +=
813 cur.pr_npage * cur.pr_pagesize / 1024;
814
815 vicaddr = (void *)
816 advance_prpageheader_cur_nextmapping(&cur);
817 continue;
818 }
819
820 /*
821 * Determine a range of unreferenced pages to page out,
822 * and clear the R/M bits in the preceding referenced
823 * range.
824 */
825 st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p,"
826 " npage %llu\n", vicaddr,
827 (unsigned long long)cur.pr_npage);
828 while (vicaddr != NULL &&
829 *(caddr_t)cur.pr_pdaddr != 0) {
830 *(caddr_t)cur.pr_pdaddr = 0;
831 vicaddr = advance_prpageheader_cur(&cur);
832 }
833 st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr"
834 " %p\n", vicaddr, cur.pr_pdaddr);
835 if (vicaddr == NULL) {
836 /*
837 * The end of mapping was reached before any
838 * unreferenced pages were seen.
839 */
840 vicaddr = (void *)
841 advance_prpageheader_cur_nextmapping(&cur);
842 continue;
843 }
844 do
845 endaddr = advance_prpageheader_cur(&cur);
846 while (endaddr != NULL &&
847 *(caddr_t)cur.pr_pdaddr == 0 &&
848 (((intptr_t)endaddr - (intptr_t)vicaddr) /
849 1024) < excess)
850 ;
851 st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n",
852 endaddr, *(caddr_t)cur.pr_pdaddr);
853
854 /*
855 * Page out from vicaddr to the end of the mapping, or
856 * endaddr if set, then continue scanning after
857 * endaddr, or the next mapping, if not set.
858 */
859 nvicaddr = endaddr;
860 if (endaddr == NULL)
861 endaddr = (caddr_t)cur.pr_addr +
862 cur.pr_pagesize * cur.pr_npage;
863 if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) ==
864 0) {
865 int64_t d_rss, att;
866 int willignore = 0;
867
868 excess += (d_rss = rss_delta(
869 &new_psinfo, &old_psinfo, vic));
870
871 /*
872 * If this pageout attempt was unsuccessful
873 * (the resident portion was not affected), and
874 * was for the whole mapping, put it in the
875 * ignored set, so it will not be scanned again
876 * until some page is referenced or modified.
877 */
878 if (d_rss >= 0 && (void *)cur.pr_addr ==
879 vicaddr && (cur.pr_pagesize * cur.pr_npage)
880 == ((uintptr_t)endaddr -
881 (uintptr_t)vicaddr)) {
882 if (lmapping_insert(
883 &vic->lpc_ignore,
884 cur.pr_addr,
885 cur.pr_pagesize *
886 cur.pr_npage) != 0)
887 debug("not enough memory to add"
888 " mapping at %p to ignored"
889 " set\n",
890 (void *)cur.pr_addr);
891 willignore = 1;
892 }
893
894 /*
895 * Update statistics.
896 */
897 lcol->lcol_stat.lcols_pg_att += (att =
898 ((intptr_t)endaddr - (intptr_t)vicaddr) /
899 1024);
900 st_debug(STDL_NORMAL, lcol, "paged out 0x%p"
901 "+0t(%llu/%llu)kB%s\n", vicaddr,
902 (unsigned long long)((d_rss <
903 0) ? - d_rss : 0), (unsigned long long)att,
904 willignore ? " (will ignore)" : "");
905 } else {
906 st_debug(STDL_NORMAL, lcol,
907 "process %d: exited/unscannable\n",
908 (int)vic->lpc_pid);
909 vic->lpc_unscannable = 1;
910 goto nextproc;
911 }
912
913 /*
914 * Update the statistics file, if it's time.
915 */
916 check_update_statistics();
917
918 vicaddr = (nvicaddr != NULL) ? nvicaddr : (void
919 *)advance_prpageheader_cur_nextmapping(&cur);
920 }
921 excess += rss_delta(&new_psinfo, &old_psinfo, vic);
922 st_debug(STDL_NORMAL, lcol, "done, excess %lld\n",
923 (long long)excess);
924 nextproc:
925 /*
926 * If a process was grabbed, release it, destroying its agent.
927 */
928 if (scan_pr != NULL) {
929 (void) Prelease(scan_pr, 0);
930 scan_pr = NULL;
931 }
932 lcol->lcol_victim = vic;
933 /*
934 * Scan the collection at most once. Only if scanning was not
935 * aborted for any reason, and the end of lprocess has not been
936 * reached, determine the next victim and scan it.
937 */
938 if (vic != NULL) {
939 if (vic->lpc_next != NULL) {
940 /*
941 * Determine the next process to be scanned.
942 */
943 if (excess > 0) {
944 vic = get_valid_victim(lcol,
945 vic->lpc_next);
946 vicaddr = 0;
947 }
948 } else {
949 /*
950 * A complete scan of the collection was made,
951 * so tick the scan counter and stop scanning
952 * until the next request.
953 */
954 lcol->lcol_stat.lcols_scan_count++;
955 lcol->lcol_stat.lcols_scan_time_complete
956 = lcol->lcol_stat.lcols_scan_time;
957 /*
958 * If an excess still exists, tick the
959 * "ineffective scan" counter, signalling that
960 * the cap may be uneforceable.
961 */
962 if (resumed == 0 && excess > 0)
963 lcol->lcol_stat
964 .lcols_scan_ineffective++;
965 /*
966 * Scanning should start at the beginning of
967 * the process list at the next request.
968 */
969 if (excess > 0)
970 vic = NULL;
971 }
972 }
973 }
974 lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start);
975 st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n",
976 (long long)excess);
977
978 lcol->lcol_resaddr = vicaddr;
979 if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) {
980 lcol->lcol_victim = get_valid_victim(lcol,
981 lcol->lcol_victim->lpc_next);
982 }
983 }
984
985 /*
986 * Abort the scan in progress, and destroy the agent LWP of any grabbed
987 * processes.
988 */
989 void
scan_abort(void)990 scan_abort(void)
991 {
992 if (scan_pr != NULL)
993 (void) Prelease(scan_pr, 0);
994 }
995
996 static void
revoke_xmap(rfd_t * rfd)997 revoke_xmap(rfd_t *rfd)
998 {
999 lprocess_t *lpc = rfd->rfd_data;
1000
1001 debug("revoking xmap for process %d\n", (int)lpc->lpc_pid);
1002 ASSERT(lpc->lpc_xmap_fd != -1);
1003 lpc->lpc_xmap_fd = -1;
1004 }
1005
1006 /*
1007 * Retrieve the process's current xmap , which is used to determine the size of
1008 * the resident portion of its segments. Return zero if successful.
1009 */
1010 static int
lpc_xmap_update(lprocess_t * lpc)1011 lpc_xmap_update(lprocess_t *lpc)
1012 {
1013 int res;
1014 struct stat st;
1015
1016 free(lpc->lpc_xmap);
1017 lpc->lpc_xmap = NULL;
1018 lpc->lpc_nxmap = -1;
1019
1020 if (lpc->lpc_xmap_fd == -1) {
1021 char pathbuf[PROC_PATH_MAX];
1022
1023 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap",
1024 (int)lpc->lpc_pid);
1025 if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP,
1026 revoke_xmap, lpc, O_RDONLY, 0)) < 0)
1027 return (-1);
1028 }
1029
1030 redo:
1031 errno = 0;
1032 if (fstat(lpc->lpc_xmap_fd, &st) != 0) {
1033 debug("cannot stat xmap\n");
1034 (void) rfd_close(lpc->lpc_xmap_fd);
1035 lpc->lpc_xmap_fd = -1;
1036 return (-1);
1037 }
1038
1039 if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) {
1040 debug("xmap wrong size\n");
1041 (void) rfd_close(lpc->lpc_xmap_fd);
1042 lpc->lpc_xmap_fd = -1;
1043 return (-1);
1044 }
1045
1046 lpc->lpc_xmap = malloc(st.st_size);
1047 if (lpc->lpc_xmap == NULL) {
1048 debug("cannot malloc() %ld bytes for xmap", st.st_size);
1049 (void) rfd_close(lpc->lpc_xmap_fd);
1050 lpc->lpc_xmap_fd = -1;
1051 return (-1);
1052 }
1053
1054 if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) !=
1055 st.st_size) {
1056 free(lpc->lpc_xmap);
1057 lpc->lpc_xmap = NULL;
1058 if (res > 0) {
1059 debug("xmap changed size, retrying\n");
1060 goto redo;
1061 } else {
1062 debug("cannot read xmap");
1063 return (-1);
1064 }
1065 }
1066 lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap);
1067
1068 return (0);
1069 }
1070