1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/mman.h> 29 #include <sys/param.h> 30 #include <sys/stat.h> 31 #include <sys/types.h> 32 #include <assert.h> 33 #include <errno.h> 34 #include <fcntl.h> 35 #include <libproc.h> 36 #include <limits.h> 37 #include <procfs.h> 38 #include <stdio.h> 39 #include <stdlib.h> 40 #include <strings.h> 41 #include <time.h> 42 #include <unistd.h> 43 #include "rcapd.h" 44 #include "rcapd_rfd.h" 45 #include "rcapd_mapping.h" 46 #include "utils.h" 47 48 static int lpc_xmap_update(lprocess_t *); 49 #ifdef DEBUG 50 extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2); 51 #endif /* DEBUG */ 52 53 /* 54 * The number of file descriptors required to grab a process and create an 55 * agent in it. 56 */ 57 #define PGRAB_FD_COUNT 10 58 59 /* 60 * Record a position in an address space as it corresponds to a prpageheader_t 61 * and affiliated structures. 62 */ 63 typedef struct prpageheader_cur { 64 int pr_nmap; /* number of mappings in address space */ 65 int pr_map; /* number of this mapping */ 66 uint64_t pr_pgoff; /* page offset into mapping */ 67 uint64_t pr_npage; /* number of pages in mapping */ 68 uint64_t pr_pagesize; /* page size of mapping */ 69 uintptr_t pr_addr; /* base of mapping */ 70 prpageheader_t *pr_prpageheader; /* associated page header */ 71 void *pr_pdaddr; /* address of page's byte in pagedata */ 72 prxmap_t *pr_xmap; /* array containing per-segment information */ 73 int pr_nxmap; /* number of xmaps in array */ 74 int64_t pr_rss; /* number of resident pages in mapping, */ 75 /* or -1 if xmap is out of sync */ 76 int64_t pr_pg_rss; /* number of pageable pages in mapping, or -1 */ 77 } prpageheader_cur_t; 78 79 static struct ps_prochandle *scan_pr; /* currently-scanned process's handle */ 80 81 typedef enum { 82 STDL_NORMAL, 83 STDL_HIGH 84 } st_debug_level_t; 85 86 /* 87 * Output a scanning-related debug message. 88 */ 89 /*PRINTFLIKE3*/ /*ARGSUSED*/ 90 static void 91 st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...) 92 { 93 #ifdef DEBUG_MSG 94 va_list alist; 95 char *buf; 96 size_t len; 97 98 if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH 99 : RCM_DEBUG)) 100 return; 101 102 len = strlen(msg) + LINELEN; 103 buf = malloc(len); 104 if (buf == NULL) 105 return; 106 (void) snprintf(buf, len, "%s %s scanner %s", 107 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), 108 lcol->lcol_name, msg); 109 110 va_start(alist, msg); 111 vdprintfe(RCM_DEBUG, buf, alist); 112 va_end(alist); 113 114 free(buf); 115 #endif /* DEBUG_MSG */ 116 } 117 118 /* 119 * Determine the collection's current victim, based on its last. The last will 120 * be returned, or, if invalid, any other valid process, if the collection has 121 * any. 122 */ 123 static lprocess_t * 124 get_valid_victim(lcollection_t *lcol, lprocess_t *lpc) 125 { 126 if (lpc == NULL || !lcollection_member(lcol, lpc)) 127 lpc = lcol->lcol_lprocess; 128 129 /* 130 * Find the next scannable process, and make it the victim. 131 */ 132 while (lpc != NULL && lpc->lpc_unscannable != 0) 133 lpc = lpc->lpc_next; 134 135 return (lpc); 136 } 137 138 /* 139 * Get a process's combined current pagedata (per-page referenced and modified 140 * bits) and set the supplied pointer to it. The caller is responsible for 141 * freeing the data. If the pagedata is unreadable, a nonzero value is 142 * returned, and errno is set. Otherwise, 0 is returned. 143 */ 144 static int 145 get_pagedata(prpageheader_t **pghpp, int fd) 146 { 147 int res; 148 struct stat st; 149 150 redo: 151 errno = 0; 152 if (fstat(fd, &st) != 0) { 153 debug("cannot stat pagedata\n"); 154 return (-1); 155 } 156 157 errno = 0; 158 *pghpp = malloc(st.st_size); 159 if (*pghpp == NULL) { 160 debug("cannot malloc() %ld bytes for pagedata", st.st_size); 161 return (-1); 162 } 163 (void) bzero(*pghpp, st.st_size); 164 165 errno = 0; 166 if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) { 167 free(*pghpp); 168 *pghpp = NULL; 169 if (res > 0 || errno == E2BIG) { 170 debug("pagedata changed size, retrying\n"); 171 goto redo; 172 } else { 173 debug("cannot read pagedata"); 174 return (-1); 175 } 176 } 177 178 return (0); 179 } 180 181 /* 182 * Return the count of kilobytes of pages represented by the given pagedata 183 * which meet the given criteria, having pages which are in all of the states 184 * specified by the mask, and in none of the states in the notmask. If the 185 * CP_CLEAR flag is set, the pagedata will also be cleared. 186 */ 187 #define CP_CLEAR 1 188 static uint64_t 189 count_pages(prpageheader_t *pghp, int flags, int mask, int notmask) 190 { 191 int map; 192 caddr_t cur, end; 193 prpageheader_t pgh = *pghp; 194 prasmap_t *asmapp; 195 uint64_t count = 0; 196 197 cur = (caddr_t)pghp + sizeof (*pghp); 198 for (map = 0; map < pgh.pr_nmap; map++) { 199 asmapp = (prasmap_t *)(uintptr_t)cur; 200 cur += sizeof (*asmapp); 201 end = cur + asmapp->pr_npage; 202 while (cur < end) { 203 if ((*cur & mask) == mask && (*cur & notmask) == 0) 204 count += asmapp->pr_pagesize / 1024; 205 if ((flags & CP_CLEAR) != 0) 206 *cur = 0; 207 cur++; 208 } 209 210 /* 211 * Skip to next 64-bit-aligned address to get the next 212 * prasmap_t. 213 */ 214 cur = (caddr_t)((intptr_t)(cur + 7) & ~7); 215 } 216 217 return (count); 218 } 219 220 /* 221 * Return the amount of memory (in kilobytes) that hasn't been referenced or 222 * modified, which memory which will be paged out first. Should be written to 223 * exclude nonresident pages when sufficient interfaces exist. 224 */ 225 static uint64_t 226 unrm_size(lprocess_t *lpc) 227 { 228 return (count_pages(lpc->lpc_prpageheader, CP_CLEAR, 229 0, PG_MODIFIED | PG_REFERENCED)); 230 } 231 232 /* 233 * Advance a prpageheader_cur_t to the address space's next mapping, returning 234 * its address, or NULL if there is none. Any known nonpageable or nonresident 235 * mappings will be skipped over. 236 */ 237 static uintptr_t 238 advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp) 239 { 240 prasmap_t *pap; 241 int i; 242 243 next: 244 ASSERT(pcp->pr_map < pcp->pr_nmap); 245 if ((pcp->pr_map + 1) == pcp->pr_nmap) 246 return (NULL); 247 pcp->pr_map++; 248 if (pcp->pr_pgoff < pcp->pr_npage) { 249 pcp->pr_pdaddr = (caddr_t)(uintptr_t) 250 ((uintptr_t)pcp->pr_pdaddr + 251 (pcp->pr_npage - pcp->pr_pgoff)); 252 pcp->pr_pgoff = pcp->pr_npage; 253 } 254 /* 255 * Skip to next 64-bit-aligned address to get the next prasmap_t. 256 */ 257 pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7); 258 pap = (prasmap_t *)pcp->pr_pdaddr; 259 pcp->pr_pgoff = 0; 260 pcp->pr_npage = pap->pr_npage; 261 pcp->pr_pagesize = pap->pr_pagesize; 262 pcp->pr_addr = pap->pr_vaddr; 263 pcp->pr_pdaddr = pap + 1; 264 265 /* 266 * Skip any known nonpageable mappings. Currently, the only one 267 * detected is the schedctl page. 268 */ 269 if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC | 270 MA_ANON)) == 0 && pap->pr_npage == 1) { 271 debug("identified nonpageable schedctl mapping at %p\n", 272 (void *)pcp->pr_addr); 273 goto next; 274 } 275 276 /* 277 * Skip mappings with no resident pages. If the xmap does not 278 * correspond to the pagedata for any reason, it will be ignored. 279 */ 280 pcp->pr_rss = -1; 281 pcp->pr_pg_rss = -1; 282 for (i = 0; i < pcp->pr_nxmap; i++) { 283 prxmap_t *xmap = &pcp->pr_xmap[i]; 284 285 if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size == 286 (pcp->pr_npage * pcp->pr_pagesize)) { 287 pcp->pr_rss = xmap->pr_rss; 288 /* 289 * Remove COW pages from the pageable RSS count. 290 */ 291 if ((xmap->pr_mflags & MA_SHARED) == 0) 292 pcp->pr_pg_rss = xmap->pr_anon; 293 break; 294 } 295 } 296 if (pcp->pr_rss == 0) { 297 debug("identified nonresident mapping at 0x%p\n", 298 (void *)pcp->pr_addr); 299 goto next; 300 } else if (pcp->pr_pg_rss == 0) { 301 debug("identified unpageable mapping at 0x%p\n", 302 (void *)pcp->pr_addr); 303 goto next; 304 } 305 306 return (pcp->pr_addr); 307 } 308 309 /* 310 * Advance a prpageheader_cur_t to the mapping's next page, returning its 311 * address, or NULL if there is none. 312 */ 313 static void * 314 advance_prpageheader_cur(prpageheader_cur_t *pcp) 315 { 316 ASSERT(pcp->pr_pgoff < pcp->pr_npage); 317 if ((pcp->pr_pgoff + 1) == pcp->pr_npage) 318 return (NULL); 319 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1; 320 pcp->pr_pgoff++; 321 322 ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0); 323 return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize); 324 } 325 326 /* 327 * Initialize a prpageheader_cur_t, positioned at the first page of the mapping 328 * of an address space. 329 */ 330 static void * 331 set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php, 332 prxmap_t *xmap, int nxmap) 333 { 334 bzero(pcp, sizeof (*pcp)); 335 pcp->pr_nmap = php->pr_nmap; 336 pcp->pr_map = -1; 337 pcp->pr_prpageheader = php; 338 pcp->pr_xmap = xmap; 339 pcp->pr_nxmap = nxmap; 340 pcp->pr_pdaddr = (prpageheader_t *)php + 1; 341 342 return ((void *)advance_prpageheader_cur_nextmapping(pcp)); 343 } 344 345 /* 346 * Position a prpageheader_cur_t to the mapped address greater or equal to the 347 * given value. 348 */ 349 static void * 350 set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php, 351 prxmap_t *xmap, int nxmap, void *naddr) 352 { 353 void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap); 354 355 while (addr != NULL && addr <= naddr) 356 if (naddr < (void *)((caddr_t)pcp->pr_addr + 357 pcp->pr_pagesize * pcp->pr_npage)) { 358 uint64_t pgdiff = ((uintptr_t)naddr - 359 (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize; 360 pcp->pr_pgoff += pgdiff; 361 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff; 362 addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize * 363 pcp->pr_pgoff; 364 break; 365 } else 366 addr = 367 (void *)advance_prpageheader_cur_nextmapping(pcp); 368 369 return (addr); 370 } 371 372 static void 373 revoke_pagedata(rfd_t *rfd) 374 { 375 lprocess_t *lpc = rfd->rfd_data; 376 377 st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for" 378 " process %d\n", (int)lpc->lpc_pid); 379 ASSERT(lpc->lpc_pgdata_fd != -1); 380 lpc->lpc_pgdata_fd = -1; 381 } 382 383 #ifdef DEBUG 384 static void 385 mklmapping(lmapping_t **lm, prpageheader_t *pgh) 386 { 387 prpageheader_cur_t cur; 388 void *addr; 389 390 addr = set_prpageheader_cur(&cur, pgh, NULL, -1); 391 ASSERT(*lm == NULL); 392 while (addr != NULL) { 393 (void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage * 394 cur.pr_pagesize); 395 addr = (void *)advance_prpageheader_cur_nextmapping(&cur); 396 } 397 } 398 399 static void 400 lmapping_dump(lmapping_t *lm) 401 { 402 debug("lm: %p\n", (void *)lm); 403 while (lm != NULL) { 404 debug("\t(%p, %llx\n", (void *)lm->lm_addr, 405 (unsigned long long)lm->lm_size); 406 lm = lm->lm_next; 407 } 408 } 409 #endif /* DEBUG */ 410 411 /* 412 * OR two prpagedata_t which are supposedly snapshots of the same address 413 * space. Intersecting mappings with different page sizes are tolerated but 414 * not normalized (not accurate). If the mappings of the two snapshots differ 415 * in any regard, the supplied mappings_changed flag will be set. 416 */ 417 static void 418 OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp) 419 { 420 prpageheader_cur_t src_cur; 421 prpageheader_cur_t dst_cur; 422 uintptr_t src_addr; 423 uintptr_t dst_addr; 424 int mappings_changed = 0; 425 426 /* 427 * OR source pagedata with the destination, for pages of intersecting 428 * mappings. 429 */ 430 src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1); 431 dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1); 432 while (src_addr != NULL && dst_addr != NULL) { 433 while (src_addr == dst_addr && src_addr != NULL) { 434 *(char *)dst_cur.pr_pdaddr |= 435 *(char *)src_cur.pr_pdaddr; 436 src_addr = (uintptr_t)advance_prpageheader_cur( 437 &src_cur); 438 dst_addr = (uintptr_t)advance_prpageheader_cur( 439 &dst_cur); 440 } 441 if (src_addr != dst_addr) 442 mappings_changed = 1; 443 src_addr = advance_prpageheader_cur_nextmapping(&src_cur); 444 dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur); 445 while (src_addr != dst_addr && src_addr != NULL && dst_addr != 446 NULL) { 447 mappings_changed = 1; 448 if (src_addr < dst_addr) 449 src_addr = advance_prpageheader_cur_nextmapping( 450 &src_cur); 451 else 452 dst_addr = advance_prpageheader_cur_nextmapping( 453 &dst_cur); 454 } 455 } 456 457 *mappings_changedp = mappings_changed; 458 } 459 460 /* 461 * Merge the current pagedata with that on hand. If the pagedata is 462 * unretrievable for any reason, such as the process having exited or being a 463 * zombie, a nonzero value is returned, the process should be marked 464 * unscannable, and future attempts to scan it should be avoided, since the 465 * symptom is probably permament. If the mappings of either pagedata 466 * differ in any respect, the supplied callback will be invoked once. 467 */ 468 static int 469 merge_current_pagedata(lprocess_t *lpc, 470 void(*mappings_changed_cb) (lprocess_t *)) 471 { 472 prpageheader_t *pghp; 473 int mappings_changed = 0; 474 uint64_t cnt; 475 476 if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 477 0) { 478 char pathbuf[PROC_PATH_MAX]; 479 480 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata", 481 (int)lpc->lpc_pid); 482 if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA, 483 revoke_pagedata, lpc, O_RDONLY, 0)) < 0 || 484 get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0) 485 return (-1); 486 debug("starting/resuming pagedata collection for %d\n", 487 (int)lpc->lpc_pid); 488 } 489 490 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0); 491 if (cnt != 0 || lpc->lpc_rss != 0) 492 debug("process %d: %llu/%llukB rfd/mdfd since last read\n", 493 (int)lpc->lpc_pid, (unsigned long long)cnt, 494 (unsigned long long)lpc->lpc_rss); 495 if (lpc->lpc_prpageheader != NULL) { 496 /* 497 * OR the two snapshots. 498 */ 499 #ifdef DEBUG 500 lmapping_t *old = NULL; 501 lmapping_t *new = NULL; 502 503 mklmapping(&new, pghp); 504 mklmapping(&old, lpc->lpc_prpageheader); 505 #endif /* DEBUG */ 506 OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed); 507 #ifdef DEBUG 508 if (((mappings_changed != 0) ^ 509 (lmapping_dump_diff(old, new) != 0))) { 510 debug("lmapping_changed inconsistent with lmapping\n"); 511 debug("old\n"); 512 lmapping_dump(old); 513 debug("new\n"); 514 lmapping_dump(new); 515 debug("ignored\n"); 516 lmapping_dump(lpc->lpc_ignore); 517 ASSERT(0); 518 } 519 lmapping_free(&new); 520 lmapping_free(&old); 521 #endif /* DEBUG */ 522 free(lpc->lpc_prpageheader); 523 } else 524 mappings_changed = 1; 525 lpc->lpc_prpageheader = pghp; 526 527 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0); 528 if (cnt != 0 || lpc->lpc_rss != 0) 529 debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n", 530 (int)lpc->lpc_pid, (unsigned long long)cnt, 531 (unsigned long long)lpc->lpc_rss); 532 if (mappings_changed != 0) { 533 debug("process %d: mappings changed\n", (int)lpc->lpc_pid); 534 if (mappings_changed_cb != NULL) 535 mappings_changed_cb(lpc); 536 } 537 return (0); 538 } 539 540 /* 541 * Attempt to page out a region of the given process's address space. May 542 * return nonzero if not all of the pages may are pageable, for any reason. 543 */ 544 static int 545 pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end) 546 { 547 int res; 548 549 if (end <= start) 550 return (0); 551 552 errno = 0; 553 res = pr_memcntl(Pr, start, (end - start), MC_SYNC, 554 (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0); 555 debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res); 556 557 /* 558 * EBUSY indicates none of the pages have backing store allocated, or 559 * some pages were locked, which are less interesting than other 560 * conditions, which are noted. 561 */ 562 if (res != 0) 563 if (errno == EBUSY) 564 res = 0; 565 else 566 debug("%d: can't pageout %p+%llx (errno %d)", (int)pid, 567 (void *)start, (long long)(end - start), errno); 568 569 return (res); 570 } 571 572 /* 573 * Compute the delta of the victim process's RSS since the last call. If the 574 * psinfo cannot be obtained, no work is done, and no error is returned; it is 575 * up to the caller to detect the process' termination via other means. 576 */ 577 static int64_t 578 rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic) 579 { 580 int64_t d_rss = 0; 581 582 if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd, 583 lprocess_update_psinfo_fd_cb, vic, vic) == 0) { 584 d_rss = (int64_t)new_psinfo->pr_rssize - 585 (int64_t)old_psinfo->pr_rssize; 586 if (d_rss < 0) 587 vic->lpc_collection->lcol_stat.lcols_pg_eff += 588 (- d_rss); 589 *old_psinfo = *new_psinfo; 590 } 591 592 return (d_rss); 593 } 594 595 static void 596 unignore_mappings(lprocess_t *lpc) 597 { 598 lmapping_free(&lpc->lpc_ignore); 599 } 600 601 static void 602 unignore_referenced_mappings(lprocess_t *lpc) 603 { 604 prpageheader_cur_t cur; 605 void *vicaddr; 606 607 vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1); 608 while (vicaddr != NULL) { 609 if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED)) 610 != 0) { 611 if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr, 612 cur.pr_npage * cur.pr_pagesize) == 0) 613 debug("removed mapping 0x%p+0t%llukB from" 614 " ignored set\n", (void *)cur.pr_addr, 615 (unsigned long long)(cur.pr_npage * 616 cur.pr_pagesize / 1024)); 617 vicaddr = (void *)advance_prpageheader_cur_nextmapping( 618 &cur); 619 } else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL) 620 vicaddr = (void *)advance_prpageheader_cur_nextmapping( 621 &cur); 622 } 623 } 624 625 /* 626 * Resume scanning, starting with the last victim, if it is still valid, or any 627 * other one, otherwise. 628 */ 629 void 630 scan(lcollection_t *lcol, int64_t excess) 631 { 632 lprocess_t *vic, *lpc; 633 void *vicaddr, *endaddr, *nvicaddr; 634 prpageheader_cur_t cur; 635 psinfo_t old_psinfo, new_psinfo; 636 hrtime_t scan_start; 637 int res, resumed; 638 uint64_t col_unrm_size; 639 640 st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n", 641 (long long)excess); 642 643 /* 644 * Determine the address to start scanning at, depending on whether 645 * scanning can be resumed. 646 */ 647 endaddr = NULL; 648 if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) == 649 lcol->lcol_victim && lcol->lcol_resaddr != NULL) { 650 vicaddr = lcol->lcol_resaddr; 651 st_debug(STDL_NORMAL, lcol, "resuming process %d\n", 652 (int)vic->lpc_pid); 653 resumed = 1; 654 } else { 655 vicaddr = NULL; 656 resumed = 0; 657 } 658 659 scan_start = gethrtime(); 660 /* 661 * Obtain the most current pagedata for the processes that might be 662 * scanned, and remove from the ignored set any mappings which have 663 * referenced or modified pages (in the hopes that the pageability of 664 * the mapping's pages may have changed). Determine if the 665 * unreferenced and unmodified portion is impossibly small to suffice 666 * to reduce the excess completely. If so, ignore these bits so that 667 * even working set will be paged out. 668 */ 669 col_unrm_size = 0; 670 lpc = vic; 671 while (lpc != NULL && should_run) { 672 if (merge_current_pagedata(lpc, unignore_mappings) != 0) { 673 st_debug(STDL_NORMAL, lcol, "process %d:" 674 " exited/temporarily unscannable", 675 (int)lpc->lpc_pid); 676 goto next; 677 } 678 debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid, 679 (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)), 680 (unsigned long long)lpc->lpc_size); 681 col_unrm_size += lpc->lpc_unrm = unrm_size(lpc); 682 683 if ((lcol->lcol_stat.lcols_scan_count % 684 RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) { 685 /* 686 * Periodically clear the set of ignored mappings. 687 * This will allow processes whose ignored segments' 688 * pageability have changed (without a corresponding 689 * reference or modification to a page) to be 690 * recognized. 691 */ 692 if (lcol->lcol_stat.lcols_scan_count > 0) 693 unignore_mappings(lpc); 694 } else { 695 /* 696 * Ensure mappings with referenced or modified pages 697 * are not in the ignored set. Their usage might mean 698 * the condition which made them unpageable is gone. 699 */ 700 unignore_referenced_mappings(lpc); 701 } 702 next: 703 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol, 704 lpc->lpc_next) : NULL; 705 } 706 if (col_unrm_size < excess) { 707 lpc = vic; 708 debug("will not reduce excess with only unreferenced pages\n"); 709 while (lpc != NULL && should_run) { 710 if (lpc->lpc_prpageheader != NULL) { 711 (void) count_pages(lpc->lpc_prpageheader, 712 CP_CLEAR, 0, 0); 713 if (lpc->lpc_pgdata_fd >= 0) { 714 if (rfd_close(lpc->lpc_pgdata_fd) != 0) 715 debug("coud not close %d" 716 " lpc_pgdata_fd %d", 717 (int)lpc->lpc_pid, 718 lpc->lpc_pgdata_fd); 719 lpc->lpc_pgdata_fd = -1; 720 } 721 } 722 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol, 723 lpc->lpc_next) : NULL; 724 } 725 } 726 727 /* 728 * Examine each process for pages to remove until the excess is 729 * reduced. 730 */ 731 while (vic != NULL && excess > 0 && should_run) { 732 /* 733 * Skip processes whose death was reported when the merging of 734 * pagedata was attempted. 735 */ 736 if (vic->lpc_prpageheader == NULL) 737 goto nextproc; 738 739 /* 740 * Obtain optional segment residency information. 741 */ 742 if (lpc_xmap_update(vic) != 0) 743 st_debug(STDL_NORMAL, lcol, "process %d: xmap" 744 " unreadable; ignoring", (int)vic->lpc_pid); 745 746 #ifdef DEBUG_MSG 747 { 748 void *ovicaddr = vicaddr; 749 #endif /* DEBUG_MSG */ 750 vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader, 751 vic->lpc_xmap, vic->lpc_nxmap, vicaddr); 752 #ifdef DEBUG_MSG 753 st_debug(STDL_NORMAL, lcol, "trying to resume from" 754 " 0x%p, next 0x%p\n", ovicaddr, vicaddr); 755 } 756 #endif /* DEBUG_MSG */ 757 758 /* 759 * Take control of the victim. 760 */ 761 if (get_psinfo(vic->lpc_pid, &old_psinfo, 762 vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb, 763 vic, vic) != 0) { 764 st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo", 765 (int)vic->lpc_pid); 766 goto nextproc; 767 } 768 (void) rfd_reserve(PGRAB_FD_COUNT); 769 if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) { 770 st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)", 771 (int)vic->lpc_pid, res); 772 goto nextproc; 773 } 774 if (Pcreate_agent(scan_pr) != 0) { 775 st_debug(STDL_NORMAL, lcol, "cannot control %d", 776 (int)vic->lpc_pid); 777 goto nextproc; 778 } 779 /* 780 * Be very pessimistic about the state of the agent LWP -- 781 * verify it's actually stopped. 782 */ 783 errno = 0; 784 while (Pstate(scan_pr) == PS_RUN) 785 (void) Pwait(scan_pr, 0); 786 if (Pstate(scan_pr) != PS_STOP) { 787 st_debug(STDL_NORMAL, lcol, "agent not in expected" 788 " state (%d)", Pstate(scan_pr)); 789 goto nextproc; 790 } 791 792 /* 793 * Within the victim's address space, find contiguous ranges of 794 * unreferenced pages to page out. 795 */ 796 st_debug(STDL_NORMAL, lcol, "paging out process %d\n", 797 (int)vic->lpc_pid); 798 while (excess > 0 && vicaddr != NULL && should_run) { 799 /* 800 * Skip mappings in the ignored set. Mappings get 801 * placed in the ignored set when all their resident 802 * pages are unreference and unmodified, yet unpageable 803 * -- such as when they are locked, or involved in 804 * asynchronous I/O. They will be scanned again when 805 * some page is referenced or modified. 806 */ 807 if (lmapping_contains(vic->lpc_ignore, cur.pr_addr, 808 cur.pr_npage * cur.pr_pagesize)) { 809 debug("ignored mapping at 0x%p\n", 810 (void *)cur.pr_addr); 811 /* 812 * Update statistics. 813 */ 814 lcol->lcol_stat.lcols_pg_att += 815 cur.pr_npage * cur.pr_pagesize / 1024; 816 817 vicaddr = (void *) 818 advance_prpageheader_cur_nextmapping(&cur); 819 continue; 820 } 821 822 /* 823 * Determine a range of unreferenced pages to page out, 824 * and clear the R/M bits in the preceding referenced 825 * range. 826 */ 827 st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p," 828 " npage %llu\n", vicaddr, 829 (unsigned long long)cur.pr_npage); 830 while (vicaddr != NULL && 831 *(caddr_t)cur.pr_pdaddr != 0) { 832 *(caddr_t)cur.pr_pdaddr = 0; 833 vicaddr = advance_prpageheader_cur(&cur); 834 } 835 st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr" 836 " %p\n", vicaddr, cur.pr_pdaddr); 837 if (vicaddr == NULL) { 838 /* 839 * The end of mapping was reached before any 840 * unreferenced pages were seen. 841 */ 842 vicaddr = (void *) 843 advance_prpageheader_cur_nextmapping(&cur); 844 continue; 845 } 846 do 847 endaddr = advance_prpageheader_cur(&cur); 848 while (endaddr != NULL && 849 *(caddr_t)cur.pr_pdaddr == 0 && 850 (((intptr_t)endaddr - (intptr_t)vicaddr) / 851 1024) < excess); 852 st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n", 853 endaddr, *(caddr_t)cur.pr_pdaddr); 854 855 /* 856 * Page out from vicaddr to the end of the mapping, or 857 * endaddr if set, then continue scanning after 858 * endaddr, or the next mapping, if not set. 859 */ 860 nvicaddr = endaddr; 861 if (endaddr == NULL) 862 endaddr = (caddr_t)cur.pr_addr + 863 cur.pr_pagesize * cur.pr_npage; 864 if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) == 865 0) { 866 int64_t d_rss, att; 867 int willignore = 0; 868 869 excess += (d_rss = rss_delta( 870 &new_psinfo, &old_psinfo, vic)); 871 872 /* 873 * If this pageout attempt was unsuccessful 874 * (the resident portion was not affected), and 875 * was for the whole mapping, put it in the 876 * ignored set, so it will not be scanned again 877 * until some page is referenced or modified. 878 */ 879 if (d_rss >= 0 && (void *)cur.pr_addr == 880 vicaddr && (cur.pr_pagesize * cur.pr_npage) 881 == ((uintptr_t)endaddr - 882 (uintptr_t)vicaddr)) { 883 if (lmapping_insert( 884 &vic->lpc_ignore, 885 cur.pr_addr, 886 cur.pr_pagesize * 887 cur.pr_npage) != 0) 888 debug("not enough memory to add" 889 " mapping at %p to ignored" 890 " set\n", 891 (void *)cur.pr_addr); 892 willignore = 1; 893 } 894 895 /* 896 * Update statistics. 897 */ 898 lcol->lcol_stat.lcols_pg_att += (att = 899 ((intptr_t)endaddr - (intptr_t)vicaddr) / 900 1024); 901 st_debug(STDL_NORMAL, lcol, "paged out 0x%p" 902 "+0t(%llu/%llu)kB%s\n", vicaddr, 903 (unsigned long long)((d_rss < 904 0) ? - d_rss : 0), (unsigned long long)att, 905 willignore ? " (will ignore)" : ""); 906 } else { 907 st_debug(STDL_NORMAL, lcol, 908 "process %d: exited/unscannable\n", 909 (int)vic->lpc_pid); 910 vic->lpc_unscannable = 1; 911 goto nextproc; 912 } 913 914 /* 915 * Update the statistics file, if it's time. 916 */ 917 check_update_statistics(); 918 919 vicaddr = (nvicaddr != NULL) ? nvicaddr : (void 920 *)advance_prpageheader_cur_nextmapping(&cur); 921 } 922 excess += rss_delta(&new_psinfo, &old_psinfo, vic); 923 st_debug(STDL_NORMAL, lcol, "done, excess %lld\n", 924 (long long)excess); 925 nextproc: 926 /* 927 * If a process was grabbed, release it, destroying its agent. 928 */ 929 if (scan_pr != NULL) { 930 (void) Prelease(scan_pr, 0); 931 scan_pr = NULL; 932 } 933 lcol->lcol_victim = vic; 934 /* 935 * Scan the collection at most once. Only if scanning was not 936 * aborted for any reason, and the end of lprocess has not been 937 * reached, determine the next victim and scan it. 938 */ 939 if (vic != NULL) { 940 if (vic->lpc_next != NULL) { 941 /* 942 * Determine the next process to be scanned. 943 */ 944 if (excess > 0) { 945 vic = get_valid_victim(lcol, 946 vic->lpc_next); 947 vicaddr = 0; 948 } 949 } else { 950 /* 951 * A complete scan of the collection was made, 952 * so tick the scan counter and stop scanning 953 * until the next request. 954 */ 955 lcol->lcol_stat.lcols_scan_count++; 956 lcol->lcol_stat.lcols_scan_time_complete 957 = lcol->lcol_stat.lcols_scan_time; 958 /* 959 * If an excess still exists, tick the 960 * "ineffective scan" counter, signalling that 961 * the cap may be uneforceable. 962 */ 963 if (resumed == 0 && excess > 0) 964 lcol->lcol_stat 965 .lcols_scan_ineffective++; 966 /* 967 * Scanning should start at the beginning of 968 * the process list at the next request. 969 */ 970 if (excess > 0) 971 vic = NULL; 972 } 973 } 974 } 975 lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start); 976 st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n", 977 (long long)excess); 978 979 lcol->lcol_resaddr = vicaddr; 980 if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) { 981 lcol->lcol_victim = get_valid_victim(lcol, 982 lcol->lcol_victim->lpc_next); 983 } 984 } 985 986 /* 987 * Abort the scan in progress, and destroy the agent LWP of any grabbed 988 * processes. 989 */ 990 void 991 scan_abort(void) 992 { 993 if (scan_pr != NULL) 994 (void) Prelease(scan_pr, NULL); 995 } 996 997 static void 998 revoke_xmap(rfd_t *rfd) 999 { 1000 lprocess_t *lpc = rfd->rfd_data; 1001 1002 debug("revoking xmap for process %d\n", (int)lpc->lpc_pid); 1003 ASSERT(lpc->lpc_xmap_fd != -1); 1004 lpc->lpc_xmap_fd = -1; 1005 } 1006 1007 /* 1008 * Retrieve the process's current xmap , which is used to determine the size of 1009 * the resident portion of its segments. Return zero if successful. 1010 */ 1011 static int 1012 lpc_xmap_update(lprocess_t *lpc) 1013 { 1014 int res; 1015 struct stat st; 1016 1017 free(lpc->lpc_xmap); 1018 lpc->lpc_xmap = NULL; 1019 lpc->lpc_nxmap = -1; 1020 1021 if (lpc->lpc_xmap_fd == -1) { 1022 char pathbuf[PROC_PATH_MAX]; 1023 1024 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap", 1025 (int)lpc->lpc_pid); 1026 if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP, 1027 revoke_xmap, lpc, O_RDONLY, 0)) < 0) 1028 return (-1); 1029 } 1030 1031 redo: 1032 errno = 0; 1033 if (fstat(lpc->lpc_xmap_fd, &st) != 0) { 1034 debug("cannot stat xmap\n"); 1035 (void) rfd_close(lpc->lpc_xmap_fd); 1036 lpc->lpc_xmap_fd = -1; 1037 return (-1); 1038 } 1039 1040 if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) { 1041 debug("xmap wrong size\n"); 1042 (void) rfd_close(lpc->lpc_xmap_fd); 1043 lpc->lpc_xmap_fd = -1; 1044 return (-1); 1045 } 1046 1047 lpc->lpc_xmap = malloc(st.st_size); 1048 if (lpc->lpc_xmap == NULL) { 1049 debug("cannot malloc() %ld bytes for xmap", st.st_size); 1050 (void) rfd_close(lpc->lpc_xmap_fd); 1051 lpc->lpc_xmap_fd = -1; 1052 return (-1); 1053 } 1054 1055 if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) != 1056 st.st_size) { 1057 free(lpc->lpc_xmap); 1058 lpc->lpc_xmap = NULL; 1059 if (res > 0) { 1060 debug("xmap changed size, retrying\n"); 1061 goto redo; 1062 } else { 1063 debug("cannot read xmap"); 1064 return (-1); 1065 } 1066 } 1067 lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap); 1068 1069 return (0); 1070 } 1071