1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/mman.h> 27 #include <sys/param.h> 28 #include <sys/stat.h> 29 #include <sys/types.h> 30 #include <assert.h> 31 #include <errno.h> 32 #include <fcntl.h> 33 #include <libproc.h> 34 #include <limits.h> 35 #include <procfs.h> 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <strings.h> 39 #include <time.h> 40 #include <unistd.h> 41 #include "rcapd.h" 42 #include "rcapd_rfd.h" 43 #include "rcapd_mapping.h" 44 #include "utils.h" 45 46 static int lpc_xmap_update(lprocess_t *); 47 #ifdef DEBUG 48 extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2); 49 #endif /* DEBUG */ 50 51 /* 52 * The number of file descriptors required to grab a process and create an 53 * agent in it. 54 */ 55 #define PGRAB_FD_COUNT 10 56 57 /* 58 * Record a position in an address space as it corresponds to a prpageheader_t 59 * and affiliated structures. 60 */ 61 typedef struct prpageheader_cur { 62 int pr_nmap; /* number of mappings in address space */ 63 int pr_map; /* number of this mapping */ 64 uint64_t pr_pgoff; /* page offset into mapping */ 65 uint64_t pr_npage; /* number of pages in mapping */ 66 uint64_t pr_pagesize; /* page size of mapping */ 67 uintptr_t pr_addr; /* base of mapping */ 68 prpageheader_t *pr_prpageheader; /* associated page header */ 69 void *pr_pdaddr; /* address of page's byte in pagedata */ 70 prxmap_t *pr_xmap; /* array containing per-segment information */ 71 int pr_nxmap; /* number of xmaps in array */ 72 int64_t pr_rss; /* number of resident pages in mapping, */ 73 /* or -1 if xmap is out of sync */ 74 int64_t pr_pg_rss; /* number of pageable pages in mapping, or -1 */ 75 } prpageheader_cur_t; 76 77 static struct ps_prochandle *scan_pr; /* currently-scanned process's handle */ 78 79 typedef enum { 80 STDL_NORMAL, 81 STDL_HIGH 82 } st_debug_level_t; 83 84 /* 85 * Output a scanning-related debug message. 86 */ 87 /*PRINTFLIKE3*/ /*ARGSUSED*/ 88 static void 89 st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...) 90 { 91 #ifdef DEBUG_MSG 92 va_list alist; 93 char *buf; 94 size_t len; 95 96 if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH 97 : RCM_DEBUG)) 98 return; 99 100 len = strlen(msg) + LINELEN; 101 buf = malloc(len); 102 if (buf == NULL) 103 return; 104 (void) snprintf(buf, len, "%s %s scanner %s", 105 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), 106 lcol->lcol_name, msg); 107 108 va_start(alist, msg); 109 vdprintfe(RCM_DEBUG, buf, alist); 110 va_end(alist); 111 112 free(buf); 113 #endif /* DEBUG_MSG */ 114 } 115 116 /* 117 * Determine the collection's current victim, based on its last. The last will 118 * be returned, or, if invalid, any other valid process, if the collection has 119 * any. 120 */ 121 static lprocess_t * 122 get_valid_victim(lcollection_t *lcol, lprocess_t *lpc) 123 { 124 if (lpc == NULL || !lcollection_member(lcol, lpc)) 125 lpc = lcol->lcol_lprocess; 126 127 /* 128 * Find the next scannable process, and make it the victim. 129 */ 130 while (lpc != NULL && lpc->lpc_unscannable != 0) 131 lpc = lpc->lpc_next; 132 133 return (lpc); 134 } 135 136 /* 137 * Get a process's combined current pagedata (per-page referenced and modified 138 * bits) and set the supplied pointer to it. The caller is responsible for 139 * freeing the data. If the pagedata is unreadable, a nonzero value is 140 * returned, and errno is set. Otherwise, 0 is returned. 141 */ 142 static int 143 get_pagedata(prpageheader_t **pghpp, int fd) 144 { 145 int res; 146 struct stat st; 147 148 redo: 149 errno = 0; 150 if (fstat(fd, &st) != 0) { 151 debug("cannot stat pagedata\n"); 152 return (-1); 153 } 154 155 errno = 0; 156 *pghpp = malloc(st.st_size); 157 if (*pghpp == NULL) { 158 debug("cannot malloc() %ld bytes for pagedata", st.st_size); 159 return (-1); 160 } 161 (void) bzero(*pghpp, st.st_size); 162 163 errno = 0; 164 if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) { 165 free(*pghpp); 166 *pghpp = NULL; 167 if (res > 0 || errno == E2BIG) { 168 debug("pagedata changed size, retrying\n"); 169 goto redo; 170 } else { 171 debug("cannot read pagedata"); 172 return (-1); 173 } 174 } 175 176 return (0); 177 } 178 179 /* 180 * Return the count of kilobytes of pages represented by the given pagedata 181 * which meet the given criteria, having pages which are in all of the states 182 * specified by the mask, and in none of the states in the notmask. If the 183 * CP_CLEAR flag is set, the pagedata will also be cleared. 184 */ 185 #define CP_CLEAR 1 186 static uint64_t 187 count_pages(prpageheader_t *pghp, int flags, int mask, int notmask) 188 { 189 int map; 190 caddr_t cur, end; 191 prpageheader_t pgh = *pghp; 192 prasmap_t *asmapp; 193 uint64_t count = 0; 194 195 cur = (caddr_t)pghp + sizeof (*pghp); 196 for (map = 0; map < pgh.pr_nmap; map++) { 197 asmapp = (prasmap_t *)(uintptr_t)cur; 198 cur += sizeof (*asmapp); 199 end = cur + asmapp->pr_npage; 200 while (cur < end) { 201 if ((*cur & mask) == mask && (*cur & notmask) == 0) 202 count += asmapp->pr_pagesize / 1024; 203 if ((flags & CP_CLEAR) != 0) 204 *cur = 0; 205 cur++; 206 } 207 208 /* 209 * Skip to next 64-bit-aligned address to get the next 210 * prasmap_t. 211 */ 212 cur = (caddr_t)((intptr_t)(cur + 7) & ~7); 213 } 214 215 return (count); 216 } 217 218 /* 219 * Return the amount of memory (in kilobytes) that hasn't been referenced or 220 * modified, which memory which will be paged out first. Should be written to 221 * exclude nonresident pages when sufficient interfaces exist. 222 */ 223 static uint64_t 224 unrm_size(lprocess_t *lpc) 225 { 226 return (count_pages(lpc->lpc_prpageheader, CP_CLEAR, 227 0, PG_MODIFIED | PG_REFERENCED)); 228 } 229 230 /* 231 * Advance a prpageheader_cur_t to the address space's next mapping, returning 232 * its address, or NULL if there is none. Any known nonpageable or nonresident 233 * mappings will be skipped over. 234 */ 235 static uintptr_t 236 advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp) 237 { 238 prasmap_t *pap; 239 int i; 240 241 next: 242 ASSERT(pcp->pr_map < pcp->pr_nmap); 243 if ((pcp->pr_map + 1) == pcp->pr_nmap) 244 return ((uintptr_t)NULL); 245 pcp->pr_map++; 246 if (pcp->pr_pgoff < pcp->pr_npage) { 247 pcp->pr_pdaddr = (caddr_t)(uintptr_t) 248 ((uintptr_t)pcp->pr_pdaddr + 249 (pcp->pr_npage - pcp->pr_pgoff)); 250 pcp->pr_pgoff = pcp->pr_npage; 251 } 252 /* 253 * Skip to next 64-bit-aligned address to get the next prasmap_t. 254 */ 255 pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7); 256 pap = (prasmap_t *)pcp->pr_pdaddr; 257 pcp->pr_pgoff = 0; 258 pcp->pr_npage = pap->pr_npage; 259 pcp->pr_pagesize = pap->pr_pagesize; 260 pcp->pr_addr = pap->pr_vaddr; 261 pcp->pr_pdaddr = pap + 1; 262 263 /* 264 * Skip any known nonpageable mappings. Currently, the only one 265 * detected is the schedctl page. 266 */ 267 if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC | 268 MA_ANON)) == 0 && pap->pr_npage == 1) { 269 debug("identified nonpageable schedctl mapping at %p\n", 270 (void *)pcp->pr_addr); 271 goto next; 272 } 273 274 /* 275 * Skip mappings with no resident pages. If the xmap does not 276 * correspond to the pagedata for any reason, it will be ignored. 277 */ 278 pcp->pr_rss = -1; 279 pcp->pr_pg_rss = -1; 280 for (i = 0; i < pcp->pr_nxmap; i++) { 281 prxmap_t *xmap = &pcp->pr_xmap[i]; 282 283 if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size == 284 (pcp->pr_npage * pcp->pr_pagesize)) { 285 pcp->pr_rss = xmap->pr_rss; 286 /* 287 * Remove COW pages from the pageable RSS count. 288 */ 289 if ((xmap->pr_mflags & MA_SHARED) == 0) 290 pcp->pr_pg_rss = xmap->pr_anon; 291 break; 292 } 293 } 294 if (pcp->pr_rss == 0) { 295 debug("identified nonresident mapping at 0x%p\n", 296 (void *)pcp->pr_addr); 297 goto next; 298 } else if (pcp->pr_pg_rss == 0) { 299 debug("identified unpageable mapping at 0x%p\n", 300 (void *)pcp->pr_addr); 301 goto next; 302 } 303 304 return (pcp->pr_addr); 305 } 306 307 /* 308 * Advance a prpageheader_cur_t to the mapping's next page, returning its 309 * address, or NULL if there is none. 310 */ 311 static void * 312 advance_prpageheader_cur(prpageheader_cur_t *pcp) 313 { 314 ASSERT(pcp->pr_pgoff < pcp->pr_npage); 315 if ((pcp->pr_pgoff + 1) == pcp->pr_npage) 316 return (NULL); 317 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1; 318 pcp->pr_pgoff++; 319 320 ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0); 321 return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize); 322 } 323 324 /* 325 * Initialize a prpageheader_cur_t, positioned at the first page of the mapping 326 * of an address space. 327 */ 328 static void * 329 set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php, 330 prxmap_t *xmap, int nxmap) 331 { 332 bzero(pcp, sizeof (*pcp)); 333 pcp->pr_nmap = php->pr_nmap; 334 pcp->pr_map = -1; 335 pcp->pr_prpageheader = php; 336 pcp->pr_xmap = xmap; 337 pcp->pr_nxmap = nxmap; 338 pcp->pr_pdaddr = (prpageheader_t *)php + 1; 339 340 return ((void *)advance_prpageheader_cur_nextmapping(pcp)); 341 } 342 343 /* 344 * Position a prpageheader_cur_t to the mapped address greater or equal to the 345 * given value. 346 */ 347 static void * 348 set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php, 349 prxmap_t *xmap, int nxmap, void *naddr) 350 { 351 void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap); 352 353 while (addr != NULL && addr <= naddr) 354 if (naddr < (void *)((caddr_t)pcp->pr_addr + 355 pcp->pr_pagesize * pcp->pr_npage)) { 356 uint64_t pgdiff = ((uintptr_t)naddr - 357 (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize; 358 pcp->pr_pgoff += pgdiff; 359 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff; 360 addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize * 361 pcp->pr_pgoff; 362 break; 363 } else 364 addr = 365 (void *)advance_prpageheader_cur_nextmapping(pcp); 366 367 return (addr); 368 } 369 370 static void 371 revoke_pagedata(rfd_t *rfd) 372 { 373 lprocess_t *lpc = rfd->rfd_data; 374 375 st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for" 376 " process %d\n", (int)lpc->lpc_pid); 377 ASSERT(lpc->lpc_pgdata_fd != -1); 378 lpc->lpc_pgdata_fd = -1; 379 } 380 381 #ifdef DEBUG 382 static void 383 mklmapping(lmapping_t **lm, prpageheader_t *pgh) 384 { 385 prpageheader_cur_t cur; 386 void *addr; 387 388 addr = set_prpageheader_cur(&cur, pgh, NULL, -1); 389 ASSERT(*lm == NULL); 390 while (addr != NULL) { 391 (void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage * 392 cur.pr_pagesize); 393 addr = (void *)advance_prpageheader_cur_nextmapping(&cur); 394 } 395 } 396 397 static void 398 lmapping_dump(lmapping_t *lm) 399 { 400 debug("lm: %p\n", (void *)lm); 401 while (lm != NULL) { 402 debug("\t(%p, %llx\n", (void *)lm->lm_addr, 403 (unsigned long long)lm->lm_size); 404 lm = lm->lm_next; 405 } 406 } 407 #endif /* DEBUG */ 408 409 /* 410 * OR two prpagedata_t which are supposedly snapshots of the same address 411 * space. Intersecting mappings with different page sizes are tolerated but 412 * not normalized (not accurate). If the mappings of the two snapshots differ 413 * in any regard, the supplied mappings_changed flag will be set. 414 */ 415 static void 416 OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp) 417 { 418 prpageheader_cur_t src_cur; 419 prpageheader_cur_t dst_cur; 420 uintptr_t src_addr; 421 uintptr_t dst_addr; 422 int mappings_changed = 0; 423 424 /* 425 * OR source pagedata with the destination, for pages of intersecting 426 * mappings. 427 */ 428 src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1); 429 dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1); 430 while (src_addr != (uintptr_t)NULL && dst_addr != (uintptr_t)NULL) { 431 while (src_addr == dst_addr && src_addr != (uintptr_t)NULL) { 432 *(char *)dst_cur.pr_pdaddr |= 433 *(char *)src_cur.pr_pdaddr; 434 src_addr = (uintptr_t)advance_prpageheader_cur( 435 &src_cur); 436 dst_addr = (uintptr_t)advance_prpageheader_cur( 437 &dst_cur); 438 } 439 if (src_addr != dst_addr) 440 mappings_changed = 1; 441 src_addr = advance_prpageheader_cur_nextmapping(&src_cur); 442 dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur); 443 while (src_addr != dst_addr && src_addr != (uintptr_t)NULL && 444 dst_addr != (uintptr_t)NULL) { 445 mappings_changed = 1; 446 if (src_addr < dst_addr) 447 src_addr = advance_prpageheader_cur_nextmapping( 448 &src_cur); 449 else 450 dst_addr = advance_prpageheader_cur_nextmapping( 451 &dst_cur); 452 } 453 } 454 455 *mappings_changedp = mappings_changed; 456 } 457 458 /* 459 * Merge the current pagedata with that on hand. If the pagedata is 460 * unretrievable for any reason, such as the process having exited or being a 461 * zombie, a nonzero value is returned, the process should be marked 462 * unscannable, and future attempts to scan it should be avoided, since the 463 * symptom is probably permament. If the mappings of either pagedata 464 * differ in any respect, the supplied callback will be invoked once. 465 */ 466 static int 467 merge_current_pagedata(lprocess_t *lpc, 468 void(*mappings_changed_cb) (lprocess_t *)) 469 { 470 prpageheader_t *pghp; 471 int mappings_changed = 0; 472 uint64_t cnt; 473 474 if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 475 0) { 476 char pathbuf[PROC_PATH_MAX]; 477 478 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata", 479 (int)lpc->lpc_pid); 480 if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA, 481 revoke_pagedata, lpc, O_RDONLY, 0)) < 0 || 482 get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0) 483 return (-1); 484 debug("starting/resuming pagedata collection for %d\n", 485 (int)lpc->lpc_pid); 486 } 487 488 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0); 489 if (cnt != 0 || lpc->lpc_rss != 0) 490 debug("process %d: %llu/%llukB rfd/mdfd since last read\n", 491 (int)lpc->lpc_pid, (unsigned long long)cnt, 492 (unsigned long long)lpc->lpc_rss); 493 if (lpc->lpc_prpageheader != NULL) { 494 /* 495 * OR the two snapshots. 496 */ 497 #ifdef DEBUG 498 lmapping_t *old = NULL; 499 lmapping_t *new = NULL; 500 501 mklmapping(&new, pghp); 502 mklmapping(&old, lpc->lpc_prpageheader); 503 #endif /* DEBUG */ 504 OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed); 505 #ifdef DEBUG 506 if (((mappings_changed != 0) ^ 507 (lmapping_dump_diff(old, new) != 0))) { 508 debug("lmapping_changed inconsistent with lmapping\n"); 509 debug("old\n"); 510 lmapping_dump(old); 511 debug("new\n"); 512 lmapping_dump(new); 513 debug("ignored\n"); 514 lmapping_dump(lpc->lpc_ignore); 515 ASSERT(0); 516 } 517 lmapping_free(&new); 518 lmapping_free(&old); 519 #endif /* DEBUG */ 520 free(lpc->lpc_prpageheader); 521 } else 522 mappings_changed = 1; 523 lpc->lpc_prpageheader = pghp; 524 525 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0); 526 if (cnt != 0 || lpc->lpc_rss != 0) 527 debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n", 528 (int)lpc->lpc_pid, (unsigned long long)cnt, 529 (unsigned long long)lpc->lpc_rss); 530 if (mappings_changed != 0) { 531 debug("process %d: mappings changed\n", (int)lpc->lpc_pid); 532 if (mappings_changed_cb != NULL) 533 mappings_changed_cb(lpc); 534 } 535 return (0); 536 } 537 538 /* 539 * Attempt to page out a region of the given process's address space. May 540 * return nonzero if not all of the pages may are pageable, for any reason. 541 */ 542 static int 543 pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end) 544 { 545 int res; 546 547 if (end <= start) 548 return (0); 549 550 errno = 0; 551 res = pr_memcntl(Pr, start, (end - start), MC_SYNC, 552 (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0); 553 debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res); 554 555 /* 556 * EBUSY indicates none of the pages have backing store allocated, or 557 * some pages were locked, which are less interesting than other 558 * conditions, which are noted. 559 */ 560 if (res != 0) 561 if (errno == EBUSY) 562 res = 0; 563 else 564 debug("%d: can't pageout %p+%llx (errno %d)", (int)pid, 565 (void *)start, (long long)(end - start), errno); 566 567 return (res); 568 } 569 570 /* 571 * Compute the delta of the victim process's RSS since the last call. If the 572 * psinfo cannot be obtained, no work is done, and no error is returned; it is 573 * up to the caller to detect the process' termination via other means. 574 */ 575 static int64_t 576 rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic) 577 { 578 int64_t d_rss = 0; 579 580 if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd, 581 lprocess_update_psinfo_fd_cb, vic, vic) == 0) { 582 d_rss = (int64_t)new_psinfo->pr_rssize - 583 (int64_t)old_psinfo->pr_rssize; 584 if (d_rss < 0) 585 vic->lpc_collection->lcol_stat.lcols_pg_eff += 586 (- d_rss); 587 *old_psinfo = *new_psinfo; 588 } 589 590 return (d_rss); 591 } 592 593 static void 594 unignore_mappings(lprocess_t *lpc) 595 { 596 lmapping_free(&lpc->lpc_ignore); 597 } 598 599 static void 600 unignore_referenced_mappings(lprocess_t *lpc) 601 { 602 prpageheader_cur_t cur; 603 void *vicaddr; 604 605 vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1); 606 while (vicaddr != NULL) { 607 if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED)) 608 != 0) { 609 if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr, 610 cur.pr_npage * cur.pr_pagesize) == 0) 611 debug("removed mapping 0x%p+0t%llukB from" 612 " ignored set\n", (void *)cur.pr_addr, 613 (unsigned long long)(cur.pr_npage * 614 cur.pr_pagesize / 1024)); 615 vicaddr = (void *)advance_prpageheader_cur_nextmapping( 616 &cur); 617 } else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL) 618 vicaddr = (void *)advance_prpageheader_cur_nextmapping( 619 &cur); 620 } 621 } 622 623 /* 624 * Resume scanning, starting with the last victim, if it is still valid, or any 625 * other one, otherwise. 626 */ 627 void 628 scan(lcollection_t *lcol, int64_t excess) 629 { 630 lprocess_t *vic, *lpc; 631 void *vicaddr, *endaddr, *nvicaddr; 632 prpageheader_cur_t cur; 633 psinfo_t old_psinfo, new_psinfo; 634 hrtime_t scan_start; 635 int res, resumed; 636 uint64_t col_unrm_size; 637 638 st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n", 639 (long long)excess); 640 641 /* 642 * Determine the address to start scanning at, depending on whether 643 * scanning can be resumed. 644 */ 645 endaddr = NULL; 646 if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) == 647 lcol->lcol_victim && lcol->lcol_resaddr != NULL) { 648 vicaddr = lcol->lcol_resaddr; 649 st_debug(STDL_NORMAL, lcol, "resuming process %d\n", 650 (int)vic->lpc_pid); 651 resumed = 1; 652 } else { 653 vicaddr = NULL; 654 resumed = 0; 655 } 656 657 scan_start = gethrtime(); 658 /* 659 * Obtain the most current pagedata for the processes that might be 660 * scanned, and remove from the ignored set any mappings which have 661 * referenced or modified pages (in the hopes that the pageability of 662 * the mapping's pages may have changed). Determine if the 663 * unreferenced and unmodified portion is impossibly small to suffice 664 * to reduce the excess completely. If so, ignore these bits so that 665 * even working set will be paged out. 666 */ 667 col_unrm_size = 0; 668 lpc = vic; 669 while (lpc != NULL && should_run) { 670 if (merge_current_pagedata(lpc, unignore_mappings) != 0) { 671 st_debug(STDL_NORMAL, lcol, "process %d:" 672 " exited/temporarily unscannable", 673 (int)lpc->lpc_pid); 674 goto next; 675 } 676 debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid, 677 (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)), 678 (unsigned long long)lpc->lpc_size); 679 col_unrm_size += lpc->lpc_unrm = unrm_size(lpc); 680 681 if ((lcol->lcol_stat.lcols_scan_count % 682 RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) { 683 /* 684 * Periodically clear the set of ignored mappings. 685 * This will allow processes whose ignored segments' 686 * pageability have changed (without a corresponding 687 * reference or modification to a page) to be 688 * recognized. 689 */ 690 if (lcol->lcol_stat.lcols_scan_count > 0) 691 unignore_mappings(lpc); 692 } else { 693 /* 694 * Ensure mappings with referenced or modified pages 695 * are not in the ignored set. Their usage might mean 696 * the condition which made them unpageable is gone. 697 */ 698 unignore_referenced_mappings(lpc); 699 } 700 next: 701 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol, 702 lpc->lpc_next) : NULL; 703 } 704 if (col_unrm_size < excess) { 705 lpc = vic; 706 debug("will not reduce excess with only unreferenced pages\n"); 707 while (lpc != NULL && should_run) { 708 if (lpc->lpc_prpageheader != NULL) { 709 (void) count_pages(lpc->lpc_prpageheader, 710 CP_CLEAR, 0, 0); 711 if (lpc->lpc_pgdata_fd >= 0) { 712 if (rfd_close(lpc->lpc_pgdata_fd) != 0) 713 debug("coud not close %d" 714 " lpc_pgdata_fd %d", 715 (int)lpc->lpc_pid, 716 lpc->lpc_pgdata_fd); 717 lpc->lpc_pgdata_fd = -1; 718 } 719 } 720 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol, 721 lpc->lpc_next) : NULL; 722 } 723 } 724 725 /* 726 * Examine each process for pages to remove until the excess is 727 * reduced. 728 */ 729 while (vic != NULL && excess > 0 && should_run) { 730 /* 731 * Skip processes whose death was reported when the merging of 732 * pagedata was attempted. 733 */ 734 if (vic->lpc_prpageheader == NULL) 735 goto nextproc; 736 737 /* 738 * Obtain optional segment residency information. 739 */ 740 if (lpc_xmap_update(vic) != 0) 741 st_debug(STDL_NORMAL, lcol, "process %d: xmap" 742 " unreadable; ignoring", (int)vic->lpc_pid); 743 744 #ifdef DEBUG_MSG 745 { 746 void *ovicaddr = vicaddr; 747 #endif /* DEBUG_MSG */ 748 vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader, 749 vic->lpc_xmap, vic->lpc_nxmap, vicaddr); 750 #ifdef DEBUG_MSG 751 st_debug(STDL_NORMAL, lcol, "trying to resume from" 752 " 0x%p, next 0x%p\n", ovicaddr, vicaddr); 753 } 754 #endif /* DEBUG_MSG */ 755 756 /* 757 * Take control of the victim. 758 */ 759 if (get_psinfo(vic->lpc_pid, &old_psinfo, 760 vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb, 761 vic, vic) != 0) { 762 st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo", 763 (int)vic->lpc_pid); 764 goto nextproc; 765 } 766 (void) rfd_reserve(PGRAB_FD_COUNT); 767 if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) { 768 st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)", 769 (int)vic->lpc_pid, res); 770 goto nextproc; 771 } 772 if (Pcreate_agent(scan_pr) != 0) { 773 st_debug(STDL_NORMAL, lcol, "cannot control %d", 774 (int)vic->lpc_pid); 775 goto nextproc; 776 } 777 /* 778 * Be very pessimistic about the state of the agent LWP -- 779 * verify it's actually stopped. 780 */ 781 errno = 0; 782 while (Pstate(scan_pr) == PS_RUN) 783 (void) Pwait(scan_pr, 0); 784 if (Pstate(scan_pr) != PS_STOP) { 785 st_debug(STDL_NORMAL, lcol, "agent not in expected" 786 " state (%d)", Pstate(scan_pr)); 787 goto nextproc; 788 } 789 790 /* 791 * Within the victim's address space, find contiguous ranges of 792 * unreferenced pages to page out. 793 */ 794 st_debug(STDL_NORMAL, lcol, "paging out process %d\n", 795 (int)vic->lpc_pid); 796 while (excess > 0 && vicaddr != NULL && should_run) { 797 /* 798 * Skip mappings in the ignored set. Mappings get 799 * placed in the ignored set when all their resident 800 * pages are unreference and unmodified, yet unpageable 801 * -- such as when they are locked, or involved in 802 * asynchronous I/O. They will be scanned again when 803 * some page is referenced or modified. 804 */ 805 if (lmapping_contains(vic->lpc_ignore, cur.pr_addr, 806 cur.pr_npage * cur.pr_pagesize)) { 807 debug("ignored mapping at 0x%p\n", 808 (void *)cur.pr_addr); 809 /* 810 * Update statistics. 811 */ 812 lcol->lcol_stat.lcols_pg_att += 813 cur.pr_npage * cur.pr_pagesize / 1024; 814 815 vicaddr = (void *) 816 advance_prpageheader_cur_nextmapping(&cur); 817 continue; 818 } 819 820 /* 821 * Determine a range of unreferenced pages to page out, 822 * and clear the R/M bits in the preceding referenced 823 * range. 824 */ 825 st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p," 826 " npage %llu\n", vicaddr, 827 (unsigned long long)cur.pr_npage); 828 while (vicaddr != NULL && 829 *(caddr_t)cur.pr_pdaddr != 0) { 830 *(caddr_t)cur.pr_pdaddr = 0; 831 vicaddr = advance_prpageheader_cur(&cur); 832 } 833 st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr" 834 " %p\n", vicaddr, cur.pr_pdaddr); 835 if (vicaddr == NULL) { 836 /* 837 * The end of mapping was reached before any 838 * unreferenced pages were seen. 839 */ 840 vicaddr = (void *) 841 advance_prpageheader_cur_nextmapping(&cur); 842 continue; 843 } 844 do 845 endaddr = advance_prpageheader_cur(&cur); 846 while (endaddr != NULL && 847 *(caddr_t)cur.pr_pdaddr == 0 && 848 (((intptr_t)endaddr - (intptr_t)vicaddr) / 849 1024) < excess) 850 ; 851 st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n", 852 endaddr, *(caddr_t)cur.pr_pdaddr); 853 854 /* 855 * Page out from vicaddr to the end of the mapping, or 856 * endaddr if set, then continue scanning after 857 * endaddr, or the next mapping, if not set. 858 */ 859 nvicaddr = endaddr; 860 if (endaddr == NULL) 861 endaddr = (caddr_t)cur.pr_addr + 862 cur.pr_pagesize * cur.pr_npage; 863 if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) == 864 0) { 865 int64_t d_rss, att; 866 int willignore = 0; 867 868 excess += (d_rss = rss_delta( 869 &new_psinfo, &old_psinfo, vic)); 870 871 /* 872 * If this pageout attempt was unsuccessful 873 * (the resident portion was not affected), and 874 * was for the whole mapping, put it in the 875 * ignored set, so it will not be scanned again 876 * until some page is referenced or modified. 877 */ 878 if (d_rss >= 0 && (void *)cur.pr_addr == 879 vicaddr && (cur.pr_pagesize * cur.pr_npage) 880 == ((uintptr_t)endaddr - 881 (uintptr_t)vicaddr)) { 882 if (lmapping_insert( 883 &vic->lpc_ignore, 884 cur.pr_addr, 885 cur.pr_pagesize * 886 cur.pr_npage) != 0) 887 debug("not enough memory to add" 888 " mapping at %p to ignored" 889 " set\n", 890 (void *)cur.pr_addr); 891 willignore = 1; 892 } 893 894 /* 895 * Update statistics. 896 */ 897 lcol->lcol_stat.lcols_pg_att += (att = 898 ((intptr_t)endaddr - (intptr_t)vicaddr) / 899 1024); 900 st_debug(STDL_NORMAL, lcol, "paged out 0x%p" 901 "+0t(%llu/%llu)kB%s\n", vicaddr, 902 (unsigned long long)((d_rss < 903 0) ? - d_rss : 0), (unsigned long long)att, 904 willignore ? " (will ignore)" : ""); 905 } else { 906 st_debug(STDL_NORMAL, lcol, 907 "process %d: exited/unscannable\n", 908 (int)vic->lpc_pid); 909 vic->lpc_unscannable = 1; 910 goto nextproc; 911 } 912 913 /* 914 * Update the statistics file, if it's time. 915 */ 916 check_update_statistics(); 917 918 vicaddr = (nvicaddr != NULL) ? nvicaddr : (void 919 *)advance_prpageheader_cur_nextmapping(&cur); 920 } 921 excess += rss_delta(&new_psinfo, &old_psinfo, vic); 922 st_debug(STDL_NORMAL, lcol, "done, excess %lld\n", 923 (long long)excess); 924 nextproc: 925 /* 926 * If a process was grabbed, release it, destroying its agent. 927 */ 928 if (scan_pr != NULL) { 929 (void) Prelease(scan_pr, 0); 930 scan_pr = NULL; 931 } 932 lcol->lcol_victim = vic; 933 /* 934 * Scan the collection at most once. Only if scanning was not 935 * aborted for any reason, and the end of lprocess has not been 936 * reached, determine the next victim and scan it. 937 */ 938 if (vic != NULL) { 939 if (vic->lpc_next != NULL) { 940 /* 941 * Determine the next process to be scanned. 942 */ 943 if (excess > 0) { 944 vic = get_valid_victim(lcol, 945 vic->lpc_next); 946 vicaddr = 0; 947 } 948 } else { 949 /* 950 * A complete scan of the collection was made, 951 * so tick the scan counter and stop scanning 952 * until the next request. 953 */ 954 lcol->lcol_stat.lcols_scan_count++; 955 lcol->lcol_stat.lcols_scan_time_complete 956 = lcol->lcol_stat.lcols_scan_time; 957 /* 958 * If an excess still exists, tick the 959 * "ineffective scan" counter, signalling that 960 * the cap may be uneforceable. 961 */ 962 if (resumed == 0 && excess > 0) 963 lcol->lcol_stat 964 .lcols_scan_ineffective++; 965 /* 966 * Scanning should start at the beginning of 967 * the process list at the next request. 968 */ 969 if (excess > 0) 970 vic = NULL; 971 } 972 } 973 } 974 lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start); 975 st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n", 976 (long long)excess); 977 978 lcol->lcol_resaddr = vicaddr; 979 if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) { 980 lcol->lcol_victim = get_valid_victim(lcol, 981 lcol->lcol_victim->lpc_next); 982 } 983 } 984 985 /* 986 * Abort the scan in progress, and destroy the agent LWP of any grabbed 987 * processes. 988 */ 989 void 990 scan_abort(void) 991 { 992 if (scan_pr != NULL) 993 (void) Prelease(scan_pr, 0); 994 } 995 996 static void 997 revoke_xmap(rfd_t *rfd) 998 { 999 lprocess_t *lpc = rfd->rfd_data; 1000 1001 debug("revoking xmap for process %d\n", (int)lpc->lpc_pid); 1002 ASSERT(lpc->lpc_xmap_fd != -1); 1003 lpc->lpc_xmap_fd = -1; 1004 } 1005 1006 /* 1007 * Retrieve the process's current xmap , which is used to determine the size of 1008 * the resident portion of its segments. Return zero if successful. 1009 */ 1010 static int 1011 lpc_xmap_update(lprocess_t *lpc) 1012 { 1013 int res; 1014 struct stat st; 1015 1016 free(lpc->lpc_xmap); 1017 lpc->lpc_xmap = NULL; 1018 lpc->lpc_nxmap = -1; 1019 1020 if (lpc->lpc_xmap_fd == -1) { 1021 char pathbuf[PROC_PATH_MAX]; 1022 1023 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap", 1024 (int)lpc->lpc_pid); 1025 if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP, 1026 revoke_xmap, lpc, O_RDONLY, 0)) < 0) 1027 return (-1); 1028 } 1029 1030 redo: 1031 errno = 0; 1032 if (fstat(lpc->lpc_xmap_fd, &st) != 0) { 1033 debug("cannot stat xmap\n"); 1034 (void) rfd_close(lpc->lpc_xmap_fd); 1035 lpc->lpc_xmap_fd = -1; 1036 return (-1); 1037 } 1038 1039 if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) { 1040 debug("xmap wrong size\n"); 1041 (void) rfd_close(lpc->lpc_xmap_fd); 1042 lpc->lpc_xmap_fd = -1; 1043 return (-1); 1044 } 1045 1046 lpc->lpc_xmap = malloc(st.st_size); 1047 if (lpc->lpc_xmap == NULL) { 1048 debug("cannot malloc() %ld bytes for xmap", st.st_size); 1049 (void) rfd_close(lpc->lpc_xmap_fd); 1050 lpc->lpc_xmap_fd = -1; 1051 return (-1); 1052 } 1053 1054 if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) != 1055 st.st_size) { 1056 free(lpc->lpc_xmap); 1057 lpc->lpc_xmap = NULL; 1058 if (res > 0) { 1059 debug("xmap changed size, retrying\n"); 1060 goto redo; 1061 } else { 1062 debug("cannot read xmap"); 1063 return (-1); 1064 } 1065 } 1066 lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap); 1067 1068 return (0); 1069 } 1070