1 /* 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Permission to use, copy, modify and distribute this software and 43 * its documentation is hereby granted, provided that both the copyright 44 * notice and this permission notice appear in all copies of the 45 * software, derivative works or modified versions, and any portions 46 * thereof, and that both notices appear in supporting documentation. 47 * 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 51 * 52 * Carnegie Mellon requests users of this software to return to 53 * 54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 55 * School of Computer Science 56 * Carnegie Mellon University 57 * Pittsburgh PA 15213-3890 58 * 59 * any improvements or extensions that they make and grant Carnegie the 60 * rights to redistribute these changes. 61 */ 62 63 #include <sys/cdefs.h> 64 __FBSDID("$FreeBSD$"); 65 66 #include "opt_vm.h" 67 #include "opt_kstack_pages.h" 68 #include "opt_kstack_max_pages.h" 69 70 #include <sys/param.h> 71 #include <sys/systm.h> 72 #include <sys/limits.h> 73 #include <sys/lock.h> 74 #include <sys/mutex.h> 75 #include <sys/proc.h> 76 #include <sys/resourcevar.h> 77 #include <sys/shm.h> 78 #include <sys/vmmeter.h> 79 #include <sys/sx.h> 80 #include <sys/sysctl.h> 81 82 #include <sys/kernel.h> 83 #include <sys/ktr.h> 84 #include <sys/unistd.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_param.h> 88 #include <vm/pmap.h> 89 #include <vm/vm_map.h> 90 #include <vm/vm_page.h> 91 #include <vm/vm_pageout.h> 92 #include <vm/vm_object.h> 93 #include <vm/vm_kern.h> 94 #include <vm/vm_extern.h> 95 #include <vm/vm_pager.h> 96 #include <vm/swap_pager.h> 97 98 #include <sys/user.h> 99 100 extern int maxslp; 101 102 /* 103 * System initialization 104 * 105 * Note: proc0 from proc.h 106 */ 107 static void vm_init_limits(void *); 108 SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0) 109 110 /* 111 * THIS MUST BE THE LAST INITIALIZATION ITEM!!! 112 * 113 * Note: run scheduling should be divorced from the vm system. 114 */ 115 static void scheduler(void *); 116 SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, scheduler, NULL) 117 118 #ifndef NO_SWAPPING 119 static void swapout(struct proc *); 120 static void vm_proc_swapin(struct proc *p); 121 static void vm_proc_swapout(struct proc *p); 122 #endif 123 124 /* 125 * MPSAFE 126 * 127 * WARNING! This code calls vm_map_check_protection() which only checks 128 * the associated vm_map_entry range. It does not determine whether the 129 * contents of the memory is actually readable or writable. In most cases 130 * just checking the vm_map_entry is sufficient within the kernel's address 131 * space. 132 */ 133 int 134 kernacc(addr, len, rw) 135 void *addr; 136 int len, rw; 137 { 138 boolean_t rv; 139 vm_offset_t saddr, eaddr; 140 vm_prot_t prot; 141 142 KASSERT((rw & ~VM_PROT_ALL) == 0, 143 ("illegal ``rw'' argument to kernacc (%x)\n", rw)); 144 prot = rw; 145 saddr = trunc_page((vm_offset_t)addr); 146 eaddr = round_page((vm_offset_t)addr + len); 147 vm_map_lock_read(kernel_map); 148 rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); 149 vm_map_unlock_read(kernel_map); 150 return (rv == TRUE); 151 } 152 153 /* 154 * MPSAFE 155 * 156 * WARNING! This code calls vm_map_check_protection() which only checks 157 * the associated vm_map_entry range. It does not determine whether the 158 * contents of the memory is actually readable or writable. vmapbuf(), 159 * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be 160 * used in conjuction with this call. 161 */ 162 int 163 useracc(addr, len, rw) 164 void *addr; 165 int len, rw; 166 { 167 boolean_t rv; 168 vm_prot_t prot; 169 vm_map_t map; 170 171 KASSERT((rw & ~VM_PROT_ALL) == 0, 172 ("illegal ``rw'' argument to useracc (%x)\n", rw)); 173 prot = rw; 174 map = &curproc->p_vmspace->vm_map; 175 if ((vm_offset_t)addr + len > vm_map_max(map) || 176 (vm_offset_t)addr + len < (vm_offset_t)addr) { 177 return (FALSE); 178 } 179 vm_map_lock_read(map); 180 rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), 181 round_page((vm_offset_t)addr + len), prot); 182 vm_map_unlock_read(map); 183 return (rv == TRUE); 184 } 185 186 /* 187 * MPSAFE 188 */ 189 void 190 vslock(addr, len) 191 void *addr; 192 u_int len; 193 { 194 195 vm_map_wire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), 196 round_page((vm_offset_t)addr + len), 197 VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); 198 } 199 200 /* 201 * MPSAFE 202 */ 203 void 204 vsunlock(addr, len) 205 void *addr; 206 u_int len; 207 { 208 209 vm_map_unwire(&curproc->p_vmspace->vm_map, 210 trunc_page((vm_offset_t)addr), 211 round_page((vm_offset_t)addr + len), 212 VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); 213 } 214 215 /* 216 * Create the U area for a new process. 217 * This routine directly affects the fork perf for a process. 218 */ 219 void 220 vm_proc_new(struct proc *p) 221 { 222 vm_page_t ma[UAREA_PAGES]; 223 vm_object_t upobj; 224 vm_offset_t up; 225 vm_page_t m; 226 u_int i; 227 228 /* 229 * Get a kernel virtual address for the U area for this process. 230 */ 231 up = kmem_alloc_nofault(kernel_map, UAREA_PAGES * PAGE_SIZE); 232 if (up == 0) 233 panic("vm_proc_new: upage allocation failed"); 234 p->p_uarea = (struct user *)up; 235 236 /* 237 * Allocate object and page(s) for the U area. 238 */ 239 upobj = vm_object_allocate(OBJT_DEFAULT, UAREA_PAGES); 240 p->p_upages_obj = upobj; 241 VM_OBJECT_LOCK(upobj); 242 for (i = 0; i < UAREA_PAGES; i++) { 243 m = vm_page_grab(upobj, i, 244 VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED); 245 ma[i] = m; 246 247 vm_page_lock_queues(); 248 vm_page_wakeup(m); 249 m->valid = VM_PAGE_BITS_ALL; 250 vm_page_unlock_queues(); 251 } 252 VM_OBJECT_UNLOCK(upobj); 253 254 /* 255 * Enter the pages into the kernel address space. 256 */ 257 pmap_qenter(up, ma, UAREA_PAGES); 258 } 259 260 /* 261 * Dispose the U area for a process that has exited. 262 * This routine directly impacts the exit perf of a process. 263 * XXX proc_zone is marked UMA_ZONE_NOFREE, so this should never be called. 264 */ 265 void 266 vm_proc_dispose(struct proc *p) 267 { 268 vm_object_t upobj; 269 vm_offset_t up; 270 vm_page_t m; 271 272 upobj = p->p_upages_obj; 273 VM_OBJECT_LOCK(upobj); 274 if (upobj->resident_page_count != UAREA_PAGES) 275 panic("vm_proc_dispose: incorrect number of pages in upobj"); 276 vm_page_lock_queues(); 277 while ((m = TAILQ_FIRST(&upobj->memq)) != NULL) { 278 vm_page_busy(m); 279 vm_page_unwire(m, 0); 280 vm_page_free(m); 281 } 282 vm_page_unlock_queues(); 283 VM_OBJECT_UNLOCK(upobj); 284 up = (vm_offset_t)p->p_uarea; 285 pmap_qremove(up, UAREA_PAGES); 286 kmem_free(kernel_map, up, UAREA_PAGES * PAGE_SIZE); 287 vm_object_deallocate(upobj); 288 } 289 290 #ifndef NO_SWAPPING 291 /* 292 * Allow the U area for a process to be prejudicially paged out. 293 */ 294 static void 295 vm_proc_swapout(struct proc *p) 296 { 297 vm_object_t upobj; 298 vm_offset_t up; 299 vm_page_t m; 300 301 upobj = p->p_upages_obj; 302 VM_OBJECT_LOCK(upobj); 303 if (upobj->resident_page_count != UAREA_PAGES) 304 panic("vm_proc_dispose: incorrect number of pages in upobj"); 305 vm_page_lock_queues(); 306 TAILQ_FOREACH(m, &upobj->memq, listq) { 307 vm_page_dirty(m); 308 vm_page_unwire(m, 0); 309 } 310 vm_page_unlock_queues(); 311 VM_OBJECT_UNLOCK(upobj); 312 up = (vm_offset_t)p->p_uarea; 313 pmap_qremove(up, UAREA_PAGES); 314 } 315 316 /* 317 * Bring the U area for a specified process back in. 318 */ 319 static void 320 vm_proc_swapin(struct proc *p) 321 { 322 vm_page_t ma[UAREA_PAGES]; 323 vm_object_t upobj; 324 vm_offset_t up; 325 vm_page_t m; 326 int rv; 327 int i; 328 329 upobj = p->p_upages_obj; 330 VM_OBJECT_LOCK(upobj); 331 for (i = 0; i < UAREA_PAGES; i++) { 332 m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); 333 if (m->valid != VM_PAGE_BITS_ALL) { 334 rv = vm_pager_get_pages(upobj, &m, 1, 0); 335 if (rv != VM_PAGER_OK) 336 panic("vm_proc_swapin: cannot get upage"); 337 } 338 ma[i] = m; 339 } 340 if (upobj->resident_page_count != UAREA_PAGES) 341 panic("vm_proc_swapin: lost pages from upobj"); 342 vm_page_lock_queues(); 343 TAILQ_FOREACH(m, &upobj->memq, listq) { 344 m->valid = VM_PAGE_BITS_ALL; 345 vm_page_wire(m); 346 vm_page_wakeup(m); 347 } 348 vm_page_unlock_queues(); 349 VM_OBJECT_UNLOCK(upobj); 350 up = (vm_offset_t)p->p_uarea; 351 pmap_qenter(up, ma, UAREA_PAGES); 352 } 353 354 /* 355 * Swap in the UAREAs of all processes swapped out to the given device. 356 * The pages in the UAREA are marked dirty and their swap metadata is freed. 357 */ 358 void 359 vm_proc_swapin_all(struct swdevt *devidx) 360 { 361 struct proc *p; 362 vm_object_t object; 363 vm_page_t m; 364 365 retry: 366 sx_slock(&allproc_lock); 367 FOREACH_PROC_IN_SYSTEM(p) { 368 PROC_LOCK(p); 369 object = p->p_upages_obj; 370 if (object != NULL) { 371 VM_OBJECT_LOCK(object); 372 if (swap_pager_isswapped(object, devidx)) { 373 VM_OBJECT_UNLOCK(object); 374 sx_sunlock(&allproc_lock); 375 faultin(p); 376 PROC_UNLOCK(p); 377 VM_OBJECT_LOCK(object); 378 vm_page_lock_queues(); 379 TAILQ_FOREACH(m, &object->memq, listq) 380 vm_page_dirty(m); 381 vm_page_unlock_queues(); 382 swap_pager_freespace(object, 0, 383 object->un_pager.swp.swp_bcount); 384 VM_OBJECT_UNLOCK(object); 385 goto retry; 386 } 387 VM_OBJECT_UNLOCK(object); 388 } 389 PROC_UNLOCK(p); 390 } 391 sx_sunlock(&allproc_lock); 392 } 393 #endif 394 395 #ifndef KSTACK_MAX_PAGES 396 #define KSTACK_MAX_PAGES 32 397 #endif 398 399 /* 400 * Create the kernel stack (including pcb for i386) for a new thread. 401 * This routine directly affects the fork perf for a process and 402 * create performance for a thread. 403 */ 404 void 405 vm_thread_new(struct thread *td, int pages) 406 { 407 vm_object_t ksobj; 408 vm_offset_t ks; 409 vm_page_t m, ma[KSTACK_MAX_PAGES]; 410 int i; 411 412 /* Bounds check */ 413 if (pages <= 1) 414 pages = KSTACK_PAGES; 415 else if (pages > KSTACK_MAX_PAGES) 416 pages = KSTACK_MAX_PAGES; 417 /* 418 * Allocate an object for the kstack. 419 */ 420 ksobj = vm_object_allocate(OBJT_DEFAULT, pages); 421 td->td_kstack_obj = ksobj; 422 /* 423 * Get a kernel virtual address for this thread's kstack. 424 */ 425 ks = kmem_alloc_nofault(kernel_map, 426 (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); 427 if (ks == 0) 428 panic("vm_thread_new: kstack allocation failed"); 429 if (KSTACK_GUARD_PAGES != 0) { 430 pmap_qremove(ks, KSTACK_GUARD_PAGES); 431 ks += KSTACK_GUARD_PAGES * PAGE_SIZE; 432 } 433 td->td_kstack = ks; 434 /* 435 * Knowing the number of pages allocated is useful when you 436 * want to deallocate them. 437 */ 438 td->td_kstack_pages = pages; 439 /* 440 * For the length of the stack, link in a real page of ram for each 441 * page of stack. 442 */ 443 VM_OBJECT_LOCK(ksobj); 444 for (i = 0; i < pages; i++) { 445 /* 446 * Get a kernel stack page. 447 */ 448 m = vm_page_grab(ksobj, i, 449 VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED); 450 ma[i] = m; 451 vm_page_lock_queues(); 452 vm_page_wakeup(m); 453 m->valid = VM_PAGE_BITS_ALL; 454 vm_page_unlock_queues(); 455 } 456 VM_OBJECT_UNLOCK(ksobj); 457 pmap_qenter(ks, ma, pages); 458 } 459 460 /* 461 * Dispose of a thread's kernel stack. 462 */ 463 void 464 vm_thread_dispose(struct thread *td) 465 { 466 vm_object_t ksobj; 467 vm_offset_t ks; 468 vm_page_t m; 469 int i, pages; 470 471 pages = td->td_kstack_pages; 472 ksobj = td->td_kstack_obj; 473 ks = td->td_kstack; 474 pmap_qremove(ks, pages); 475 VM_OBJECT_LOCK(ksobj); 476 for (i = 0; i < pages; i++) { 477 m = vm_page_lookup(ksobj, i); 478 if (m == NULL) 479 panic("vm_thread_dispose: kstack already missing?"); 480 vm_page_lock_queues(); 481 vm_page_busy(m); 482 vm_page_unwire(m, 0); 483 vm_page_free(m); 484 vm_page_unlock_queues(); 485 } 486 VM_OBJECT_UNLOCK(ksobj); 487 vm_object_deallocate(ksobj); 488 kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE), 489 (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); 490 } 491 492 /* 493 * Allow a thread's kernel stack to be paged out. 494 */ 495 void 496 vm_thread_swapout(struct thread *td) 497 { 498 vm_object_t ksobj; 499 vm_page_t m; 500 int i, pages; 501 502 cpu_thread_swapout(td); 503 pages = td->td_kstack_pages; 504 ksobj = td->td_kstack_obj; 505 pmap_qremove(td->td_kstack, pages); 506 VM_OBJECT_LOCK(ksobj); 507 for (i = 0; i < pages; i++) { 508 m = vm_page_lookup(ksobj, i); 509 if (m == NULL) 510 panic("vm_thread_swapout: kstack already missing?"); 511 vm_page_lock_queues(); 512 vm_page_dirty(m); 513 vm_page_unwire(m, 0); 514 vm_page_unlock_queues(); 515 } 516 VM_OBJECT_UNLOCK(ksobj); 517 } 518 519 /* 520 * Bring the kernel stack for a specified thread back in. 521 */ 522 void 523 vm_thread_swapin(struct thread *td) 524 { 525 vm_object_t ksobj; 526 vm_page_t m, ma[KSTACK_MAX_PAGES]; 527 int i, pages, rv; 528 529 pages = td->td_kstack_pages; 530 ksobj = td->td_kstack_obj; 531 VM_OBJECT_LOCK(ksobj); 532 for (i = 0; i < pages; i++) { 533 m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); 534 if (m->valid != VM_PAGE_BITS_ALL) { 535 rv = vm_pager_get_pages(ksobj, &m, 1, 0); 536 if (rv != VM_PAGER_OK) 537 panic("vm_thread_swapin: cannot get kstack for proc: %d", td->td_proc->p_pid); 538 m = vm_page_lookup(ksobj, i); 539 m->valid = VM_PAGE_BITS_ALL; 540 } 541 ma[i] = m; 542 vm_page_lock_queues(); 543 vm_page_wire(m); 544 vm_page_wakeup(m); 545 vm_page_unlock_queues(); 546 } 547 VM_OBJECT_UNLOCK(ksobj); 548 pmap_qenter(td->td_kstack, ma, pages); 549 cpu_thread_swapin(td); 550 } 551 552 /* 553 * Set up a variable-sized alternate kstack. 554 */ 555 void 556 vm_thread_new_altkstack(struct thread *td, int pages) 557 { 558 559 td->td_altkstack = td->td_kstack; 560 td->td_altkstack_obj = td->td_kstack_obj; 561 td->td_altkstack_pages = td->td_kstack_pages; 562 563 vm_thread_new(td, pages); 564 } 565 566 /* 567 * Restore the original kstack. 568 */ 569 void 570 vm_thread_dispose_altkstack(struct thread *td) 571 { 572 573 vm_thread_dispose(td); 574 575 td->td_kstack = td->td_altkstack; 576 td->td_kstack_obj = td->td_altkstack_obj; 577 td->td_kstack_pages = td->td_altkstack_pages; 578 td->td_altkstack = 0; 579 td->td_altkstack_obj = NULL; 580 td->td_altkstack_pages = 0; 581 } 582 583 /* 584 * Implement fork's actions on an address space. 585 * Here we arrange for the address space to be copied or referenced, 586 * allocate a user struct (pcb and kernel stack), then call the 587 * machine-dependent layer to fill those in and make the new process 588 * ready to run. The new process is set up so that it returns directly 589 * to user mode to avoid stack copying and relocation problems. 590 */ 591 void 592 vm_forkproc(td, p2, td2, flags) 593 struct thread *td; 594 struct proc *p2; 595 struct thread *td2; 596 int flags; 597 { 598 struct proc *p1 = td->td_proc; 599 struct user *up; 600 601 GIANT_REQUIRED; 602 603 if ((flags & RFPROC) == 0) { 604 /* 605 * Divorce the memory, if it is shared, essentially 606 * this changes shared memory amongst threads, into 607 * COW locally. 608 */ 609 if ((flags & RFMEM) == 0) { 610 if (p1->p_vmspace->vm_refcnt > 1) { 611 vmspace_unshare(p1); 612 } 613 } 614 cpu_fork(td, p2, td2, flags); 615 return; 616 } 617 618 if (flags & RFMEM) { 619 p2->p_vmspace = p1->p_vmspace; 620 p1->p_vmspace->vm_refcnt++; 621 } 622 623 while (vm_page_count_severe()) { 624 VM_WAIT; 625 } 626 627 if ((flags & RFMEM) == 0) { 628 p2->p_vmspace = vmspace_fork(p1->p_vmspace); 629 630 pmap_pinit2(vmspace_pmap(p2->p_vmspace)); 631 632 if (p1->p_vmspace->vm_shm) 633 shmfork(p1, p2); 634 } 635 636 /* XXXKSE this is unsatisfactory but should be adequate */ 637 up = p2->p_uarea; 638 MPASS(p2->p_sigacts != NULL); 639 640 /* 641 * p_stats currently points at fields in the user struct 642 * but not at &u, instead at p_addr. Copy parts of 643 * p_stats; zero the rest of p_stats (statistics). 644 */ 645 p2->p_stats = &up->u_stats; 646 bzero(&up->u_stats.pstat_startzero, 647 (unsigned) ((caddr_t) &up->u_stats.pstat_endzero - 648 (caddr_t) &up->u_stats.pstat_startzero)); 649 bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, 650 ((caddr_t) &up->u_stats.pstat_endcopy - 651 (caddr_t) &up->u_stats.pstat_startcopy)); 652 653 /* 654 * cpu_fork will copy and update the pcb, set up the kernel stack, 655 * and make the child ready to run. 656 */ 657 cpu_fork(td, p2, td2, flags); 658 } 659 660 /* 661 * Called after process has been wait(2)'ed apon and is being reaped. 662 * The idea is to reclaim resources that we could not reclaim while 663 * the process was still executing. 664 */ 665 void 666 vm_waitproc(p) 667 struct proc *p; 668 { 669 670 GIANT_REQUIRED; 671 vmspace_exitfree(p); /* and clean-out the vmspace */ 672 } 673 674 /* 675 * Set default limits for VM system. 676 * Called for proc 0, and then inherited by all others. 677 * 678 * XXX should probably act directly on proc0. 679 */ 680 static void 681 vm_init_limits(udata) 682 void *udata; 683 { 684 struct proc *p = udata; 685 int rss_limit; 686 687 /* 688 * Set up the initial limits on process VM. Set the maximum resident 689 * set size to be half of (reasonably) available memory. Since this 690 * is a soft limit, it comes into effect only when the system is out 691 * of memory - half of main memory helps to favor smaller processes, 692 * and reduces thrashing of the object cache. 693 */ 694 p->p_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; 695 p->p_rlimit[RLIMIT_STACK].rlim_max = maxssiz; 696 p->p_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; 697 p->p_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; 698 /* limit the limit to no less than 2MB */ 699 rss_limit = max(cnt.v_free_count, 512); 700 p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit); 701 p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; 702 } 703 704 void 705 faultin(p) 706 struct proc *p; 707 { 708 #ifdef NO_SWAPPING 709 710 PROC_LOCK_ASSERT(p, MA_OWNED); 711 if ((p->p_sflag & PS_INMEM) == 0) 712 panic("faultin: proc swapped out with NO_SWAPPING!"); 713 #else /* !NO_SWAPPING */ 714 struct thread *td; 715 716 GIANT_REQUIRED; 717 PROC_LOCK_ASSERT(p, MA_OWNED); 718 /* 719 * If another process is swapping in this process, 720 * just wait until it finishes. 721 */ 722 if (p->p_sflag & PS_SWAPPINGIN) 723 msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0); 724 else if ((p->p_sflag & PS_INMEM) == 0) { 725 /* 726 * Don't let another thread swap process p out while we are 727 * busy swapping it in. 728 */ 729 ++p->p_lock; 730 mtx_lock_spin(&sched_lock); 731 p->p_sflag |= PS_SWAPPINGIN; 732 mtx_unlock_spin(&sched_lock); 733 PROC_UNLOCK(p); 734 735 vm_proc_swapin(p); 736 FOREACH_THREAD_IN_PROC(p, td) 737 vm_thread_swapin(td); 738 739 PROC_LOCK(p); 740 mtx_lock_spin(&sched_lock); 741 p->p_sflag &= ~PS_SWAPPINGIN; 742 p->p_sflag |= PS_INMEM; 743 FOREACH_THREAD_IN_PROC(p, td) { 744 TD_CLR_SWAPPED(td); 745 if (TD_CAN_RUN(td)) 746 setrunnable(td); 747 } 748 mtx_unlock_spin(&sched_lock); 749 750 wakeup(&p->p_sflag); 751 752 /* Allow other threads to swap p out now. */ 753 --p->p_lock; 754 } 755 #endif /* NO_SWAPPING */ 756 } 757 758 /* 759 * This swapin algorithm attempts to swap-in processes only if there 760 * is enough space for them. Of course, if a process waits for a long 761 * time, it will be swapped in anyway. 762 * 763 * XXXKSE - process with the thread with highest priority counts.. 764 * 765 * Giant is still held at this point, to be released in tsleep. 766 */ 767 /* ARGSUSED*/ 768 static void 769 scheduler(dummy) 770 void *dummy; 771 { 772 struct proc *p; 773 struct thread *td; 774 int pri; 775 struct proc *pp; 776 int ppri; 777 778 mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); 779 /* GIANT_REQUIRED */ 780 781 loop: 782 if (vm_page_count_min()) { 783 VM_WAIT; 784 goto loop; 785 } 786 787 pp = NULL; 788 ppri = INT_MIN; 789 sx_slock(&allproc_lock); 790 FOREACH_PROC_IN_SYSTEM(p) { 791 struct ksegrp *kg; 792 if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { 793 continue; 794 } 795 mtx_lock_spin(&sched_lock); 796 FOREACH_THREAD_IN_PROC(p, td) { 797 /* 798 * An otherwise runnable thread of a process 799 * swapped out has only the TDI_SWAPPED bit set. 800 * 801 */ 802 if (td->td_inhibitors == TDI_SWAPPED) { 803 kg = td->td_ksegrp; 804 pri = p->p_swtime + kg->kg_slptime; 805 if ((p->p_sflag & PS_SWAPINREQ) == 0) { 806 pri -= kg->kg_nice * 8; 807 } 808 809 /* 810 * if this ksegrp is higher priority 811 * and there is enough space, then select 812 * this process instead of the previous 813 * selection. 814 */ 815 if (pri > ppri) { 816 pp = p; 817 ppri = pri; 818 } 819 } 820 } 821 mtx_unlock_spin(&sched_lock); 822 } 823 sx_sunlock(&allproc_lock); 824 825 /* 826 * Nothing to do, back to sleep. 827 */ 828 if ((p = pp) == NULL) { 829 tsleep(&proc0, PVM, "sched", maxslp * hz / 2); 830 goto loop; 831 } 832 PROC_LOCK(p); 833 834 /* 835 * Another process may be bringing or may have already 836 * brought this process in while we traverse all threads. 837 * Or, this process may even be being swapped out again. 838 */ 839 if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { 840 PROC_UNLOCK(p); 841 goto loop; 842 } 843 844 mtx_lock_spin(&sched_lock); 845 p->p_sflag &= ~PS_SWAPINREQ; 846 mtx_unlock_spin(&sched_lock); 847 848 /* 849 * We would like to bring someone in. (only if there is space). 850 * [What checks the space? ] 851 */ 852 faultin(p); 853 PROC_UNLOCK(p); 854 mtx_lock_spin(&sched_lock); 855 p->p_swtime = 0; 856 mtx_unlock_spin(&sched_lock); 857 goto loop; 858 } 859 860 #ifndef NO_SWAPPING 861 862 /* 863 * Swap_idle_threshold1 is the guaranteed swapped in time for a process 864 */ 865 static int swap_idle_threshold1 = 2; 866 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, 867 &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process"); 868 869 /* 870 * Swap_idle_threshold2 is the time that a process can be idle before 871 * it will be swapped out, if idle swapping is enabled. 872 */ 873 static int swap_idle_threshold2 = 10; 874 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, 875 &swap_idle_threshold2, 0, "Time before a process will be swapped out"); 876 877 /* 878 * Swapout is driven by the pageout daemon. Very simple, we find eligible 879 * procs and unwire their u-areas. We try to always "swap" at least one 880 * process in case we need the room for a swapin. 881 * If any procs have been sleeping/stopped for at least maxslp seconds, 882 * they are swapped. Else, we swap the longest-sleeping or stopped process, 883 * if any, otherwise the longest-resident process. 884 */ 885 void 886 swapout_procs(action) 887 int action; 888 { 889 struct proc *p; 890 struct thread *td; 891 struct ksegrp *kg; 892 int didswap = 0; 893 894 GIANT_REQUIRED; 895 896 retry: 897 sx_slock(&allproc_lock); 898 FOREACH_PROC_IN_SYSTEM(p) { 899 struct vmspace *vm; 900 int minslptime = 100000; 901 902 /* 903 * Watch out for a process in 904 * creation. It may have no 905 * address space or lock yet. 906 */ 907 mtx_lock_spin(&sched_lock); 908 if (p->p_state == PRS_NEW) { 909 mtx_unlock_spin(&sched_lock); 910 continue; 911 } 912 mtx_unlock_spin(&sched_lock); 913 914 /* 915 * An aio daemon switches its 916 * address space while running. 917 * Perform a quick check whether 918 * a process has P_SYSTEM. 919 */ 920 if ((p->p_flag & P_SYSTEM) != 0) 921 continue; 922 923 /* 924 * Do not swapout a process that 925 * is waiting for VM data 926 * structures as there is a possible 927 * deadlock. Test this first as 928 * this may block. 929 * 930 * Lock the map until swapout 931 * finishes, or a thread of this 932 * process may attempt to alter 933 * the map. 934 */ 935 PROC_LOCK(p); 936 vm = p->p_vmspace; 937 KASSERT(vm != NULL, 938 ("swapout_procs: a process has no address space")); 939 ++vm->vm_refcnt; 940 PROC_UNLOCK(p); 941 if (!vm_map_trylock(&vm->vm_map)) 942 goto nextproc1; 943 944 PROC_LOCK(p); 945 if (p->p_lock != 0 || 946 (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT) 947 ) != 0) { 948 goto nextproc2; 949 } 950 /* 951 * only aiod changes vmspace, however it will be 952 * skipped because of the if statement above checking 953 * for P_SYSTEM 954 */ 955 if ((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) != PS_INMEM) 956 goto nextproc2; 957 958 switch (p->p_state) { 959 default: 960 /* Don't swap out processes in any sort 961 * of 'special' state. */ 962 break; 963 964 case PRS_NORMAL: 965 mtx_lock_spin(&sched_lock); 966 /* 967 * do not swapout a realtime process 968 * Check all the thread groups.. 969 */ 970 FOREACH_KSEGRP_IN_PROC(p, kg) { 971 if (PRI_IS_REALTIME(kg->kg_pri_class)) 972 goto nextproc; 973 974 /* 975 * Guarantee swap_idle_threshold1 976 * time in memory. 977 */ 978 if (kg->kg_slptime < swap_idle_threshold1) 979 goto nextproc; 980 981 /* 982 * Do not swapout a process if it is 983 * waiting on a critical event of some 984 * kind or there is a thread whose 985 * pageable memory may be accessed. 986 * 987 * This could be refined to support 988 * swapping out a thread. 989 */ 990 FOREACH_THREAD_IN_GROUP(kg, td) { 991 if ((td->td_priority) < PSOCK || 992 !thread_safetoswapout(td)) 993 goto nextproc; 994 } 995 /* 996 * If the system is under memory stress, 997 * or if we are swapping 998 * idle processes >= swap_idle_threshold2, 999 * then swap the process out. 1000 */ 1001 if (((action & VM_SWAP_NORMAL) == 0) && 1002 (((action & VM_SWAP_IDLE) == 0) || 1003 (kg->kg_slptime < swap_idle_threshold2))) 1004 goto nextproc; 1005 1006 if (minslptime > kg->kg_slptime) 1007 minslptime = kg->kg_slptime; 1008 } 1009 1010 /* 1011 * If the process has been asleep for awhile and had 1012 * most of its pages taken away already, swap it out. 1013 */ 1014 if ((action & VM_SWAP_NORMAL) || 1015 ((action & VM_SWAP_IDLE) && 1016 (minslptime > swap_idle_threshold2))) { 1017 swapout(p); 1018 didswap++; 1019 mtx_unlock_spin(&sched_lock); 1020 PROC_UNLOCK(p); 1021 vm_map_unlock(&vm->vm_map); 1022 vmspace_free(vm); 1023 sx_sunlock(&allproc_lock); 1024 goto retry; 1025 } 1026 nextproc: 1027 mtx_unlock_spin(&sched_lock); 1028 } 1029 nextproc2: 1030 PROC_UNLOCK(p); 1031 vm_map_unlock(&vm->vm_map); 1032 nextproc1: 1033 vmspace_free(vm); 1034 continue; 1035 } 1036 sx_sunlock(&allproc_lock); 1037 /* 1038 * If we swapped something out, and another process needed memory, 1039 * then wakeup the sched process. 1040 */ 1041 if (didswap) 1042 wakeup(&proc0); 1043 } 1044 1045 static void 1046 swapout(p) 1047 struct proc *p; 1048 { 1049 struct thread *td; 1050 1051 PROC_LOCK_ASSERT(p, MA_OWNED); 1052 mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); 1053 #if defined(SWAP_DEBUG) 1054 printf("swapping out %d\n", p->p_pid); 1055 #endif 1056 1057 /* 1058 * The states of this process and its threads may have changed 1059 * by now. Assuming that there is only one pageout daemon thread, 1060 * this process should still be in memory. 1061 */ 1062 KASSERT((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) == PS_INMEM, 1063 ("swapout: lost a swapout race?")); 1064 1065 #if defined(INVARIANTS) 1066 /* 1067 * Make sure that all threads are safe to be swapped out. 1068 * 1069 * Alternatively, we could swap out only safe threads. 1070 */ 1071 FOREACH_THREAD_IN_PROC(p, td) { 1072 KASSERT(thread_safetoswapout(td), 1073 ("swapout: there is a thread not safe for swapout")); 1074 } 1075 #endif /* INVARIANTS */ 1076 1077 ++p->p_stats->p_ru.ru_nswap; 1078 /* 1079 * remember the process resident count 1080 */ 1081 p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); 1082 1083 p->p_sflag &= ~PS_INMEM; 1084 p->p_sflag |= PS_SWAPPINGOUT; 1085 PROC_UNLOCK(p); 1086 FOREACH_THREAD_IN_PROC(p, td) 1087 TD_SET_SWAPPED(td); 1088 mtx_unlock_spin(&sched_lock); 1089 1090 vm_proc_swapout(p); 1091 FOREACH_THREAD_IN_PROC(p, td) 1092 vm_thread_swapout(td); 1093 1094 PROC_LOCK(p); 1095 mtx_lock_spin(&sched_lock); 1096 p->p_sflag &= ~PS_SWAPPINGOUT; 1097 p->p_swtime = 0; 1098 } 1099 #endif /* !NO_SWAPPING */ 1100