xref: /freebsd/sys/vm/vm_fault.c (revision 0c43d89a0d8e976ca494d4837f4c1f3734d2c300)
1 /*
2  * Copyright (c) 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  *
9  *
10  * This code is derived from software contributed to Berkeley by
11  * The Mach Operating System project at Carnegie-Mellon University.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. All advertising materials mentioning features or use of this software
22  *    must display the following acknowledgement:
23  *	This product includes software developed by the University of
24  *	California, Berkeley and its contributors.
25  * 4. Neither the name of the University nor the names of its contributors
26  *    may be used to endorse or promote products derived from this software
27  *    without specific prior written permission.
28  *
29  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39  * SUCH DAMAGE.
40  *
41  *	from: @(#)vm_fault.c	8.4 (Berkeley) 1/12/94
42  *
43  *
44  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
45  * All rights reserved.
46  *
47  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
48  *
49  * Permission to use, copy, modify and distribute this software and
50  * its documentation is hereby granted, provided that both the copyright
51  * notice and this permission notice appear in all copies of the
52  * software, derivative works or modified versions, and any portions
53  * thereof, and that both notices appear in supporting documentation.
54  *
55  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
56  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
57  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
58  *
59  * Carnegie Mellon requests users of this software to return to
60  *
61  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
62  *  School of Computer Science
63  *  Carnegie Mellon University
64  *  Pittsburgh PA 15213-3890
65  *
66  * any improvements or extensions that they make and grant Carnegie the
67  * rights to redistribute these changes.
68  *
69  * $Id: vm_fault.c,v 1.3 1994/08/02 07:55:18 davidg Exp $
70  */
71 
72 /*
73  *	Page fault handling module.
74  */
75 
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/proc.h>
79 #include <sys/resourcevar.h>
80 
81 #include <vm/vm.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_pageout.h>
84 
85 
86 #define VM_FAULT_READ_AHEAD 4
87 #define VM_FAULT_READ_AHEAD_MIN 1
88 #define VM_FAULT_READ_BEHIND 3
89 #define VM_FAULT_READ (VM_FAULT_READ_AHEAD+VM_FAULT_READ_BEHIND+1)
90 extern int swap_pager_full;
91 extern int vm_pageout_proc_limit;
92 
93 /*
94  *	vm_fault:
95  *
96  *	Handle a page fault occuring at the given address,
97  *	requiring the given permissions, in the map specified.
98  *	If successful, the page is inserted into the
99  *	associated physical map.
100  *
101  *	NOTE: the given address should be truncated to the
102  *	proper page address.
103  *
104  *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
105  *	a standard error specifying why the fault is fatal is returned.
106  *
107  *
108  *	The map in question must be referenced, and remains so.
109  *	Caller may hold no locks.
110  */
111 int
112 vm_fault(map, vaddr, fault_type, change_wiring)
113 	vm_map_t	map;
114 	vm_offset_t	vaddr;
115 	vm_prot_t	fault_type;
116 	boolean_t	change_wiring;
117 {
118 	vm_object_t		first_object;
119 	vm_offset_t		first_offset;
120 	vm_map_entry_t		entry;
121 	register vm_object_t	object;
122 	register vm_offset_t	offset;
123 	vm_page_t	m;
124 	vm_page_t		first_m;
125 	vm_prot_t		prot;
126 	int			result;
127 	boolean_t		wired;
128 	boolean_t		su;
129 	boolean_t		lookup_still_valid;
130 	boolean_t		page_exists;
131 	vm_page_t		old_m;
132 	vm_object_t		next_object;
133 	vm_page_t		marray[VM_FAULT_READ];
134 	int			reqpage;
135 	int			spl;
136 	int			hardfault=0;
137 
138 	cnt.v_faults++;		/* needs lock XXX */
139 /*
140  *	Recovery actions
141  */
142 #define	FREE_PAGE(m)	{				\
143 	PAGE_WAKEUP(m);					\
144 	vm_page_lock_queues();				\
145 	vm_page_free(m);				\
146 	vm_page_unlock_queues();			\
147 }
148 
149 #define	RELEASE_PAGE(m)	{				\
150 	PAGE_WAKEUP(m);					\
151 	vm_page_lock_queues();				\
152 	vm_page_activate(m);				\
153 	vm_page_unlock_queues();			\
154 }
155 
156 #define	UNLOCK_MAP	{				\
157 	if (lookup_still_valid) {			\
158 		vm_map_lookup_done(map, entry);		\
159 		lookup_still_valid = FALSE;		\
160 	}						\
161 }
162 
163 #define	UNLOCK_THINGS	{				\
164 	object->paging_in_progress--;			\
165 	if (object->paging_in_progress == 0)		\
166 		wakeup((caddr_t)object);		\
167 	vm_object_unlock(object);			\
168 	if (object != first_object) {			\
169 		vm_object_lock(first_object);		\
170 		FREE_PAGE(first_m);			\
171 		first_object->paging_in_progress--;	\
172 		if (first_object->paging_in_progress == 0) \
173 			wakeup((caddr_t)first_object);	\
174 		vm_object_unlock(first_object);		\
175 	}						\
176 	UNLOCK_MAP;					\
177 }
178 
179 #define	UNLOCK_AND_DEALLOCATE	{			\
180 	UNLOCK_THINGS;					\
181 	vm_object_deallocate(first_object);		\
182 }
183 
184 
185     RetryFault: ;
186 
187 	/*
188 	 *	Find the backing store object and offset into
189 	 *	it to begin the search.
190 	 */
191 
192 	if ((result = vm_map_lookup(&map, vaddr, fault_type, &entry,
193 	    &first_object, &first_offset,
194 	    &prot, &wired, &su)) != KERN_SUCCESS) {
195 		return(result);
196 	}
197 	lookup_still_valid = TRUE;
198 
199 	if (wired)
200 		fault_type = prot;
201 
202 	first_m = NULL;
203 
204    	/*
205 	 *	Make a reference to this object to
206 	 *	prevent its disposal while we are messing with
207 	 *	it.  Once we have the reference, the map is free
208 	 *	to be diddled.  Since objects reference their
209 	 *	shadows (and copies), they will stay around as well.
210 	 */
211 
212 	vm_object_lock(first_object);
213 
214 	first_object->ref_count++;
215 	first_object->paging_in_progress++;
216 
217 	/*
218 	 *	INVARIANTS (through entire routine):
219 	 *
220 	 *	1)	At all times, we must either have the object
221 	 *		lock or a busy page in some object to prevent
222 	 *		some other thread from trying to bring in
223 	 *		the same page.
224 	 *
225 	 *		Note that we cannot hold any locks during the
226 	 *		pager access or when waiting for memory, so
227 	 *		we use a busy page then.
228 	 *
229 	 *		Note also that we aren't as concerned about
230 	 *		more than one thead attempting to pager_data_unlock
231 	 *		the same page at once, so we don't hold the page
232 	 *		as busy then, but do record the highest unlock
233 	 *		value so far.  [Unlock requests may also be delivered
234 	 *		out of order.]
235 	 *
236 	 *	2)	Once we have a busy page, we must remove it from
237 	 *		the pageout queues, so that the pageout daemon
238 	 *		will not grab it away.
239 	 *
240 	 *	3)	To prevent another thread from racing us down the
241 	 *		shadow chain and entering a new page in the top
242 	 *		object before we do, we must keep a busy page in
243 	 *		the top object while following the shadow chain.
244 	 *
245 	 *	4)	We must increment paging_in_progress on any object
246 	 *		for which we have a busy page, to prevent
247 	 *		vm_object_collapse from removing the busy page
248 	 *		without our noticing.
249 	 */
250 
251 	/*
252 	 *	Search for the page at object/offset.
253 	 */
254 
255 	object = first_object;
256 	offset = first_offset;
257 
258 	/*
259 	 *	See whether this page is resident
260 	 */
261 
262 	while (TRUE) {
263 		m = vm_page_lookup(object, offset);
264 		if (m != NULL) {
265 			/*
266 			 *	If the page is being brought in,
267 			 *	wait for it and then retry.
268 			 */
269 			if (m->flags & PG_BUSY) {
270 				int s;
271 				UNLOCK_THINGS;
272 				s = splhigh();
273 				if (m->flags & PG_BUSY) {
274 					m->flags |= PG_WANTED;
275 					tsleep((caddr_t)m,PSWP,"vmpfw",0);
276 				}
277 				splx(s);
278 				vm_object_deallocate(first_object);
279 				goto RetryFault;
280 			}
281 
282 			/*
283 			 *	Remove the page from the pageout daemon's
284 			 *	reach while we play with it.
285 			 */
286 
287 			vm_page_lock_queues();
288 			spl = splhigh();
289 			if (m->flags & PG_INACTIVE) {
290 				TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq);
291 				m->flags &= ~PG_INACTIVE;
292 				cnt.v_inactive_count--;
293 				cnt.v_reactivated++;
294 			}
295 
296 			if (m->flags & PG_ACTIVE) {
297 				TAILQ_REMOVE(&vm_page_queue_active, m, pageq);
298 				m->flags &= ~PG_ACTIVE;
299 				cnt.v_active_count--;
300 			}
301 			splx(spl);
302 			vm_page_unlock_queues();
303 
304 			/*
305 			 *	Mark page busy for other threads.
306 			 */
307 			m->flags |= PG_BUSY;
308 			break;
309 		}
310 
311 		if (((object->pager != NULL) &&
312 		    (!change_wiring || wired))
313 		    || (object == first_object)) {
314 
315 #if 0
316 			if (curproc && (vaddr < VM_MAXUSER_ADDRESS) &&
317 				(curproc->p_rlimit[RLIMIT_RSS].rlim_max <
318 			    curproc->p_vmspace->vm_pmap.pm_stats.resident_count * NBPG)) {
319 				UNLOCK_AND_DEALLOCATE;
320 				vm_fault_free_pages(curproc);
321 				goto RetryFault;
322 			}
323 #endif
324 
325 			if (swap_pager_full && !object->shadow && (!object->pager ||
326 				(object->pager && object->pager->pg_type == PG_SWAP &&
327 				!vm_pager_has_page(object->pager, offset+object->paging_offset)))) {
328 				if (vaddr < VM_MAXUSER_ADDRESS && curproc && curproc->p_pid >= 48) /* XXX */ {
329 					printf("Process %d killed by vm_fault -- out of swap\n", curproc->p_pid);
330 					psignal(curproc, SIGKILL);
331 					curproc->p_estcpu = 0;
332 					curproc->p_nice = PRIO_MIN;
333 					setpriority(curproc);
334 				}
335 			}
336 
337 			/*
338 			 *	Allocate a new page for this object/offset
339 			 *	pair.
340 			 */
341 
342 			m = vm_page_alloc(object, offset);
343 
344 			if (m == NULL) {
345 				UNLOCK_AND_DEALLOCATE;
346 				VM_WAIT;
347 				goto RetryFault;
348 			}
349 		}
350 
351 		if (object->pager != NULL && (!change_wiring || wired)) {
352 			int rv;
353 			int faultcount;
354 			int reqpage;
355 
356 			/*
357 			 *	Now that we have a busy page, we can
358 			 *	release the object lock.
359 			 */
360 			vm_object_unlock(object);
361 			/*
362 			 * now we find out if any other pages should
363 			 * be paged in at this time
364 			 * this routine checks to see if the pages surrounding this fault
365 			 * reside in the same object as the page for this fault.  If
366 			 * they do, then they are faulted in also into the
367 			 * object.  The array "marray" returned contains an array of
368 			 * vm_page_t structs where one of them is the vm_page_t passed to
369 			 * the routine.  The reqpage return value is the index into the
370 			 * marray for the vm_page_t passed to the routine.
371 			 */
372 			cnt.v_pageins++;
373 			faultcount = vm_fault_additional_pages(first_object, first_offset,
374 				m, VM_FAULT_READ_BEHIND, VM_FAULT_READ_AHEAD, marray, &reqpage);
375 
376 			/*
377 			 *	Call the pager to retrieve the data, if any,
378 			 *	after releasing the lock on the map.
379 			 */
380 			UNLOCK_MAP;
381 
382 			rv = faultcount ?
383 			    vm_pager_get_pages(object->pager,
384 				marray, faultcount, reqpage, TRUE): VM_PAGER_FAIL;
385 			if (rv == VM_PAGER_OK) {
386 				/*
387 				 *	Found the page.
388 				 *	Leave it busy while we play with it.
389 				 */
390 				vm_object_lock(object);
391 
392 				/*
393 				 *	Relookup in case pager changed page.
394 				 *	Pager is responsible for disposition
395 				 *	of old page if moved.
396 				 */
397 				m = vm_page_lookup(object, offset);
398 
399 				cnt.v_pgpgin++;
400 				m->flags &= ~PG_FAKE;
401 				pmap_clear_modify(VM_PAGE_TO_PHYS(m));
402 				hardfault++;
403 				break;
404 			}
405 
406 			/*
407 			 *	Remove the bogus page (which does not
408 			 *	exist at this object/offset); before
409 			 *	doing so, we must get back our object
410 			 *	lock to preserve our invariant.
411 			 *
412 			 *	Also wake up any other thread that may want
413 			 *	to bring in this page.
414 			 *
415 			 *	If this is the top-level object, we must
416 			 *	leave the busy page to prevent another
417 			 *	thread from rushing past us, and inserting
418 			 *	the page in that object at the same time
419 			 *	that we are.
420 			 */
421 
422 			vm_object_lock(object);
423 			/*
424 			 * Data outside the range of the pager; an error
425 			 */
426 			if ((rv == VM_PAGER_ERROR) || (rv == VM_PAGER_BAD)) {
427 				FREE_PAGE(m);
428 				UNLOCK_AND_DEALLOCATE;
429 				return(KERN_PROTECTION_FAILURE); /* XXX */
430 			}
431 			if (object != first_object) {
432 				FREE_PAGE(m);
433 				/*
434 				 * XXX - we cannot just fall out at this
435 				 * point, m has been freed and is invalid!
436 				 */
437 			}
438 		}
439 
440 		/*
441 		 * We get here if the object has no pager (or unwiring)
442 		 * or the pager doesn't have the page.
443 		 */
444 		if (object == first_object)
445 			first_m = m;
446 
447 		/*
448 		 *	Move on to the next object.  Lock the next
449 		 *	object before unlocking the current one.
450 		 */
451 
452 		offset += object->shadow_offset;
453 		next_object = object->shadow;
454 		if (next_object == NULL) {
455 			/*
456 			 *	If there's no object left, fill the page
457 			 *	in the top object with zeros.
458 			 */
459 			if (object != first_object) {
460 				object->paging_in_progress--;
461 				if (object->paging_in_progress == 0)
462 					wakeup((caddr_t) object);
463 				vm_object_unlock(object);
464 
465 				object = first_object;
466 				offset = first_offset;
467 				m = first_m;
468 				vm_object_lock(object);
469 			}
470 			first_m = NULL;
471 
472 			vm_page_zero_fill(m);
473 			cnt.v_zfod++;
474 			m->flags &= ~PG_FAKE;
475 			break;
476 		}
477 		else {
478 			vm_object_lock(next_object);
479 			if (object != first_object) {
480 				object->paging_in_progress--;
481 				if (object->paging_in_progress == 0)
482 					wakeup((caddr_t) object);
483 			}
484 			vm_object_unlock(object);
485 			object = next_object;
486 			object->paging_in_progress++;
487 		}
488 	}
489 
490 	if ((m->flags & (PG_ACTIVE|PG_INACTIVE) != 0) ||
491 		(m->flags & PG_BUSY) == 0)
492 		panic("vm_fault: absent or active or inactive or not busy after main loop");
493 
494 	/*
495 	 *	PAGE HAS BEEN FOUND.
496 	 *	[Loop invariant still holds -- the object lock
497 	 *	is held.]
498 	 */
499 
500 	old_m = m;	/* save page that would be copied */
501 
502 	/*
503 	 *	If the page is being written, but isn't
504 	 *	already owned by the top-level object,
505 	 *	we have to copy it into a new page owned
506 	 *	by the top-level object.
507 	 */
508 
509 	if (object != first_object) {
510 	    	/*
511 		 *	We only really need to copy if we
512 		 *	want to write it.
513 		 */
514 
515 	    	if (fault_type & VM_PROT_WRITE) {
516 
517 			/*
518 			 *	If we try to collapse first_object at this
519 			 *	point, we may deadlock when we try to get
520 			 *	the lock on an intermediate object (since we
521 			 *	have the bottom object locked).  We can't
522 			 *	unlock the bottom object, because the page
523 			 *	we found may move (by collapse) if we do.
524 			 *
525 			 *	Instead, we first copy the page.  Then, when
526 			 *	we have no more use for the bottom object,
527 			 *	we unlock it and try to collapse.
528 			 *
529 			 *	Note that we copy the page even if we didn't
530 			 *	need to... that's the breaks.
531 			 */
532 
533 		    	/*
534 			 *	We already have an empty page in
535 			 *	first_object - use it.
536 			 */
537 
538 			vm_page_copy(m, first_m);
539 			first_m->flags &= ~PG_FAKE;
540 
541 			/*
542 			 *	If another map is truly sharing this
543 			 *	page with us, we have to flush all
544 			 *	uses of the original page, since we
545 			 *	can't distinguish those which want the
546 			 *	original from those which need the
547 			 *	new copy.
548 			 *
549 			 *	XXX If we know that only one map has
550 			 *	access to this page, then we could
551 			 *	avoid the pmap_page_protect() call.
552 			 */
553 
554 			vm_page_lock_queues();
555 
556 			vm_page_activate(m);
557 			pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE);
558 			if ((m->flags & PG_CLEAN) == 0)
559 				m->flags |= PG_LAUNDRY;
560 			vm_page_unlock_queues();
561 
562 			/*
563 			 *	We no longer need the old page or object.
564 			 */
565 			PAGE_WAKEUP(m);
566 			object->paging_in_progress--;
567 			if (object->paging_in_progress == 0)
568 				wakeup((caddr_t) object);
569 			vm_object_unlock(object);
570 
571 			/*
572 			 *	Only use the new page below...
573 			 */
574 
575 			cnt.v_cow_faults++;
576 			m = first_m;
577 			object = first_object;
578 			offset = first_offset;
579 
580 			/*
581 			 *	Now that we've gotten the copy out of the
582 			 *	way, let's try to collapse the top object.
583 			 */
584 			vm_object_lock(object);
585 			/*
586 			 *	But we have to play ugly games with
587 			 *	paging_in_progress to do that...
588 			 */
589 			object->paging_in_progress--;
590 			if (object->paging_in_progress == 0)
591 				wakeup((caddr_t) object);
592 			vm_object_collapse(object);
593 			object->paging_in_progress++;
594 		}
595 		else {
596 		    	prot &= ~VM_PROT_WRITE;
597 			m->flags |= PG_COPYONWRITE;
598 		}
599 	}
600 
601 	if (m->flags & (PG_ACTIVE|PG_INACTIVE))
602 		panic("vm_fault: active or inactive before copy object handling");
603 
604 	/*
605 	 *	If the page is being written, but hasn't been
606 	 *	copied to the copy-object, we have to copy it there.
607 	 */
608     RetryCopy:
609 	if (first_object->copy != NULL) {
610 		vm_object_t copy_object = first_object->copy;
611 		vm_offset_t copy_offset;
612 		vm_page_t copy_m;
613 
614 		/*
615 		 *	We only need to copy if we want to write it.
616 		 */
617 		if ((fault_type & VM_PROT_WRITE) == 0) {
618 			prot &= ~VM_PROT_WRITE;
619 			m->flags |= PG_COPYONWRITE;
620 		}
621 		else {
622 			/*
623 			 *	Try to get the lock on the copy_object.
624 			 */
625 			if (!vm_object_lock_try(copy_object)) {
626 				vm_object_unlock(object);
627 				/* should spin a bit here... */
628 				vm_object_lock(object);
629 				goto RetryCopy;
630 			}
631 
632 			/*
633 			 *	Make another reference to the copy-object,
634 			 *	to keep it from disappearing during the
635 			 *	copy.
636 			 */
637 			copy_object->ref_count++;
638 
639 			/*
640 			 *	Does the page exist in the copy?
641 			 */
642 			copy_offset = first_offset
643 				- copy_object->shadow_offset;
644 			copy_m = vm_page_lookup(copy_object, copy_offset);
645 			if (page_exists = (copy_m != NULL)) {
646 				if (copy_m->flags & PG_BUSY) {
647 					/*
648 					 *	If the page is being brought
649 					 *	in, wait for it and then retry.
650 					 */
651 					PAGE_ASSERT_WAIT(copy_m, !change_wiring);
652 					RELEASE_PAGE(m);
653 					copy_object->ref_count--;
654 					vm_object_unlock(copy_object);
655 					UNLOCK_THINGS;
656 					thread_block("fltcpy");
657 					vm_object_deallocate(first_object);
658 					goto RetryFault;
659 				}
660 			}
661 
662 			/*
663 			 *	If the page is not in memory (in the object)
664 			 *	and the object has a pager, we have to check
665 			 *	if the pager has the data in secondary
666 			 *	storage.
667 			 */
668 			if (!page_exists) {
669 
670 				/*
671 				 *	If we don't allocate a (blank) page
672 				 *	here... another thread could try
673 				 *	to page it in, allocate a page, and
674 				 *	then block on the busy page in its
675 				 *	shadow (first_object).  Then we'd
676 				 *	trip over the busy page after we
677 				 *	found that the copy_object's pager
678 				 *	doesn't have the page...
679 				 */
680 				copy_m = vm_page_alloc(copy_object, copy_offset);
681 				if (copy_m == NULL) {
682 					/*
683 					 *	Wait for a page, then retry.
684 					 */
685 					RELEASE_PAGE(m);
686 					copy_object->ref_count--;
687 					vm_object_unlock(copy_object);
688 					UNLOCK_AND_DEALLOCATE;
689 					VM_WAIT;
690 					goto RetryFault;
691 				}
692 
693 			 	if (copy_object->pager != NULL) {
694 					vm_object_unlock(object);
695 					vm_object_unlock(copy_object);
696 					UNLOCK_MAP;
697 
698 					page_exists = vm_pager_has_page(
699 							copy_object->pager,
700 							(copy_offset + copy_object->paging_offset));
701 
702 					vm_object_lock(copy_object);
703 
704 					/*
705 					 * Since the map is unlocked, someone
706 					 * else could have copied this object
707 					 * and put a different copy_object
708 					 * between the two.  Or, the last
709 					 * reference to the copy-object (other
710 					 * than the one we have) may have
711 					 * disappeared - if that has happened,
712 					 * we don't need to make the copy.
713 					 */
714 					if (copy_object->shadow != object ||
715 					    copy_object->ref_count == 1) {
716 						/*
717 						 *	Gaah... start over!
718 						 */
719 						FREE_PAGE(copy_m);
720 						vm_object_unlock(copy_object);
721 						vm_object_deallocate(copy_object);
722 							/* may block */
723 						vm_object_lock(object);
724 						goto RetryCopy;
725 					}
726 					vm_object_lock(object);
727 
728 					if (page_exists) {
729 						/*
730 						 *	We didn't need the page
731 						 */
732 						FREE_PAGE(copy_m);
733 					}
734 				}
735 			}
736 			if (!page_exists) {
737 				/*
738 				 *	Must copy page into copy-object.
739 				 */
740 				vm_page_copy(m, copy_m);
741 				copy_m->flags &= ~PG_FAKE;
742 
743 				/*
744 				 * Things to remember:
745 				 * 1. The copied page must be marked 'dirty'
746 				 *    so it will be paged out to the copy
747 				 *    object.
748 				 * 2. If the old page was in use by any users
749 				 *    of the copy-object, it must be removed
750 				 *    from all pmaps.  (We can't know which
751 				 *    pmaps use it.)
752 				 */
753 				vm_page_lock_queues();
754 
755 				vm_page_activate(old_m);
756 
757 
758 				pmap_page_protect(VM_PAGE_TO_PHYS(old_m),
759 						  VM_PROT_NONE);
760 				if ((old_m->flags & PG_CLEAN) == 0)
761 					old_m->flags |= PG_LAUNDRY;
762 				copy_m->flags &= ~PG_CLEAN;
763 				vm_page_activate(copy_m);
764 				vm_page_unlock_queues();
765 
766 				PAGE_WAKEUP(copy_m);
767 			}
768 			/*
769 			 *	The reference count on copy_object must be
770 			 *	at least 2: one for our extra reference,
771 			 *	and at least one from the outside world
772 			 *	(we checked that when we last locked
773 			 *	copy_object).
774 			 */
775 			copy_object->ref_count--;
776 			vm_object_unlock(copy_object);
777 			m->flags &= ~PG_COPYONWRITE;
778 		}
779 	}
780 
781 	if (m->flags & (PG_ACTIVE | PG_INACTIVE))
782 		panic("vm_fault: active or inactive before retrying lookup");
783 
784 	/*
785 	 *	We must verify that the maps have not changed
786 	 *	since our last lookup.
787 	 */
788 
789 	if (!lookup_still_valid) {
790 		vm_object_t	retry_object;
791 		vm_offset_t	retry_offset;
792 		vm_prot_t	retry_prot;
793 
794 		/*
795 		 *	Since map entries may be pageable, make sure we can
796 		 *	take a page fault on them.
797 		 */
798 		vm_object_unlock(object);
799 
800 		/*
801 		 *	To avoid trying to write_lock the map while another
802 		 *	thread has it read_locked (in vm_map_pageable), we
803 		 *	do not try for write permission.  If the page is
804 		 *	still writable, we will get write permission.  If it
805 		 *	is not, or has been marked needs_copy, we enter the
806 		 *	mapping without write permission, and will merely
807 		 *	take another fault.
808 		 */
809 		result = vm_map_lookup(&map, vaddr,
810 				fault_type & ~VM_PROT_WRITE, &entry,
811 				&retry_object, &retry_offset, &retry_prot,
812 				&wired, &su);
813 
814 		vm_object_lock(object);
815 
816 		/*
817 		 *	If we don't need the page any longer, put it on the
818 		 *	active list (the easiest thing to do here).  If no
819 		 *	one needs it, pageout will grab it eventually.
820 		 */
821 
822 		if (result != KERN_SUCCESS) {
823 			RELEASE_PAGE(m);
824 			UNLOCK_AND_DEALLOCATE;
825 			return(result);
826 		}
827 
828 		lookup_still_valid = TRUE;
829 
830 		if ((retry_object != first_object) ||
831 				(retry_offset != first_offset)) {
832 			RELEASE_PAGE(m);
833 			UNLOCK_AND_DEALLOCATE;
834 			goto RetryFault;
835 		}
836 
837 		/*
838 		 *	Check whether the protection has changed or the object
839 		 *	has been copied while we left the map unlocked.
840 		 *	Changing from read to write permission is OK - we leave
841 		 *	the page write-protected, and catch the write fault.
842 		 *	Changing from write to read permission means that we
843 		 *	can't mark the page write-enabled after all.
844 		 */
845 		prot &= retry_prot;
846 		if (m->flags & PG_COPYONWRITE)
847 			prot &= ~VM_PROT_WRITE;
848 	}
849 
850 	/*
851 	 * (the various bits we're fiddling with here are locked by
852 	 * the object's lock)
853 	 */
854 
855 	/* XXX This distorts the meaning of the copy_on_write bit */
856 
857 	if (prot & VM_PROT_WRITE)
858 		m->flags &= ~PG_COPYONWRITE;
859 
860 	/*
861 	 *	It's critically important that a wired-down page be faulted
862 	 *	only once in each map for which it is wired.
863 	 */
864 
865 	if (m->flags & (PG_ACTIVE | PG_INACTIVE))
866 		panic("vm_fault: active or inactive before pmap_enter");
867 
868 	vm_object_unlock(object);
869 
870 	/*
871 	 *	Put this page into the physical map.
872 	 *	We had to do the unlock above because pmap_enter
873 	 *	may cause other faults.   We don't put the
874 	 *	page back on the active queue until later so
875 	 *	that the page-out daemon won't find us (yet).
876 	 */
877 
878 	pmap_enter(map->pmap, vaddr, VM_PAGE_TO_PHYS(m), prot, wired);
879 
880 	/*
881 	 *	If the page is not wired down, then put it where the
882 	 *	pageout daemon can find it.
883 	 */
884 	vm_object_lock(object);
885 	vm_page_lock_queues();
886 	if (change_wiring) {
887 		if (wired)
888 			vm_page_wire(m);
889 		else
890 			vm_page_unwire(m);
891 	}
892 	else {
893 		vm_page_activate(m);
894 	}
895 
896 	if( curproc && curproc->p_stats) {
897 		if (hardfault) {
898 			curproc->p_stats->p_ru.ru_majflt++;
899 		} else {
900 			curproc->p_stats->p_ru.ru_minflt++;
901 		}
902 	}
903 
904 	vm_page_unlock_queues();
905 
906 	/*
907 	 *	Unlock everything, and return
908 	 */
909 
910 	PAGE_WAKEUP(m);
911 	UNLOCK_AND_DEALLOCATE;
912 
913 	return(KERN_SUCCESS);
914 
915 }
916 
917 /*
918  *	vm_fault_wire:
919  *
920  *	Wire down a range of virtual addresses in a map.
921  */
922 int
923 vm_fault_wire(map, start, end)
924 	vm_map_t	map;
925 	vm_offset_t	start, end;
926 {
927 
928 	register vm_offset_t	va;
929 	register pmap_t		pmap;
930 	int rv;
931 
932 	pmap = vm_map_pmap(map);
933 
934 	/*
935 	 *	Inform the physical mapping system that the
936 	 *	range of addresses may not fault, so that
937 	 *	page tables and such can be locked down as well.
938 	 */
939 
940 	pmap_pageable(pmap, start, end, FALSE);
941 
942 	/*
943 	 *	We simulate a fault to get the page and enter it
944 	 *	in the physical map.
945 	 */
946 
947 	for (va = start; va < end; va += PAGE_SIZE) {
948 		rv = vm_fault(map, va, VM_PROT_NONE, TRUE);
949 		if (rv) {
950 			if (va != start)
951 				vm_fault_unwire(map, start, va);
952 			return(rv);
953 		}
954 	}
955 	return(KERN_SUCCESS);
956 }
957 
958 
959 /*
960  *	vm_fault_unwire:
961  *
962  *	Unwire a range of virtual addresses in a map.
963  */
964 void
965 vm_fault_unwire(map, start, end)
966 	vm_map_t	map;
967 	vm_offset_t	start, end;
968 {
969 
970 	register vm_offset_t	va, pa;
971 	register pmap_t		pmap;
972 
973 	pmap = vm_map_pmap(map);
974 
975 	/*
976 	 *	Since the pages are wired down, we must be able to
977 	 *	get their mappings from the physical map system.
978 	 */
979 
980 	vm_page_lock_queues();
981 
982 	for (va = start; va < end; va += PAGE_SIZE) {
983 		pa = pmap_extract(pmap, va);
984 		if (pa == (vm_offset_t) 0) {
985 			panic("unwire: page not in pmap");
986 		}
987 		pmap_change_wiring(pmap, va, FALSE);
988 		vm_page_unwire(PHYS_TO_VM_PAGE(pa));
989 	}
990 	vm_page_unlock_queues();
991 
992 	/*
993 	 *	Inform the physical mapping system that the range
994 	 *	of addresses may fault, so that page tables and
995 	 *	such may be unwired themselves.
996 	 */
997 
998 	pmap_pageable(pmap, start, end, TRUE);
999 
1000 }
1001 
1002 /*
1003  *	Routine:
1004  *		vm_fault_copy_entry
1005  *	Function:
1006  *		Copy all of the pages from a wired-down map entry to another.
1007  *
1008  *	In/out conditions:
1009  *		The source and destination maps must be locked for write.
1010  *		The source map entry must be wired down (or be a sharing map
1011  *		entry corresponding to a main map entry that is wired down).
1012  */
1013 
1014 void
1015 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry)
1016 	vm_map_t	dst_map;
1017 	vm_map_t	src_map;
1018 	vm_map_entry_t	dst_entry;
1019 	vm_map_entry_t	src_entry;
1020 {
1021 	vm_object_t	dst_object;
1022 	vm_object_t	src_object;
1023 	vm_offset_t	dst_offset;
1024 	vm_offset_t	src_offset;
1025 	vm_prot_t	prot;
1026 	vm_offset_t	vaddr;
1027 	vm_page_t	dst_m;
1028 	vm_page_t	src_m;
1029 
1030 #ifdef	lint
1031 	src_map++;
1032 #endif	lint
1033 
1034 	src_object = src_entry->object.vm_object;
1035 	src_offset = src_entry->offset;
1036 
1037 	/*
1038 	 *	Create the top-level object for the destination entry.
1039 	 *	(Doesn't actually shadow anything - we copy the pages
1040 	 *	directly.)
1041 	 */
1042 	dst_object = vm_object_allocate(
1043 			(vm_size_t) (dst_entry->end - dst_entry->start));
1044 
1045 	dst_entry->object.vm_object = dst_object;
1046 	dst_entry->offset = 0;
1047 
1048 	prot  = dst_entry->max_protection;
1049 
1050 	/*
1051 	 *	Loop through all of the pages in the entry's range, copying
1052 	 *	each one from the source object (it should be there) to the
1053 	 *	destination object.
1054 	 */
1055 	for (vaddr = dst_entry->start, dst_offset = 0;
1056 	     vaddr < dst_entry->end;
1057 	     vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) {
1058 
1059 		/*
1060 		 *	Allocate a page in the destination object
1061 		 */
1062 		vm_object_lock(dst_object);
1063 		do {
1064 			dst_m = vm_page_alloc(dst_object, dst_offset);
1065 			if (dst_m == NULL) {
1066 				vm_object_unlock(dst_object);
1067 				VM_WAIT;
1068 				vm_object_lock(dst_object);
1069 			}
1070 		} while (dst_m == NULL);
1071 
1072 		/*
1073 		 *	Find the page in the source object, and copy it in.
1074 		 *	(Because the source is wired down, the page will be
1075 		 *	in memory.)
1076 		 */
1077 		vm_object_lock(src_object);
1078 		src_m = vm_page_lookup(src_object, dst_offset + src_offset);
1079 		if (src_m == NULL)
1080 			panic("vm_fault_copy_wired: page missing");
1081 
1082 		vm_page_copy(src_m, dst_m);
1083 
1084 		/*
1085 		 *	Enter it in the pmap...
1086 		 */
1087 		vm_object_unlock(src_object);
1088 		vm_object_unlock(dst_object);
1089 
1090 		pmap_enter(dst_map->pmap, vaddr, VM_PAGE_TO_PHYS(dst_m),
1091 				prot, FALSE);
1092 
1093 		/*
1094 		 *	Mark it no longer busy, and put it on the active list.
1095 		 */
1096 		vm_object_lock(dst_object);
1097 		vm_page_lock_queues();
1098 		vm_page_activate(dst_m);
1099 		vm_page_unlock_queues();
1100 		PAGE_WAKEUP(dst_m);
1101 		vm_object_unlock(dst_object);
1102 	}
1103 }
1104 
1105 
1106 /*
1107  * looks page up in shadow chain
1108  */
1109 
1110 int
1111 vm_fault_page_lookup(object, offset, rtobject, rtoffset, rtm)
1112 	vm_object_t object;
1113 	vm_offset_t offset;
1114 	vm_object_t *rtobject;
1115 	vm_offset_t *rtoffset;
1116 	vm_page_t *rtm;
1117 {
1118 	vm_page_t m;
1119 	vm_object_t first_object = object;
1120 
1121 	*rtm = 0;
1122 	*rtobject = 0;
1123 	*rtoffset = 0;
1124 
1125 
1126 	while (!(m=vm_page_lookup(object, offset))) {
1127 		if (object->pager) {
1128 			if (vm_pager_has_page(object->pager, object->paging_offset+offset)) {
1129 				*rtobject = object;
1130 				*rtoffset = offset;
1131 				return 1;
1132 			}
1133 		}
1134 
1135 		if (!object->shadow)
1136 			return 0;
1137 		else {
1138 			offset += object->shadow_offset;
1139 			object = object->shadow;
1140 		}
1141 	}
1142 	*rtobject = object;
1143 	*rtoffset = offset;
1144 	*rtm = m;
1145 	return 1;
1146 }
1147 
1148 /*
1149  * This routine checks around the requested page for other pages that
1150  * might be able to be faulted in.
1151  *
1152  * Inputs:
1153  *	first_object, first_offset, m, rbehind, rahead
1154  *
1155  * Outputs:
1156  *  marray (array of vm_page_t), reqpage (index of requested page)
1157  *
1158  * Return value:
1159  *  number of pages in marray
1160  */
1161 int
1162 vm_fault_additional_pages(first_object, first_offset, m, rbehind, raheada, marray, reqpage)
1163 	vm_object_t first_object;
1164 	vm_offset_t first_offset;
1165 	vm_page_t m;
1166 	int rbehind;
1167 	int raheada;
1168 	vm_page_t *marray;
1169 	int *reqpage;
1170 {
1171 	int i;
1172 	vm_page_t tmpm;
1173 	vm_object_t object;
1174 	vm_offset_t offset, startoffset, endoffset, toffset, size;
1175 	vm_object_t rtobject;
1176 	vm_page_t rtm;
1177 	vm_offset_t rtoffset;
1178 	vm_offset_t offsetdiff;
1179 	int rahead;
1180 	int treqpage;
1181 
1182 	object = m->object;
1183 	offset = m->offset;
1184 
1185 	offsetdiff = offset - first_offset;
1186 
1187 	/*
1188 	 * if the requested page is not available, then give up now
1189 	 */
1190 
1191 	if (!vm_pager_has_page(object->pager, object->paging_offset+offset))
1192 		return 0;
1193 
1194 	/*
1195 	 * if there is no getmulti routine for this pager, then just allow
1196 	 * one page to be read.
1197 	 */
1198 /*
1199 	if (!object->pager->pg_ops->pgo_getpages) {
1200 		*reqpage = 0;
1201 		marray[0] = m;
1202 		return 1;
1203 	}
1204 */
1205 
1206 	/*
1207 	 * try to do any readahead that we might have free pages for.
1208 	 */
1209 	rahead = raheada;
1210 	if (rahead > (cnt.v_free_count - cnt.v_free_reserved)) {
1211 		rahead = cnt.v_free_count - cnt.v_free_reserved;
1212 		rbehind = 0;
1213 	}
1214 
1215 	if (cnt.v_free_count < cnt.v_free_min) {
1216 		if (rahead > VM_FAULT_READ_AHEAD_MIN)
1217 			rahead = VM_FAULT_READ_AHEAD_MIN;
1218 		rbehind = 0;
1219 	}
1220 
1221 	/*
1222 	 * if we don't have any free pages, then just read one page.
1223 	 */
1224 	if (rahead <= 0) {
1225 		*reqpage = 0;
1226 		marray[0] = m;
1227 		return 1;
1228 	}
1229 
1230 	/*
1231 	 * scan backward for the read behind pages --
1232 	 * in memory or on disk not in same object
1233 	 */
1234 	toffset = offset - NBPG;
1235 	if( rbehind*NBPG > offset)
1236 		rbehind = offset / NBPG;
1237 	startoffset = offset - rbehind*NBPG;
1238 	while (toffset >= startoffset) {
1239 		if (!vm_fault_page_lookup(first_object, toffset - offsetdiff, &rtobject, &rtoffset, &rtm) ||
1240 		    rtm != 0 || rtobject != object) {
1241 			startoffset = toffset + NBPG;
1242 			break;
1243 		}
1244 		if( toffset == 0)
1245 			break;
1246 		toffset -= NBPG;
1247 	}
1248 
1249 	/*
1250 	 * scan forward for the read ahead pages --
1251 	 * in memory or on disk not in same object
1252 	 */
1253 	toffset = offset + NBPG;
1254 	endoffset = offset + (rahead+1)*NBPG;
1255 	while (toffset < object->size && toffset < endoffset) {
1256 		if (!vm_fault_page_lookup(first_object, toffset - offsetdiff, &rtobject, &rtoffset, &rtm) ||
1257 		    rtm != 0 || rtobject != object) {
1258 			break;
1259 		}
1260 		toffset += NBPG;
1261 	}
1262 	endoffset = toffset;
1263 
1264 	/* calculate number of bytes of pages */
1265 	size = (endoffset - startoffset) / NBPG;
1266 
1267 	/* calculate the page offset of the required page */
1268 	treqpage = (offset - startoffset) / NBPG;
1269 
1270 	/* see if we have space (again) */
1271 	if (cnt.v_free_count >= cnt.v_free_reserved + size) {
1272 		bzero(marray, (rahead + rbehind + 1) * sizeof(vm_page_t));
1273 		/*
1274 		 * get our pages and don't block for them
1275 		 */
1276 		for (i = 0; i < size; i++) {
1277 			if (i != treqpage)
1278 				rtm  = vm_page_alloc(object, startoffset + i * NBPG);
1279 			else
1280 				rtm = m;
1281 			marray[i] = rtm;
1282 		}
1283 
1284 		for (i = 0; i < size; i++) {
1285 			if (marray[i] == 0)
1286 				break;
1287 		}
1288 
1289 		/*
1290 		 * if we could not get our block of pages, then
1291 		 * free the readahead/readbehind pages.
1292 		 */
1293 		if (i < size) {
1294 			for (i = 0; i < size; i++) {
1295 				if (i != treqpage && marray[i])
1296 					FREE_PAGE(marray[i]);
1297 			}
1298 			*reqpage = 0;
1299 			marray[0] = m;
1300 			return 1;
1301 		}
1302 
1303 		*reqpage = treqpage;
1304 		return size;
1305 	}
1306 	*reqpage = 0;
1307 	marray[0] = m;
1308 	return 1;
1309 }
1310 
1311