xref: /freebsd/sys/vm/swap_pager.c (revision 35c10d223984037248bec0cd8aced0e73cbfcb89)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * Copyright (c) 1990 University of Utah.
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * the Systems Programming Group of the University of Utah Computer
9  * Science Department.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
40  *
41  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
42  * $Id: swap_pager.c,v 1.10 1994/10/09 01:52:04 phk Exp $
43  */
44 
45 /*
46  * Quick hack to page to dedicated partition(s).
47  * TODO:
48  *	Add multiprocessor locks
49  *	Deal with async writes in a better fashion
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/proc.h>
55 #include <sys/buf.h>
56 #include <sys/vnode.h>
57 #include <sys/malloc.h>
58 
59 #include <miscfs/specfs/specdev.h>
60 #include <sys/rlist.h>
61 
62 #include <vm/vm.h>
63 #include <vm/vm_pager.h>
64 #include <vm/vm_page.h>
65 #include <vm/vm_pageout.h>
66 #include <vm/swap_pager.h>
67 
68 #ifndef NPENDINGIO
69 #define NPENDINGIO	16
70 #endif
71 
72 int	swap_pager_input __P((sw_pager_t, vm_page_t *, int, int));
73 int	swap_pager_output __P((sw_pager_t, vm_page_t *, int, int, int *));
74 
75 int nswiodone;
76 extern int vm_pageout_rate_limit;
77 static int cleandone;
78 extern int hz;
79 int swap_pager_full;
80 extern vm_map_t pager_map;
81 extern int vm_swap_size;
82 struct rlist *swaplist;
83 int nswaplist;
84 
85 #define MAX_PAGEOUT_CLUSTER 8
86 
87 TAILQ_HEAD(swpclean, swpagerclean);
88 
89 typedef	struct swpagerclean	*swp_clean_t;
90 
91 struct swpagerclean {
92 	TAILQ_ENTRY(swpagerclean)	spc_list;
93 	int				spc_flags;
94 	struct buf			*spc_bp;
95 	sw_pager_t			spc_swp;
96 	vm_offset_t			spc_kva;
97 	int				spc_count;
98 	vm_page_t			spc_m[MAX_PAGEOUT_CLUSTER];
99 } swcleanlist [NPENDINGIO] ;
100 
101 
102 extern vm_map_t kernel_map;
103 
104 /* spc_flags values */
105 #define SPC_ERROR	0x01
106 
107 #define SWB_EMPTY (-1)
108 
109 struct swpclean swap_pager_done;	/* list of compileted page cleans */
110 struct swpclean swap_pager_inuse;	/* list of pending page cleans */
111 struct swpclean swap_pager_free;	/* list of free pager clean structs */
112 struct pagerlst swap_pager_list;	/* list of "named" anon regions */
113 struct pagerlst swap_pager_un_list;	/* list of "unnamed" anon pagers */
114 
115 #define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
116 int swap_pager_needflags;
117 struct rlist *swapfrag;
118 
119 struct pagerlst *swp_qs[]={
120 	&swap_pager_list, &swap_pager_un_list, (struct pagerlst *) 0
121 };
122 
123 int swap_pager_putmulti();
124 
125 struct pagerops swappagerops = {
126 	swap_pager_init,
127 	swap_pager_alloc,
128 	swap_pager_dealloc,
129 	swap_pager_getpage,
130 	swap_pager_getmulti,
131 	swap_pager_putpage,
132 	swap_pager_putmulti,
133 	swap_pager_haspage
134 };
135 
136 int npendingio = NPENDINGIO;
137 int pendingiowait;
138 int require_swap_init;
139 void swap_pager_finish();
140 int dmmin, dmmax;
141 extern int vm_page_count;
142 
143 struct buf * getpbuf() ;
144 void relpbuf(struct buf *bp) ;
145 
146 static inline void swapsizecheck() {
147 	if( vm_swap_size < 128*btodb(PAGE_SIZE)) {
148 		if( swap_pager_full)
149 			printf("swap_pager: out of space\n");
150 		swap_pager_full = 1;
151 	} else if( vm_swap_size > 192*btodb(PAGE_SIZE))
152 		swap_pager_full = 0;
153 }
154 
155 void
156 swap_pager_init()
157 {
158 	dfltpagerops = &swappagerops;
159 
160 	TAILQ_INIT(&swap_pager_list);
161 	TAILQ_INIT(&swap_pager_un_list);
162 
163 	/*
164 	 * Initialize clean lists
165 	 */
166 	TAILQ_INIT(&swap_pager_inuse);
167 	TAILQ_INIT(&swap_pager_done);
168 	TAILQ_INIT(&swap_pager_free);
169 
170 	require_swap_init = 1;
171 
172 	/*
173 	 * Calculate the swap allocation constants.
174 	 */
175 
176 	dmmin = CLBYTES/DEV_BSIZE;
177 	dmmax = btodb(SWB_NPAGES*PAGE_SIZE)*2;
178 
179 }
180 
181 /*
182  * Allocate a pager structure and associated resources.
183  * Note that if we are called from the pageout daemon (handle == NULL)
184  * we should not wait for memory as it could resulting in deadlock.
185  */
186 vm_pager_t
187 swap_pager_alloc(handle, size, prot, offset)
188 	caddr_t handle;
189 	register vm_size_t size;
190 	vm_prot_t prot;
191 	vm_offset_t offset;
192 {
193 	register vm_pager_t pager;
194 	register sw_pager_t swp;
195 	int waitok;
196 	int i,j;
197 
198 	if (require_swap_init) {
199 		swp_clean_t spc;
200 		struct buf *bp;
201 		/*
202 		 * kva's are allocated here so that we dont need to keep
203 		 * doing kmem_alloc pageables at runtime
204 		 */
205 		for (i = 0, spc = swcleanlist; i < npendingio ; i++, spc++) {
206 			spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE*MAX_PAGEOUT_CLUSTER);
207 			if (!spc->spc_kva) {
208 				break;
209 			}
210 			spc->spc_bp = malloc( sizeof( *bp), M_TEMP, M_NOWAIT);
211 			if (!spc->spc_bp) {
212 				kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
213 				break;
214 			}
215 			spc->spc_flags = 0;
216 			TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
217 		}
218 		require_swap_init = 0;
219 		if( size == 0)
220 			return(NULL);
221 	}
222 
223 	/*
224 	 * If this is a "named" anonymous region, look it up and
225 	 * return the appropriate pager if it exists.
226 	 */
227 	if (handle) {
228 		pager = vm_pager_lookup(&swap_pager_list, handle);
229 		if (pager != NULL) {
230 			/*
231 			 * Use vm_object_lookup to gain a reference
232 			 * to the object and also to remove from the
233 			 * object cache.
234 			 */
235 			if (vm_object_lookup(pager) == NULL)
236 				panic("swap_pager_alloc: bad object");
237 			return(pager);
238 		}
239 	}
240 
241 	if (swap_pager_full) {
242 		return(NULL);
243 	}
244 
245 	/*
246 	 * Pager doesn't exist, allocate swap management resources
247 	 * and initialize.
248 	 */
249 	waitok = handle ? M_WAITOK : M_NOWAIT;
250 	pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok);
251 	if (pager == NULL)
252 		return(NULL);
253 	swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok);
254 	if (swp == NULL) {
255 		free((caddr_t)pager, M_VMPAGER);
256 		return(NULL);
257 	}
258 	size = round_page(size);
259 	swp->sw_osize = size;
260 	swp->sw_nblocks = (btodb(size) + btodb(SWB_NPAGES * PAGE_SIZE) - 1) / btodb(SWB_NPAGES*PAGE_SIZE);
261 	swp->sw_blocks = (sw_blk_t)
262 		malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks),
263 		       M_VMPGDATA, waitok);
264 	if (swp->sw_blocks == NULL) {
265 		free((caddr_t)swp, M_VMPGDATA);
266 		free((caddr_t)pager, M_VMPAGER);
267 		return(NULL);
268 	}
269 
270 	for (i = 0; i < swp->sw_nblocks; i++) {
271 		swp->sw_blocks[i].swb_valid = 0;
272 		swp->sw_blocks[i].swb_locked = 0;
273 		for (j = 0; j < SWB_NPAGES; j++)
274 			swp->sw_blocks[i].swb_block[j] = SWB_EMPTY;
275 	}
276 
277 	swp->sw_poip = 0;
278 	if (handle) {
279 		vm_object_t object;
280 
281 		swp->sw_flags = SW_NAMED;
282 		TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list);
283 		/*
284 		 * Consistant with other pagers: return with object
285 		 * referenced.  Can't do this with handle == NULL
286 		 * since it might be the pageout daemon calling.
287 		 */
288 		object = vm_object_allocate(size);
289 		vm_object_enter(object, pager);
290 		vm_object_setpager(object, pager, 0, FALSE);
291 	} else {
292 		swp->sw_flags = 0;
293 		TAILQ_INSERT_TAIL(&swap_pager_un_list, pager, pg_list);
294 	}
295 	pager->pg_handle = handle;
296 	pager->pg_ops = &swappagerops;
297 	pager->pg_type = PG_SWAP;
298 	pager->pg_data = (caddr_t)swp;
299 
300 	return(pager);
301 }
302 
303 /*
304  * returns disk block associated with pager and offset
305  * additionally, as a side effect returns a flag indicating
306  * if the block has been written
307  */
308 
309 static int *
310 swap_pager_diskaddr(swp, offset, valid)
311 	sw_pager_t swp;
312 	vm_offset_t offset;
313 	int *valid;
314 {
315 	register sw_blk_t swb;
316 	int ix;
317 
318 	if (valid)
319 		*valid = 0;
320 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
321 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
322 		return(FALSE);
323 	}
324 	swb = &swp->sw_blocks[ix];
325 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
326 	if (valid)
327 		*valid = swb->swb_valid & (1<<ix);
328 	return &swb->swb_block[ix];
329 }
330 
331 /*
332  * Utility routine to set the valid (written) bit for
333  * a block associated with a pager and offset
334  */
335 static void
336 swap_pager_setvalid(swp, offset, valid)
337 	sw_pager_t swp;
338 	vm_offset_t offset;
339 	int valid;
340 {
341 	register sw_blk_t swb;
342 	int ix;
343 
344 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
345 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks)
346 		return;
347 
348 	swb = &swp->sw_blocks[ix];
349 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
350 	if (valid)
351 		swb->swb_valid |= (1 << ix);
352 	else
353 		swb->swb_valid &= ~(1 << ix);
354 	return;
355 }
356 
357 /*
358  * this routine allocates swap space with a fragmentation
359  * minimization policy.
360  */
361 int
362 swap_pager_getswapspace( unsigned amount, unsigned *rtval) {
363 #ifdef EXP
364 	unsigned tmpalloc;
365 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
366 	if( amount < nblocksfrag) {
367 		if( rlist_alloc(&swapfrag, amount, rtval))
368 			return 1;
369 		if( !rlist_alloc(&swaplist, nblocksfrag, &tmpalloc))
370 			return 0;
371 		rlist_free( &swapfrag, tmpalloc+amount, tmpalloc + nblocksfrag - 1);
372 		*rtval = tmpalloc;
373 		return 1;
374 	}
375 #endif
376 	if( !rlist_alloc(&swaplist, amount, rtval))
377 		return 0;
378 	else
379 		return 1;
380 }
381 
382 /*
383  * this routine frees swap space with a fragmentation
384  * minimization policy.
385  */
386 void
387 swap_pager_freeswapspace( unsigned from, unsigned to) {
388 #ifdef EXP
389 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
390 	unsigned tmpalloc;
391 	if( ((to + 1) - from) >= nblocksfrag) {
392 #endif
393 		rlist_free(&swaplist, from, to);
394 #ifdef EXP
395 		return;
396 	}
397 	rlist_free(&swapfrag, from, to);
398 	while( rlist_alloc(&swapfrag, nblocksfrag, &tmpalloc)) {
399 		rlist_free(&swaplist, tmpalloc, tmpalloc + nblocksfrag-1);
400 	}
401 #endif
402 }
403 /*
404  * this routine frees swap blocks from a specified pager
405  */
406 void
407 _swap_pager_freespace(swp, start, size)
408 	sw_pager_t swp;
409 	vm_offset_t start;
410 	vm_offset_t size;
411 {
412 	vm_offset_t i;
413 	int s;
414 
415 	s = splbio();
416 	for (i = start; i < round_page(start + size - 1); i += PAGE_SIZE) {
417 		int valid;
418 		int *addr = swap_pager_diskaddr(swp, i, &valid);
419 		if (addr && *addr != SWB_EMPTY) {
420 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
421 			if( valid) {
422 				vm_swap_size += btodb(PAGE_SIZE);
423 				swap_pager_setvalid(swp, i, 0);
424 			}
425 			*addr = SWB_EMPTY;
426 		}
427 	}
428 	swapsizecheck();
429 	splx(s);
430 }
431 
432 void
433 swap_pager_freespace(pager, start, size)
434 	vm_pager_t pager;
435 	vm_offset_t start;
436 	vm_offset_t size;
437 {
438 	_swap_pager_freespace((sw_pager_t) pager->pg_data, start, size);
439 }
440 
441 /*
442  * swap_pager_reclaim frees up over-allocated space from all pagers
443  * this eliminates internal fragmentation due to allocation of space
444  * for segments that are never swapped to. It has been written so that
445  * it does not block until the rlist_free operation occurs; it keeps
446  * the queues consistant.
447  */
448 
449 /*
450  * Maximum number of blocks (pages) to reclaim per pass
451  */
452 #define MAXRECLAIM 256
453 
454 void
455 swap_pager_reclaim()
456 {
457 	vm_pager_t p;
458 	sw_pager_t swp;
459 	int i, j, k;
460 	int s;
461 	int reclaimcount;
462 	static int reclaims[MAXRECLAIM];
463 	static int in_reclaim;
464 
465 /*
466  * allow only one process to be in the swap_pager_reclaim subroutine
467  */
468 	s = splbio();
469 	if (in_reclaim) {
470 		tsleep((caddr_t) &in_reclaim, PSWP, "swrclm", 0);
471 		splx(s);
472 		return;
473 	}
474 	in_reclaim = 1;
475 	reclaimcount = 0;
476 
477 	/* for each pager queue */
478 	for (k = 0; swp_qs[k]; k++) {
479 
480 		p = swp_qs[k]->tqh_first;
481 		while (p && (reclaimcount < MAXRECLAIM)) {
482 
483 			/*
484 			 * see if any blocks associated with a pager has been
485 			 * allocated but not used (written)
486 			 */
487 			swp = (sw_pager_t) p->pg_data;
488 			for (i = 0; i < swp->sw_nblocks; i++) {
489 				sw_blk_t swb = &swp->sw_blocks[i];
490 				if( swb->swb_locked)
491 					continue;
492 				for (j = 0; j < SWB_NPAGES; j++) {
493 					if (swb->swb_block[j] != SWB_EMPTY &&
494 						(swb->swb_valid & (1 << j)) == 0) {
495 						reclaims[reclaimcount++] = swb->swb_block[j];
496 						swb->swb_block[j] = SWB_EMPTY;
497 						if (reclaimcount >= MAXRECLAIM)
498 							goto rfinished;
499 					}
500 				}
501 			}
502 			p = p->pg_list.tqe_next;
503 		}
504 	}
505 
506 rfinished:
507 
508 /*
509  * free the blocks that have been added to the reclaim list
510  */
511 	for (i = 0; i < reclaimcount; i++) {
512 		swap_pager_freeswapspace(reclaims[i], reclaims[i]+btodb(PAGE_SIZE) - 1);
513 		swapsizecheck();
514 		wakeup((caddr_t) &in_reclaim);
515 	}
516 
517 	splx(s);
518 	in_reclaim = 0;
519 	wakeup((caddr_t) &in_reclaim);
520 }
521 
522 
523 /*
524  * swap_pager_copy copies blocks from one pager to another and
525  * destroys the source pager
526  */
527 
528 void
529 swap_pager_copy(srcpager, srcoffset, dstpager, dstoffset, offset)
530 	vm_pager_t srcpager;
531 	vm_offset_t srcoffset;
532 	vm_pager_t dstpager;
533 	vm_offset_t dstoffset;
534 	vm_offset_t offset;
535 {
536 	sw_pager_t srcswp, dstswp;
537 	vm_offset_t i;
538 	int s;
539 
540 	srcswp = (sw_pager_t) srcpager->pg_data;
541 	dstswp = (sw_pager_t) dstpager->pg_data;
542 
543 /*
544  * remove the source pager from the swap_pager internal queue
545  */
546 	s = splbio();
547 	if (srcswp->sw_flags & SW_NAMED) {
548 		TAILQ_REMOVE(&swap_pager_list, srcpager, pg_list);
549 		srcswp->sw_flags &= ~SW_NAMED;
550 	} else {
551 		TAILQ_REMOVE(&swap_pager_un_list, srcpager, pg_list);
552 	}
553 
554 	while (srcswp->sw_poip) {
555 		tsleep((caddr_t)srcswp, PVM, "spgout", 0);
556 	}
557 	splx(s);
558 
559 /*
560  * clean all of the pages that are currently active and finished
561  */
562 	(void) swap_pager_clean();
563 
564 	s = splbio();
565 /*
566  * clear source block before destination object
567  * (release allocated space)
568  */
569 	for (i = 0; i < offset + srcoffset; i += PAGE_SIZE) {
570 		int valid;
571 		int *addr = swap_pager_diskaddr(srcswp, i, &valid);
572 		if (addr && *addr != SWB_EMPTY) {
573 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
574 			if( valid)
575 				vm_swap_size += btodb(PAGE_SIZE);
576 			swapsizecheck();
577 			*addr = SWB_EMPTY;
578 		}
579 	}
580 /*
581  * transfer source to destination
582  */
583 	for (i = 0; i < dstswp->sw_osize; i += PAGE_SIZE) {
584 		int srcvalid, dstvalid;
585 		int *srcaddrp = swap_pager_diskaddr(srcswp, i + offset + srcoffset,
586 			&srcvalid);
587 		int *dstaddrp;
588 	/*
589 	 * see if the source has space allocated
590 	 */
591 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
592 		/*
593 		 * if the source is valid and the dest has no space, then
594 		 * copy the allocation from the srouce to the dest.
595 		 */
596 			if (srcvalid) {
597 				dstaddrp = swap_pager_diskaddr(dstswp, i + dstoffset, &dstvalid);
598 				/*
599 				 * if the dest already has a valid block, deallocate the
600 				 * source block without copying.
601 				 */
602 				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
603 					swap_pager_freeswapspace(*dstaddrp, *dstaddrp+btodb(PAGE_SIZE) - 1);
604 					*dstaddrp = SWB_EMPTY;
605 				}
606 				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
607 					*dstaddrp = *srcaddrp;
608 					*srcaddrp = SWB_EMPTY;
609 					swap_pager_setvalid(dstswp, i + dstoffset, 1);
610 					vm_swap_size -= btodb(PAGE_SIZE);
611 				}
612 			}
613 		/*
614 		 * if the source is not empty at this point, then deallocate the space.
615 		 */
616 			if (*srcaddrp != SWB_EMPTY) {
617 				swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
618 				if( srcvalid)
619 					vm_swap_size += btodb(PAGE_SIZE);
620 				*srcaddrp = SWB_EMPTY;
621 			}
622 		}
623 	}
624 
625 /*
626  * deallocate the rest of the source object
627  */
628 	for (i = dstswp->sw_osize + offset + srcoffset; i < srcswp->sw_osize; i += PAGE_SIZE) {
629 		int valid;
630 		int *srcaddrp = swap_pager_diskaddr(srcswp, i, &valid);
631 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
632 			swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
633 			if( valid)
634 				vm_swap_size += btodb(PAGE_SIZE);
635 			*srcaddrp = SWB_EMPTY;
636 		}
637 	}
638 
639 	swapsizecheck();
640 	splx(s);
641 
642 	free((caddr_t)srcswp->sw_blocks, M_VMPGDATA);
643 	srcswp->sw_blocks = 0;
644 	free((caddr_t)srcswp, M_VMPGDATA);
645 	srcpager->pg_data = 0;
646 	free((caddr_t)srcpager, M_VMPAGER);
647 
648 	return;
649 }
650 
651 
652 void
653 swap_pager_dealloc(pager)
654 	vm_pager_t pager;
655 {
656 	register int i,j;
657 	register sw_blk_t bp;
658 	register sw_pager_t swp;
659 	int s;
660 
661 	/*
662 	 * Remove from list right away so lookups will fail if we
663 	 * block for pageout completion.
664 	 */
665 	s = splbio();
666 	swp = (sw_pager_t) pager->pg_data;
667 	if (swp->sw_flags & SW_NAMED) {
668 		TAILQ_REMOVE(&swap_pager_list, pager, pg_list);
669 		swp->sw_flags &= ~SW_NAMED;
670 	} else {
671 		TAILQ_REMOVE(&swap_pager_un_list, pager, pg_list);
672 	}
673 	/*
674 	 * Wait for all pageouts to finish and remove
675 	 * all entries from cleaning list.
676 	 */
677 
678 	while (swp->sw_poip) {
679 		tsleep((caddr_t)swp, PVM, "swpout", 0);
680 	}
681 	splx(s);
682 
683 
684 	(void) swap_pager_clean();
685 
686 	/*
687 	 * Free left over swap blocks
688 	 */
689 	s = splbio();
690 	for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) {
691 		for (j = 0; j < SWB_NPAGES; j++)
692 		if (bp->swb_block[j] != SWB_EMPTY) {
693 			swap_pager_freeswapspace((unsigned)bp->swb_block[j],
694 				(unsigned)bp->swb_block[j] + btodb(PAGE_SIZE) - 1);
695 			if( bp->swb_valid & (1<<j))
696 				vm_swap_size += btodb(PAGE_SIZE);
697 			bp->swb_block[j] = SWB_EMPTY;
698 		}
699 	}
700 	splx(s);
701 	swapsizecheck();
702 
703 	/*
704 	 * Free swap management resources
705 	 */
706 	free((caddr_t)swp->sw_blocks, M_VMPGDATA);
707 	swp->sw_blocks = 0;
708 	free((caddr_t)swp, M_VMPGDATA);
709 	pager->pg_data = 0;
710 	free((caddr_t)pager, M_VMPAGER);
711 }
712 
713 /*
714  * swap_pager_getmulti can get multiple pages.
715  */
716 int
717 swap_pager_getmulti(pager, m, count, reqpage, sync)
718 	vm_pager_t pager;
719 	vm_page_t *m;
720 	int count;
721 	int reqpage;
722 	boolean_t sync;
723 {
724 	if( reqpage >= count)
725 		panic("swap_pager_getmulti: reqpage >= count\n");
726 	return swap_pager_input((sw_pager_t) pager->pg_data, m, count, reqpage);
727 }
728 
729 /*
730  * swap_pager_getpage gets individual pages
731  */
732 int
733 swap_pager_getpage(pager, m, sync)
734 	vm_pager_t pager;
735 	vm_page_t m;
736 	boolean_t sync;
737 {
738 	vm_page_t marray[1];
739 
740 	marray[0] = m;
741 	return swap_pager_input((sw_pager_t)pager->pg_data, marray, 1, 0);
742 }
743 
744 int
745 swap_pager_putmulti(pager, m, c, sync, rtvals)
746 	vm_pager_t pager;
747 	vm_page_t *m;
748 	int c;
749 	boolean_t sync;
750 	int *rtvals;
751 {
752 	int flags;
753 
754 	if (pager == NULL) {
755 		(void) swap_pager_clean();
756 		return VM_PAGER_OK;
757 	}
758 
759 	flags = B_WRITE;
760 	if (!sync)
761 		flags |= B_ASYNC;
762 
763 	return swap_pager_output((sw_pager_t)pager->pg_data, m, c, flags, rtvals);
764 }
765 
766 /*
767  * swap_pager_putpage writes individual pages
768  */
769 int
770 swap_pager_putpage(pager, m, sync)
771 	vm_pager_t pager;
772 	vm_page_t m;
773 	boolean_t sync;
774 {
775 	int flags;
776 	vm_page_t marray[1];
777 	int rtvals[1];
778 
779 
780 	if (pager == NULL) {
781 		(void) swap_pager_clean();
782 		return VM_PAGER_OK;
783 	}
784 
785 	marray[0] = m;
786 	flags = B_WRITE;
787 	if (!sync)
788 		flags |= B_ASYNC;
789 
790 	swap_pager_output((sw_pager_t)pager->pg_data, marray, 1, flags, rtvals);
791 
792 	return rtvals[0];
793 }
794 
795 static inline int
796 const swap_pager_block_index(swp, offset)
797 	sw_pager_t swp;
798 	vm_offset_t offset;
799 {
800 	return (offset / (SWB_NPAGES*PAGE_SIZE));
801 }
802 
803 static inline int
804 const swap_pager_block_offset(swp, offset)
805 	sw_pager_t swp;
806 	vm_offset_t offset;
807 {
808 	return ((offset % (PAGE_SIZE*SWB_NPAGES)) / PAGE_SIZE);
809 }
810 
811 /*
812  * _swap_pager_haspage returns TRUE if the pager has data that has
813  * been written out.
814  */
815 static boolean_t
816 _swap_pager_haspage(swp, offset)
817 	sw_pager_t swp;
818 	vm_offset_t offset;
819 {
820 	register sw_blk_t swb;
821 	int ix;
822 
823 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
824 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
825 		return(FALSE);
826 	}
827 	swb = &swp->sw_blocks[ix];
828 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
829 	if (swb->swb_block[ix] != SWB_EMPTY) {
830 		if (swb->swb_valid & (1 << ix))
831 			return TRUE;
832 	}
833 
834 	return(FALSE);
835 }
836 
837 /*
838  * swap_pager_haspage is the externally accessible version of
839  * _swap_pager_haspage above.  this routine takes a vm_pager_t
840  * for an argument instead of sw_pager_t.
841  */
842 boolean_t
843 swap_pager_haspage(pager, offset)
844 	vm_pager_t pager;
845 	vm_offset_t offset;
846 {
847 	return _swap_pager_haspage((sw_pager_t) pager->pg_data, offset);
848 }
849 
850 /*
851  * swap_pager_freepage is a convienience routine that clears the busy
852  * bit and deallocates a page.
853  */
854 static void
855 swap_pager_freepage(m)
856 	vm_page_t m;
857 {
858 	PAGE_WAKEUP(m);
859 	vm_page_free(m);
860 }
861 
862 /*
863  * swap_pager_ridpages is a convienience routine that deallocates all
864  * but the required page.  this is usually used in error returns that
865  * need to invalidate the "extra" readahead pages.
866  */
867 static void
868 swap_pager_ridpages(m, count, reqpage)
869 	vm_page_t *m;
870 	int count;
871 	int reqpage;
872 {
873 	int i;
874 	for (i = 0; i < count; i++)
875 		if (i != reqpage)
876 			swap_pager_freepage(m[i]);
877 }
878 
879 int swapwritecount=0;
880 
881 /*
882  * swap_pager_iodone1 is the completion routine for both reads and async writes
883  */
884 void
885 swap_pager_iodone1(bp)
886 	struct buf *bp;
887 {
888 	bp->b_flags |= B_DONE;
889 	bp->b_flags &= ~B_ASYNC;
890 	wakeup((caddr_t)bp);
891 /*
892 	if ((bp->b_flags & B_READ) == 0)
893 		vwakeup(bp);
894 */
895 }
896 
897 
898 int
899 swap_pager_input(swp, m, count, reqpage)
900 	register sw_pager_t swp;
901 	vm_page_t *m;
902 	int count, reqpage;
903 {
904 	register struct buf *bp;
905 	sw_blk_t swb[count];
906 	register int s;
907 	int i;
908 	boolean_t rv;
909 	vm_offset_t kva, off[count];
910 	swp_clean_t spc;
911 	vm_offset_t paging_offset;
912 	vm_object_t object;
913 	int reqaddr[count];
914 
915 	int first, last;
916 	int failed;
917 	int reqdskregion;
918 
919 	object = m[reqpage]->object;
920 	paging_offset = object->paging_offset;
921 	/*
922 	 * First determine if the page exists in the pager if this is
923 	 * a sync read.  This quickly handles cases where we are
924 	 * following shadow chains looking for the top level object
925 	 * with the page.
926 	 */
927 	if (swp->sw_blocks == NULL) {
928 		swap_pager_ridpages(m, count, reqpage);
929 		return(VM_PAGER_FAIL);
930 	}
931 
932 	for(i = 0; i < count; i++) {
933 		vm_offset_t foff = m[i]->offset + paging_offset;
934 		int ix = swap_pager_block_index(swp, foff);
935 		if (ix >= swp->sw_nblocks) {
936 			int j;
937 			if( i <= reqpage) {
938 				swap_pager_ridpages(m, count, reqpage);
939 				return(VM_PAGER_FAIL);
940 			}
941 			for(j = i; j < count; j++) {
942 				swap_pager_freepage(m[j]);
943 			}
944 			count = i;
945 			break;
946 		}
947 
948 		swb[i] = &swp->sw_blocks[ix];
949 		off[i] = swap_pager_block_offset(swp, foff);
950 		reqaddr[i] = swb[i]->swb_block[off[i]];
951 	}
952 
953 	/* make sure that our required input request is existant */
954 
955 	if (reqaddr[reqpage] == SWB_EMPTY ||
956 		(swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
957 		swap_pager_ridpages(m, count, reqpage);
958 		return(VM_PAGER_FAIL);
959 	}
960 
961 
962 	reqdskregion = reqaddr[reqpage] / dmmax;
963 
964 	/*
965 	 * search backwards for the first contiguous page to transfer
966 	 */
967 	failed = 0;
968 	first = 0;
969 	for (i = reqpage - 1; i >= 0; --i) {
970 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
971 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
972 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
973 			((reqaddr[i] / dmmax) != reqdskregion)) {
974 				failed = 1;
975 				swap_pager_freepage(m[i]);
976 				if (first == 0)
977 					first = i + 1;
978 		}
979 	}
980 	/*
981 	 * search forwards for the last contiguous page to transfer
982 	 */
983 	failed = 0;
984 	last = count;
985 	for (i = reqpage + 1; i < count; i++) {
986 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
987 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
988 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
989 			((reqaddr[i] / dmmax) != reqdskregion)) {
990 				failed = 1;
991 				swap_pager_freepage(m[i]);
992 				if (last == count)
993 					last = i;
994 		}
995 	}
996 
997 	count = last;
998 	if (first != 0) {
999 		for (i = first; i < count; i++) {
1000 			m[i-first] = m[i];
1001 			reqaddr[i-first] = reqaddr[i];
1002 			off[i-first] = off[i];
1003 		}
1004 		count -= first;
1005 		reqpage -= first;
1006 	}
1007 
1008 	++swb[reqpage]->swb_locked;
1009 
1010 	/*
1011 	 * at this point:
1012 	 * "m" is a pointer to the array of vm_page_t for paging I/O
1013 	 * "count" is the number of vm_page_t entries represented by "m"
1014 	 * "object" is the vm_object_t for I/O
1015 	 * "reqpage" is the index into "m" for the page actually faulted
1016 	 */
1017 
1018 	spc = NULL;	/* we might not use an spc data structure */
1019 
1020 	if (count == 1) {
1021 		/*
1022 		 * if a kva has not been allocated, we can only do a one page transfer,
1023 		 * so we free the other pages that might have been allocated by
1024 		 * vm_fault.
1025 		 */
1026 		swap_pager_ridpages(m, count, reqpage);
1027 		m[0] = m[reqpage];
1028 		reqaddr[0] = reqaddr[reqpage];
1029 
1030 		count = 1;
1031 		reqpage = 0;
1032 	/*
1033 	 * get a swap pager clean data structure, block until we get it
1034 	 */
1035 		if (swap_pager_free.tqh_first == NULL) {
1036 			s = splbio();
1037 			if( curproc == pageproc)
1038 				(void) swap_pager_clean();
1039 			else
1040 				wakeup((caddr_t) &vm_pages_needed);
1041 			while (swap_pager_free.tqh_first == NULL) {
1042 				swap_pager_needflags |= SWAP_FREE_NEEDED;
1043 				tsleep((caddr_t)&swap_pager_free,
1044 					PVM, "swpfre", 0);
1045 				if( curproc == pageproc)
1046 					(void) swap_pager_clean();
1047 				else
1048 					wakeup((caddr_t) &vm_pages_needed);
1049 			}
1050 			splx(s);
1051 		}
1052 		spc = swap_pager_free.tqh_first;
1053 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1054 		kva = spc->spc_kva;
1055 		bp = spc->spc_bp;
1056 		bzero(bp, sizeof *bp);
1057 		bp->b_spc = spc;
1058 	} else {
1059 	/*
1060 	 * Get a swap buffer header to perform the IO
1061 	 */
1062 		bp = getpbuf();
1063 		kva = (vm_offset_t) bp->b_data;
1064 	}
1065 
1066 	/*
1067 	 * map our page(s) into kva for input
1068 	 */
1069 	pmap_qenter( kva, m, count);
1070 
1071 	s = splbio();
1072 	bp->b_flags = B_BUSY | B_READ | B_CALL;
1073 	bp->b_iodone = swap_pager_iodone1;
1074 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1075 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1076 	crhold(bp->b_rcred);
1077 	crhold(bp->b_wcred);
1078 	bp->b_un.b_addr = (caddr_t) kva;
1079 	bp->b_blkno = reqaddr[0];
1080 	bp->b_bcount = PAGE_SIZE*count;
1081 	bp->b_bufsize = PAGE_SIZE*count;
1082 
1083 	bgetvp( swapdev_vp, bp);
1084 
1085 	swp->sw_piip++;
1086 
1087 	/*
1088 	 * perform the I/O
1089 	 */
1090 	VOP_STRATEGY(bp);
1091 
1092 	/*
1093 	 * wait for the sync I/O to complete
1094 	 */
1095 	while ((bp->b_flags & B_DONE) == 0) {
1096 		tsleep((caddr_t)bp, PVM, "swread", 0);
1097 	}
1098 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1099 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1100 
1101 	--swp->sw_piip;
1102 	if (swp->sw_piip == 0)
1103 		wakeup((caddr_t) swp);
1104 
1105 	/*
1106 	 * relpbuf does this, but we maintain our own buffer
1107 	 * list also...
1108 	 */
1109 	if (bp->b_vp)
1110 		brelvp(bp);
1111 
1112 	splx(s);
1113 	--swb[reqpage]->swb_locked;
1114 
1115 	/*
1116 	 * remove the mapping for kernel virtual
1117 	 */
1118 	pmap_qremove( kva, count);
1119 
1120 	if (spc) {
1121 		/*
1122 		 * if we have used an spc, we need to free it.
1123 		 */
1124 		if( bp->b_rcred != NOCRED)
1125 			crfree(bp->b_rcred);
1126 		if( bp->b_wcred != NOCRED)
1127 			crfree(bp->b_wcred);
1128 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1129 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1130 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1131 			wakeup((caddr_t)&swap_pager_free);
1132 		}
1133 	} else {
1134 		/*
1135 		 * release the physical I/O buffer
1136 		 */
1137 		relpbuf(bp);
1138 		/*
1139 		 * finish up input if everything is ok
1140 		 */
1141 		if( rv == VM_PAGER_OK) {
1142 			for (i = 0; i < count; i++) {
1143 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1144 				m[i]->flags |= PG_CLEAN;
1145 				m[i]->flags &= ~PG_LAUNDRY;
1146 				if (i != reqpage) {
1147 					/*
1148 					 * whether or not to leave the page activated
1149 					 * is up in the air, but we should put the page
1150 					 * on a page queue somewhere. (it already is in
1151 					 * the object).
1152 					 * After some emperical results, it is best
1153 					 * to deactivate the readahead pages.
1154 					 */
1155 					vm_page_deactivate(m[i]);
1156 
1157 					/*
1158 					 * just in case someone was asking for this
1159 					 * page we now tell them that it is ok to use
1160 					 */
1161 					m[i]->flags &= ~PG_FAKE;
1162 					PAGE_WAKEUP(m[i]);
1163 				}
1164 			}
1165 			if( swap_pager_full) {
1166 				_swap_pager_freespace( swp, m[0]->offset+paging_offset, count*PAGE_SIZE);
1167 			}
1168 		} else {
1169 			swap_pager_ridpages(m, count, reqpage);
1170 		}
1171 	}
1172 	return(rv);
1173 }
1174 
1175 int
1176 swap_pager_output(swp, m, count, flags, rtvals)
1177 	register sw_pager_t swp;
1178 	vm_page_t *m;
1179 	int count;
1180 	int flags;
1181 	int *rtvals;
1182 {
1183 	register struct buf *bp;
1184 	sw_blk_t swb[count];
1185 	register int s;
1186 	int i, j, ix;
1187 	boolean_t rv;
1188 	vm_offset_t kva, off, foff;
1189 	swp_clean_t spc;
1190 	vm_offset_t paging_offset;
1191 	vm_object_t object;
1192 	int reqaddr[count];
1193 	int failed;
1194 
1195 /*
1196 	if( count > 1)
1197 		printf("off: 0x%x, count: %d\n", m[0]->offset, count);
1198 */
1199 	spc = NULL;
1200 
1201 	object = m[0]->object;
1202 	paging_offset = object->paging_offset;
1203 
1204 	failed = 0;
1205 	for(j=0;j<count;j++) {
1206 		foff = m[j]->offset + paging_offset;
1207 		ix = swap_pager_block_index(swp, foff);
1208 		swb[j] = 0;
1209 		if( swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
1210 			rtvals[j] = VM_PAGER_FAIL;
1211 			failed = 1;
1212 			continue;
1213 		} else {
1214 			rtvals[j] = VM_PAGER_OK;
1215 		}
1216 		swb[j] = &swp->sw_blocks[ix];
1217 		++swb[j]->swb_locked;
1218 		if( failed) {
1219 			rtvals[j] = VM_PAGER_FAIL;
1220 			continue;
1221 		}
1222 		off = swap_pager_block_offset(swp, foff);
1223 		reqaddr[j] = swb[j]->swb_block[off];
1224 		if( reqaddr[j] == SWB_EMPTY) {
1225 			int blk;
1226 			int tries;
1227 			int ntoget;
1228 			tries = 0;
1229 			s = splbio();
1230 
1231 			/*
1232 			 * if any other pages have been allocated in this block, we
1233 			 * only try to get one page.
1234 			 */
1235 			for (i = 0; i < SWB_NPAGES; i++) {
1236 				if (swb[j]->swb_block[i] != SWB_EMPTY)
1237 					break;
1238 			}
1239 
1240 
1241 			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
1242 			/*
1243 			 * this code is alittle conservative, but works
1244 			 * (the intent of this code is to allocate small chunks
1245 			 *  for small objects)
1246 			 */
1247 			if( (m[j]->offset == 0) && (ntoget*PAGE_SIZE > object->size)) {
1248 				ntoget = (object->size + (PAGE_SIZE-1))/PAGE_SIZE;
1249 			}
1250 
1251 retrygetspace:
1252 			if (!swap_pager_full && ntoget > 1 &&
1253 				swap_pager_getswapspace(ntoget * btodb(PAGE_SIZE), &blk)) {
1254 
1255 				for (i = 0; i < ntoget; i++) {
1256 					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
1257 					swb[j]->swb_valid = 0;
1258 				}
1259 
1260 				reqaddr[j] = swb[j]->swb_block[off];
1261 			} else if (!swap_pager_getswapspace(btodb(PAGE_SIZE),
1262 				&swb[j]->swb_block[off])) {
1263 				/*
1264 				 * if the allocation has failed, we try to reclaim space and
1265 				 * retry.
1266 				 */
1267 				if (++tries == 1) {
1268 					swap_pager_reclaim();
1269 					goto retrygetspace;
1270 				}
1271 				rtvals[j] = VM_PAGER_AGAIN;
1272 				failed = 1;
1273 			} else {
1274 				reqaddr[j] = swb[j]->swb_block[off];
1275 				swb[j]->swb_valid &= ~(1<<off);
1276 			}
1277 			splx(s);
1278 		}
1279 	}
1280 
1281 	/*
1282 	 * search forwards for the last contiguous page to transfer
1283 	 */
1284 	failed = 0;
1285 	for (i = 0; i < count; i++) {
1286 		if( failed || (reqaddr[i] != reqaddr[0] + i*btodb(PAGE_SIZE)) ||
1287 			(reqaddr[i] / dmmax) != (reqaddr[0] / dmmax) ||
1288 			(rtvals[i] != VM_PAGER_OK)) {
1289 			failed = 1;
1290 			if( rtvals[i] == VM_PAGER_OK)
1291 				rtvals[i] = VM_PAGER_AGAIN;
1292 		}
1293 	}
1294 
1295 	for(i = 0; i < count; i++) {
1296 		if( rtvals[i] != VM_PAGER_OK) {
1297 			if( swb[i])
1298 				--swb[i]->swb_locked;
1299 		}
1300 	}
1301 
1302 	for(i = 0; i < count; i++)
1303 		if( rtvals[i] != VM_PAGER_OK)
1304 			break;
1305 
1306 	if( i == 0) {
1307 		return VM_PAGER_AGAIN;
1308 	}
1309 
1310 	count = i;
1311 	for(i=0;i<count;i++) {
1312 		if( reqaddr[i] == SWB_EMPTY)
1313 			printf("I/O to empty block????\n");
1314 	}
1315 
1316 	/*
1317 	 */
1318 
1319 	/*
1320 	 * For synchronous writes, we clean up
1321 	 * all completed async pageouts.
1322 	 */
1323 	if ((flags & B_ASYNC) == 0) {
1324 		swap_pager_clean();
1325 	}
1326 
1327 	kva = 0;
1328 
1329 	/*
1330 	 * we allocate a new kva for transfers > 1 page
1331 	 * but for transfers == 1 page, the swap_pager_free list contains
1332 	 * entries that have pre-allocated kva's (for efficiency).
1333 	 * NOTE -- we do not use the physical buffer pool or the
1334 	 * preallocated associated kva's because of the potential for
1335 	 * deadlock.  This is very subtile -- but deadlocks or resource
1336 	 * contention must be avoided on pageouts -- or your system will
1337 	 * sleep (forever) !!!
1338 	 */
1339 /*
1340 	if ( count > 1) {
1341 		kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE);
1342 		if( !kva) {
1343 			for (i = 0; i < count; i++) {
1344 				if( swb[i])
1345 					--swb[i]->swb_locked;
1346 				rtvals[i] = VM_PAGER_AGAIN;
1347 			}
1348 			return VM_PAGER_AGAIN;
1349 		}
1350 	}
1351 */
1352 
1353 	/*
1354 	 * get a swap pager clean data structure, block until we get it
1355 	 */
1356 	if (swap_pager_free.tqh_first == NULL) {
1357 		s = splbio();
1358 		if( curproc == pageproc)
1359 			(void) swap_pager_clean();
1360 		else
1361 			wakeup((caddr_t) &vm_pages_needed);
1362 		while (swap_pager_free.tqh_first == NULL) {
1363 			swap_pager_needflags |= SWAP_FREE_NEEDED;
1364 			tsleep((caddr_t)&swap_pager_free,
1365 				PVM, "swpfre", 0);
1366 			if( curproc == pageproc)
1367 				(void) swap_pager_clean();
1368 			else
1369 				wakeup((caddr_t) &vm_pages_needed);
1370 		}
1371 		splx(s);
1372 	}
1373 
1374 	spc = swap_pager_free.tqh_first;
1375 	TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1376 
1377 	kva = spc->spc_kva;
1378 
1379 	/*
1380 	 * map our page(s) into kva for I/O
1381 	 */
1382 	pmap_qenter(kva, m, count);
1383 
1384 	/*
1385 	 * get the base I/O offset into the swap file
1386 	 */
1387 	for(i=0;i<count;i++) {
1388 		foff = m[i]->offset + paging_offset;
1389 		off = swap_pager_block_offset(swp, foff);
1390 		/*
1391 		 * if we are setting the valid bit anew,
1392 		 * then diminish the swap free space
1393 		 */
1394 		if( (swb[i]->swb_valid & (1 << off)) == 0)
1395 			vm_swap_size -= btodb(PAGE_SIZE);
1396 
1397 		/*
1398 		 * set the valid bit
1399 		 */
1400 		swb[i]->swb_valid |= (1 << off);
1401 		/*
1402 		 * and unlock the data structure
1403 		 */
1404 		--swb[i]->swb_locked;
1405 	}
1406 
1407 	s = splbio();
1408 	/*
1409 	 * Get a swap buffer header and perform the IO
1410 	 */
1411 	bp = spc->spc_bp;
1412 	bzero(bp, sizeof *bp);
1413 	bp->b_spc = spc;
1414 
1415 	bp->b_flags = B_BUSY;
1416 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1417 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1418 	if( bp->b_rcred != NOCRED)
1419 		crhold(bp->b_rcred);
1420 	if( bp->b_wcred != NOCRED)
1421 		crhold(bp->b_wcred);
1422 	bp->b_data = (caddr_t) kva;
1423 	bp->b_blkno = reqaddr[0];
1424 	bgetvp( swapdev_vp, bp);
1425 
1426 	bp->b_bcount = PAGE_SIZE*count;
1427 	bp->b_bufsize = PAGE_SIZE*count;
1428 	swapdev_vp->v_numoutput++;
1429 
1430 	/*
1431 	 * If this is an async write we set up additional buffer fields
1432 	 * and place a "cleaning" entry on the inuse queue.
1433 	 */
1434 	if ( flags & B_ASYNC ) {
1435 		spc->spc_flags = 0;
1436 		spc->spc_swp = swp;
1437 		for(i=0;i<count;i++)
1438 			spc->spc_m[i] = m[i];
1439 		spc->spc_count = count;
1440 		/*
1441 		 * the completion routine for async writes
1442 		 */
1443 		bp->b_flags |= B_CALL;
1444 		bp->b_iodone = swap_pager_iodone;
1445 		bp->b_dirtyoff = 0;
1446 		bp->b_dirtyend = bp->b_bcount;
1447 		swp->sw_poip++;
1448 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
1449 	} else {
1450 		swp->sw_poip++;
1451 		bp->b_flags |= B_CALL;
1452 		bp->b_iodone = swap_pager_iodone1;
1453 	}
1454 	/*
1455 	 * perform the I/O
1456 	 */
1457 	VOP_STRATEGY(bp);
1458 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC ) {
1459 		if ((bp->b_flags & B_DONE) == B_DONE) {
1460 			swap_pager_clean();
1461 		}
1462 		splx(s);
1463 		for(i=0;i<count;i++) {
1464 			rtvals[i] = VM_PAGER_PEND;
1465 		}
1466 		return VM_PAGER_PEND;
1467 	}
1468 
1469 	/*
1470 	 * wait for the sync I/O to complete
1471 	 */
1472 	while ((bp->b_flags & B_DONE) == 0) {
1473 		tsleep((caddr_t)bp, PVM, "swwrt", 0);
1474 	}
1475 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1476 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1477 
1478 	--swp->sw_poip;
1479 	if (swp->sw_poip == 0)
1480 		wakeup((caddr_t) swp);
1481 
1482 	if (bp->b_vp)
1483 		brelvp(bp);
1484 
1485 	splx(s);
1486 
1487 	/*
1488 	 * remove the mapping for kernel virtual
1489 	 */
1490 	pmap_qremove( kva, count);
1491 
1492 	/*
1493 	 * if we have written the page, then indicate that the page
1494 	 * is clean.
1495 	 */
1496 	if (rv == VM_PAGER_OK) {
1497 		for(i=0;i<count;i++) {
1498 			if( rtvals[i] == VM_PAGER_OK) {
1499 				m[i]->flags |= PG_CLEAN;
1500 				m[i]->flags &= ~PG_LAUNDRY;
1501 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1502 				/*
1503 				 * optimization, if a page has been read during the
1504 				 * pageout process, we activate it.
1505 				 */
1506 				if ( (m[i]->flags & PG_ACTIVE) == 0 &&
1507 					pmap_is_referenced(VM_PAGE_TO_PHYS(m[i])))
1508 					vm_page_activate(m[i]);
1509 			}
1510 		}
1511 	} else {
1512 		for(i=0;i<count;i++) {
1513 			rtvals[i] = rv;
1514 			m[i]->flags |= PG_LAUNDRY;
1515 		}
1516 	}
1517 
1518 	if( bp->b_rcred != NOCRED)
1519 		crfree(bp->b_rcred);
1520 	if( bp->b_wcred != NOCRED)
1521 		crfree(bp->b_wcred);
1522 	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1523 	if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1524 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1525 		wakeup((caddr_t)&swap_pager_free);
1526 	}
1527 
1528 	return(rv);
1529 }
1530 
1531 boolean_t
1532 swap_pager_clean()
1533 {
1534 	register swp_clean_t spc, tspc;
1535 	register int s;
1536 
1537 	tspc = NULL;
1538 	if (swap_pager_done.tqh_first == NULL)
1539 		return FALSE;
1540 	for (;;) {
1541 		s = splbio();
1542 		/*
1543 		 * Look up and removal from done list must be done
1544 		 * at splbio() to avoid conflicts with swap_pager_iodone.
1545 		 */
1546 		while ((spc = swap_pager_done.tqh_first) != 0) {
1547 			pmap_qremove( spc->spc_kva, spc->spc_count);
1548 			swap_pager_finish(spc);
1549 			TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
1550 			goto doclean;
1551 		}
1552 
1553 		/*
1554 		 * No operations done, thats all we can do for now.
1555 		 */
1556 
1557 		splx(s);
1558 		break;
1559 
1560 		/*
1561 		 * The desired page was found to be busy earlier in
1562 		 * the scan but has since completed.
1563 		 */
1564 doclean:
1565 		if (tspc && tspc == spc) {
1566 			tspc = NULL;
1567 		}
1568 		spc->spc_flags = 0;
1569 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1570 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1571 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1572 			wakeup((caddr_t)&swap_pager_free);
1573 		}
1574 		++cleandone;
1575 		splx(s);
1576 	}
1577 
1578 	return(tspc ? TRUE : FALSE);
1579 }
1580 
1581 void
1582 swap_pager_finish(spc)
1583 	register swp_clean_t spc;
1584 {
1585 	vm_object_t object = spc->spc_m[0]->object;
1586 	int i;
1587 
1588 	if ((object->paging_in_progress -= spc->spc_count) == 0)
1589 		thread_wakeup((int) object);
1590 
1591 	/*
1592 	 * If no error mark as clean and inform the pmap system.
1593 	 * If error, mark as dirty so we will try again.
1594 	 * (XXX could get stuck doing this, should give up after awhile)
1595 	 */
1596 	if (spc->spc_flags & SPC_ERROR) {
1597 		for(i=0;i<spc->spc_count;i++) {
1598 			printf("swap_pager_finish: clean of page %lx failed\n",
1599 			       (u_long)VM_PAGE_TO_PHYS(spc->spc_m[i]));
1600 			spc->spc_m[i]->flags |= PG_LAUNDRY;
1601 		}
1602 	} else {
1603 		for(i=0;i<spc->spc_count;i++) {
1604 			pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i]));
1605 			spc->spc_m[i]->flags |= PG_CLEAN;
1606 		}
1607 	}
1608 
1609 
1610 	for(i=0;i<spc->spc_count;i++) {
1611 		/*
1612 		 * we wakeup any processes that are waiting on
1613 		 * these pages.
1614 		 */
1615 		PAGE_WAKEUP(spc->spc_m[i]);
1616 	}
1617 	nswiodone -= spc->spc_count;
1618 
1619 	return;
1620 }
1621 
1622 /*
1623  * swap_pager_iodone
1624  */
1625 void
1626 swap_pager_iodone(bp)
1627 	register struct buf *bp;
1628 {
1629 	register swp_clean_t spc;
1630 	int s;
1631 
1632 	s = splbio();
1633 	spc = (swp_clean_t) bp->b_spc;
1634 	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
1635 	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
1636 	if (bp->b_flags & B_ERROR) {
1637 		spc->spc_flags |= SPC_ERROR;
1638 		printf("error %d blkno %lu sz %ld ",
1639 			bp->b_error, (u_long)bp->b_blkno, bp->b_bcount);
1640 	}
1641 
1642 /*
1643 	if ((bp->b_flags & B_READ) == 0)
1644 		vwakeup(bp);
1645 */
1646 
1647 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_ASYNC);
1648 	if (bp->b_vp) {
1649 		brelvp(bp);
1650 	}
1651 	if( bp->b_rcred != NOCRED)
1652 		crfree(bp->b_rcred);
1653 	if( bp->b_wcred != NOCRED)
1654 		crfree(bp->b_wcred);
1655 
1656 	nswiodone += spc->spc_count;
1657 	if (--spc->spc_swp->sw_poip == 0) {
1658 		wakeup((caddr_t)spc->spc_swp);
1659 	}
1660 
1661 	if ((swap_pager_needflags & SWAP_FREE_NEEDED) ||
1662 	    swap_pager_inuse.tqh_first == 0) {
1663 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1664 		wakeup((caddr_t)&swap_pager_free);
1665 		wakeup((caddr_t)&vm_pages_needed);
1666 	}
1667 
1668 	if (vm_pageout_pages_needed) {
1669 		wakeup((caddr_t)&vm_pageout_pages_needed);
1670 	}
1671 
1672 	if ((swap_pager_inuse.tqh_first == NULL) ||
1673 	    (cnt.v_free_count < cnt.v_free_min &&
1674 	    nswiodone + cnt.v_free_count >= cnt.v_free_min) ) {
1675 		wakeup((caddr_t)&vm_pages_needed);
1676 	}
1677 	splx(s);
1678 }
1679 
1680 /*
1681  * return true if any swap control structures can be allocated
1682  */
1683 int
1684 swap_pager_ready() {
1685 	if( swap_pager_free.tqh_first)
1686 		return 1;
1687 	else
1688 		return 0;
1689 }
1690