xref: /freebsd/sys/vm/swap_pager.c (revision 2e1e24dd280b4fbdbac30c8dae0b4604ba3f00fd)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * Copyright (c) 1990 University of Utah.
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * the Systems Programming Group of the University of Utah Computer
9  * Science Department.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
40  *
41  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
42  * $Id: swap_pager.c,v 1.11 1994/10/09 07:35:16 davidg Exp $
43  */
44 
45 /*
46  * Quick hack to page to dedicated partition(s).
47  * TODO:
48  *	Add multiprocessor locks
49  *	Deal with async writes in a better fashion
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/proc.h>
55 #include <sys/buf.h>
56 #include <sys/vnode.h>
57 #include <sys/malloc.h>
58 
59 #include <miscfs/specfs/specdev.h>
60 #include <sys/rlist.h>
61 
62 #include <vm/vm.h>
63 #include <vm/vm_pager.h>
64 #include <vm/vm_page.h>
65 #include <vm/vm_pageout.h>
66 #include <vm/swap_pager.h>
67 
68 #ifndef NPENDINGIO
69 #define NPENDINGIO	16
70 #endif
71 
72 int	swap_pager_input __P((sw_pager_t, vm_page_t *, int, int));
73 int	swap_pager_output __P((sw_pager_t, vm_page_t *, int, int, int *));
74 
75 int nswiodone;
76 extern int vm_pageout_rate_limit;
77 static int cleandone;
78 extern int hz;
79 int swap_pager_full;
80 extern vm_map_t pager_map;
81 extern int vm_swap_size;
82 struct rlist *swaplist;
83 int nswaplist;
84 
85 #define MAX_PAGEOUT_CLUSTER 8
86 
87 TAILQ_HEAD(swpclean, swpagerclean);
88 
89 typedef	struct swpagerclean	*swp_clean_t;
90 
91 struct swpagerclean {
92 	TAILQ_ENTRY(swpagerclean)	spc_list;
93 	int				spc_flags;
94 	struct buf			*spc_bp;
95 	sw_pager_t			spc_swp;
96 	vm_offset_t			spc_kva;
97 	int				spc_count;
98 	vm_page_t			spc_m[MAX_PAGEOUT_CLUSTER];
99 } swcleanlist [NPENDINGIO] ;
100 
101 
102 extern vm_map_t kernel_map;
103 
104 /* spc_flags values */
105 #define SPC_ERROR	0x01
106 
107 #define SWB_EMPTY (-1)
108 
109 struct swpclean swap_pager_done;	/* list of compileted page cleans */
110 struct swpclean swap_pager_inuse;	/* list of pending page cleans */
111 struct swpclean swap_pager_free;	/* list of free pager clean structs */
112 struct pagerlst swap_pager_list;	/* list of "named" anon regions */
113 struct pagerlst swap_pager_un_list;	/* list of "unnamed" anon pagers */
114 
115 #define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
116 int swap_pager_needflags;
117 struct rlist *swapfrag;
118 
119 struct pagerlst *swp_qs[]={
120 	&swap_pager_list, &swap_pager_un_list, (struct pagerlst *) 0
121 };
122 
123 int swap_pager_putmulti();
124 
125 struct pagerops swappagerops = {
126 	swap_pager_init,
127 	swap_pager_alloc,
128 	swap_pager_dealloc,
129 	swap_pager_getpage,
130 	swap_pager_getmulti,
131 	swap_pager_putpage,
132 	swap_pager_putmulti,
133 	swap_pager_haspage
134 };
135 
136 int npendingio = NPENDINGIO;
137 int pendingiowait;
138 int require_swap_init;
139 void swap_pager_finish();
140 int dmmin, dmmax;
141 extern int vm_page_count;
142 
143 struct buf * getpbuf() ;
144 void relpbuf(struct buf *bp) ;
145 
146 static inline void swapsizecheck() {
147 	if( vm_swap_size < 128*btodb(PAGE_SIZE)) {
148 		if( swap_pager_full)
149 			printf("swap_pager: out of space\n");
150 		swap_pager_full = 1;
151 	} else if( vm_swap_size > 192*btodb(PAGE_SIZE))
152 		swap_pager_full = 0;
153 }
154 
155 void
156 swap_pager_init()
157 {
158 	dfltpagerops = &swappagerops;
159 
160 	TAILQ_INIT(&swap_pager_list);
161 	TAILQ_INIT(&swap_pager_un_list);
162 
163 	/*
164 	 * Initialize clean lists
165 	 */
166 	TAILQ_INIT(&swap_pager_inuse);
167 	TAILQ_INIT(&swap_pager_done);
168 	TAILQ_INIT(&swap_pager_free);
169 
170 	require_swap_init = 1;
171 
172 	/*
173 	 * Calculate the swap allocation constants.
174 	 */
175 
176 	dmmin = CLBYTES/DEV_BSIZE;
177 	dmmax = btodb(SWB_NPAGES*PAGE_SIZE)*2;
178 
179 }
180 
181 /*
182  * Allocate a pager structure and associated resources.
183  * Note that if we are called from the pageout daemon (handle == NULL)
184  * we should not wait for memory as it could resulting in deadlock.
185  */
186 vm_pager_t
187 swap_pager_alloc(handle, size, prot, offset)
188 	caddr_t handle;
189 	register vm_size_t size;
190 	vm_prot_t prot;
191 	vm_offset_t offset;
192 {
193 	register vm_pager_t pager;
194 	register sw_pager_t swp;
195 	int waitok;
196 	int i,j;
197 
198 	if (require_swap_init) {
199 		swp_clean_t spc;
200 		struct buf *bp;
201 		/*
202 		 * kva's are allocated here so that we dont need to keep
203 		 * doing kmem_alloc pageables at runtime
204 		 */
205 		for (i = 0, spc = swcleanlist; i < npendingio ; i++, spc++) {
206 			spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE*MAX_PAGEOUT_CLUSTER);
207 			if (!spc->spc_kva) {
208 				break;
209 			}
210 			spc->spc_bp = malloc( sizeof( *bp), M_TEMP, M_NOWAIT);
211 			if (!spc->spc_bp) {
212 				kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
213 				break;
214 			}
215 			spc->spc_flags = 0;
216 			TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
217 		}
218 		require_swap_init = 0;
219 		if( size == 0)
220 			return(NULL);
221 	}
222 
223 	/*
224 	 * If this is a "named" anonymous region, look it up and
225 	 * return the appropriate pager if it exists.
226 	 */
227 	if (handle) {
228 		pager = vm_pager_lookup(&swap_pager_list, handle);
229 		if (pager != NULL) {
230 			/*
231 			 * Use vm_object_lookup to gain a reference
232 			 * to the object and also to remove from the
233 			 * object cache.
234 			 */
235 			if (vm_object_lookup(pager) == NULL)
236 				panic("swap_pager_alloc: bad object");
237 			return(pager);
238 		}
239 	}
240 
241 	if (swap_pager_full) {
242 		return(NULL);
243 	}
244 
245 	/*
246 	 * Pager doesn't exist, allocate swap management resources
247 	 * and initialize.
248 	 */
249 	waitok = handle ? M_WAITOK : M_NOWAIT;
250 	pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok);
251 	if (pager == NULL)
252 		return(NULL);
253 	swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok);
254 	if (swp == NULL) {
255 		free((caddr_t)pager, M_VMPAGER);
256 		return(NULL);
257 	}
258 	size = round_page(size);
259 	swp->sw_osize = size;
260 	swp->sw_nblocks = (btodb(size) + btodb(SWB_NPAGES * PAGE_SIZE) - 1) / btodb(SWB_NPAGES*PAGE_SIZE);
261 	swp->sw_blocks = (sw_blk_t)
262 		malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks),
263 		       M_VMPGDATA, waitok);
264 	if (swp->sw_blocks == NULL) {
265 		free((caddr_t)swp, M_VMPGDATA);
266 		free((caddr_t)pager, M_VMPAGER);
267 		return(NULL);
268 	}
269 
270 	for (i = 0; i < swp->sw_nblocks; i++) {
271 		swp->sw_blocks[i].swb_valid = 0;
272 		swp->sw_blocks[i].swb_locked = 0;
273 		for (j = 0; j < SWB_NPAGES; j++)
274 			swp->sw_blocks[i].swb_block[j] = SWB_EMPTY;
275 	}
276 
277 	swp->sw_poip = 0;
278 	if (handle) {
279 		vm_object_t object;
280 
281 		swp->sw_flags = SW_NAMED;
282 		TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list);
283 		/*
284 		 * Consistant with other pagers: return with object
285 		 * referenced.  Can't do this with handle == NULL
286 		 * since it might be the pageout daemon calling.
287 		 */
288 		object = vm_object_allocate(size);
289 		vm_object_enter(object, pager);
290 		vm_object_setpager(object, pager, 0, FALSE);
291 	} else {
292 		swp->sw_flags = 0;
293 		TAILQ_INSERT_TAIL(&swap_pager_un_list, pager, pg_list);
294 	}
295 	pager->pg_handle = handle;
296 	pager->pg_ops = &swappagerops;
297 	pager->pg_type = PG_SWAP;
298 	pager->pg_data = (caddr_t)swp;
299 
300 	return(pager);
301 }
302 
303 /*
304  * returns disk block associated with pager and offset
305  * additionally, as a side effect returns a flag indicating
306  * if the block has been written
307  */
308 
309 static int *
310 swap_pager_diskaddr(swp, offset, valid)
311 	sw_pager_t swp;
312 	vm_offset_t offset;
313 	int *valid;
314 {
315 	register sw_blk_t swb;
316 	int ix;
317 
318 	if (valid)
319 		*valid = 0;
320 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
321 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
322 		return(FALSE);
323 	}
324 	swb = &swp->sw_blocks[ix];
325 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
326 	if (valid)
327 		*valid = swb->swb_valid & (1<<ix);
328 	return &swb->swb_block[ix];
329 }
330 
331 /*
332  * Utility routine to set the valid (written) bit for
333  * a block associated with a pager and offset
334  */
335 static void
336 swap_pager_setvalid(swp, offset, valid)
337 	sw_pager_t swp;
338 	vm_offset_t offset;
339 	int valid;
340 {
341 	register sw_blk_t swb;
342 	int ix;
343 
344 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
345 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks)
346 		return;
347 
348 	swb = &swp->sw_blocks[ix];
349 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
350 	if (valid)
351 		swb->swb_valid |= (1 << ix);
352 	else
353 		swb->swb_valid &= ~(1 << ix);
354 	return;
355 }
356 
357 /*
358  * this routine allocates swap space with a fragmentation
359  * minimization policy.
360  */
361 int
362 swap_pager_getswapspace( unsigned amount, unsigned *rtval) {
363 #ifdef EXP
364 	unsigned tmpalloc;
365 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
366 	if( amount < nblocksfrag) {
367 		if( rlist_alloc(&swapfrag, amount, rtval))
368 			return 1;
369 		if( !rlist_alloc(&swaplist, nblocksfrag, &tmpalloc))
370 			return 0;
371 		rlist_free( &swapfrag, tmpalloc+amount, tmpalloc + nblocksfrag - 1);
372 		*rtval = tmpalloc;
373 		return 1;
374 	}
375 #endif
376 	if( !rlist_alloc(&swaplist, amount, rtval))
377 		return 0;
378 	else
379 		return 1;
380 }
381 
382 /*
383  * this routine frees swap space with a fragmentation
384  * minimization policy.
385  */
386 void
387 swap_pager_freeswapspace( unsigned from, unsigned to) {
388 #ifdef EXP
389 	unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE);
390 	unsigned tmpalloc;
391 	if( ((to + 1) - from) >= nblocksfrag) {
392 #endif
393 		rlist_free(&swaplist, from, to);
394 #ifdef EXP
395 		return;
396 	}
397 	rlist_free(&swapfrag, from, to);
398 	while( rlist_alloc(&swapfrag, nblocksfrag, &tmpalloc)) {
399 		rlist_free(&swaplist, tmpalloc, tmpalloc + nblocksfrag-1);
400 	}
401 #endif
402 }
403 /*
404  * this routine frees swap blocks from a specified pager
405  */
406 void
407 _swap_pager_freespace(swp, start, size)
408 	sw_pager_t swp;
409 	vm_offset_t start;
410 	vm_offset_t size;
411 {
412 	vm_offset_t i;
413 	int s;
414 
415 	s = splbio();
416 	for (i = start; i < round_page(start + size - 1); i += PAGE_SIZE) {
417 		int valid;
418 		int *addr = swap_pager_diskaddr(swp, i, &valid);
419 		if (addr && *addr != SWB_EMPTY) {
420 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
421 			if( valid) {
422 				vm_swap_size += btodb(PAGE_SIZE);
423 				swap_pager_setvalid(swp, i, 0);
424 			}
425 			*addr = SWB_EMPTY;
426 		}
427 	}
428 	swapsizecheck();
429 	splx(s);
430 }
431 
432 void
433 swap_pager_freespace(pager, start, size)
434 	vm_pager_t pager;
435 	vm_offset_t start;
436 	vm_offset_t size;
437 {
438 	_swap_pager_freespace((sw_pager_t) pager->pg_data, start, size);
439 }
440 
441 /*
442  * swap_pager_reclaim frees up over-allocated space from all pagers
443  * this eliminates internal fragmentation due to allocation of space
444  * for segments that are never swapped to. It has been written so that
445  * it does not block until the rlist_free operation occurs; it keeps
446  * the queues consistant.
447  */
448 
449 /*
450  * Maximum number of blocks (pages) to reclaim per pass
451  */
452 #define MAXRECLAIM 256
453 
454 void
455 swap_pager_reclaim()
456 {
457 	vm_pager_t p;
458 	sw_pager_t swp;
459 	int i, j, k;
460 	int s;
461 	int reclaimcount;
462 	static int reclaims[MAXRECLAIM];
463 	static int in_reclaim;
464 
465 /*
466  * allow only one process to be in the swap_pager_reclaim subroutine
467  */
468 	s = splbio();
469 	if (in_reclaim) {
470 		tsleep((caddr_t) &in_reclaim, PSWP, "swrclm", 0);
471 		splx(s);
472 		return;
473 	}
474 	in_reclaim = 1;
475 	reclaimcount = 0;
476 
477 	/* for each pager queue */
478 	for (k = 0; swp_qs[k]; k++) {
479 
480 		p = swp_qs[k]->tqh_first;
481 		while (p && (reclaimcount < MAXRECLAIM)) {
482 
483 			/*
484 			 * see if any blocks associated with a pager has been
485 			 * allocated but not used (written)
486 			 */
487 			swp = (sw_pager_t) p->pg_data;
488 			for (i = 0; i < swp->sw_nblocks; i++) {
489 				sw_blk_t swb = &swp->sw_blocks[i];
490 				if( swb->swb_locked)
491 					continue;
492 				for (j = 0; j < SWB_NPAGES; j++) {
493 					if (swb->swb_block[j] != SWB_EMPTY &&
494 						(swb->swb_valid & (1 << j)) == 0) {
495 						reclaims[reclaimcount++] = swb->swb_block[j];
496 						swb->swb_block[j] = SWB_EMPTY;
497 						if (reclaimcount >= MAXRECLAIM)
498 							goto rfinished;
499 					}
500 				}
501 			}
502 			p = p->pg_list.tqe_next;
503 		}
504 	}
505 
506 rfinished:
507 
508 /*
509  * free the blocks that have been added to the reclaim list
510  */
511 	for (i = 0; i < reclaimcount; i++) {
512 		swap_pager_freeswapspace(reclaims[i], reclaims[i]+btodb(PAGE_SIZE) - 1);
513 		swapsizecheck();
514 		wakeup((caddr_t) &in_reclaim);
515 	}
516 
517 	splx(s);
518 	in_reclaim = 0;
519 	wakeup((caddr_t) &in_reclaim);
520 }
521 
522 
523 /*
524  * swap_pager_copy copies blocks from one pager to another and
525  * destroys the source pager
526  */
527 
528 void
529 swap_pager_copy(srcpager, srcoffset, dstpager, dstoffset, offset)
530 	vm_pager_t srcpager;
531 	vm_offset_t srcoffset;
532 	vm_pager_t dstpager;
533 	vm_offset_t dstoffset;
534 	vm_offset_t offset;
535 {
536 	sw_pager_t srcswp, dstswp;
537 	vm_offset_t i;
538 	int s;
539 
540 	srcswp = (sw_pager_t) srcpager->pg_data;
541 	dstswp = (sw_pager_t) dstpager->pg_data;
542 
543 /*
544  * remove the source pager from the swap_pager internal queue
545  */
546 	s = splbio();
547 	if (srcswp->sw_flags & SW_NAMED) {
548 		TAILQ_REMOVE(&swap_pager_list, srcpager, pg_list);
549 		srcswp->sw_flags &= ~SW_NAMED;
550 	} else {
551 		TAILQ_REMOVE(&swap_pager_un_list, srcpager, pg_list);
552 	}
553 
554 	while (srcswp->sw_poip) {
555 		tsleep((caddr_t)srcswp, PVM, "spgout", 0);
556 	}
557 	splx(s);
558 
559 /*
560  * clean all of the pages that are currently active and finished
561  */
562 	(void) swap_pager_clean();
563 
564 	s = splbio();
565 /*
566  * clear source block before destination object
567  * (release allocated space)
568  */
569 	for (i = 0; i < offset + srcoffset; i += PAGE_SIZE) {
570 		int valid;
571 		int *addr = swap_pager_diskaddr(srcswp, i, &valid);
572 		if (addr && *addr != SWB_EMPTY) {
573 			swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1);
574 			if( valid)
575 				vm_swap_size += btodb(PAGE_SIZE);
576 			swapsizecheck();
577 			*addr = SWB_EMPTY;
578 		}
579 	}
580 /*
581  * transfer source to destination
582  */
583 	for (i = 0; i < dstswp->sw_osize; i += PAGE_SIZE) {
584 		int srcvalid, dstvalid;
585 		int *srcaddrp = swap_pager_diskaddr(srcswp, i + offset + srcoffset,
586 			&srcvalid);
587 		int *dstaddrp;
588 	/*
589 	 * see if the source has space allocated
590 	 */
591 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
592 		/*
593 		 * if the source is valid and the dest has no space, then
594 		 * copy the allocation from the srouce to the dest.
595 		 */
596 			if (srcvalid) {
597 				dstaddrp = swap_pager_diskaddr(dstswp, i + dstoffset, &dstvalid);
598 				/*
599 				 * if the dest already has a valid block, deallocate the
600 				 * source block without copying.
601 				 */
602 				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
603 					swap_pager_freeswapspace(*dstaddrp, *dstaddrp+btodb(PAGE_SIZE) - 1);
604 					*dstaddrp = SWB_EMPTY;
605 				}
606 				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
607 					*dstaddrp = *srcaddrp;
608 					*srcaddrp = SWB_EMPTY;
609 					swap_pager_setvalid(dstswp, i + dstoffset, 1);
610 					vm_swap_size -= btodb(PAGE_SIZE);
611 				}
612 			}
613 		/*
614 		 * if the source is not empty at this point, then deallocate the space.
615 		 */
616 			if (*srcaddrp != SWB_EMPTY) {
617 				swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
618 				if( srcvalid)
619 					vm_swap_size += btodb(PAGE_SIZE);
620 				*srcaddrp = SWB_EMPTY;
621 			}
622 		}
623 	}
624 
625 /*
626  * deallocate the rest of the source object
627  */
628 	for (i = dstswp->sw_osize + offset + srcoffset; i < srcswp->sw_osize; i += PAGE_SIZE) {
629 		int valid;
630 		int *srcaddrp = swap_pager_diskaddr(srcswp, i, &valid);
631 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
632 			swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1);
633 			if( valid)
634 				vm_swap_size += btodb(PAGE_SIZE);
635 			*srcaddrp = SWB_EMPTY;
636 		}
637 	}
638 
639 	swapsizecheck();
640 	splx(s);
641 
642 	free((caddr_t)srcswp->sw_blocks, M_VMPGDATA);
643 	srcswp->sw_blocks = 0;
644 	free((caddr_t)srcswp, M_VMPGDATA);
645 	srcpager->pg_data = 0;
646 	free((caddr_t)srcpager, M_VMPAGER);
647 
648 	return;
649 }
650 
651 
652 void
653 swap_pager_dealloc(pager)
654 	vm_pager_t pager;
655 {
656 	register int i,j;
657 	register sw_blk_t bp;
658 	register sw_pager_t swp;
659 	int s;
660 
661 	/*
662 	 * Remove from list right away so lookups will fail if we
663 	 * block for pageout completion.
664 	 */
665 	s = splbio();
666 	swp = (sw_pager_t) pager->pg_data;
667 	if (swp->sw_flags & SW_NAMED) {
668 		TAILQ_REMOVE(&swap_pager_list, pager, pg_list);
669 		swp->sw_flags &= ~SW_NAMED;
670 	} else {
671 		TAILQ_REMOVE(&swap_pager_un_list, pager, pg_list);
672 	}
673 	/*
674 	 * Wait for all pageouts to finish and remove
675 	 * all entries from cleaning list.
676 	 */
677 
678 	while (swp->sw_poip) {
679 		tsleep((caddr_t)swp, PVM, "swpout", 0);
680 	}
681 	splx(s);
682 
683 
684 	(void) swap_pager_clean();
685 
686 	/*
687 	 * Free left over swap blocks
688 	 */
689 	s = splbio();
690 	for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) {
691 		for (j = 0; j < SWB_NPAGES; j++)
692 		if (bp->swb_block[j] != SWB_EMPTY) {
693 			swap_pager_freeswapspace((unsigned)bp->swb_block[j],
694 				(unsigned)bp->swb_block[j] + btodb(PAGE_SIZE) - 1);
695 			if( bp->swb_valid & (1<<j))
696 				vm_swap_size += btodb(PAGE_SIZE);
697 			bp->swb_block[j] = SWB_EMPTY;
698 		}
699 	}
700 	splx(s);
701 	swapsizecheck();
702 
703 	/*
704 	 * Free swap management resources
705 	 */
706 	free((caddr_t)swp->sw_blocks, M_VMPGDATA);
707 	swp->sw_blocks = 0;
708 	free((caddr_t)swp, M_VMPGDATA);
709 	pager->pg_data = 0;
710 	free((caddr_t)pager, M_VMPAGER);
711 }
712 
713 /*
714  * swap_pager_getmulti can get multiple pages.
715  */
716 int
717 swap_pager_getmulti(pager, m, count, reqpage, sync)
718 	vm_pager_t pager;
719 	vm_page_t *m;
720 	int count;
721 	int reqpage;
722 	boolean_t sync;
723 {
724 	if( reqpage >= count)
725 		panic("swap_pager_getmulti: reqpage >= count\n");
726 	return swap_pager_input((sw_pager_t) pager->pg_data, m, count, reqpage);
727 }
728 
729 /*
730  * swap_pager_getpage gets individual pages
731  */
732 int
733 swap_pager_getpage(pager, m, sync)
734 	vm_pager_t pager;
735 	vm_page_t m;
736 	boolean_t sync;
737 {
738 	vm_page_t marray[1];
739 
740 	marray[0] = m;
741 	return swap_pager_input((sw_pager_t)pager->pg_data, marray, 1, 0);
742 }
743 
744 int
745 swap_pager_putmulti(pager, m, c, sync, rtvals)
746 	vm_pager_t pager;
747 	vm_page_t *m;
748 	int c;
749 	boolean_t sync;
750 	int *rtvals;
751 {
752 	int flags;
753 
754 	if (pager == NULL) {
755 		(void) swap_pager_clean();
756 		return VM_PAGER_OK;
757 	}
758 
759 	flags = B_WRITE;
760 	if (!sync)
761 		flags |= B_ASYNC;
762 
763 	return swap_pager_output((sw_pager_t)pager->pg_data, m, c, flags, rtvals);
764 }
765 
766 /*
767  * swap_pager_putpage writes individual pages
768  */
769 int
770 swap_pager_putpage(pager, m, sync)
771 	vm_pager_t pager;
772 	vm_page_t m;
773 	boolean_t sync;
774 {
775 	int flags;
776 	vm_page_t marray[1];
777 	int rtvals[1];
778 
779 
780 	if (pager == NULL) {
781 		(void) swap_pager_clean();
782 		return VM_PAGER_OK;
783 	}
784 
785 	marray[0] = m;
786 	flags = B_WRITE;
787 	if (!sync)
788 		flags |= B_ASYNC;
789 
790 	swap_pager_output((sw_pager_t)pager->pg_data, marray, 1, flags, rtvals);
791 
792 	return rtvals[0];
793 }
794 
795 static inline int
796 const swap_pager_block_index(swp, offset)
797 	sw_pager_t swp;
798 	vm_offset_t offset;
799 {
800 	return (offset / (SWB_NPAGES*PAGE_SIZE));
801 }
802 
803 static inline int
804 const swap_pager_block_offset(swp, offset)
805 	sw_pager_t swp;
806 	vm_offset_t offset;
807 {
808 	return ((offset % (PAGE_SIZE*SWB_NPAGES)) / PAGE_SIZE);
809 }
810 
811 /*
812  * _swap_pager_haspage returns TRUE if the pager has data that has
813  * been written out.
814  */
815 static boolean_t
816 _swap_pager_haspage(swp, offset)
817 	sw_pager_t swp;
818 	vm_offset_t offset;
819 {
820 	register sw_blk_t swb;
821 	int ix;
822 
823 	ix = offset / (SWB_NPAGES*PAGE_SIZE);
824 	if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
825 		return(FALSE);
826 	}
827 	swb = &swp->sw_blocks[ix];
828 	ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE;
829 	if (swb->swb_block[ix] != SWB_EMPTY) {
830 		if (swb->swb_valid & (1 << ix))
831 			return TRUE;
832 	}
833 
834 	return(FALSE);
835 }
836 
837 /*
838  * swap_pager_haspage is the externally accessible version of
839  * _swap_pager_haspage above.  this routine takes a vm_pager_t
840  * for an argument instead of sw_pager_t.
841  */
842 boolean_t
843 swap_pager_haspage(pager, offset)
844 	vm_pager_t pager;
845 	vm_offset_t offset;
846 {
847 	return _swap_pager_haspage((sw_pager_t) pager->pg_data, offset);
848 }
849 
850 /*
851  * swap_pager_freepage is a convienience routine that clears the busy
852  * bit and deallocates a page.
853  */
854 static void
855 swap_pager_freepage(m)
856 	vm_page_t m;
857 {
858 	PAGE_WAKEUP(m);
859 	vm_page_free(m);
860 }
861 
862 /*
863  * swap_pager_ridpages is a convienience routine that deallocates all
864  * but the required page.  this is usually used in error returns that
865  * need to invalidate the "extra" readahead pages.
866  */
867 static void
868 swap_pager_ridpages(m, count, reqpage)
869 	vm_page_t *m;
870 	int count;
871 	int reqpage;
872 {
873 	int i;
874 	for (i = 0; i < count; i++)
875 		if (i != reqpage)
876 			swap_pager_freepage(m[i]);
877 }
878 
879 int swapwritecount=0;
880 
881 /*
882  * swap_pager_iodone1 is the completion routine for both reads and async writes
883  */
884 void
885 swap_pager_iodone1(bp)
886 	struct buf *bp;
887 {
888 	bp->b_flags |= B_DONE;
889 	bp->b_flags &= ~B_ASYNC;
890 	wakeup((caddr_t)bp);
891 /*
892 	if ((bp->b_flags & B_READ) == 0)
893 		vwakeup(bp);
894 */
895 }
896 
897 
898 int
899 swap_pager_input(swp, m, count, reqpage)
900 	register sw_pager_t swp;
901 	vm_page_t *m;
902 	int count, reqpage;
903 {
904 	register struct buf *bp;
905 	sw_blk_t swb[count];
906 	register int s;
907 	int i;
908 	boolean_t rv;
909 	vm_offset_t kva, off[count];
910 	swp_clean_t spc;
911 	vm_offset_t paging_offset;
912 	vm_object_t object;
913 	int reqaddr[count];
914 
915 	int first, last;
916 	int failed;
917 	int reqdskregion;
918 
919 	object = m[reqpage]->object;
920 	paging_offset = object->paging_offset;
921 	/*
922 	 * First determine if the page exists in the pager if this is
923 	 * a sync read.  This quickly handles cases where we are
924 	 * following shadow chains looking for the top level object
925 	 * with the page.
926 	 */
927 	if (swp->sw_blocks == NULL) {
928 		swap_pager_ridpages(m, count, reqpage);
929 		return(VM_PAGER_FAIL);
930 	}
931 
932 	for(i = 0; i < count; i++) {
933 		vm_offset_t foff = m[i]->offset + paging_offset;
934 		int ix = swap_pager_block_index(swp, foff);
935 		if (ix >= swp->sw_nblocks) {
936 			int j;
937 			if( i <= reqpage) {
938 				swap_pager_ridpages(m, count, reqpage);
939 				return(VM_PAGER_FAIL);
940 			}
941 			for(j = i; j < count; j++) {
942 				swap_pager_freepage(m[j]);
943 			}
944 			count = i;
945 			break;
946 		}
947 
948 		swb[i] = &swp->sw_blocks[ix];
949 		off[i] = swap_pager_block_offset(swp, foff);
950 		reqaddr[i] = swb[i]->swb_block[off[i]];
951 	}
952 
953 	/* make sure that our required input request is existant */
954 
955 	if (reqaddr[reqpage] == SWB_EMPTY ||
956 		(swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
957 		swap_pager_ridpages(m, count, reqpage);
958 		return(VM_PAGER_FAIL);
959 	}
960 
961 
962 	reqdskregion = reqaddr[reqpage] / dmmax;
963 
964 	/*
965 	 * search backwards for the first contiguous page to transfer
966 	 */
967 	failed = 0;
968 	first = 0;
969 	for (i = reqpage - 1; i >= 0; --i) {
970 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
971 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
972 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
973 			((reqaddr[i] / dmmax) != reqdskregion)) {
974 				failed = 1;
975 				swap_pager_freepage(m[i]);
976 				if (first == 0)
977 					first = i + 1;
978 		}
979 	}
980 	/*
981 	 * search forwards for the last contiguous page to transfer
982 	 */
983 	failed = 0;
984 	last = count;
985 	for (i = reqpage + 1; i < count; i++) {
986 		if ( failed || (reqaddr[i] == SWB_EMPTY) ||
987 			(swb[i]->swb_valid & (1 << off[i])) == 0 ||
988 			(reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
989 			((reqaddr[i] / dmmax) != reqdskregion)) {
990 				failed = 1;
991 				swap_pager_freepage(m[i]);
992 				if (last == count)
993 					last = i;
994 		}
995 	}
996 
997 	count = last;
998 	if (first != 0) {
999 		for (i = first; i < count; i++) {
1000 			m[i-first] = m[i];
1001 			reqaddr[i-first] = reqaddr[i];
1002 			off[i-first] = off[i];
1003 		}
1004 		count -= first;
1005 		reqpage -= first;
1006 	}
1007 
1008 	++swb[reqpage]->swb_locked;
1009 
1010 	/*
1011 	 * at this point:
1012 	 * "m" is a pointer to the array of vm_page_t for paging I/O
1013 	 * "count" is the number of vm_page_t entries represented by "m"
1014 	 * "object" is the vm_object_t for I/O
1015 	 * "reqpage" is the index into "m" for the page actually faulted
1016 	 */
1017 
1018 	spc = NULL;	/* we might not use an spc data structure */
1019 
1020 	if (count == 1) {
1021 		/*
1022 		 * if a kva has not been allocated, we can only do a one page transfer,
1023 		 * so we free the other pages that might have been allocated by
1024 		 * vm_fault.
1025 		 */
1026 		swap_pager_ridpages(m, count, reqpage);
1027 		m[0] = m[reqpage];
1028 		reqaddr[0] = reqaddr[reqpage];
1029 
1030 		count = 1;
1031 		reqpage = 0;
1032 	/*
1033 	 * get a swap pager clean data structure, block until we get it
1034 	 */
1035 		if (swap_pager_free.tqh_first == NULL) {
1036 			s = splbio();
1037 			if( curproc == pageproc)
1038 				(void) swap_pager_clean();
1039 			else
1040 				wakeup((caddr_t) &vm_pages_needed);
1041 			while (swap_pager_free.tqh_first == NULL) {
1042 				swap_pager_needflags |= SWAP_FREE_NEEDED;
1043 				tsleep((caddr_t)&swap_pager_free,
1044 					PVM, "swpfre", 0);
1045 				if( curproc == pageproc)
1046 					(void) swap_pager_clean();
1047 				else
1048 					wakeup((caddr_t) &vm_pages_needed);
1049 			}
1050 			splx(s);
1051 		}
1052 		spc = swap_pager_free.tqh_first;
1053 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1054 		kva = spc->spc_kva;
1055 		bp = spc->spc_bp;
1056 		bzero(bp, sizeof *bp);
1057 		bp->b_spc = spc;
1058 	} else {
1059 	/*
1060 	 * Get a swap buffer header to perform the IO
1061 	 */
1062 		bp = getpbuf();
1063 		kva = (vm_offset_t) bp->b_data;
1064 	}
1065 
1066 	/*
1067 	 * map our page(s) into kva for input
1068 	 */
1069 	pmap_qenter( kva, m, count);
1070 
1071 	s = splbio();
1072 	bp->b_flags = B_BUSY | B_READ | B_CALL;
1073 	bp->b_iodone = swap_pager_iodone1;
1074 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1075 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1076 	crhold(bp->b_rcred);
1077 	crhold(bp->b_wcred);
1078 	bp->b_un.b_addr = (caddr_t) kva;
1079 	bp->b_blkno = reqaddr[0];
1080 	bp->b_bcount = PAGE_SIZE*count;
1081 	bp->b_bufsize = PAGE_SIZE*count;
1082 
1083 	bgetvp( swapdev_vp, bp);
1084 
1085 	swp->sw_piip++;
1086 
1087 	/*
1088 	 * perform the I/O
1089 	 */
1090 	VOP_STRATEGY(bp);
1091 
1092 	/*
1093 	 * wait for the sync I/O to complete
1094 	 */
1095 	while ((bp->b_flags & B_DONE) == 0) {
1096 		tsleep((caddr_t)bp, PVM, "swread", 0);
1097 	}
1098 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1099 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1100 
1101 	--swp->sw_piip;
1102 	if (swp->sw_piip == 0)
1103 		wakeup((caddr_t) swp);
1104 
1105 	/*
1106 	 * relpbuf does this, but we maintain our own buffer
1107 	 * list also...
1108 	 */
1109 	if (bp->b_vp)
1110 		brelvp(bp);
1111 
1112 	splx(s);
1113 	--swb[reqpage]->swb_locked;
1114 
1115 	/*
1116 	 * remove the mapping for kernel virtual
1117 	 */
1118 	pmap_qremove( kva, count);
1119 
1120 	if (spc) {
1121 		/*
1122 		 * if we have used an spc, we need to free it.
1123 		 */
1124 		if( bp->b_rcred != NOCRED)
1125 			crfree(bp->b_rcred);
1126 		if( bp->b_wcred != NOCRED)
1127 			crfree(bp->b_wcred);
1128 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1129 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1130 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1131 			wakeup((caddr_t)&swap_pager_free);
1132 		}
1133 	} else {
1134 		/*
1135 		 * release the physical I/O buffer
1136 		 */
1137 		relpbuf(bp);
1138 		/*
1139 		 * finish up input if everything is ok
1140 		 */
1141 		if( rv == VM_PAGER_OK) {
1142 			for (i = 0; i < count; i++) {
1143 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1144 				m[i]->flags |= PG_CLEAN;
1145 				m[i]->flags &= ~PG_LAUNDRY;
1146 				if (i != reqpage) {
1147 					/*
1148 					 * whether or not to leave the page activated
1149 					 * is up in the air, but we should put the page
1150 					 * on a page queue somewhere. (it already is in
1151 					 * the object).
1152 					 * After some emperical results, it is best
1153 					 * to deactivate the readahead pages.
1154 					 */
1155 					vm_page_deactivate(m[i]);
1156 
1157 					/*
1158 					 * just in case someone was asking for this
1159 					 * page we now tell them that it is ok to use
1160 					 */
1161 					m[i]->flags &= ~PG_FAKE;
1162 					PAGE_WAKEUP(m[i]);
1163 				}
1164 			}
1165 			/*
1166 			 * If we're out of swap space, then attempt to free
1167 			 * some whenever pages are brought in. We must clear
1168 			 * the clean flag so that the page contents will be
1169 			 * preserved.
1170 			 */
1171 			if (swap_pager_full) {
1172 				for (i = 0; i < count; i++) {
1173 					m[i]->flags &= ~PG_CLEAN;
1174 				}
1175 				_swap_pager_freespace( swp, m[0]->offset+paging_offset, count*PAGE_SIZE);
1176 			}
1177 		} else {
1178 			swap_pager_ridpages(m, count, reqpage);
1179 		}
1180 	}
1181 	return(rv);
1182 }
1183 
1184 int
1185 swap_pager_output(swp, m, count, flags, rtvals)
1186 	register sw_pager_t swp;
1187 	vm_page_t *m;
1188 	int count;
1189 	int flags;
1190 	int *rtvals;
1191 {
1192 	register struct buf *bp;
1193 	sw_blk_t swb[count];
1194 	register int s;
1195 	int i, j, ix;
1196 	boolean_t rv;
1197 	vm_offset_t kva, off, foff;
1198 	swp_clean_t spc;
1199 	vm_offset_t paging_offset;
1200 	vm_object_t object;
1201 	int reqaddr[count];
1202 	int failed;
1203 
1204 /*
1205 	if( count > 1)
1206 		printf("off: 0x%x, count: %d\n", m[0]->offset, count);
1207 */
1208 	spc = NULL;
1209 
1210 	object = m[0]->object;
1211 	paging_offset = object->paging_offset;
1212 
1213 	failed = 0;
1214 	for(j=0;j<count;j++) {
1215 		foff = m[j]->offset + paging_offset;
1216 		ix = swap_pager_block_index(swp, foff);
1217 		swb[j] = 0;
1218 		if( swp->sw_blocks == NULL || ix >= swp->sw_nblocks) {
1219 			rtvals[j] = VM_PAGER_FAIL;
1220 			failed = 1;
1221 			continue;
1222 		} else {
1223 			rtvals[j] = VM_PAGER_OK;
1224 		}
1225 		swb[j] = &swp->sw_blocks[ix];
1226 		++swb[j]->swb_locked;
1227 		if( failed) {
1228 			rtvals[j] = VM_PAGER_FAIL;
1229 			continue;
1230 		}
1231 		off = swap_pager_block_offset(swp, foff);
1232 		reqaddr[j] = swb[j]->swb_block[off];
1233 		if( reqaddr[j] == SWB_EMPTY) {
1234 			int blk;
1235 			int tries;
1236 			int ntoget;
1237 			tries = 0;
1238 			s = splbio();
1239 
1240 			/*
1241 			 * if any other pages have been allocated in this block, we
1242 			 * only try to get one page.
1243 			 */
1244 			for (i = 0; i < SWB_NPAGES; i++) {
1245 				if (swb[j]->swb_block[i] != SWB_EMPTY)
1246 					break;
1247 			}
1248 
1249 
1250 			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
1251 			/*
1252 			 * this code is alittle conservative, but works
1253 			 * (the intent of this code is to allocate small chunks
1254 			 *  for small objects)
1255 			 */
1256 			if( (m[j]->offset == 0) && (ntoget*PAGE_SIZE > object->size)) {
1257 				ntoget = (object->size + (PAGE_SIZE-1))/PAGE_SIZE;
1258 			}
1259 
1260 retrygetspace:
1261 			if (!swap_pager_full && ntoget > 1 &&
1262 				swap_pager_getswapspace(ntoget * btodb(PAGE_SIZE), &blk)) {
1263 
1264 				for (i = 0; i < ntoget; i++) {
1265 					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
1266 					swb[j]->swb_valid = 0;
1267 				}
1268 
1269 				reqaddr[j] = swb[j]->swb_block[off];
1270 			} else if (!swap_pager_getswapspace(btodb(PAGE_SIZE),
1271 				&swb[j]->swb_block[off])) {
1272 				/*
1273 				 * if the allocation has failed, we try to reclaim space and
1274 				 * retry.
1275 				 */
1276 				if (++tries == 1) {
1277 					swap_pager_reclaim();
1278 					goto retrygetspace;
1279 				}
1280 				rtvals[j] = VM_PAGER_AGAIN;
1281 				failed = 1;
1282 			} else {
1283 				reqaddr[j] = swb[j]->swb_block[off];
1284 				swb[j]->swb_valid &= ~(1<<off);
1285 			}
1286 			splx(s);
1287 		}
1288 	}
1289 
1290 	/*
1291 	 * search forwards for the last contiguous page to transfer
1292 	 */
1293 	failed = 0;
1294 	for (i = 0; i < count; i++) {
1295 		if( failed || (reqaddr[i] != reqaddr[0] + i*btodb(PAGE_SIZE)) ||
1296 			(reqaddr[i] / dmmax) != (reqaddr[0] / dmmax) ||
1297 			(rtvals[i] != VM_PAGER_OK)) {
1298 			failed = 1;
1299 			if( rtvals[i] == VM_PAGER_OK)
1300 				rtvals[i] = VM_PAGER_AGAIN;
1301 		}
1302 	}
1303 
1304 	for(i = 0; i < count; i++) {
1305 		if( rtvals[i] != VM_PAGER_OK) {
1306 			if( swb[i])
1307 				--swb[i]->swb_locked;
1308 		}
1309 	}
1310 
1311 	for(i = 0; i < count; i++)
1312 		if( rtvals[i] != VM_PAGER_OK)
1313 			break;
1314 
1315 	if( i == 0) {
1316 		return VM_PAGER_AGAIN;
1317 	}
1318 
1319 	count = i;
1320 	for(i=0;i<count;i++) {
1321 		if( reqaddr[i] == SWB_EMPTY)
1322 			printf("I/O to empty block????\n");
1323 	}
1324 
1325 	/*
1326 	 */
1327 
1328 	/*
1329 	 * For synchronous writes, we clean up
1330 	 * all completed async pageouts.
1331 	 */
1332 	if ((flags & B_ASYNC) == 0) {
1333 		swap_pager_clean();
1334 	}
1335 
1336 	kva = 0;
1337 
1338 	/*
1339 	 * we allocate a new kva for transfers > 1 page
1340 	 * but for transfers == 1 page, the swap_pager_free list contains
1341 	 * entries that have pre-allocated kva's (for efficiency).
1342 	 * NOTE -- we do not use the physical buffer pool or the
1343 	 * preallocated associated kva's because of the potential for
1344 	 * deadlock.  This is very subtile -- but deadlocks or resource
1345 	 * contention must be avoided on pageouts -- or your system will
1346 	 * sleep (forever) !!!
1347 	 */
1348 /*
1349 	if ( count > 1) {
1350 		kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE);
1351 		if( !kva) {
1352 			for (i = 0; i < count; i++) {
1353 				if( swb[i])
1354 					--swb[i]->swb_locked;
1355 				rtvals[i] = VM_PAGER_AGAIN;
1356 			}
1357 			return VM_PAGER_AGAIN;
1358 		}
1359 	}
1360 */
1361 
1362 	/*
1363 	 * get a swap pager clean data structure, block until we get it
1364 	 */
1365 	if (swap_pager_free.tqh_first == NULL) {
1366 		s = splbio();
1367 		if( curproc == pageproc)
1368 			(void) swap_pager_clean();
1369 		else
1370 			wakeup((caddr_t) &vm_pages_needed);
1371 		while (swap_pager_free.tqh_first == NULL) {
1372 			swap_pager_needflags |= SWAP_FREE_NEEDED;
1373 			tsleep((caddr_t)&swap_pager_free,
1374 				PVM, "swpfre", 0);
1375 			if( curproc == pageproc)
1376 				(void) swap_pager_clean();
1377 			else
1378 				wakeup((caddr_t) &vm_pages_needed);
1379 		}
1380 		splx(s);
1381 	}
1382 
1383 	spc = swap_pager_free.tqh_first;
1384 	TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1385 
1386 	kva = spc->spc_kva;
1387 
1388 	/*
1389 	 * map our page(s) into kva for I/O
1390 	 */
1391 	pmap_qenter(kva, m, count);
1392 
1393 	/*
1394 	 * get the base I/O offset into the swap file
1395 	 */
1396 	for(i=0;i<count;i++) {
1397 		foff = m[i]->offset + paging_offset;
1398 		off = swap_pager_block_offset(swp, foff);
1399 		/*
1400 		 * if we are setting the valid bit anew,
1401 		 * then diminish the swap free space
1402 		 */
1403 		if( (swb[i]->swb_valid & (1 << off)) == 0)
1404 			vm_swap_size -= btodb(PAGE_SIZE);
1405 
1406 		/*
1407 		 * set the valid bit
1408 		 */
1409 		swb[i]->swb_valid |= (1 << off);
1410 		/*
1411 		 * and unlock the data structure
1412 		 */
1413 		--swb[i]->swb_locked;
1414 	}
1415 
1416 	s = splbio();
1417 	/*
1418 	 * Get a swap buffer header and perform the IO
1419 	 */
1420 	bp = spc->spc_bp;
1421 	bzero(bp, sizeof *bp);
1422 	bp->b_spc = spc;
1423 
1424 	bp->b_flags = B_BUSY;
1425 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1426 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1427 	if( bp->b_rcred != NOCRED)
1428 		crhold(bp->b_rcred);
1429 	if( bp->b_wcred != NOCRED)
1430 		crhold(bp->b_wcred);
1431 	bp->b_data = (caddr_t) kva;
1432 	bp->b_blkno = reqaddr[0];
1433 	bgetvp( swapdev_vp, bp);
1434 
1435 	bp->b_bcount = PAGE_SIZE*count;
1436 	bp->b_bufsize = PAGE_SIZE*count;
1437 	swapdev_vp->v_numoutput++;
1438 
1439 	/*
1440 	 * If this is an async write we set up additional buffer fields
1441 	 * and place a "cleaning" entry on the inuse queue.
1442 	 */
1443 	if ( flags & B_ASYNC ) {
1444 		spc->spc_flags = 0;
1445 		spc->spc_swp = swp;
1446 		for(i=0;i<count;i++)
1447 			spc->spc_m[i] = m[i];
1448 		spc->spc_count = count;
1449 		/*
1450 		 * the completion routine for async writes
1451 		 */
1452 		bp->b_flags |= B_CALL;
1453 		bp->b_iodone = swap_pager_iodone;
1454 		bp->b_dirtyoff = 0;
1455 		bp->b_dirtyend = bp->b_bcount;
1456 		swp->sw_poip++;
1457 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
1458 	} else {
1459 		swp->sw_poip++;
1460 		bp->b_flags |= B_CALL;
1461 		bp->b_iodone = swap_pager_iodone1;
1462 	}
1463 	/*
1464 	 * perform the I/O
1465 	 */
1466 	VOP_STRATEGY(bp);
1467 	if ((flags & (B_READ|B_ASYNC)) == B_ASYNC ) {
1468 		if ((bp->b_flags & B_DONE) == B_DONE) {
1469 			swap_pager_clean();
1470 		}
1471 		splx(s);
1472 		for(i=0;i<count;i++) {
1473 			rtvals[i] = VM_PAGER_PEND;
1474 		}
1475 		return VM_PAGER_PEND;
1476 	}
1477 
1478 	/*
1479 	 * wait for the sync I/O to complete
1480 	 */
1481 	while ((bp->b_flags & B_DONE) == 0) {
1482 		tsleep((caddr_t)bp, PVM, "swwrt", 0);
1483 	}
1484 	rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK;
1485 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE);
1486 
1487 	--swp->sw_poip;
1488 	if (swp->sw_poip == 0)
1489 		wakeup((caddr_t) swp);
1490 
1491 	if (bp->b_vp)
1492 		brelvp(bp);
1493 
1494 	splx(s);
1495 
1496 	/*
1497 	 * remove the mapping for kernel virtual
1498 	 */
1499 	pmap_qremove( kva, count);
1500 
1501 	/*
1502 	 * if we have written the page, then indicate that the page
1503 	 * is clean.
1504 	 */
1505 	if (rv == VM_PAGER_OK) {
1506 		for(i=0;i<count;i++) {
1507 			if( rtvals[i] == VM_PAGER_OK) {
1508 				m[i]->flags |= PG_CLEAN;
1509 				m[i]->flags &= ~PG_LAUNDRY;
1510 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1511 				/*
1512 				 * optimization, if a page has been read during the
1513 				 * pageout process, we activate it.
1514 				 */
1515 				if ( (m[i]->flags & PG_ACTIVE) == 0 &&
1516 					pmap_is_referenced(VM_PAGE_TO_PHYS(m[i])))
1517 					vm_page_activate(m[i]);
1518 			}
1519 		}
1520 	} else {
1521 		for(i=0;i<count;i++) {
1522 			rtvals[i] = rv;
1523 			m[i]->flags |= PG_LAUNDRY;
1524 		}
1525 	}
1526 
1527 	if( bp->b_rcred != NOCRED)
1528 		crfree(bp->b_rcred);
1529 	if( bp->b_wcred != NOCRED)
1530 		crfree(bp->b_wcred);
1531 	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1532 	if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1533 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1534 		wakeup((caddr_t)&swap_pager_free);
1535 	}
1536 
1537 	return(rv);
1538 }
1539 
1540 boolean_t
1541 swap_pager_clean()
1542 {
1543 	register swp_clean_t spc, tspc;
1544 	register int s;
1545 
1546 	tspc = NULL;
1547 	if (swap_pager_done.tqh_first == NULL)
1548 		return FALSE;
1549 	for (;;) {
1550 		s = splbio();
1551 		/*
1552 		 * Look up and removal from done list must be done
1553 		 * at splbio() to avoid conflicts with swap_pager_iodone.
1554 		 */
1555 		while ((spc = swap_pager_done.tqh_first) != 0) {
1556 			pmap_qremove( spc->spc_kva, spc->spc_count);
1557 			swap_pager_finish(spc);
1558 			TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
1559 			goto doclean;
1560 		}
1561 
1562 		/*
1563 		 * No operations done, thats all we can do for now.
1564 		 */
1565 
1566 		splx(s);
1567 		break;
1568 
1569 		/*
1570 		 * The desired page was found to be busy earlier in
1571 		 * the scan but has since completed.
1572 		 */
1573 doclean:
1574 		if (tspc && tspc == spc) {
1575 			tspc = NULL;
1576 		}
1577 		spc->spc_flags = 0;
1578 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1579 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1580 			swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1581 			wakeup((caddr_t)&swap_pager_free);
1582 		}
1583 		++cleandone;
1584 		splx(s);
1585 	}
1586 
1587 	return(tspc ? TRUE : FALSE);
1588 }
1589 
1590 void
1591 swap_pager_finish(spc)
1592 	register swp_clean_t spc;
1593 {
1594 	vm_object_t object = spc->spc_m[0]->object;
1595 	int i;
1596 
1597 	if ((object->paging_in_progress -= spc->spc_count) == 0)
1598 		thread_wakeup((int) object);
1599 
1600 	/*
1601 	 * If no error mark as clean and inform the pmap system.
1602 	 * If error, mark as dirty so we will try again.
1603 	 * (XXX could get stuck doing this, should give up after awhile)
1604 	 */
1605 	if (spc->spc_flags & SPC_ERROR) {
1606 		for(i=0;i<spc->spc_count;i++) {
1607 			printf("swap_pager_finish: clean of page %lx failed\n",
1608 			       (u_long)VM_PAGE_TO_PHYS(spc->spc_m[i]));
1609 			spc->spc_m[i]->flags |= PG_LAUNDRY;
1610 		}
1611 	} else {
1612 		for(i=0;i<spc->spc_count;i++) {
1613 			pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i]));
1614 			spc->spc_m[i]->flags |= PG_CLEAN;
1615 		}
1616 	}
1617 
1618 
1619 	for(i=0;i<spc->spc_count;i++) {
1620 		/*
1621 		 * we wakeup any processes that are waiting on
1622 		 * these pages.
1623 		 */
1624 		PAGE_WAKEUP(spc->spc_m[i]);
1625 	}
1626 	nswiodone -= spc->spc_count;
1627 
1628 	return;
1629 }
1630 
1631 /*
1632  * swap_pager_iodone
1633  */
1634 void
1635 swap_pager_iodone(bp)
1636 	register struct buf *bp;
1637 {
1638 	register swp_clean_t spc;
1639 	int s;
1640 
1641 	s = splbio();
1642 	spc = (swp_clean_t) bp->b_spc;
1643 	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
1644 	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
1645 	if (bp->b_flags & B_ERROR) {
1646 		spc->spc_flags |= SPC_ERROR;
1647 		printf("error %d blkno %lu sz %ld ",
1648 			bp->b_error, (u_long)bp->b_blkno, bp->b_bcount);
1649 	}
1650 
1651 /*
1652 	if ((bp->b_flags & B_READ) == 0)
1653 		vwakeup(bp);
1654 */
1655 
1656 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_ASYNC);
1657 	if (bp->b_vp) {
1658 		brelvp(bp);
1659 	}
1660 	if( bp->b_rcred != NOCRED)
1661 		crfree(bp->b_rcred);
1662 	if( bp->b_wcred != NOCRED)
1663 		crfree(bp->b_wcred);
1664 
1665 	nswiodone += spc->spc_count;
1666 	if (--spc->spc_swp->sw_poip == 0) {
1667 		wakeup((caddr_t)spc->spc_swp);
1668 	}
1669 
1670 	if ((swap_pager_needflags & SWAP_FREE_NEEDED) ||
1671 	    swap_pager_inuse.tqh_first == 0) {
1672 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1673 		wakeup((caddr_t)&swap_pager_free);
1674 		wakeup((caddr_t)&vm_pages_needed);
1675 	}
1676 
1677 	if (vm_pageout_pages_needed) {
1678 		wakeup((caddr_t)&vm_pageout_pages_needed);
1679 	}
1680 
1681 	if ((swap_pager_inuse.tqh_first == NULL) ||
1682 	    (cnt.v_free_count < cnt.v_free_min &&
1683 	    nswiodone + cnt.v_free_count >= cnt.v_free_min) ) {
1684 		wakeup((caddr_t)&vm_pages_needed);
1685 	}
1686 	splx(s);
1687 }
1688 
1689 /*
1690  * return true if any swap control structures can be allocated
1691  */
1692 int
1693 swap_pager_ready() {
1694 	if( swap_pager_free.tqh_first)
1695 		return 1;
1696 	else
1697 		return 0;
1698 }
1699