xref: /freebsd/sys/vm/swap_pager.c (revision 0640d357f29fb1c0daaaffadd0416c5981413afd)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * Copyright (c) 1990 University of Utah.
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * the Systems Programming Group of the University of Utah Computer
9  * Science Department.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
40  *
41  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
42  * $Id: swap_pager.c,v 1.102 1998/10/13 08:24:42 dg Exp $
43  */
44 
45 /*
46  * Quick hack to page to dedicated partition(s).
47  * TODO:
48  *	Add multiprocessor locks
49  *	Deal with async writes in a better fashion
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
55 #include <sys/proc.h>
56 #include <sys/buf.h>
57 #include <sys/vnode.h>
58 #include <sys/malloc.h>
59 #include <sys/vmmeter.h>
60 #include <sys/rlist.h>
61 
62 #ifndef MAX_PAGEOUT_CLUSTER
63 #define MAX_PAGEOUT_CLUSTER 16
64 #endif
65 
66 #ifndef NPENDINGIO
67 #define NPENDINGIO	16
68 #endif
69 
70 #define SWB_NPAGES MAX_PAGEOUT_CLUSTER
71 
72 #include <vm/vm.h>
73 #include <vm/vm_prot.h>
74 #include <vm/vm_object.h>
75 #include <vm/vm_page.h>
76 #include <vm/vm_pager.h>
77 #include <vm/vm_pageout.h>
78 #include <vm/swap_pager.h>
79 #include <vm/vm_extern.h>
80 
81 static int nswiodone;
82 int swap_pager_full;
83 extern int vm_swap_size;
84 static int suggest_more_swap = 0;
85 static int no_swap_space = 1;
86 static int max_pageout_cluster;
87 struct rlisthdr swaplist;
88 
89 TAILQ_HEAD(swpclean, swpagerclean);
90 
91 typedef struct swpagerclean *swp_clean_t;
92 
93 static struct swpagerclean {
94 	TAILQ_ENTRY(swpagerclean) spc_list;
95 	int spc_flags;
96 	struct buf *spc_bp;
97 	vm_object_t spc_object;
98 	vm_offset_t spc_kva;
99 	int spc_first;
100 	int spc_count;
101 	vm_page_t spc_m[MAX_PAGEOUT_CLUSTER];
102 } swcleanlist[NPENDINGIO];
103 
104 
105 /* spc_flags values */
106 #define SPC_ERROR	0x01
107 
108 #define SWB_EMPTY (-1)
109 
110 /* list of completed page cleans */
111 static struct swpclean swap_pager_done;
112 
113 /* list of pending page cleans */
114 static struct swpclean swap_pager_inuse;
115 
116 /* list of free pager clean structs */
117 static struct swpclean swap_pager_free;
118 static int swap_pager_free_count;
119 static int swap_pager_free_pending;
120 
121 /* list of "named" anon region objects */
122 static struct pagerlst swap_pager_object_list;
123 
124 /* list of "unnamed" anon region objects */
125 struct pagerlst swap_pager_un_object_list;
126 
127 #define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
128 #define SWAP_FREE_NEEDED_BY_PAGEOUT 0x2
129 static int swap_pager_needflags;
130 
131 static struct pagerlst *swp_qs[] = {
132 	&swap_pager_object_list, &swap_pager_un_object_list, (struct pagerlst *) 0
133 };
134 
135 /*
136  * pagerops for OBJT_SWAP - "swap pager".
137  */
138 static vm_object_t
139 		swap_pager_alloc __P((void *handle, vm_ooffset_t size,
140 				      vm_prot_t prot, vm_ooffset_t offset));
141 static void	swap_pager_dealloc __P((vm_object_t object));
142 static boolean_t
143 		swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex,
144 					int *before, int *after));
145 static int	swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
146 static void	swap_pager_init __P((void));
147 static void spc_free __P((swp_clean_t));
148 
149 struct pagerops swappagerops = {
150 	swap_pager_init,
151 	swap_pager_alloc,
152 	swap_pager_dealloc,
153 	swap_pager_getpages,
154 	swap_pager_putpages,
155 	swap_pager_haspage,
156 	swap_pager_sync
157 };
158 
159 static int npendingio;
160 static int dmmin;
161 int dmmax;
162 
163 static int	swap_pager_block_index __P((vm_pindex_t pindex));
164 static int	swap_pager_block_offset __P((vm_pindex_t pindex));
165 static daddr_t *swap_pager_diskaddr __P((vm_object_t object,
166 					  vm_pindex_t pindex, int *valid));
167 static void	swap_pager_finish __P((swp_clean_t spc));
168 static void	swap_pager_free_swap __P((vm_object_t object));
169 static void	swap_pager_freeswapspace __P((vm_object_t object,
170 					      unsigned int from,
171 					      unsigned int to));
172 static int	swap_pager_getswapspace __P((vm_object_t object,
173 					     unsigned int amount,
174 					     daddr_t *rtval));
175 static void	swap_pager_iodone __P((struct buf *));
176 static void	swap_pager_iodone1 __P((struct buf *bp));
177 static void	swap_pager_reclaim __P((void));
178 static void	swap_pager_ridpages __P((vm_page_t *m, int count,
179 					 int reqpage));
180 static void	swap_pager_setvalid __P((vm_object_t object,
181 					 vm_offset_t offset, int valid));
182 static __inline void	swapsizecheck __P((void));
183 
184 #define SWAPLOW (vm_swap_size < (512 * btodb(PAGE_SIZE)))
185 
186 static __inline void
187 swapsizecheck()
188 {
189 	if (vm_swap_size < 128 * btodb(PAGE_SIZE)) {
190 		if (swap_pager_full == 0)
191 			printf("swap_pager: out of swap space\n");
192 		swap_pager_full = 1;
193 	} else if (vm_swap_size > 192 * btodb(PAGE_SIZE))
194 		swap_pager_full = 0;
195 }
196 
197 static void
198 swap_pager_init()
199 {
200 	int maxsafepending;
201 	TAILQ_INIT(&swap_pager_object_list);
202 	TAILQ_INIT(&swap_pager_un_object_list);
203 
204 	/*
205 	 * Initialize clean lists
206 	 */
207 	TAILQ_INIT(&swap_pager_inuse);
208 	TAILQ_INIT(&swap_pager_done);
209 	TAILQ_INIT(&swap_pager_free);
210 	swap_pager_free_count = 0;
211 
212 	/*
213 	 * Calculate the swap allocation constants.
214 	 */
215 	dmmin = PAGE_SIZE / DEV_BSIZE;
216 	dmmax = btodb(SWB_NPAGES * PAGE_SIZE) * 2;
217 
218 	maxsafepending = cnt.v_free_min - cnt.v_free_reserved;
219 	npendingio = NPENDINGIO;
220 	max_pageout_cluster = MAX_PAGEOUT_CLUSTER;
221 
222 	if ((2 * NPENDINGIO * MAX_PAGEOUT_CLUSTER) > maxsafepending) {
223 		max_pageout_cluster = MAX_PAGEOUT_CLUSTER / 2;
224 		npendingio = maxsafepending / (2 * max_pageout_cluster);
225 		if (npendingio < 2)
226 			npendingio = 2;
227 	}
228 }
229 
230 void
231 swap_pager_swap_init()
232 {
233 	swp_clean_t spc;
234 	struct buf *bp;
235 	int i;
236 
237 	/*
238 	 * kva's are allocated here so that we dont need to keep doing
239 	 * kmem_alloc pageables at runtime
240 	 */
241 	for (i = 0, spc = swcleanlist; i < npendingio; i++, spc++) {
242 		spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE * max_pageout_cluster);
243 		if (!spc->spc_kva) {
244 			break;
245 		}
246 		spc->spc_bp = malloc(sizeof(*bp), M_TEMP, M_KERNEL);
247 		if (!spc->spc_bp) {
248 			kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
249 			break;
250 		}
251 		spc->spc_flags = 0;
252 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
253 		swap_pager_free_count++;
254 	}
255 }
256 
257 int
258 swap_pager_swp_alloc(object, wait)
259 	vm_object_t object;
260 	int wait;
261 {
262 	sw_blk_t swb;
263 	int nblocks;
264 	int i, j;
265 
266 	nblocks = (object->size + SWB_NPAGES - 1) / SWB_NPAGES;
267 	swb = malloc(nblocks * sizeof(*swb), M_VMPGDATA, wait);
268 	if (swb == NULL)
269 		return 1;
270 
271 	for (i = 0; i < nblocks; i++) {
272 		swb[i].swb_valid = 0;
273 		swb[i].swb_locked = 0;
274 		for (j = 0; j < SWB_NPAGES; j++)
275 			swb[i].swb_block[j] = SWB_EMPTY;
276 	}
277 
278 	object->un_pager.swp.swp_nblocks = nblocks;
279 	object->un_pager.swp.swp_allocsize = 0;
280 	object->un_pager.swp.swp_blocks = swb;
281 	object->un_pager.swp.swp_poip = 0;
282 
283 	if (object->handle != NULL) {
284 		TAILQ_INSERT_TAIL(&swap_pager_object_list, object, pager_object_list);
285 	} else {
286 		TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
287 	}
288 
289 	return 0;
290 }
291 
292 /*
293  * Allocate an object and associated resources.
294  * Note that if we are called from the pageout daemon (handle == NULL)
295  * we should not wait for memory as it could resulting in deadlock.
296  */
297 static vm_object_t
298 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
299 		 vm_ooffset_t offset)
300 {
301 	vm_object_t object;
302 
303 	/*
304 	 * If this is a "named" anonymous region, look it up and use the
305 	 * object if it exists, otherwise allocate a new one.
306 	 */
307 	if (handle) {
308 		object = vm_pager_object_lookup(&swap_pager_object_list, handle);
309 		if (object != NULL) {
310 			vm_object_reference(object);
311 		} else {
312 			/*
313 			 * XXX - there is a race condition here. Two processes
314 			 * can request the same named object simultaneuously,
315 			 * and if one blocks for memory, the result is a disaster.
316 			 * Probably quite rare, but is yet another reason to just
317 			 * rip support of "named anonymous regions" out altogether.
318 			 */
319 			object = vm_object_allocate(OBJT_SWAP,
320 				OFF_TO_IDX(offset + PAGE_MASK + size));
321 			object->handle = handle;
322 			(void) swap_pager_swp_alloc(object, M_WAITOK);
323 		}
324 	} else {
325 		object = vm_object_allocate(OBJT_SWAP,
326 			OFF_TO_IDX(offset + PAGE_MASK + size));
327 		(void) swap_pager_swp_alloc(object, M_WAITOK);
328 	}
329 
330 	return (object);
331 }
332 
333 /*
334  * returns disk block associated with pager and offset
335  * additionally, as a side effect returns a flag indicating
336  * if the block has been written
337  */
338 
339 static __inline daddr_t *
340 swap_pager_diskaddr(object, pindex, valid)
341 	vm_object_t object;
342 	vm_pindex_t pindex;
343 	int *valid;
344 {
345 	register sw_blk_t swb;
346 	int ix;
347 
348 	if (valid)
349 		*valid = 0;
350 	ix = pindex / SWB_NPAGES;
351 	if ((ix >= object->un_pager.swp.swp_nblocks) ||
352 	    (pindex >= object->size)) {
353 		return (FALSE);
354 	}
355 	swb = &object->un_pager.swp.swp_blocks[ix];
356 	ix = pindex % SWB_NPAGES;
357 	if (valid)
358 		*valid = swb->swb_valid & (1 << ix);
359 	return &swb->swb_block[ix];
360 }
361 
362 /*
363  * Utility routine to set the valid (written) bit for
364  * a block associated with a pager and offset
365  */
366 static void
367 swap_pager_setvalid(object, offset, valid)
368 	vm_object_t object;
369 	vm_offset_t offset;
370 	int valid;
371 {
372 	register sw_blk_t swb;
373 	int ix;
374 
375 	ix = offset / SWB_NPAGES;
376 	if (ix >= object->un_pager.swp.swp_nblocks)
377 		return;
378 
379 	swb = &object->un_pager.swp.swp_blocks[ix];
380 	ix = offset % SWB_NPAGES;
381 	if (valid)
382 		swb->swb_valid |= (1 << ix);
383 	else
384 		swb->swb_valid &= ~(1 << ix);
385 	return;
386 }
387 
388 /*
389  * this routine allocates swap space with a fragmentation
390  * minimization policy.
391  */
392 static int
393 swap_pager_getswapspace(object, amount, rtval)
394 	vm_object_t object;
395 	unsigned int amount;
396 	daddr_t *rtval;
397 {
398 	unsigned location;
399 
400 	vm_swap_size -= amount;
401 	if (!suggest_more_swap && (vm_swap_size < btodb(cnt.v_page_count * PAGE_SIZE))) {
402 		printf("swap_pager: suggest more swap space: %d MB\n",
403 			(2 * cnt.v_page_count * (PAGE_SIZE / 1024)) / 1000);
404 		suggest_more_swap = 1;
405 	}
406 
407 	if (!rlist_alloc(&swaplist, amount, &location)) {
408 		vm_swap_size += amount;
409 		return 0;
410 	} else {
411 		swapsizecheck();
412 		object->un_pager.swp.swp_allocsize += amount;
413 		*rtval = location;
414 		return 1;
415 	}
416 }
417 
418 /*
419  * this routine frees swap space with a fragmentation
420  * minimization policy.
421  */
422 static void
423 swap_pager_freeswapspace(object, from, to)
424 	vm_object_t object;
425 	unsigned int from;
426 	unsigned int to;
427 {
428 	rlist_free(&swaplist, from, to);
429 	vm_swap_size += (to - from) + 1;
430 	object->un_pager.swp.swp_allocsize -= (to - from) + 1;
431 	swapsizecheck();
432 }
433 /*
434  * this routine frees swap blocks from a specified pager
435  */
436 void
437 swap_pager_freespace(object, start, size)
438 	vm_object_t object;
439 	vm_pindex_t start;
440 	vm_size_t size;
441 {
442 	vm_pindex_t i;
443 	int s;
444 
445 	s = splvm();
446 	for (i = start; i < start + size; i += 1) {
447 		int valid;
448 		daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
449 
450 		if (addr && *addr != SWB_EMPTY) {
451 			swap_pager_freeswapspace(object, *addr, *addr + btodb(PAGE_SIZE) - 1);
452 			if (valid) {
453 				swap_pager_setvalid(object, i, 0);
454 			}
455 			*addr = SWB_EMPTY;
456 		}
457 	}
458 	splx(s);
459 }
460 
461 /*
462  * same as freespace, but don't free, just force a DMZ next time
463  */
464 void
465 swap_pager_dmzspace(object, start, size)
466 	vm_object_t object;
467 	vm_pindex_t start;
468 	vm_size_t size;
469 {
470 	vm_pindex_t i;
471 	int s;
472 
473 	s = splvm();
474 	for (i = start; i < start + size; i += 1) {
475 		int valid;
476 		daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
477 
478 		if (addr && *addr != SWB_EMPTY) {
479 			if (valid) {
480 				swap_pager_setvalid(object, i, 0);
481 			}
482 		}
483 	}
484 	splx(s);
485 }
486 
487 static void
488 swap_pager_free_swap(object)
489 	vm_object_t object;
490 {
491 	register int i, j;
492 	register sw_blk_t swb;
493 	int first_block=0, block_count=0;
494 	int s;
495 	/*
496 	 * Free left over swap blocks
497 	 */
498 	swb = object->un_pager.swp.swp_blocks;
499 	if (swb == NULL) {
500 		return;
501 	}
502 
503 	s = splvm();
504 	for (i = 0; i < object->un_pager.swp.swp_nblocks; i++, swb++) {
505 		for (j = 0; j < SWB_NPAGES; j++) {
506 			if (swb->swb_block[j] != SWB_EMPTY) {
507 				/*
508    				 * initially the length of the run is zero
509    				 */
510 				if (block_count == 0) {
511 					first_block = swb->swb_block[j];
512 					block_count = btodb(PAGE_SIZE);
513 					swb->swb_block[j] = SWB_EMPTY;
514 				/*
515    				 * if the new block can be included into the current run
516    				 */
517 				} else if (swb->swb_block[j] == first_block + block_count) {
518 					block_count += btodb(PAGE_SIZE);
519 					swb->swb_block[j] = SWB_EMPTY;
520 				/*
521    				 * terminate the previous run, and start a new one
522    				 */
523 				} else {
524 					swap_pager_freeswapspace(object, first_block,
525    					(unsigned) first_block + block_count - 1);
526 					first_block = swb->swb_block[j];
527 					block_count = btodb(PAGE_SIZE);
528 					swb->swb_block[j] = SWB_EMPTY;
529 				}
530 			}
531 		}
532 	}
533 
534 	if (block_count) {
535 		swap_pager_freeswapspace(object, first_block,
536 		   	 (unsigned) first_block + block_count - 1);
537 	}
538 	splx(s);
539 }
540 
541 
542 /*
543  * swap_pager_reclaim frees up over-allocated space from all pagers
544  * this eliminates internal fragmentation due to allocation of space
545  * for segments that are never swapped to. It has been written so that
546  * it does not block until the rlist_free operation occurs; it keeps
547  * the queues consistant.
548  */
549 
550 /*
551  * Maximum number of blocks (pages) to reclaim per pass
552  */
553 #define MAXRECLAIM 128
554 
555 static void
556 swap_pager_reclaim()
557 {
558 	vm_object_t object;
559 	int i, j, k;
560 	int s;
561 	int reclaimcount;
562 	static struct {
563 		int address;
564 		vm_object_t object;
565 	} reclaims[MAXRECLAIM];
566 	static int in_reclaim;
567 
568 	/*
569 	 * allow only one process to be in the swap_pager_reclaim subroutine
570 	 */
571 	s = splvm();
572 	if (in_reclaim) {
573 		tsleep(&in_reclaim, PSWP, "swrclm", 0);
574 		splx(s);
575 		return;
576 	}
577 	in_reclaim = 1;
578 	reclaimcount = 0;
579 
580 	/* for each pager queue */
581 	for (k = 0; swp_qs[k]; k++) {
582 
583 		object = TAILQ_FIRST(swp_qs[k]);
584 		while (object && (reclaimcount < MAXRECLAIM)) {
585 
586 			/*
587 			 * see if any blocks associated with a pager has been
588 			 * allocated but not used (written)
589 			 */
590 			if ((object->flags & OBJ_DEAD) == 0 &&
591 				(object->paging_in_progress == 0)) {
592 				for (i = 0; i < object->un_pager.swp.swp_nblocks; i++) {
593 					sw_blk_t swb = &object->un_pager.swp.swp_blocks[i];
594 
595 					if (swb->swb_locked)
596 						continue;
597 					for (j = 0; j < SWB_NPAGES; j++) {
598 						if (swb->swb_block[j] != SWB_EMPTY &&
599 						    (swb->swb_valid & (1 << j)) == 0) {
600 							reclaims[reclaimcount].address = swb->swb_block[j];
601 							reclaims[reclaimcount++].object = object;
602 							swb->swb_block[j] = SWB_EMPTY;
603 							if (reclaimcount >= MAXRECLAIM)
604 								goto rfinished;
605 						}
606 					}
607 				}
608 			}
609 			object = TAILQ_NEXT(object, pager_object_list);
610 		}
611 	}
612 
613 rfinished:
614 
615 	/*
616 	 * free the blocks that have been added to the reclaim list
617 	 */
618 	for (i = 0; i < reclaimcount; i++) {
619 		swap_pager_freeswapspace(reclaims[i].object,
620 		    reclaims[i].address, reclaims[i].address + btodb(PAGE_SIZE) - 1);
621 	}
622 	splx(s);
623 	in_reclaim = 0;
624 	wakeup(&in_reclaim);
625 }
626 
627 
628 /*
629  * swap_pager_copy copies blocks from one pager to another and
630  * destroys the source pager
631  */
632 
633 void
634 swap_pager_copy(srcobject, srcoffset, dstobject, dstoffset,
635 	offset, destroysource)
636 	vm_object_t srcobject;
637 	vm_pindex_t srcoffset;
638 	vm_object_t dstobject;
639 	vm_pindex_t dstoffset;
640 	vm_pindex_t offset;
641 	int destroysource;
642 {
643 	vm_pindex_t i;
644 	int origsize;
645 	int s;
646 
647 	if (vm_swap_size)
648 		no_swap_space = 0;
649 
650 	origsize = srcobject->un_pager.swp.swp_allocsize;
651 
652 	/*
653 	 * remove the source object from the swap_pager internal queue
654 	 */
655 	if (destroysource) {
656 		if (srcobject->handle == NULL) {
657 			TAILQ_REMOVE(&swap_pager_un_object_list, srcobject, pager_object_list);
658 		} else {
659 			TAILQ_REMOVE(&swap_pager_object_list, srcobject, pager_object_list);
660 		}
661 	}
662 
663 	s = splvm();
664 	while (srcobject->un_pager.swp.swp_poip) {
665 		tsleep(srcobject, PVM, "spgout", 0);
666 	}
667 
668 	/*
669 	 * clean all of the pages that are currently active and finished
670 	 */
671 	if (swap_pager_free_pending)
672 		swap_pager_sync();
673 
674 	/*
675 	 * transfer source to destination
676 	 */
677 	for (i = 0; i < dstobject->size; i += 1) {
678 		int srcvalid, dstvalid;
679 		daddr_t *srcaddrp = swap_pager_diskaddr(srcobject,
680 				i + offset + srcoffset, &srcvalid);
681 		daddr_t *dstaddrp;
682 
683 		/*
684 		 * see if the source has space allocated
685 		 */
686 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
687 			/*
688 			 * if the source is valid and the dest has no space,
689 			 * then copy the allocation from the srouce to the
690 			 * dest.
691 			 */
692 			if (srcvalid) {
693 				dstaddrp = swap_pager_diskaddr(dstobject, i + dstoffset,
694 							&dstvalid);
695 				/*
696 				 * if the dest already has a valid block,
697 				 * deallocate the source block without
698 				 * copying.
699 				 */
700 				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
701 					swap_pager_freeswapspace(dstobject, *dstaddrp,
702 						*dstaddrp + btodb(PAGE_SIZE) - 1);
703 					*dstaddrp = SWB_EMPTY;
704 				}
705 				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
706 					*dstaddrp = *srcaddrp;
707 					*srcaddrp = SWB_EMPTY;
708 					dstobject->un_pager.swp.swp_allocsize += btodb(PAGE_SIZE);
709 					srcobject->un_pager.swp.swp_allocsize -= btodb(PAGE_SIZE);
710 					swap_pager_setvalid(dstobject, i + dstoffset, 1);
711 				}
712 			}
713 			/*
714 			 * if the source is not empty at this point, then
715 			 * deallocate the space.
716 			 */
717 			if (*srcaddrp != SWB_EMPTY) {
718 				swap_pager_freeswapspace(srcobject, *srcaddrp,
719 					*srcaddrp + btodb(PAGE_SIZE) - 1);
720 				*srcaddrp = SWB_EMPTY;
721 			}
722 		}
723 	}
724 	splx(s);
725 
726 	/*
727 	 * Free left over swap blocks
728 	 */
729 	if (destroysource) {
730 		swap_pager_free_swap(srcobject);
731 
732 		if (srcobject->un_pager.swp.swp_allocsize) {
733 			printf("swap_pager_copy: *warning* pager with %d blocks (orig: %d)\n",
734 			    srcobject->un_pager.swp.swp_allocsize, origsize);
735 		}
736 
737 		free(srcobject->un_pager.swp.swp_blocks, M_VMPGDATA);
738 		srcobject->un_pager.swp.swp_blocks = NULL;
739 	}
740 	return;
741 }
742 
743 static void
744 swap_pager_dealloc(object)
745 	vm_object_t object;
746 {
747 	int s;
748 	sw_blk_t swb;
749 
750 	/*
751 	 * Remove from list right away so lookups will fail if we block for
752 	 * pageout completion.
753 	 */
754 	if (object->handle == NULL) {
755 		TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
756 	} else {
757 		TAILQ_REMOVE(&swap_pager_object_list, object, pager_object_list);
758 	}
759 
760 	/*
761 	 * Wait for all pageouts to finish and remove all entries from
762 	 * cleaning list.
763 	 */
764 
765 	s = splvm();
766 	while (object->un_pager.swp.swp_poip) {
767 		tsleep(object, PVM, "swpout", 0);
768 	}
769 	splx(s);
770 
771 	if (swap_pager_free_pending)
772 		swap_pager_sync();
773 
774 	/*
775 	 * Free left over swap blocks
776 	 */
777 	swap_pager_free_swap(object);
778 
779 	if (object->un_pager.swp.swp_allocsize) {
780 		printf("swap_pager_dealloc: *warning* freeing pager with %d blocks\n",
781 		    object->un_pager.swp.swp_allocsize);
782 	}
783 	swb = object->un_pager.swp.swp_blocks;
784 	if (swb) {
785 		/*
786    		* Free swap management resources
787    		*/
788 		free(swb, M_VMPGDATA);
789 		object->un_pager.swp.swp_blocks = NULL;
790 	}
791 }
792 
793 static __inline int
794 swap_pager_block_index(pindex)
795 	vm_pindex_t pindex;
796 {
797 	return (pindex / SWB_NPAGES);
798 }
799 
800 static __inline int
801 swap_pager_block_offset(pindex)
802 	vm_pindex_t pindex;
803 {
804 	return (pindex % SWB_NPAGES);
805 }
806 
807 /*
808  * swap_pager_haspage returns TRUE if the pager has data that has
809  * been written out.
810  */
811 static boolean_t
812 swap_pager_haspage(object, pindex, before, after)
813 	vm_object_t object;
814 	vm_pindex_t pindex;
815 	int *before;
816 	int *after;
817 {
818 	register sw_blk_t swb;
819 	int ix;
820 
821 	if (before != NULL)
822 		*before = 0;
823 	if (after != NULL)
824 		*after = 0;
825 	ix = pindex / SWB_NPAGES;
826 	if (ix >= object->un_pager.swp.swp_nblocks) {
827 		return (FALSE);
828 	}
829 	swb = &object->un_pager.swp.swp_blocks[ix];
830 	ix = pindex % SWB_NPAGES;
831 
832 	if (swb->swb_block[ix] != SWB_EMPTY) {
833 
834 		if (swb->swb_valid & (1 << ix)) {
835 			int tix;
836 			if (before) {
837 				for(tix = ix - 1; tix >= 0; --tix) {
838 					if ((swb->swb_valid & (1 << tix)) == 0)
839 						break;
840 					if ((swb->swb_block[tix] +
841 						(ix - tix) * (PAGE_SIZE/DEV_BSIZE)) !=
842 						swb->swb_block[ix])
843 						break;
844 					(*before)++;
845 				}
846 			}
847 
848 			if (after) {
849 				for(tix = ix + 1; tix < SWB_NPAGES; tix++) {
850 					if ((swb->swb_valid & (1 << tix)) == 0)
851 						break;
852 					if ((swb->swb_block[tix] -
853 						(tix - ix) * (PAGE_SIZE/DEV_BSIZE)) !=
854 						swb->swb_block[ix])
855 						break;
856 					(*after)++;
857 				}
858 			}
859 
860 			return TRUE;
861 		}
862 	}
863 	return (FALSE);
864 }
865 
866 /*
867  * Wakeup based upon spc state
868  */
869 static void
870 spc_wakeup(void)
871 {
872 	if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) {
873 		swap_pager_needflags &= ~SWAP_FREE_NEEDED_BY_PAGEOUT;
874 		wakeup(&swap_pager_needflags);
875 	} else if ((swap_pager_needflags & SWAP_FREE_NEEDED) &&
876 		swap_pager_free_count >= ((2 * npendingio) / 3)) {
877 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
878 		wakeup(&swap_pager_free);
879 	}
880 }
881 
882 /*
883  * Free an spc structure
884  */
885 static void
886 spc_free(spc)
887 	swp_clean_t spc;
888 {
889 	spc->spc_flags = 0;
890 	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
891 	swap_pager_free_count++;
892 	if (swap_pager_needflags) {
893 		spc_wakeup();
894 	}
895 }
896 
897 /*
898  * swap_pager_ridpages is a convienience routine that deallocates all
899  * but the required page.  this is usually used in error returns that
900  * need to invalidate the "extra" readahead pages.
901  */
902 static void
903 swap_pager_ridpages(m, count, reqpage)
904 	vm_page_t *m;
905 	int count;
906 	int reqpage;
907 {
908 	int i;
909 
910 	for (i = 0; i < count; i++) {
911 		if (i != reqpage) {
912 			vm_page_free(m[i]);
913 		}
914 	}
915 }
916 
917 /*
918  * swap_pager_iodone1 is the completion routine for both reads and async writes
919  */
920 static void
921 swap_pager_iodone1(bp)
922 	struct buf *bp;
923 {
924 	bp->b_flags |= B_DONE;
925 	bp->b_flags &= ~B_ASYNC;
926 	wakeup(bp);
927 }
928 
929 static int
930 swap_pager_getpages(object, m, count, reqpage)
931 	vm_object_t object;
932 	vm_page_t *m;
933 	int count, reqpage;
934 {
935 	register struct buf *bp;
936 	sw_blk_t swb[count];
937 	register int s;
938 	int i;
939 	boolean_t rv;
940 	vm_offset_t kva, off[count];
941 	vm_pindex_t paging_offset;
942 	int reqaddr[count];
943 	int sequential;
944 
945 	int first, last;
946 	int failed;
947 	int reqdskregion;
948 
949 	object = m[reqpage]->object;
950 	paging_offset = OFF_TO_IDX(object->paging_offset);
951 	sequential = (m[reqpage]->pindex == (object->last_read + 1));
952 
953 	for (i = 0; i < count; i++) {
954 		vm_pindex_t fidx = m[i]->pindex + paging_offset;
955 		int ix = swap_pager_block_index(fidx);
956 
957 		if (ix >= object->un_pager.swp.swp_nblocks) {
958 			int j;
959 
960 			if (i <= reqpage) {
961 				swap_pager_ridpages(m, count, reqpage);
962 				return (VM_PAGER_FAIL);
963 			}
964 			for (j = i; j < count; j++) {
965 				vm_page_free(m[j]);
966 			}
967 			count = i;
968 			break;
969 		}
970 		swb[i] = &object->un_pager.swp.swp_blocks[ix];
971 		off[i] = swap_pager_block_offset(fidx);
972 		reqaddr[i] = swb[i]->swb_block[off[i]];
973 	}
974 
975 	/* make sure that our required input request is existant */
976 
977 	if (reqaddr[reqpage] == SWB_EMPTY ||
978 	    (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
979 		swap_pager_ridpages(m, count, reqpage);
980 		return (VM_PAGER_FAIL);
981 	}
982 	reqdskregion = reqaddr[reqpage] / dmmax;
983 
984 	/*
985 	 * search backwards for the first contiguous page to transfer
986 	 */
987 	failed = 0;
988 	first = 0;
989 	for (i = reqpage - 1; i >= 0; --i) {
990 		if (sequential || failed || (reqaddr[i] == SWB_EMPTY) ||
991 		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
992 		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
993 		    ((reqaddr[i] / dmmax) != reqdskregion)) {
994 			failed = 1;
995 			vm_page_free(m[i]);
996 			if (first == 0)
997 				first = i + 1;
998 		}
999 	}
1000 	/*
1001 	 * search forwards for the last contiguous page to transfer
1002 	 */
1003 	failed = 0;
1004 	last = count;
1005 	for (i = reqpage + 1; i < count; i++) {
1006 		if (failed || (reqaddr[i] == SWB_EMPTY) ||
1007 		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
1008 		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
1009 		    ((reqaddr[i] / dmmax) != reqdskregion)) {
1010 			failed = 1;
1011 			vm_page_free(m[i]);
1012 			if (last == count)
1013 				last = i;
1014 		}
1015 	}
1016 
1017 	count = last;
1018 	if (first != 0) {
1019 		for (i = first; i < count; i++) {
1020 			m[i - first] = m[i];
1021 			reqaddr[i - first] = reqaddr[i];
1022 			off[i - first] = off[i];
1023 		}
1024 		count -= first;
1025 		reqpage -= first;
1026 	}
1027 	++swb[reqpage]->swb_locked;
1028 
1029 	/*
1030 	 * at this point: "m" is a pointer to the array of vm_page_t for
1031 	 * paging I/O "count" is the number of vm_page_t entries represented
1032 	 * by "m" "object" is the vm_object_t for I/O "reqpage" is the index
1033 	 * into "m" for the page actually faulted
1034 	 */
1035 
1036 	/*
1037 	 * Get a swap buffer header to perform the IO
1038 	 */
1039 	bp = getpbuf();
1040 	kva = (vm_offset_t) bp->b_data;
1041 
1042 	/*
1043 	 * map our page(s) into kva for input
1044 	 */
1045 	pmap_qenter(kva, m, count);
1046 
1047 	bp->b_flags = B_BUSY | B_READ | B_CALL | B_PAGING;
1048 	bp->b_iodone = swap_pager_iodone1;
1049 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1050 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1051 	crhold(bp->b_rcred);
1052 	crhold(bp->b_wcred);
1053 	bp->b_data = (caddr_t) kva;
1054 	bp->b_blkno = reqaddr[0];
1055 	bp->b_bcount = PAGE_SIZE * count;
1056 	bp->b_bufsize = PAGE_SIZE * count;
1057 
1058 	pbgetvp(swapdev_vp, bp);
1059 
1060 	cnt.v_swapin++;
1061 	cnt.v_swappgsin += count;
1062 	/*
1063 	 * perform the I/O
1064 	 */
1065 	VOP_STRATEGY(bp->b_vp, bp);
1066 
1067 	/*
1068 	 * wait for the sync I/O to complete
1069 	 */
1070 	s = splvm();
1071 	while ((bp->b_flags & B_DONE) == 0) {
1072 		if (tsleep(bp, PVM, "swread", hz*20)) {
1073 			printf(
1074 "swap_pager: indefinite wait buffer: device: %#lx, blkno: %ld, size: %ld\n",
1075 			    (u_long)bp->b_dev, (long)bp->b_blkno,
1076 			    (long)bp->b_bcount);
1077 		}
1078 	}
1079 
1080 	if (bp->b_flags & B_ERROR) {
1081 		printf(
1082 "swap_pager: I/O error - pagein failed; blkno %ld, size %ld, error %d\n",
1083 		    (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error);
1084 		rv = VM_PAGER_ERROR;
1085 	} else {
1086 		rv = VM_PAGER_OK;
1087 	}
1088 
1089 	splx(s);
1090 	swb[reqpage]->swb_locked--;
1091 
1092 	/*
1093 	 * remove the mapping for kernel virtual
1094 	 */
1095 	pmap_qremove(kva, count);
1096 
1097 	/*
1098 	 * release the physical I/O buffer
1099 	 */
1100 	relpbuf(bp);
1101 	/*
1102 	 * finish up input if everything is ok
1103 	 */
1104 	if (rv == VM_PAGER_OK) {
1105 		for (i = 0; i < count; i++) {
1106 			m[i]->dirty = 0;
1107 			vm_page_flag_clear(m[i], PG_ZERO);
1108 			if (i != reqpage) {
1109 				/*
1110 				 * whether or not to leave the page
1111 				 * activated is up in the air, but we
1112 				 * should put the page on a page queue
1113 				 * somewhere. (it already is in the
1114 				 * object). After some emperical
1115 				 * results, it is best to deactivate
1116 				 * the readahead pages.
1117 				 */
1118 				vm_page_deactivate(m[i]);
1119 
1120 				/*
1121 				 * just in case someone was asking for
1122 				 * this page we now tell them that it
1123 				 * is ok to use
1124 				 */
1125 				m[i]->valid = VM_PAGE_BITS_ALL;
1126 				vm_page_wakeup(m[i]);
1127 			}
1128 		}
1129 
1130 		m[reqpage]->object->last_read = m[count-1]->pindex;
1131 
1132 		/*
1133 		 * If we're out of swap space, then attempt to free
1134 		 * some whenever multiple pages are brought in. We
1135 		 * must set the dirty bits so that the page contents
1136 		 * will be preserved.
1137 		 */
1138 		if (SWAPLOW ||
1139 			(vm_swap_size < btodb((cnt.v_page_count - cnt.v_wire_count)) * PAGE_SIZE)) {
1140 			for (i = 0; i < count; i++) {
1141 				m[i]->dirty = VM_PAGE_BITS_ALL;
1142 			}
1143 			swap_pager_freespace(object,
1144 				m[0]->pindex + paging_offset, count);
1145 		}
1146 
1147 	} else {
1148 		swap_pager_ridpages(m, count, reqpage);
1149 	}
1150 	return (rv);
1151 }
1152 
1153 int
1154 swap_pager_putpages(object, m, count, sync, rtvals)
1155 	vm_object_t object;
1156 	vm_page_t *m;
1157 	int count;
1158 	boolean_t sync;
1159 	int *rtvals;
1160 {
1161 	register struct buf *bp;
1162 	sw_blk_t swb[count];
1163 	register int s;
1164 	int i, j, ix, firstidx, lastidx;
1165 	boolean_t rv;
1166 	vm_offset_t kva, off, fidx;
1167 	swp_clean_t spc;
1168 	vm_pindex_t paging_pindex;
1169 	int reqaddr[count];
1170 	int failed;
1171 
1172 	if (vm_swap_size)
1173 		no_swap_space = 0;
1174 
1175 	if (no_swap_space) {
1176 		for (i = 0; i < count; i++)
1177 			rtvals[i] = VM_PAGER_FAIL;
1178 		return VM_PAGER_FAIL;
1179 	}
1180 
1181 	if (curproc != pageproc)
1182 		sync = TRUE;
1183 
1184 	object = m[0]->object;
1185 	paging_pindex = OFF_TO_IDX(object->paging_offset);
1186 
1187 	failed = 0;
1188 	for (j = 0; j < count; j++) {
1189 		fidx = m[j]->pindex + paging_pindex;
1190 		ix = swap_pager_block_index(fidx);
1191 		swb[j] = 0;
1192 		if (ix >= object->un_pager.swp.swp_nblocks) {
1193 			rtvals[j] = VM_PAGER_FAIL;
1194 			failed = 1;
1195 			continue;
1196 		} else {
1197 			rtvals[j] = VM_PAGER_OK;
1198 		}
1199 		swb[j] = &object->un_pager.swp.swp_blocks[ix];
1200 		swb[j]->swb_locked++;
1201 		if (failed) {
1202 			rtvals[j] = VM_PAGER_FAIL;
1203 			continue;
1204 		}
1205 		off = swap_pager_block_offset(fidx);
1206 		reqaddr[j] = swb[j]->swb_block[off];
1207 		if (reqaddr[j] == SWB_EMPTY) {
1208 			daddr_t blk;
1209 			int tries;
1210 			int ntoget;
1211 
1212 			tries = 0;
1213 			s = splvm();
1214 
1215 			/*
1216 			 * if any other pages have been allocated in this
1217 			 * block, we only try to get one page.
1218 			 */
1219 			for (i = 0; i < SWB_NPAGES; i++) {
1220 				if (swb[j]->swb_block[i] != SWB_EMPTY)
1221 					break;
1222 			}
1223 
1224 			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
1225 			/*
1226 			 * this code is alittle conservative, but works (the
1227 			 * intent of this code is to allocate small chunks for
1228 			 * small objects)
1229 			 */
1230 			if ((off == 0) && ((fidx + ntoget) > object->size)) {
1231 				ntoget = object->size - fidx;
1232 			}
1233 	retrygetspace:
1234 			if (!swap_pager_full && ntoget > 1 &&
1235 			    swap_pager_getswapspace(object, ntoget * btodb(PAGE_SIZE),
1236 				&blk)) {
1237 
1238 				for (i = 0; i < ntoget; i++) {
1239 					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
1240 					swb[j]->swb_valid = 0;
1241 				}
1242 
1243 				reqaddr[j] = swb[j]->swb_block[off];
1244 			} else if (!swap_pager_getswapspace(object, btodb(PAGE_SIZE),
1245 				&swb[j]->swb_block[off])) {
1246 				/*
1247 				 * if the allocation has failed, we try to
1248 				 * reclaim space and retry.
1249 				 */
1250 				if (++tries == 1) {
1251 					swap_pager_reclaim();
1252 					goto retrygetspace;
1253 				}
1254 				rtvals[j] = VM_PAGER_AGAIN;
1255 				failed = 1;
1256 				swap_pager_full = 1;
1257 			} else {
1258 				reqaddr[j] = swb[j]->swb_block[off];
1259 				swb[j]->swb_valid &= ~(1 << off);
1260 			}
1261 			splx(s);
1262 		}
1263 	}
1264 
1265 	/*
1266 	 * search forwards for the last contiguous page to transfer
1267 	 */
1268 	failed = 0;
1269 	for (i = 0; i < count; i++) {
1270 		if (failed ||
1271 			(reqaddr[i] != reqaddr[0] + i * btodb(PAGE_SIZE)) ||
1272 		    ((reqaddr[i] / dmmax) != (reqaddr[0] / dmmax)) ||
1273 		    (rtvals[i] != VM_PAGER_OK)) {
1274 			failed = 1;
1275 			if (rtvals[i] == VM_PAGER_OK)
1276 				rtvals[i] = VM_PAGER_AGAIN;
1277 		}
1278 	}
1279 
1280 	ix = 0;
1281 	firstidx = -1;
1282 	for (i = 0; i < count; i++) {
1283 		if (rtvals[i] == VM_PAGER_OK) {
1284 			ix++;
1285 			if (firstidx == -1) {
1286 				firstidx = i;
1287 			}
1288 		} else if (firstidx >= 0) {
1289 			break;
1290 		}
1291 	}
1292 
1293 	if (firstidx == -1) {
1294 		for (i = 0; i < count; i++) {
1295 			if (rtvals[i] == VM_PAGER_OK)
1296 				rtvals[i] = VM_PAGER_AGAIN;
1297 		}
1298 		return VM_PAGER_AGAIN;
1299 	}
1300 
1301 	lastidx = firstidx + ix;
1302 
1303 	if (ix > max_pageout_cluster) {
1304 		for (i = firstidx + max_pageout_cluster; i < lastidx; i++) {
1305 			if (rtvals[i] == VM_PAGER_OK)
1306 				rtvals[i] = VM_PAGER_AGAIN;
1307 		}
1308 		ix = max_pageout_cluster;
1309 		lastidx = firstidx + ix;
1310 	}
1311 
1312 	for (i = 0; i < firstidx; i++) {
1313 		if (swb[i])
1314 			swb[i]->swb_locked--;
1315 	}
1316 
1317 	for (i = lastidx; i < count; i++) {
1318 		if (swb[i])
1319 			swb[i]->swb_locked--;
1320 	}
1321 
1322 #if defined(DIAGNOSTIC)
1323 	for (i = firstidx; i < lastidx; i++) {
1324 		if (reqaddr[i] == SWB_EMPTY) {
1325 			printf("I/O to empty block???? -- pindex: %d, i: %d\n",
1326 				m[i]->pindex, i);
1327 		}
1328 	}
1329 #endif
1330 
1331 	/*
1332 	 * Clean up all completed async pageouts.
1333 	 */
1334 	if (swap_pager_free_pending)
1335 		swap_pager_sync();
1336 
1337 	/*
1338 	 * get a swap pager clean data structure, block until we get it
1339 	 */
1340 	if (curproc == pageproc) {
1341 		if (swap_pager_free_count == 0) {
1342 			s = splvm();
1343 			while (swap_pager_free_count == 0) {
1344 				swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT;
1345 			/*
1346 			 * if it does not get one within a short time, then
1347 			 * there is a potential deadlock, so we go-on trying
1348 			 * to free pages.  It is important to block here as opposed
1349 			 * to returning, thereby allowing the pageout daemon to continue.
1350 			 * It is likely that pageout daemon will start suboptimally
1351 			 * reclaiming vnode backed pages if we don't block.  Since the
1352 			 * I/O subsystem is probably already fully utilized, might as
1353 			 * well wait.
1354 			 */
1355 				if (tsleep(&swap_pager_needflags, PVM-1, "swpfre", hz/2)) {
1356 					if (swap_pager_free_pending)
1357 						swap_pager_sync();
1358 					if (swap_pager_free_count == 0) {
1359 						for (i = firstidx; i < lastidx; i++) {
1360 							rtvals[i] = VM_PAGER_AGAIN;
1361 						}
1362 						splx(s);
1363 						return VM_PAGER_AGAIN;
1364 					}
1365 				} else {
1366 					swap_pager_sync();
1367 				}
1368 			}
1369 			splx(s);
1370 		}
1371 
1372 		spc = TAILQ_FIRST(&swap_pager_free);
1373 #if defined(DIAGNOSTIC)
1374 		if (spc == NULL)
1375 			panic("swap_pager_putpages: free queue is empty, %d expected\n",
1376 				swap_pager_free_count);
1377 #endif
1378 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1379 		swap_pager_free_count--;
1380 
1381 		kva = spc->spc_kva;
1382 		bp = spc->spc_bp;
1383 		bzero(bp, sizeof *bp);
1384 		bp->b_spc = spc;
1385 		bp->b_xflags = 0;
1386 		bp->b_data = (caddr_t) kva;
1387 	} else {
1388 		spc = NULL;
1389 		bp = getpbuf();
1390 		kva = (vm_offset_t) bp->b_data;
1391 		bp->b_spc = NULL;
1392 	}
1393 
1394 	/*
1395 	 * map our page(s) into kva for I/O
1396 	 */
1397 	pmap_qenter(kva, &m[firstidx], ix);
1398 
1399 	/*
1400 	 * get the base I/O offset into the swap file
1401 	 */
1402 	for (i = firstidx; i < lastidx ; i++) {
1403 		fidx = m[i]->pindex + paging_pindex;
1404 		off = swap_pager_block_offset(fidx);
1405 		/*
1406 		 * set the valid bit
1407 		 */
1408 		swb[i]->swb_valid |= (1 << off);
1409 		/*
1410 		 * and unlock the data structure
1411 		 */
1412 		swb[i]->swb_locked--;
1413 	}
1414 
1415 	bp->b_flags = B_BUSY | B_PAGING;
1416 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1417 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1418 	if (bp->b_rcred != NOCRED)
1419 		crhold(bp->b_rcred);
1420 	if (bp->b_wcred != NOCRED)
1421 		crhold(bp->b_wcred);
1422 	bp->b_blkno = reqaddr[firstidx];
1423 	pbgetvp(swapdev_vp, bp);
1424 
1425 	bp->b_bcount = PAGE_SIZE * ix;
1426 	bp->b_bufsize = PAGE_SIZE * ix;
1427 
1428 	s = splvm();
1429 	swapdev_vp->v_numoutput++;
1430 
1431 	/*
1432 	 * If this is an async write we set up additional buffer fields and
1433   	 * place a "cleaning" entry on the inuse queue.
1434   	 */
1435  	object->un_pager.swp.swp_poip++;
1436 
1437  	if (spc) {
1438   		spc->spc_flags = 0;
1439   		spc->spc_object = object;
1440  		bp->b_npages = ix;
1441  		for (i = firstidx; i < lastidx; i++) {
1442   			spc->spc_m[i] = m[i];
1443  			bp->b_pages[i - firstidx] = m[i];
1444  			vm_page_protect(m[i], VM_PROT_READ);
1445  			pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1446  			m[i]->dirty = 0;
1447  		}
1448   		spc->spc_first = firstidx;
1449   		spc->spc_count = ix;
1450 		/*
1451 		 * the completion routine for async writes
1452 		 */
1453 		bp->b_flags |= B_CALL;
1454 		bp->b_iodone = swap_pager_iodone;
1455 		bp->b_dirtyoff = 0;
1456 		bp->b_dirtyend = bp->b_bcount;
1457 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
1458 	} else {
1459 		bp->b_flags |= B_CALL;
1460 		bp->b_iodone = swap_pager_iodone1;
1461 		bp->b_npages = ix;
1462 		for (i = firstidx; i < lastidx; i++)
1463 			bp->b_pages[i - firstidx] = m[i];
1464 	}
1465 
1466 	cnt.v_swapout++;
1467 	cnt.v_swappgsout += ix;
1468 
1469 	/*
1470 	 * perform the I/O
1471 	 */
1472 	VOP_STRATEGY(bp->b_vp, bp);
1473 	if (sync == FALSE) {
1474 		if (swap_pager_free_pending) {
1475 			swap_pager_sync();
1476 		}
1477 		for (i = firstidx; i < lastidx; i++) {
1478 			rtvals[i] = VM_PAGER_PEND;
1479 		}
1480 		splx(s);
1481 		return VM_PAGER_PEND;
1482 	}
1483 
1484 	/*
1485 	 * wait for the sync I/O to complete
1486 	 */
1487 	while ((bp->b_flags & B_DONE) == 0) {
1488 		tsleep(bp, PVM, "swwrt", 0);
1489 	}
1490 
1491 	if (bp->b_flags & B_ERROR) {
1492 		printf(
1493 "swap_pager: I/O error - pageout failed; blkno %ld, size %ld, error %d\n",
1494 		    (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error);
1495 		rv = VM_PAGER_ERROR;
1496 	} else {
1497 		rv = VM_PAGER_OK;
1498 	}
1499 
1500 	object->un_pager.swp.swp_poip--;
1501 	if (object->un_pager.swp.swp_poip == 0)
1502 		wakeup(object);
1503 
1504 	if (bp->b_vp)
1505 		pbrelvp(bp);
1506 
1507 	splx(s);
1508 
1509 	/*
1510 	 * remove the mapping for kernel virtual
1511 	 */
1512 	pmap_qremove(kva, ix);
1513 
1514 	/*
1515 	 * if we have written the page, then indicate that the page is clean.
1516 	 */
1517 	if (rv == VM_PAGER_OK) {
1518 		for (i = firstidx; i < lastidx; i++) {
1519 			if (rtvals[i] == VM_PAGER_OK) {
1520 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1521 				m[i]->dirty = 0;
1522 				/*
1523 				 * optimization, if a page has been read
1524 				 * during the pageout process, we activate it.
1525 				 */
1526 				if (((m[i]->flags & (PG_WANTED|PG_REFERENCED)) ||
1527 				    pmap_ts_referenced(VM_PAGE_TO_PHYS(m[i])))) {
1528 					vm_page_activate(m[i]);
1529 				}
1530 			}
1531 		}
1532 	} else {
1533 		for (i = firstidx; i < lastidx; i++) {
1534 			rtvals[i] = rv;
1535 		}
1536 	}
1537 
1538 	if (bp->b_rcred != NOCRED)
1539 		crfree(bp->b_rcred);
1540 	if (bp->b_wcred != NOCRED)
1541 		crfree(bp->b_wcred);
1542 
1543 	spc_free(spc);
1544 	if (swap_pager_free_pending)
1545 		swap_pager_sync();
1546 
1547 	return (rv);
1548 }
1549 
1550 void
1551 swap_pager_sync()
1552 {
1553 	swp_clean_t spc;
1554 
1555 	while (spc = TAILQ_FIRST(&swap_pager_done)) {
1556 		swap_pager_finish(spc);
1557 	}
1558 	return;
1559 }
1560 
1561 static void
1562 swap_pager_finish(spc)
1563 	register swp_clean_t spc;
1564 {
1565 	int i, s, lastidx;
1566 	vm_object_t object;
1567 	vm_page_t *ma;
1568 
1569 	ma = spc->spc_m;
1570 	object = spc->spc_object;
1571 	lastidx = spc->spc_first + spc->spc_count;
1572 
1573 	s = splvm();
1574 	TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
1575 	splx(s);
1576 
1577 	pmap_qremove(spc->spc_kva, spc->spc_count);
1578 
1579 	/*
1580 	 * If no error, mark as clean and inform the pmap system. If error,
1581 	 * mark as dirty so we will try again. (XXX could get stuck doing
1582 	 * this, should give up after awhile)
1583 	 */
1584 	if (spc->spc_flags & SPC_ERROR) {
1585 
1586 		for (i = spc->spc_first; i < lastidx; i++) {
1587 			printf("swap_pager_finish: I/O error, clean of page %lx failed\n",
1588 			    (u_long) VM_PAGE_TO_PHYS(ma[i]));
1589 			ma[i]->dirty = VM_PAGE_BITS_ALL;
1590 			vm_page_io_finish(ma[i]);
1591 		}
1592 
1593 		vm_object_pip_subtract(object, spc->spc_count);
1594 		if ((object->paging_in_progress == 0) &&
1595 			(object->flags & OBJ_PIPWNT)) {
1596 			vm_object_clear_flag(object, OBJ_PIPWNT);
1597 			wakeup(object);
1598 		}
1599 
1600 	} else {
1601 		for (i = spc->spc_first; i < lastidx; i++) {
1602 			if ((ma[i]->queue != PQ_ACTIVE) &&
1603 			   ((ma[i]->flags & PG_WANTED) ||
1604 				 pmap_ts_referenced(VM_PAGE_TO_PHYS(ma[i])))) {
1605 				vm_page_activate(ma[i]);
1606 			}
1607 		}
1608 	}
1609 
1610 	nswiodone -= spc->spc_count;
1611 	swap_pager_free_pending--;
1612 	spc_free(spc);
1613 
1614 	return;
1615 }
1616 
1617 /*
1618  * swap_pager_iodone
1619  */
1620 static void
1621 swap_pager_iodone(bp)
1622 	register struct buf *bp;
1623 {
1624 	int i, s, lastidx;
1625 	register swp_clean_t spc;
1626 	vm_object_t object;
1627 	vm_page_t *ma;
1628 
1629 
1630 	s = splvm();
1631 	spc = (swp_clean_t) bp->b_spc;
1632 	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
1633 	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
1634 
1635 	object = spc->spc_object;
1636 
1637 #if defined(DIAGNOSTIC)
1638 	if (object->paging_in_progress < spc->spc_count)
1639 		printf("swap_pager_iodone: paging_in_progress(%d) < spc_count(%d)\n",
1640 			object->paging_in_progress, spc->spc_count);
1641 #endif
1642 
1643 	if (bp->b_flags & B_ERROR) {
1644 		spc->spc_flags |= SPC_ERROR;
1645 		printf("swap_pager: I/O error - async %s failed; blkno %lu, size %ld, error %d\n",
1646 		    (bp->b_flags & B_READ) ? "pagein" : "pageout",
1647 		    (u_long) bp->b_blkno, bp->b_bcount, bp->b_error);
1648 	} else {
1649 		vm_object_pip_subtract(object, spc->spc_count);
1650 		if ((object->paging_in_progress == 0) &&
1651 			(object->flags & OBJ_PIPWNT)) {
1652 			vm_object_clear_flag(object, OBJ_PIPWNT);
1653 			wakeup(object);
1654 		}
1655 		ma = spc->spc_m;
1656 		lastidx = spc->spc_first + spc->spc_count;
1657 		for (i = spc->spc_first; i < lastidx; i++) {
1658 			/*
1659 			 * we wakeup any processes that are waiting on these pages.
1660 			 */
1661 			vm_page_io_finish(ma[i]);
1662 		}
1663 	}
1664 
1665 	if (bp->b_vp)
1666 		pbrelvp(bp);
1667 
1668 	if (bp->b_rcred != NOCRED)
1669 		crfree(bp->b_rcred);
1670 	if (bp->b_wcred != NOCRED)
1671 		crfree(bp->b_wcred);
1672 
1673 	nswiodone += spc->spc_count;
1674 	swap_pager_free_pending++;
1675 	if (--spc->spc_object->un_pager.swp.swp_poip == 0) {
1676 		wakeup(spc->spc_object);
1677 	}
1678 
1679 	if (swap_pager_needflags &&
1680 	  ((swap_pager_free_count + swap_pager_free_pending) > (npendingio / 2))) {
1681 		spc_wakeup();
1682 	}
1683 
1684 	if ((TAILQ_FIRST(&swap_pager_inuse) == NULL) &&
1685 		vm_pageout_pages_needed) {
1686 		wakeup(&vm_pageout_pages_needed);
1687 		vm_pageout_pages_needed = 0;
1688 	}
1689 
1690 	splx(s);
1691 }
1692