xref: /freebsd/sys/vm/swap_pager.c (revision e627b39baccd1ec9129690167cf5e6d860509655)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * Copyright (c) 1990 University of Utah.
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * the Systems Programming Group of the University of Utah Computer
9  * Science Department.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
40  *
41  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
42  * $Id: swap_pager.c,v 1.70 1996/07/30 03:08:05 dyson Exp $
43  */
44 
45 /*
46  * Quick hack to page to dedicated partition(s).
47  * TODO:
48  *	Add multiprocessor locks
49  *	Deal with async writes in a better fashion
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
55 #include <sys/proc.h>
56 #include <sys/buf.h>
57 #include <sys/vnode.h>
58 #include <sys/malloc.h>
59 #include <sys/vmmeter.h>
60 
61 #include <miscfs/specfs/specdev.h>
62 #include <sys/rlist.h>
63 
64 #include <vm/vm.h>
65 #include <vm/vm_param.h>
66 #include <vm/vm_prot.h>
67 #include <vm/vm_object.h>
68 #include <vm/vm_page.h>
69 #include <vm/vm_pager.h>
70 #include <vm/vm_pageout.h>
71 #include <vm/swap_pager.h>
72 #include <vm/vm_kern.h>
73 #include <vm/vm_extern.h>
74 
75 #ifndef NPENDINGIO
76 #define NPENDINGIO	10
77 #endif
78 
79 static int nswiodone;
80 int swap_pager_full;
81 extern int vm_swap_size;
82 static int no_swap_space = 1;
83 struct rlisthdr swaplist;
84 
85 #define MAX_PAGEOUT_CLUSTER 16
86 
87 TAILQ_HEAD(swpclean, swpagerclean);
88 
89 typedef struct swpagerclean *swp_clean_t;
90 
91 static struct swpagerclean {
92 	TAILQ_ENTRY(swpagerclean) spc_list;
93 	int spc_flags;
94 	struct buf *spc_bp;
95 	vm_object_t spc_object;
96 	vm_offset_t spc_kva;
97 	int spc_count;
98 	vm_page_t spc_m[MAX_PAGEOUT_CLUSTER];
99 } swcleanlist[NPENDINGIO];
100 
101 
102 /* spc_flags values */
103 #define SPC_ERROR	0x01
104 
105 #define SWB_EMPTY (-1)
106 
107 /* list of completed page cleans */
108 static struct swpclean swap_pager_done;
109 
110 /* list of pending page cleans */
111 static struct swpclean swap_pager_inuse;
112 
113 /* list of free pager clean structs */
114 static struct swpclean swap_pager_free;
115 int swap_pager_free_count;
116 
117 /* list of "named" anon region objects */
118 static struct pagerlst swap_pager_object_list;
119 
120 /* list of "unnamed" anon region objects */
121 struct pagerlst swap_pager_un_object_list;
122 
123 #define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
124 #define SWAP_FREE_NEEDED_BY_PAGEOUT 0x2
125 static int swap_pager_needflags;
126 
127 static struct pagerlst *swp_qs[] = {
128 	&swap_pager_object_list, &swap_pager_un_object_list, (struct pagerlst *) 0
129 };
130 
131 /*
132  * pagerops for OBJT_SWAP - "swap pager".
133  */
134 static vm_object_t
135 		swap_pager_alloc __P((void *handle, vm_size_t size,
136 				      vm_prot_t prot, vm_ooffset_t offset));
137 static void	swap_pager_dealloc __P((vm_object_t object));
138 static boolean_t
139 		swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex,
140 					int *before, int *after));
141 static int	swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
142 static void	swap_pager_init __P((void));
143 static void	swap_pager_sync __P((void));
144 
145 struct pagerops swappagerops = {
146 	swap_pager_init,
147 	swap_pager_alloc,
148 	swap_pager_dealloc,
149 	swap_pager_getpages,
150 	swap_pager_putpages,
151 	swap_pager_haspage,
152 	swap_pager_sync
153 };
154 
155 static int npendingio = NPENDINGIO;
156 static int dmmin;
157 int dmmax;
158 
159 static __pure int
160 		swap_pager_block_index __P((vm_pindex_t pindex)) __pure2;
161 static __pure int
162 		swap_pager_block_offset __P((vm_pindex_t pindex)) __pure2;
163 static daddr_t *swap_pager_diskaddr __P((vm_object_t object,
164 					  vm_pindex_t pindex, int *valid));
165 static void	swap_pager_finish __P((swp_clean_t spc));
166 static void	swap_pager_freepage __P((vm_page_t m));
167 static void	swap_pager_free_swap __P((vm_object_t object));
168 static void	swap_pager_freeswapspace __P((vm_object_t object,
169 					      unsigned int from,
170 					      unsigned int to));
171 static int	swap_pager_getswapspace __P((vm_object_t object,
172 					     unsigned int amount,
173 					     daddr_t *rtval));
174 static void	swap_pager_iodone __P((struct buf *));
175 static void	swap_pager_iodone1 __P((struct buf *bp));
176 static void	swap_pager_reclaim __P((void));
177 static void	swap_pager_ridpages __P((vm_page_t *m, int count,
178 					 int reqpage));
179 static void	swap_pager_setvalid __P((vm_object_t object,
180 					 vm_offset_t offset, int valid));
181 static void	swapsizecheck __P((void));
182 
183 #define SWAPLOW (vm_swap_size < (512 * btodb(PAGE_SIZE)))
184 
185 static inline void
186 swapsizecheck()
187 {
188 	if (vm_swap_size < 128 * btodb(PAGE_SIZE)) {
189 		if (swap_pager_full == 0)
190 			printf("swap_pager: out of swap space\n");
191 		swap_pager_full = 1;
192 	} else if (vm_swap_size > 192 * btodb(PAGE_SIZE))
193 		swap_pager_full = 0;
194 }
195 
196 static void
197 swap_pager_init()
198 {
199 	TAILQ_INIT(&swap_pager_object_list);
200 	TAILQ_INIT(&swap_pager_un_object_list);
201 
202 	/*
203 	 * Initialize clean lists
204 	 */
205 	TAILQ_INIT(&swap_pager_inuse);
206 	TAILQ_INIT(&swap_pager_done);
207 	TAILQ_INIT(&swap_pager_free);
208 	swap_pager_free_count = 0;
209 
210 	/*
211 	 * Calculate the swap allocation constants.
212 	 */
213 	dmmin = PAGE_SIZE / DEV_BSIZE;
214 	dmmax = btodb(SWB_NPAGES * PAGE_SIZE) * 2;
215 }
216 
217 void
218 swap_pager_swap_init()
219 {
220 	swp_clean_t spc;
221 	struct buf *bp;
222 	int i;
223 
224 	/*
225 	 * kva's are allocated here so that we dont need to keep doing
226 	 * kmem_alloc pageables at runtime
227 	 */
228 	for (i = 0, spc = swcleanlist; i < npendingio; i++, spc++) {
229 		spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE * MAX_PAGEOUT_CLUSTER);
230 		if (!spc->spc_kva) {
231 			break;
232 		}
233 		spc->spc_bp = malloc(sizeof(*bp), M_TEMP, M_KERNEL);
234 		if (!spc->spc_bp) {
235 			kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
236 			break;
237 		}
238 		spc->spc_flags = 0;
239 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
240 		swap_pager_free_count++;
241 	}
242 }
243 
244 int
245 swap_pager_swp_alloc(object, wait)
246 	vm_object_t object;
247 	int wait;
248 {
249 	sw_blk_t swb;
250 	int nblocks;
251 	int i, j;
252 
253 	nblocks = (object->size + SWB_NPAGES - 1) / SWB_NPAGES;
254 	swb = malloc(nblocks * sizeof(*swb), M_VMPGDATA, wait);
255 	if (swb == NULL)
256 		return 1;
257 
258 	for (i = 0; i < nblocks; i++) {
259 		swb[i].swb_valid = 0;
260 		swb[i].swb_locked = 0;
261 		for (j = 0; j < SWB_NPAGES; j++)
262 			swb[i].swb_block[j] = SWB_EMPTY;
263 	}
264 
265 	object->un_pager.swp.swp_nblocks = nblocks;
266 	object->un_pager.swp.swp_allocsize = 0;
267 	object->un_pager.swp.swp_blocks = swb;
268 	object->un_pager.swp.swp_poip = 0;
269 
270 	if (object->handle != NULL) {
271 		TAILQ_INSERT_TAIL(&swap_pager_object_list, object, pager_object_list);
272 	} else {
273 		TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
274 	}
275 
276 	return 0;
277 }
278 
279 /*
280  * Allocate an object and associated resources.
281  * Note that if we are called from the pageout daemon (handle == NULL)
282  * we should not wait for memory as it could resulting in deadlock.
283  */
284 static vm_object_t
285 swap_pager_alloc(handle, size, prot, offset)
286 	void *handle;
287 	register vm_size_t size;
288 	vm_prot_t prot;
289 	vm_ooffset_t offset;
290 {
291 	vm_object_t object;
292 
293 	/*
294 	 * If this is a "named" anonymous region, look it up and use the
295 	 * object if it exists, otherwise allocate a new one.
296 	 */
297 	if (handle) {
298 		object = vm_pager_object_lookup(&swap_pager_object_list, handle);
299 		if (object != NULL) {
300 			vm_object_reference(object);
301 		} else {
302 			/*
303 			 * XXX - there is a race condition here. Two processes
304 			 * can request the same named object simultaneuously,
305 			 * and if one blocks for memory, the result is a disaster.
306 			 * Probably quite rare, but is yet another reason to just
307 			 * rip support of "named anonymous regions" out altogether.
308 			 */
309 			object = vm_object_allocate(OBJT_SWAP,
310 				OFF_TO_IDX(offset + PAGE_MASK) + size);
311 			object->handle = handle;
312 			(void) swap_pager_swp_alloc(object, M_WAITOK);
313 		}
314 	} else {
315 		object = vm_object_allocate(OBJT_SWAP,
316 			OFF_TO_IDX(offset + PAGE_MASK) + size);
317 		(void) swap_pager_swp_alloc(object, M_WAITOK);
318 	}
319 
320 	return (object);
321 }
322 
323 /*
324  * returns disk block associated with pager and offset
325  * additionally, as a side effect returns a flag indicating
326  * if the block has been written
327  */
328 
329 inline static daddr_t *
330 swap_pager_diskaddr(object, pindex, valid)
331 	vm_object_t object;
332 	vm_pindex_t pindex;
333 	int *valid;
334 {
335 	register sw_blk_t swb;
336 	int ix;
337 
338 	if (valid)
339 		*valid = 0;
340 	ix = pindex / SWB_NPAGES;
341 	if ((ix >= object->un_pager.swp.swp_nblocks) ||
342 	    (pindex >= object->size)) {
343 		return (FALSE);
344 	}
345 	swb = &object->un_pager.swp.swp_blocks[ix];
346 	ix = pindex % SWB_NPAGES;
347 	if (valid)
348 		*valid = swb->swb_valid & (1 << ix);
349 	return &swb->swb_block[ix];
350 }
351 
352 /*
353  * Utility routine to set the valid (written) bit for
354  * a block associated with a pager and offset
355  */
356 static void
357 swap_pager_setvalid(object, offset, valid)
358 	vm_object_t object;
359 	vm_offset_t offset;
360 	int valid;
361 {
362 	register sw_blk_t swb;
363 	int ix;
364 
365 	ix = offset / SWB_NPAGES;
366 	if (ix >= object->un_pager.swp.swp_nblocks)
367 		return;
368 
369 	swb = &object->un_pager.swp.swp_blocks[ix];
370 	ix = offset % SWB_NPAGES;
371 	if (valid)
372 		swb->swb_valid |= (1 << ix);
373 	else
374 		swb->swb_valid &= ~(1 << ix);
375 	return;
376 }
377 
378 /*
379  * this routine allocates swap space with a fragmentation
380  * minimization policy.
381  */
382 static int
383 swap_pager_getswapspace(object, amount, rtval)
384 	vm_object_t object;
385 	unsigned int amount;
386 	daddr_t *rtval;
387 {
388 	unsigned location;
389 	vm_swap_size -= amount;
390 	if (!rlist_alloc(&swaplist, amount, &location)) {
391 		vm_swap_size += amount;
392 		return 0;
393 	} else {
394 		swapsizecheck();
395 		object->un_pager.swp.swp_allocsize += amount;
396 		*rtval = location;
397 		return 1;
398 	}
399 }
400 
401 /*
402  * this routine frees swap space with a fragmentation
403  * minimization policy.
404  */
405 static void
406 swap_pager_freeswapspace(object, from, to)
407 	vm_object_t object;
408 	unsigned int from;
409 	unsigned int to;
410 {
411 	rlist_free(&swaplist, from, to);
412 	vm_swap_size += (to - from) + 1;
413 	object->un_pager.swp.swp_allocsize -= (to - from) + 1;
414 	swapsizecheck();
415 }
416 /*
417  * this routine frees swap blocks from a specified pager
418  */
419 void
420 swap_pager_freespace(object, start, size)
421 	vm_object_t object;
422 	vm_pindex_t start;
423 	vm_size_t size;
424 {
425 	vm_pindex_t i;
426 	int s;
427 
428 	s = splbio();
429 	for (i = start; i < start + size; i += 1) {
430 		int valid;
431 		daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
432 
433 		if (addr && *addr != SWB_EMPTY) {
434 			swap_pager_freeswapspace(object, *addr, *addr + btodb(PAGE_SIZE) - 1);
435 			if (valid) {
436 				swap_pager_setvalid(object, i, 0);
437 			}
438 			*addr = SWB_EMPTY;
439 		}
440 	}
441 	splx(s);
442 }
443 
444 /*
445  * same as freespace, but don't free, just force a DMZ next time
446  */
447 void
448 swap_pager_dmzspace(object, start, size)
449 	vm_object_t object;
450 	vm_pindex_t start;
451 	vm_size_t size;
452 {
453 	vm_pindex_t i;
454 	int s;
455 
456 	s = splbio();
457 	for (i = start; i < start + size; i += 1) {
458 		int valid;
459 		daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
460 
461 		if (addr && *addr != SWB_EMPTY) {
462 			if (valid) {
463 				swap_pager_setvalid(object, i, 0);
464 			}
465 		}
466 	}
467 	splx(s);
468 }
469 
470 static void
471 swap_pager_free_swap(object)
472 	vm_object_t object;
473 {
474 	register int i, j;
475 	register sw_blk_t swb;
476 	int first_block=0, block_count=0;
477 	int s;
478 	/*
479 	 * Free left over swap blocks
480 	 */
481 	s = splbio();
482 	for (i = 0, swb = object->un_pager.swp.swp_blocks;
483 	    i < object->un_pager.swp.swp_nblocks; i++, swb++) {
484 		for (j = 0; j < SWB_NPAGES; j++) {
485 			if (swb->swb_block[j] != SWB_EMPTY) {
486 				/*
487 				 * initially the length of the run is zero
488 				 */
489 				if (block_count == 0) {
490 					first_block = swb->swb_block[j];
491 					block_count = btodb(PAGE_SIZE);
492 					swb->swb_block[j] = SWB_EMPTY;
493 				/*
494 				 * if the new block can be included into the current run
495 				 */
496 				} else if (swb->swb_block[j] == first_block + block_count) {
497 					block_count += btodb(PAGE_SIZE);
498 					swb->swb_block[j] = SWB_EMPTY;
499 				/*
500 				 * terminate the previous run, and start a new one
501 				 */
502 				} else {
503 					swap_pager_freeswapspace(object, first_block,
504 				   	 (unsigned) first_block + block_count - 1);
505 					first_block = swb->swb_block[j];
506 					block_count = btodb(PAGE_SIZE);
507 					swb->swb_block[j] = SWB_EMPTY;
508 				}
509 			}
510 		}
511 	}
512 
513 	if (block_count) {
514 		swap_pager_freeswapspace(object, first_block,
515 		   	 (unsigned) first_block + block_count - 1);
516 	}
517 	splx(s);
518 }
519 
520 
521 /*
522  * swap_pager_reclaim frees up over-allocated space from all pagers
523  * this eliminates internal fragmentation due to allocation of space
524  * for segments that are never swapped to. It has been written so that
525  * it does not block until the rlist_free operation occurs; it keeps
526  * the queues consistant.
527  */
528 
529 /*
530  * Maximum number of blocks (pages) to reclaim per pass
531  */
532 #define MAXRECLAIM 128
533 
534 static void
535 swap_pager_reclaim()
536 {
537 	vm_object_t object;
538 	int i, j, k;
539 	int s;
540 	int reclaimcount;
541 	static struct {
542 		int address;
543 		vm_object_t object;
544 	} reclaims[MAXRECLAIM];
545 	static int in_reclaim;
546 
547 	/*
548 	 * allow only one process to be in the swap_pager_reclaim subroutine
549 	 */
550 	s = splbio();
551 	if (in_reclaim) {
552 		tsleep(&in_reclaim, PSWP, "swrclm", 0);
553 		splx(s);
554 		return;
555 	}
556 	in_reclaim = 1;
557 	reclaimcount = 0;
558 
559 	/* for each pager queue */
560 	for (k = 0; swp_qs[k]; k++) {
561 
562 		object = TAILQ_FIRST(swp_qs[k]);
563 		while (object && (reclaimcount < MAXRECLAIM)) {
564 
565 			/*
566 			 * see if any blocks associated with a pager has been
567 			 * allocated but not used (written)
568 			 */
569 			if ((object->flags & OBJ_DEAD) == 0 &&
570 				(object->paging_in_progress == 0)) {
571 				for (i = 0; i < object->un_pager.swp.swp_nblocks; i++) {
572 					sw_blk_t swb = &object->un_pager.swp.swp_blocks[i];
573 
574 					if (swb->swb_locked)
575 						continue;
576 					for (j = 0; j < SWB_NPAGES; j++) {
577 						if (swb->swb_block[j] != SWB_EMPTY &&
578 						    (swb->swb_valid & (1 << j)) == 0) {
579 							reclaims[reclaimcount].address = swb->swb_block[j];
580 							reclaims[reclaimcount++].object = object;
581 							swb->swb_block[j] = SWB_EMPTY;
582 							if (reclaimcount >= MAXRECLAIM)
583 								goto rfinished;
584 						}
585 					}
586 				}
587 			}
588 			object = TAILQ_NEXT(object, pager_object_list);
589 		}
590 	}
591 
592 rfinished:
593 
594 	/*
595 	 * free the blocks that have been added to the reclaim list
596 	 */
597 	for (i = 0; i < reclaimcount; i++) {
598 		swap_pager_freeswapspace(reclaims[i].object,
599 		    reclaims[i].address, reclaims[i].address + btodb(PAGE_SIZE) - 1);
600 	}
601 	splx(s);
602 	in_reclaim = 0;
603 	wakeup(&in_reclaim);
604 }
605 
606 
607 /*
608  * swap_pager_copy copies blocks from one pager to another and
609  * destroys the source pager
610  */
611 
612 void
613 swap_pager_copy(srcobject, srcoffset, dstobject, dstoffset, offset)
614 	vm_object_t srcobject;
615 	vm_pindex_t srcoffset;
616 	vm_object_t dstobject;
617 	vm_pindex_t dstoffset;
618 	vm_pindex_t offset;
619 {
620 	vm_pindex_t i;
621 	int origsize;
622 	int s;
623 
624 	if (vm_swap_size)
625 		no_swap_space = 0;
626 
627 	origsize = srcobject->un_pager.swp.swp_allocsize;
628 
629 	/*
630 	 * remove the source object from the swap_pager internal queue
631 	 */
632 	if (srcobject->handle == NULL) {
633 		TAILQ_REMOVE(&swap_pager_un_object_list, srcobject, pager_object_list);
634 	} else {
635 		TAILQ_REMOVE(&swap_pager_object_list, srcobject, pager_object_list);
636 	}
637 
638 	s = splbio();
639 	while (srcobject->un_pager.swp.swp_poip) {
640 		tsleep(srcobject, PVM, "spgout", 0);
641 	}
642 	splx(s);
643 
644 	/*
645 	 * clean all of the pages that are currently active and finished
646 	 */
647 	swap_pager_sync();
648 
649 	s = splbio();
650 	/*
651 	 * transfer source to destination
652 	 */
653 	for (i = 0; i < dstobject->size; i += 1) {
654 		int srcvalid, dstvalid;
655 		daddr_t *srcaddrp = swap_pager_diskaddr(srcobject, i + offset + srcoffset,
656 						    &srcvalid);
657 		daddr_t *dstaddrp;
658 
659 		/*
660 		 * see if the source has space allocated
661 		 */
662 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
663 			/*
664 			 * if the source is valid and the dest has no space,
665 			 * then copy the allocation from the srouce to the
666 			 * dest.
667 			 */
668 			if (srcvalid) {
669 				dstaddrp = swap_pager_diskaddr(dstobject, i + dstoffset,
670 							&dstvalid);
671 				/*
672 				 * if the dest already has a valid block,
673 				 * deallocate the source block without
674 				 * copying.
675 				 */
676 				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
677 					swap_pager_freeswapspace(dstobject, *dstaddrp,
678 						*dstaddrp + btodb(PAGE_SIZE) - 1);
679 					*dstaddrp = SWB_EMPTY;
680 				}
681 				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
682 					*dstaddrp = *srcaddrp;
683 					*srcaddrp = SWB_EMPTY;
684 					dstobject->un_pager.swp.swp_allocsize += btodb(PAGE_SIZE);
685 					srcobject->un_pager.swp.swp_allocsize -= btodb(PAGE_SIZE);
686 					swap_pager_setvalid(dstobject, i + dstoffset, 1);
687 				}
688 			}
689 			/*
690 			 * if the source is not empty at this point, then
691 			 * deallocate the space.
692 			 */
693 			if (*srcaddrp != SWB_EMPTY) {
694 				swap_pager_freeswapspace(srcobject, *srcaddrp,
695 					*srcaddrp + btodb(PAGE_SIZE) - 1);
696 				*srcaddrp = SWB_EMPTY;
697 			}
698 		}
699 	}
700 	splx(s);
701 
702 	/*
703 	 * Free left over swap blocks
704 	 */
705 	swap_pager_free_swap(srcobject);
706 
707 	if (srcobject->un_pager.swp.swp_allocsize) {
708 		printf("swap_pager_copy: *warning* pager with %d blocks (orig: %d)\n",
709 		    srcobject->un_pager.swp.swp_allocsize, origsize);
710 	}
711 
712 	free(srcobject->un_pager.swp.swp_blocks, M_VMPGDATA);
713 	srcobject->un_pager.swp.swp_blocks = NULL;
714 
715 	return;
716 }
717 
718 static void
719 swap_pager_dealloc(object)
720 	vm_object_t object;
721 {
722 	int s;
723 
724 	/*
725 	 * Remove from list right away so lookups will fail if we block for
726 	 * pageout completion.
727 	 */
728 	if (object->handle == NULL) {
729 		TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
730 	} else {
731 		TAILQ_REMOVE(&swap_pager_object_list, object, pager_object_list);
732 	}
733 
734 	/*
735 	 * Wait for all pageouts to finish and remove all entries from
736 	 * cleaning list.
737 	 */
738 
739 	s = splbio();
740 	while (object->un_pager.swp.swp_poip) {
741 		tsleep(object, PVM, "swpout", 0);
742 	}
743 	splx(s);
744 
745 
746 	swap_pager_sync();
747 
748 	/*
749 	 * Free left over swap blocks
750 	 */
751 	swap_pager_free_swap(object);
752 
753 	if (object->un_pager.swp.swp_allocsize) {
754 		printf("swap_pager_dealloc: *warning* freeing pager with %d blocks\n",
755 		    object->un_pager.swp.swp_allocsize);
756 	}
757 	/*
758 	 * Free swap management resources
759 	 */
760 	free(object->un_pager.swp.swp_blocks, M_VMPGDATA);
761 	object->un_pager.swp.swp_blocks = NULL;
762 }
763 
764 static inline __pure int
765 swap_pager_block_index(pindex)
766 	vm_pindex_t pindex;
767 {
768 	return (pindex / SWB_NPAGES);
769 }
770 
771 static inline __pure int
772 swap_pager_block_offset(pindex)
773 	vm_pindex_t pindex;
774 {
775 	return (pindex % SWB_NPAGES);
776 }
777 
778 /*
779  * swap_pager_haspage returns TRUE if the pager has data that has
780  * been written out.
781  */
782 static boolean_t
783 swap_pager_haspage(object, pindex, before, after)
784 	vm_object_t object;
785 	vm_pindex_t pindex;
786 	int *before;
787 	int *after;
788 {
789 	register sw_blk_t swb;
790 	int ix;
791 
792 	if (before != NULL)
793 		*before = 0;
794 	if (after != NULL)
795 		*after = 0;
796 	ix = pindex / SWB_NPAGES;
797 	if (ix >= object->un_pager.swp.swp_nblocks) {
798 		return (FALSE);
799 	}
800 	swb = &object->un_pager.swp.swp_blocks[ix];
801 	ix = pindex % SWB_NPAGES;
802 
803 	if (swb->swb_block[ix] != SWB_EMPTY) {
804 
805 		if (swb->swb_valid & (1 << ix)) {
806 			int tix;
807 			if (before) {
808 				for(tix = ix - 1; tix >= 0; --tix) {
809 					if ((swb->swb_valid & (1 << tix)) == 0)
810 						break;
811 					if ((swb->swb_block[tix] +
812 						(ix - tix) * (PAGE_SIZE/DEV_BSIZE)) !=
813 						swb->swb_block[ix])
814 						break;
815 					(*before)++;
816 				}
817 			}
818 
819 			if (after) {
820 				for(tix = ix + 1; tix < SWB_NPAGES; tix++) {
821 					if ((swb->swb_valid & (1 << tix)) == 0)
822 						break;
823 					if ((swb->swb_block[tix] -
824 						(tix - ix) * (PAGE_SIZE/DEV_BSIZE)) !=
825 						swb->swb_block[ix])
826 						break;
827 					(*after)++;
828 				}
829 			}
830 
831 			return TRUE;
832 		}
833 	}
834 	return (FALSE);
835 }
836 
837 /*
838  * swap_pager_freepage is a convienience routine that clears the busy
839  * bit and deallocates a page.
840  */
841 static void
842 swap_pager_freepage(m)
843 	vm_page_t m;
844 {
845 	PAGE_WAKEUP(m);
846 	vm_page_free(m);
847 }
848 
849 /*
850  * swap_pager_ridpages is a convienience routine that deallocates all
851  * but the required page.  this is usually used in error returns that
852  * need to invalidate the "extra" readahead pages.
853  */
854 static void
855 swap_pager_ridpages(m, count, reqpage)
856 	vm_page_t *m;
857 	int count;
858 	int reqpage;
859 {
860 	int i;
861 
862 	for (i = 0; i < count; i++)
863 		if (i != reqpage)
864 			swap_pager_freepage(m[i]);
865 }
866 
867 /*
868  * swap_pager_iodone1 is the completion routine for both reads and async writes
869  */
870 static void
871 swap_pager_iodone1(bp)
872 	struct buf *bp;
873 {
874 	bp->b_flags |= B_DONE;
875 	bp->b_flags &= ~B_ASYNC;
876 	wakeup(bp);
877 }
878 
879 static int
880 swap_pager_getpages(object, m, count, reqpage)
881 	vm_object_t object;
882 	vm_page_t *m;
883 	int count, reqpage;
884 {
885 	register struct buf *bp;
886 	sw_blk_t swb[count];
887 	register int s;
888 	int i;
889 	boolean_t rv;
890 	vm_offset_t kva, off[count];
891 	swp_clean_t spc;
892 	vm_pindex_t paging_offset;
893 	int reqaddr[count];
894 	int sequential;
895 
896 	int first, last;
897 	int failed;
898 	int reqdskregion;
899 
900 	object = m[reqpage]->object;
901 	paging_offset = OFF_TO_IDX(object->paging_offset);
902 	sequential = (m[reqpage]->pindex == (object->last_read + 1));
903 
904 	for (i = 0; i < count; i++) {
905 		vm_pindex_t fidx = m[i]->pindex + paging_offset;
906 		int ix = swap_pager_block_index(fidx);
907 
908 		if (ix >= object->un_pager.swp.swp_nblocks) {
909 			int j;
910 
911 			if (i <= reqpage) {
912 				swap_pager_ridpages(m, count, reqpage);
913 				return (VM_PAGER_FAIL);
914 			}
915 			for (j = i; j < count; j++) {
916 				swap_pager_freepage(m[j]);
917 			}
918 			count = i;
919 			break;
920 		}
921 		swb[i] = &object->un_pager.swp.swp_blocks[ix];
922 		off[i] = swap_pager_block_offset(fidx);
923 		reqaddr[i] = swb[i]->swb_block[off[i]];
924 	}
925 
926 	/* make sure that our required input request is existant */
927 
928 	if (reqaddr[reqpage] == SWB_EMPTY ||
929 	    (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
930 		swap_pager_ridpages(m, count, reqpage);
931 		return (VM_PAGER_FAIL);
932 	}
933 	reqdskregion = reqaddr[reqpage] / dmmax;
934 
935 	/*
936 	 * search backwards for the first contiguous page to transfer
937 	 */
938 	failed = 0;
939 	first = 0;
940 	for (i = reqpage - 1; i >= 0; --i) {
941 		if (sequential || failed || (reqaddr[i] == SWB_EMPTY) ||
942 		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
943 		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
944 		    ((reqaddr[i] / dmmax) != reqdskregion)) {
945 			failed = 1;
946 			swap_pager_freepage(m[i]);
947 			if (first == 0)
948 				first = i + 1;
949 		}
950 	}
951 	/*
952 	 * search forwards for the last contiguous page to transfer
953 	 */
954 	failed = 0;
955 	last = count;
956 	for (i = reqpage + 1; i < count; i++) {
957 		if (failed || (reqaddr[i] == SWB_EMPTY) ||
958 		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
959 		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
960 		    ((reqaddr[i] / dmmax) != reqdskregion)) {
961 			failed = 1;
962 			swap_pager_freepage(m[i]);
963 			if (last == count)
964 				last = i;
965 		}
966 	}
967 
968 	count = last;
969 	if (first != 0) {
970 		for (i = first; i < count; i++) {
971 			m[i - first] = m[i];
972 			reqaddr[i - first] = reqaddr[i];
973 			off[i - first] = off[i];
974 		}
975 		count -= first;
976 		reqpage -= first;
977 	}
978 	++swb[reqpage]->swb_locked;
979 
980 	/*
981 	 * at this point: "m" is a pointer to the array of vm_page_t for
982 	 * paging I/O "count" is the number of vm_page_t entries represented
983 	 * by "m" "object" is the vm_object_t for I/O "reqpage" is the index
984 	 * into "m" for the page actually faulted
985 	 */
986 
987 	spc = NULL;
988 	if ((count == 1) && ((spc = TAILQ_FIRST(&swap_pager_free)) != NULL)) {
989 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
990 		swap_pager_free_count--;
991 		kva = spc->spc_kva;
992 		bp = spc->spc_bp;
993 		bzero(bp, sizeof *bp);
994 		bp->b_spc = spc;
995 		bp->b_vnbufs.le_next = NOLIST;
996 	} else {
997 		/*
998 		 * Get a swap buffer header to perform the IO
999 		 */
1000 		bp = getpbuf();
1001 		kva = (vm_offset_t) bp->b_data;
1002 	}
1003 
1004 	/*
1005 	 * map our page(s) into kva for input
1006 	 */
1007 	pmap_qenter(kva, m, count);
1008 
1009 	bp->b_flags = B_BUSY | B_READ | B_CALL | B_PAGING;
1010 	bp->b_iodone = swap_pager_iodone1;
1011 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1012 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1013 	crhold(bp->b_rcred);
1014 	crhold(bp->b_wcred);
1015 	bp->b_un.b_addr = (caddr_t) kva;
1016 	bp->b_blkno = reqaddr[0];
1017 	bp->b_bcount = PAGE_SIZE * count;
1018 	bp->b_bufsize = PAGE_SIZE * count;
1019 
1020 	pbgetvp(swapdev_vp, bp);
1021 
1022 	cnt.v_swapin++;
1023 	cnt.v_swappgsin += count;
1024 	/*
1025 	 * perform the I/O
1026 	 */
1027 	VOP_STRATEGY(bp);
1028 
1029 	/*
1030 	 * wait for the sync I/O to complete
1031 	 */
1032 	s = splbio();
1033 	while ((bp->b_flags & B_DONE) == 0) {
1034 		if (tsleep(bp, PVM, "swread", hz*20)) {
1035 			printf("swap_pager: indefinite wait buffer: device: %d, blkno: %d, size: %d\n",
1036 				bp->b_dev, bp->b_blkno, bp->b_bcount);
1037 		}
1038 	}
1039 
1040 	if (bp->b_flags & B_ERROR) {
1041 		printf("swap_pager: I/O error - pagein failed; blkno %d, size %d, error %d\n",
1042 		    bp->b_blkno, bp->b_bcount, bp->b_error);
1043 		rv = VM_PAGER_ERROR;
1044 	} else {
1045 		rv = VM_PAGER_OK;
1046 	}
1047 
1048 	/*
1049 	 * relpbuf does this, but we maintain our own buffer list also...
1050 	 */
1051 	if (bp->b_vp)
1052 		pbrelvp(bp);
1053 
1054 	splx(s);
1055 	swb[reqpage]->swb_locked--;
1056 
1057 	/*
1058 	 * remove the mapping for kernel virtual
1059 	 */
1060 	pmap_qremove(kva, count);
1061 
1062 	if (spc) {
1063 		m[reqpage]->object->last_read = m[reqpage]->pindex;
1064 		if (bp->b_flags & B_WANTED)
1065 			wakeup(bp);
1066 		/*
1067 		 * if we have used an spc, we need to free it.
1068 		 */
1069 		if (bp->b_rcred != NOCRED)
1070 			crfree(bp->b_rcred);
1071 		if (bp->b_wcred != NOCRED)
1072 			crfree(bp->b_wcred);
1073 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1074 		swap_pager_free_count++;
1075 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1076 			wakeup(&swap_pager_free);
1077 		}
1078 		if (swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT)
1079 			pagedaemon_wakeup();
1080 		swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT);
1081 		if (rv == VM_PAGER_OK) {
1082 			pmap_clear_modify(VM_PAGE_TO_PHYS(m[reqpage]));
1083 			m[reqpage]->valid = VM_PAGE_BITS_ALL;
1084 			m[reqpage]->dirty = 0;
1085 		}
1086 	} else {
1087 		/*
1088 		 * release the physical I/O buffer
1089 		 */
1090 		relpbuf(bp);
1091 		/*
1092 		 * finish up input if everything is ok
1093 		 */
1094 		if (rv == VM_PAGER_OK) {
1095 			for (i = 0; i < count; i++) {
1096 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1097 				m[i]->dirty = 0;
1098 				m[i]->flags &= ~PG_ZERO;
1099 				if (i != reqpage) {
1100 					/*
1101 					 * whether or not to leave the page
1102 					 * activated is up in the air, but we
1103 					 * should put the page on a page queue
1104 					 * somewhere. (it already is in the
1105 					 * object). After some emperical
1106 					 * results, it is best to deactivate
1107 					 * the readahead pages.
1108 					 */
1109 					vm_page_deactivate(m[i]);
1110 
1111 					/*
1112 					 * just in case someone was asking for
1113 					 * this page we now tell them that it
1114 					 * is ok to use
1115 					 */
1116 					m[i]->valid = VM_PAGE_BITS_ALL;
1117 					PAGE_WAKEUP(m[i]);
1118 				}
1119 			}
1120 
1121 			m[reqpage]->object->last_read = m[count-1]->pindex;
1122 
1123 			/*
1124 			 * If we're out of swap space, then attempt to free
1125 			 * some whenever multiple pages are brought in. We
1126 			 * must set the dirty bits so that the page contents
1127 			 * will be preserved.
1128 			 */
1129 			if (SWAPLOW) {
1130 				for (i = 0; i < count; i++) {
1131 					m[i]->dirty = VM_PAGE_BITS_ALL;
1132 				}
1133 				swap_pager_freespace(object, m[0]->pindex + paging_offset, count);
1134 			}
1135 		} else {
1136 			swap_pager_ridpages(m, count, reqpage);
1137 		}
1138 	}
1139 	return (rv);
1140 }
1141 
1142 int
1143 swap_pager_putpages(object, m, count, sync, rtvals)
1144 	vm_object_t object;
1145 	vm_page_t *m;
1146 	int count;
1147 	boolean_t sync;
1148 	int *rtvals;
1149 {
1150 	register struct buf *bp;
1151 	sw_blk_t swb[count];
1152 	register int s;
1153 	int i, j, ix;
1154 	boolean_t rv;
1155 	vm_offset_t kva, off, fidx;
1156 	swp_clean_t spc;
1157 	vm_pindex_t paging_pindex;
1158 	int reqaddr[count];
1159 	int failed;
1160 
1161 	if (vm_swap_size)
1162 		no_swap_space = 0;
1163 	if (no_swap_space) {
1164 		for (i = 0; i < count; i++)
1165 			rtvals[i] = VM_PAGER_FAIL;
1166 		return VM_PAGER_FAIL;
1167 	}
1168 	spc = NULL;
1169 
1170 	object = m[0]->object;
1171 	paging_pindex = OFF_TO_IDX(object->paging_offset);
1172 
1173 	failed = 0;
1174 	for (j = 0; j < count; j++) {
1175 		fidx = m[j]->pindex + paging_pindex;
1176 		ix = swap_pager_block_index(fidx);
1177 		swb[j] = 0;
1178 		if (ix >= object->un_pager.swp.swp_nblocks) {
1179 			rtvals[j] = VM_PAGER_FAIL;
1180 			failed = 1;
1181 			continue;
1182 		} else {
1183 			rtvals[j] = VM_PAGER_OK;
1184 		}
1185 		swb[j] = &object->un_pager.swp.swp_blocks[ix];
1186 		swb[j]->swb_locked++;
1187 		if (failed) {
1188 			rtvals[j] = VM_PAGER_FAIL;
1189 			continue;
1190 		}
1191 		off = swap_pager_block_offset(fidx);
1192 		reqaddr[j] = swb[j]->swb_block[off];
1193 		if (reqaddr[j] == SWB_EMPTY) {
1194 			daddr_t blk;
1195 			int tries;
1196 			int ntoget;
1197 
1198 			tries = 0;
1199 			s = splbio();
1200 
1201 			/*
1202 			 * if any other pages have been allocated in this
1203 			 * block, we only try to get one page.
1204 			 */
1205 			for (i = 0; i < SWB_NPAGES; i++) {
1206 				if (swb[j]->swb_block[i] != SWB_EMPTY)
1207 					break;
1208 			}
1209 
1210 			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
1211 			/*
1212 			 * this code is alittle conservative, but works (the
1213 			 * intent of this code is to allocate small chunks for
1214 			 * small objects)
1215 			 */
1216 			if ((off == 0) && ((fidx + ntoget) > object->size)) {
1217 				ntoget = object->size - fidx;
1218 			}
1219 	retrygetspace:
1220 			if (!swap_pager_full && ntoget > 1 &&
1221 			    swap_pager_getswapspace(object, ntoget * btodb(PAGE_SIZE),
1222 				&blk)) {
1223 
1224 				for (i = 0; i < ntoget; i++) {
1225 					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
1226 					swb[j]->swb_valid = 0;
1227 				}
1228 
1229 				reqaddr[j] = swb[j]->swb_block[off];
1230 			} else if (!swap_pager_getswapspace(object, btodb(PAGE_SIZE),
1231 				&swb[j]->swb_block[off])) {
1232 				/*
1233 				 * if the allocation has failed, we try to
1234 				 * reclaim space and retry.
1235 				 */
1236 				if (++tries == 1) {
1237 					swap_pager_reclaim();
1238 					goto retrygetspace;
1239 				}
1240 				rtvals[j] = VM_PAGER_AGAIN;
1241 				failed = 1;
1242 				swap_pager_full = 1;
1243 			} else {
1244 				reqaddr[j] = swb[j]->swb_block[off];
1245 				swb[j]->swb_valid &= ~(1 << off);
1246 			}
1247 			splx(s);
1248 		}
1249 	}
1250 
1251 	/*
1252 	 * search forwards for the last contiguous page to transfer
1253 	 */
1254 	failed = 0;
1255 	for (i = 0; i < count; i++) {
1256 		if (failed ||
1257 			(reqaddr[i] != reqaddr[0] + i * btodb(PAGE_SIZE)) ||
1258 		    ((reqaddr[i] / dmmax) != (reqaddr[0] / dmmax)) ||
1259 		    (rtvals[i] != VM_PAGER_OK)) {
1260 			failed = 1;
1261 			if (rtvals[i] == VM_PAGER_OK)
1262 				rtvals[i] = VM_PAGER_AGAIN;
1263 		}
1264 	}
1265 
1266 	for (i = 0; i < count; i++) {
1267 		if (rtvals[i] != VM_PAGER_OK) {
1268 			if (swb[i])
1269 				--swb[i]->swb_locked;
1270 		}
1271 	}
1272 
1273 	for (i = 0; i < count; i++)
1274 		if (rtvals[i] != VM_PAGER_OK)
1275 			break;
1276 
1277 	if (i == 0) {
1278 		return VM_PAGER_AGAIN;
1279 	}
1280 	count = i;
1281 	for (i = 0; i < count; i++) {
1282 		if (reqaddr[i] == SWB_EMPTY) {
1283 			printf("I/O to empty block???? -- pindex: %d, i: %d\n",
1284 				m[i]->pindex, i);
1285 		}
1286 	}
1287 
1288 	/*
1289 	 * For synchronous writes, we clean up all completed async pageouts.
1290 	 */
1291 	if (sync == TRUE) {
1292 		swap_pager_sync();
1293 	}
1294 	kva = 0;
1295 
1296 	/*
1297 	 * get a swap pager clean data structure, block until we get it
1298 	 */
1299 	if (swap_pager_free_count <= 3) {
1300 		s = splbio();
1301 		if (curproc == pageproc) {
1302 retryfree:
1303 			/*
1304 			 * pageout daemon needs a swap control block
1305 			 */
1306 			swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT|SWAP_FREE_NEEDED;
1307 			/*
1308 			 * if it does not get one within a short time, then
1309 			 * there is a potential deadlock, so we go-on trying
1310 			 * to free pages.  It is important to block here as opposed
1311 			 * to returning, thereby allowing the pageout daemon to continue.
1312 			 * It is likely that pageout daemon will start suboptimally
1313 			 * reclaiming vnode backed pages if we don't block.  Since the
1314 			 * I/O subsystem is probably already fully utilized, might as
1315 			 * well wait.
1316 			 */
1317 			if (tsleep(&swap_pager_free, PVM, "swpfre", hz/5)) {
1318 				swap_pager_sync();
1319 				if (swap_pager_free_count <= 3) {
1320 					splx(s);
1321 					return VM_PAGER_AGAIN;
1322 				}
1323 			} else {
1324 			/*
1325 			 * we make sure that pageouts aren't taking up all of
1326 			 * the free swap control blocks.
1327 			 */
1328 				swap_pager_sync();
1329 				if (swap_pager_free_count <= 3) {
1330 					goto retryfree;
1331 				}
1332 			}
1333 		} else {
1334 			pagedaemon_wakeup();
1335 			while (swap_pager_free_count <= 3) {
1336 				swap_pager_needflags |= SWAP_FREE_NEEDED;
1337 				tsleep(&swap_pager_free, PVM, "swpfre", 0);
1338 				pagedaemon_wakeup();
1339 			}
1340 		}
1341 		splx(s);
1342 	}
1343 	spc = TAILQ_FIRST(&swap_pager_free);
1344 	if (spc == NULL)
1345 		panic("swap_pager_putpages: free queue is empty, %d expected\n", swap_pager_free_count);
1346 	TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
1347 	swap_pager_free_count--;
1348 
1349 	kva = spc->spc_kva;
1350 
1351 	/*
1352 	 * map our page(s) into kva for I/O
1353 	 */
1354 	pmap_qenter(kva, m, count);
1355 
1356 	/*
1357 	 * get the base I/O offset into the swap file
1358 	 */
1359 	for (i = 0; i < count; i++) {
1360 		fidx = m[i]->pindex + paging_pindex;
1361 		off = swap_pager_block_offset(fidx);
1362 		/*
1363 		 * set the valid bit
1364 		 */
1365 		swb[i]->swb_valid |= (1 << off);
1366 		/*
1367 		 * and unlock the data structure
1368 		 */
1369 		swb[i]->swb_locked--;
1370 	}
1371 
1372 	/*
1373 	 * Get a swap buffer header and perform the IO
1374 	 */
1375 	bp = spc->spc_bp;
1376 	bzero(bp, sizeof *bp);
1377 	bp->b_spc = spc;
1378 	bp->b_vnbufs.le_next = NOLIST;
1379 
1380 	bp->b_flags = B_BUSY | B_PAGING;
1381 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
1382 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
1383 	if (bp->b_rcred != NOCRED)
1384 		crhold(bp->b_rcred);
1385 	if (bp->b_wcred != NOCRED)
1386 		crhold(bp->b_wcred);
1387 	bp->b_data = (caddr_t) kva;
1388 	bp->b_blkno = reqaddr[0];
1389 	pbgetvp(swapdev_vp, bp);
1390 
1391 	bp->b_bcount = PAGE_SIZE * count;
1392 	bp->b_bufsize = PAGE_SIZE * count;
1393 	swapdev_vp->v_numoutput++;
1394 
1395 	/*
1396 	 * If this is an async write we set up additional buffer fields and
1397 	 * place a "cleaning" entry on the inuse queue.
1398 	 */
1399 	s = splbio();
1400 	if (sync == FALSE) {
1401 		spc->spc_flags = 0;
1402 		spc->spc_object = object;
1403 		for (i = 0; i < count; i++)
1404 			spc->spc_m[i] = m[i];
1405 		spc->spc_count = count;
1406 		/*
1407 		 * the completion routine for async writes
1408 		 */
1409 		bp->b_flags |= B_CALL;
1410 		bp->b_iodone = swap_pager_iodone;
1411 		bp->b_dirtyoff = 0;
1412 		bp->b_dirtyend = bp->b_bcount;
1413 		object->un_pager.swp.swp_poip++;
1414 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
1415 	} else {
1416 		object->un_pager.swp.swp_poip++;
1417 		bp->b_flags |= B_CALL;
1418 		bp->b_iodone = swap_pager_iodone1;
1419 	}
1420 
1421 	cnt.v_swapout++;
1422 	cnt.v_swappgsout += count;
1423 	/*
1424 	 * perform the I/O
1425 	 */
1426 	VOP_STRATEGY(bp);
1427 	if (sync == FALSE) {
1428 		if ((bp->b_flags & B_DONE) == B_DONE) {
1429 			swap_pager_sync();
1430 		}
1431 		splx(s);
1432 		for (i = 0; i < count; i++) {
1433 			rtvals[i] = VM_PAGER_PEND;
1434 		}
1435 		return VM_PAGER_PEND;
1436 	}
1437 	/*
1438 	 * wait for the sync I/O to complete
1439 	 */
1440 	while ((bp->b_flags & B_DONE) == 0) {
1441 		tsleep(bp, PVM, "swwrt", 0);
1442 	}
1443 	if (bp->b_flags & B_ERROR) {
1444 		printf("swap_pager: I/O error - pageout failed; blkno %d, size %d, error %d\n",
1445 		    bp->b_blkno, bp->b_bcount, bp->b_error);
1446 		rv = VM_PAGER_ERROR;
1447 	} else {
1448 		rv = VM_PAGER_OK;
1449 	}
1450 
1451 	object->un_pager.swp.swp_poip--;
1452 	if (object->un_pager.swp.swp_poip == 0)
1453 		wakeup(object);
1454 
1455 	if (bp->b_vp)
1456 		pbrelvp(bp);
1457 	if (bp->b_flags & B_WANTED)
1458 		wakeup(bp);
1459 
1460 	splx(s);
1461 
1462 	/*
1463 	 * remove the mapping for kernel virtual
1464 	 */
1465 	pmap_qremove(kva, count);
1466 
1467 	/*
1468 	 * if we have written the page, then indicate that the page is clean.
1469 	 */
1470 	if (rv == VM_PAGER_OK) {
1471 		for (i = 0; i < count; i++) {
1472 			if (rtvals[i] == VM_PAGER_OK) {
1473 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
1474 				m[i]->dirty = 0;
1475 				/*
1476 				 * optimization, if a page has been read
1477 				 * during the pageout process, we activate it.
1478 				 */
1479 				if ((m[i]->queue != PQ_ACTIVE) &&
1480 				    ((m[i]->flags & (PG_WANTED|PG_REFERENCED)) ||
1481 				    pmap_is_referenced(VM_PAGE_TO_PHYS(m[i])))) {
1482 					vm_page_activate(m[i]);
1483 				}
1484 			}
1485 		}
1486 	} else {
1487 		for (i = 0; i < count; i++) {
1488 			rtvals[i] = rv;
1489 		}
1490 	}
1491 
1492 	if (bp->b_rcred != NOCRED)
1493 		crfree(bp->b_rcred);
1494 	if (bp->b_wcred != NOCRED)
1495 		crfree(bp->b_wcred);
1496 	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1497 	swap_pager_free_count++;
1498 	if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1499 		wakeup(&swap_pager_free);
1500 	}
1501 	if (swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT)
1502 		pagedaemon_wakeup();
1503 	swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT);
1504 	return (rv);
1505 }
1506 
1507 static void
1508 swap_pager_sync()
1509 {
1510 	register swp_clean_t spc, tspc;
1511 	register int s;
1512 
1513 	tspc = NULL;
1514 	if (TAILQ_FIRST(&swap_pager_done) == NULL)
1515 		return;
1516 	for (;;) {
1517 		s = splbio();
1518 		/*
1519 		 * Look up and removal from done list must be done at splbio()
1520 		 * to avoid conflicts with swap_pager_iodone.
1521 		 */
1522 		while ((spc = TAILQ_FIRST(&swap_pager_done)) != 0) {
1523 			pmap_qremove(spc->spc_kva, spc->spc_count);
1524 			swap_pager_finish(spc);
1525 			TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
1526 			goto doclean;
1527 		}
1528 
1529 		/*
1530 		 * No operations done, thats all we can do for now.
1531 		 */
1532 
1533 		splx(s);
1534 		break;
1535 
1536 		/*
1537 		 * The desired page was found to be busy earlier in the scan
1538 		 * but has since completed.
1539 		 */
1540 doclean:
1541 		if (tspc && tspc == spc) {
1542 			tspc = NULL;
1543 		}
1544 		spc->spc_flags = 0;
1545 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
1546 		swap_pager_free_count++;
1547 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
1548 			wakeup(&swap_pager_free);
1549 		}
1550 		if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT)
1551 			pagedaemon_wakeup();
1552 		swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT);
1553 		splx(s);
1554 	}
1555 
1556 	return;
1557 }
1558 
1559 void
1560 swap_pager_finish(spc)
1561 	register swp_clean_t spc;
1562 {
1563 	vm_object_t object = spc->spc_m[0]->object;
1564 	int i;
1565 
1566 	object->paging_in_progress -= spc->spc_count;
1567 	if ((object->paging_in_progress == 0) &&
1568 	    (object->flags & OBJ_PIPWNT)) {
1569 		object->flags &= ~OBJ_PIPWNT;
1570 		wakeup(object);
1571 	}
1572 
1573 	/*
1574 	 * If no error, mark as clean and inform the pmap system. If error,
1575 	 * mark as dirty so we will try again. (XXX could get stuck doing
1576 	 * this, should give up after awhile)
1577 	 */
1578 	if (spc->spc_flags & SPC_ERROR) {
1579 		for (i = 0; i < spc->spc_count; i++) {
1580 			printf("swap_pager_finish: I/O error, clean of page %lx failed\n",
1581 			    (u_long) VM_PAGE_TO_PHYS(spc->spc_m[i]));
1582 		}
1583 	} else {
1584 		for (i = 0; i < spc->spc_count; i++) {
1585 			pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i]));
1586 			spc->spc_m[i]->dirty = 0;
1587 			if ((spc->spc_m[i]->queue != PQ_ACTIVE) &&
1588 			    ((spc->spc_m[i]->flags & PG_WANTED) || pmap_is_referenced(VM_PAGE_TO_PHYS(spc->spc_m[i]))))
1589 				vm_page_activate(spc->spc_m[i]);
1590 		}
1591 	}
1592 
1593 
1594 	for (i = 0; i < spc->spc_count; i++) {
1595 		/*
1596 		 * we wakeup any processes that are waiting on these pages.
1597 		 */
1598 		PAGE_WAKEUP(spc->spc_m[i]);
1599 	}
1600 	nswiodone -= spc->spc_count;
1601 
1602 	return;
1603 }
1604 
1605 /*
1606  * swap_pager_iodone
1607  */
1608 static void
1609 swap_pager_iodone(bp)
1610 	register struct buf *bp;
1611 {
1612 	register swp_clean_t spc;
1613 	int s;
1614 
1615 	s = splbio();
1616 	spc = (swp_clean_t) bp->b_spc;
1617 	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
1618 	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
1619 	if (bp->b_flags & B_ERROR) {
1620 		spc->spc_flags |= SPC_ERROR;
1621 		printf("swap_pager: I/O error - async %s failed; blkno %lu, size %ld, error %d\n",
1622 		    (bp->b_flags & B_READ) ? "pagein" : "pageout",
1623 		    (u_long) bp->b_blkno, bp->b_bcount, bp->b_error);
1624 	}
1625 
1626 	if (bp->b_vp)
1627 		pbrelvp(bp);
1628 
1629 /*
1630 	if (bp->b_flags & B_WANTED)
1631 */
1632 		wakeup(bp);
1633 
1634 	if (bp->b_rcred != NOCRED)
1635 		crfree(bp->b_rcred);
1636 	if (bp->b_wcred != NOCRED)
1637 		crfree(bp->b_wcred);
1638 
1639 	nswiodone += spc->spc_count;
1640 	if (--spc->spc_object->un_pager.swp.swp_poip == 0) {
1641 		wakeup(spc->spc_object);
1642 	}
1643 	if ((swap_pager_needflags & SWAP_FREE_NEEDED) ||
1644 	    TAILQ_FIRST(&swap_pager_inuse) == 0) {
1645 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
1646 		wakeup(&swap_pager_free);
1647 	}
1648 
1649 	if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) {
1650 		swap_pager_needflags &= ~SWAP_FREE_NEEDED_BY_PAGEOUT;
1651 		pagedaemon_wakeup();
1652 	}
1653 
1654 	if (vm_pageout_pages_needed) {
1655 		wakeup(&vm_pageout_pages_needed);
1656 		vm_pageout_pages_needed = 0;
1657 	}
1658 	if ((TAILQ_FIRST(&swap_pager_inuse) == NULL) ||
1659 	    ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min &&
1660 	    nswiodone + cnt.v_free_count + cnt.v_cache_count >= cnt.v_free_min)) {
1661 		pagedaemon_wakeup();
1662 	}
1663 	splx(s);
1664 }
1665