xref: /freebsd/sys/kern/vfs_bio.c (revision 0c43d89a0d8e976ca494d4837f4c1f3734d2c300)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  *
19  * $Id: vfs_bio.c,v 1.8 1994/08/08 15:40:59 wollman Exp $
20  */
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/kernel.h>
25 #include <sys/proc.h>
26 #include <sys/vnode.h>
27 #include <sys/buf.h>
28 #include <sys/mount.h>
29 #include <sys/malloc.h>
30 #include <sys/resourcevar.h>
31 #include <vm/vm.h>
32 #include <vm/vm_pageout.h>
33 
34 #include <miscfs/specfs/specdev.h>
35 
36 struct	buf *buf;		/* buffer header pool */
37 int	nbuf;			/* number of buffer headers calculated elsewhere */
38 struct swqueue bswlist;
39 struct	buf *bclnlist;		/* Head of cleaned page list. */
40 
41 extern	vm_map_t buffer_map, io_map;
42 
43 void vm_hold_free_pages(vm_offset_t from, vm_offset_t to);
44 void vm_hold_load_pages(vm_offset_t from, vm_offset_t to);
45 
46 int needsbuffer;
47 
48 /*
49  * Internal update daemon, process 3
50  *	The variable vfs_update_wakeup allows for internal syncs.
51  */
52 int vfs_update_wakeup;
53 
54 /*
55  * Initialize buffer headers and related structures.
56  */
57 void bufinit()
58 {
59 	struct buf *bp;
60 	int i;
61 
62 	TAILQ_INIT(&bswlist);
63 	LIST_INIT(&invalhash);
64 
65 	/* first, make a null hash table */
66 	for(i=0;i<BUFHSZ;i++)
67 		LIST_INIT(&bufhashtbl[i]);
68 
69 	/* next, make a null set of free lists */
70 	for(i=0;i<BUFFER_QUEUES;i++)
71 		TAILQ_INIT(&bufqueues[i]);
72 
73 	/* finally, initialize each buffer header and stick on empty q */
74 	for(i=0;i<nbuf;i++) {
75 		bp = &buf[i];
76 		bzero(bp, sizeof *bp);
77 		bp->b_flags = B_INVAL;	/* we're just an empty header */
78 		bp->b_dev = NODEV;
79 		bp->b_vp = NULL;
80 		bp->b_rcred = NOCRED;
81 		bp->b_wcred = NOCRED;
82 		bp->b_qindex = QUEUE_EMPTY;
83 		bp->b_vnbufs.le_next = NOLIST;
84 		bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE);
85 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
86 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
87 	}
88 }
89 
90 /*
91  * remove the buffer from the appropriate free list
92  */
93 void
94 bremfree(struct buf *bp)
95 {
96 	int s = splbio();
97 	if( bp->b_qindex != QUEUE_NONE) {
98 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
99 		bp->b_qindex = QUEUE_NONE;
100 	} else {
101 		panic("bremfree: removing a buffer when not on a queue");
102 	}
103 	splx(s);
104 }
105 
106 /*
107  * Get a buffer with the specified data.  Look in the cache first.
108  */
109 int
110 bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred,
111 	struct buf **bpp)
112 {
113 	struct buf *bp;
114 
115 	bp = getblk (vp, blkno, size, 0, 0);
116 	*bpp = bp;
117 
118 	/* if not found in cache, do some I/O */
119 	if ((bp->b_flags & B_CACHE) == 0) {
120 		if (curproc && curproc->p_stats)	/* count block I/O */
121 			curproc->p_stats->p_ru.ru_inblock++;
122 		bp->b_flags |= B_READ;
123 		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
124 		if( bp->b_rcred == NOCRED) {
125 			if (cred != NOCRED)
126 				crhold(cred);
127 			bp->b_rcred = cred;
128 		}
129 		VOP_STRATEGY(bp);
130 		return( biowait (bp));
131 	}
132 
133 	return (0);
134 }
135 
136 /*
137  * Operates like bread, but also starts asynchronous I/O on
138  * read-ahead blocks.
139  */
140 int
141 breadn(struct vnode *vp, daddr_t blkno, int size,
142 	daddr_t *rablkno, int *rabsize,
143 	int cnt, struct ucred *cred, struct buf **bpp)
144 {
145 	struct buf *bp, *rabp;
146 	int i;
147 	int rv = 0, readwait = 0;
148 
149 	*bpp = bp = getblk (vp, blkno, size, 0, 0);
150 
151 	/* if not found in cache, do some I/O */
152 	if ((bp->b_flags & B_CACHE) == 0) {
153 		if (curproc && curproc->p_stats)	/* count block I/O */
154 			curproc->p_stats->p_ru.ru_inblock++;
155 		bp->b_flags |= B_READ;
156 		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
157 		if( bp->b_rcred == NOCRED) {
158 			if (cred != NOCRED)
159 				crhold(cred);
160 			bp->b_rcred = cred;
161 		}
162 		VOP_STRATEGY(bp);
163 		++readwait;
164 	}
165 
166 	for(i=0;i<cnt;i++, rablkno++, rabsize++) {
167 		if( incore(vp, *rablkno)) {
168 			continue;
169 		}
170 		rabp = getblk (vp, *rablkno, *rabsize, 0, 0);
171 
172 		if ((rabp->b_flags & B_CACHE) == 0) {
173 			if (curproc && curproc->p_stats)
174 				curproc->p_stats->p_ru.ru_inblock++;
175 			rabp->b_flags |= B_READ | B_ASYNC;
176 			rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
177 			if( rabp->b_rcred == NOCRED) {
178 				if (cred != NOCRED)
179 					crhold(cred);
180 				rabp->b_rcred = cred;
181 			}
182 			VOP_STRATEGY(rabp);
183 		} else {
184 			brelse(rabp);
185 		}
186 	}
187 
188 	if( readwait) {
189 		rv = biowait (bp);
190 	}
191 
192 	return (rv);
193 }
194 
195 /*
196  * Write, release buffer on completion.  (Done by iodone
197  * if async.)
198  */
199 int
200 bwrite(struct buf *bp)
201 {
202 	int oldflags = bp->b_flags;
203 
204 	if(bp->b_flags & B_INVAL) {
205 		brelse(bp);
206 		return (0);
207 	}
208 
209 	if(!(bp->b_flags & B_BUSY))
210 		panic("bwrite: buffer is not busy???");
211 
212 	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
213 	bp->b_flags |= B_WRITEINPROG;
214 
215 	if (oldflags & B_ASYNC) {
216 		if (oldflags & B_DELWRI) {
217 			reassignbuf(bp, bp->b_vp);
218 		} else if( curproc) {
219 			++curproc->p_stats->p_ru.ru_oublock;
220 		}
221 	}
222 
223 	bp->b_vp->v_numoutput++;
224 	VOP_STRATEGY(bp);
225 
226 	if( (oldflags & B_ASYNC) == 0) {
227 		int rtval = biowait(bp);
228 		if (oldflags & B_DELWRI) {
229 			reassignbuf(bp, bp->b_vp);
230 		} else if( curproc) {
231 			++curproc->p_stats->p_ru.ru_oublock;
232 		}
233 		brelse(bp);
234 		return (rtval);
235 	}
236 
237 	return(0);
238 }
239 
240 int
241 vn_bwrite(ap)
242 	struct vop_bwrite_args *ap;
243 {
244 	return (bwrite(ap->a_bp));
245 }
246 
247 /*
248  * Delayed write. (Buffer is marked dirty).
249  */
250 void
251 bdwrite(struct buf *bp)
252 {
253 
254 	if((bp->b_flags & B_BUSY) == 0) {
255 		panic("bdwrite: buffer is not busy");
256 	}
257 
258 	if(bp->b_flags & B_INVAL) {
259 		brelse(bp);
260 		return;
261 	}
262 
263 	if(bp->b_flags & B_TAPE) {
264 		bawrite(bp);
265 		return;
266 	}
267 
268 	bp->b_flags &= ~B_READ;
269 	if( (bp->b_flags & B_DELWRI) == 0) {
270 		if( curproc)
271 			++curproc->p_stats->p_ru.ru_oublock;
272 		bp->b_flags |= B_DONE|B_DELWRI;
273 		reassignbuf(bp, bp->b_vp);
274 	}
275 	brelse(bp);
276 	return;
277 }
278 
279 /*
280  * Asynchronous write.
281  * Start output on a buffer, but do not wait for it to complete.
282  * The buffer is released when the output completes.
283  */
284 void
285 bawrite(struct buf *bp)
286 {
287 	bp->b_flags |= B_ASYNC;
288 	(void) bwrite(bp);
289 }
290 
291 /*
292  * Release a buffer.
293  */
294 void
295 brelse(struct buf *bp)
296 {
297 	int x;
298 
299 	/* anyone need a "free" block? */
300 	x=splbio();
301 	if (needsbuffer) {
302 		needsbuffer = 0;
303 		wakeup((caddr_t)&needsbuffer);
304 	}
305 
306 	/* anyone need this block? */
307 	if (bp->b_flags & B_WANTED) {
308 		bp->b_flags &= ~(B_WANTED|B_AGE);
309 		wakeup((caddr_t)bp);
310 	}
311 
312 	if (bp->b_flags & B_LOCKED)
313 		bp->b_flags &= ~B_ERROR;
314 
315 	if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) ||
316 		(bp->b_bufsize <= 0)) {
317 		bp->b_flags |= B_INVAL;
318 		bp->b_flags &= ~(B_DELWRI|B_CACHE);
319 		if(bp->b_vp)
320 			brelvp(bp);
321 	}
322 
323 	if( bp->b_qindex != QUEUE_NONE)
324 		panic("brelse: free buffer onto another queue???");
325 
326 	/* enqueue */
327 	/* buffers with no memory */
328 	if(bp->b_bufsize == 0) {
329 		bp->b_qindex = QUEUE_EMPTY;
330 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
331 		LIST_REMOVE(bp, b_hash);
332 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
333 		bp->b_dev = NODEV;
334 	/* buffers with junk contents */
335 	} else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) {
336 		bp->b_qindex = QUEUE_AGE;
337 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
338 		LIST_REMOVE(bp, b_hash);
339 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
340 		bp->b_dev = NODEV;
341 	/* buffers that are locked */
342 	} else if(bp->b_flags & B_LOCKED) {
343 		bp->b_qindex = QUEUE_LOCKED;
344 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
345 	/* buffers with stale but valid contents */
346 	} else if(bp->b_flags & B_AGE) {
347 		bp->b_qindex = QUEUE_AGE;
348 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
349 	/* buffers with valid and quite potentially reuseable contents */
350 	} else {
351 		bp->b_qindex = QUEUE_LRU;
352 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
353 	}
354 
355 	/* unlock */
356 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE);
357 	splx(x);
358 }
359 
360 int freebufspace;
361 int allocbufspace;
362 
363 /*
364  * Find a buffer header which is available for use.
365  */
366 struct buf *
367 getnewbuf(int slpflag, int slptimeo)
368 {
369 	struct buf *bp;
370 	int s;
371 	s = splbio();
372 start:
373 	/* can we constitute a new buffer? */
374 	if (bp = bufqueues[QUEUE_EMPTY].tqh_first) {
375 		if( bp->b_qindex != QUEUE_EMPTY)
376 			panic("getnewbuf: inconsistent EMPTY queue");
377 		bremfree(bp);
378 		goto fillbuf;
379 	}
380 
381 tryfree:
382 	if (bp = bufqueues[QUEUE_AGE].tqh_first) {
383 		if( bp->b_qindex != QUEUE_AGE)
384 			panic("getnewbuf: inconsistent AGE queue");
385 		bremfree(bp);
386 	} else if (bp = bufqueues[QUEUE_LRU].tqh_first) {
387 		if( bp->b_qindex != QUEUE_LRU)
388 			panic("getnewbuf: inconsistent LRU queue");
389 		bremfree(bp);
390 	} else	{
391 		/* wait for a free buffer of any kind */
392 		needsbuffer = 1;
393 		tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0);
394 		splx(s);
395 		return (0);
396 	}
397 
398 
399 	/* if we are a delayed write, convert to an async write */
400 	if (bp->b_flags & B_DELWRI) {
401 		bp->b_flags |= B_BUSY;
402 		bawrite (bp);
403 		goto start;
404 	}
405 
406 	if(bp->b_vp)
407 		brelvp(bp);
408 
409 	/* we are not free, nor do we contain interesting data */
410 	if (bp->b_rcred != NOCRED)
411 		crfree(bp->b_rcred);
412 	if (bp->b_wcred != NOCRED)
413 		crfree(bp->b_wcred);
414 fillbuf:
415 	bp->b_flags = B_BUSY;
416 	LIST_REMOVE(bp, b_hash);
417 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
418 	splx(s);
419 	bp->b_dev = NODEV;
420 	bp->b_vp = NULL;
421 	bp->b_blkno = bp->b_lblkno = 0;
422 	bp->b_iodone = 0;
423 	bp->b_error = 0;
424 	bp->b_resid = 0;
425 	bp->b_bcount = 0;
426 	bp->b_wcred = bp->b_rcred = NOCRED;
427 	bp->b_dirtyoff = bp->b_dirtyend = 0;
428 	bp->b_validoff = bp->b_validend = 0;
429 	return (bp);
430 }
431 
432 /*
433  * Check to see if a block is currently memory resident.
434  */
435 struct buf *
436 incore(struct vnode *vp, daddr_t blkno)
437 {
438 	struct buf *bp;
439 	struct bufhashhdr *bh;
440 
441 	int s = splbio();
442 
443 	bh = BUFHASH(vp, blkno);
444 	bp = bh->lh_first;
445 
446 	/* Search hash chain */
447 	while (bp) {
448 		if( (bp < buf) || (bp >= buf + nbuf)) {
449 			printf("incore: buf out of range: %lx, hash: %d\n",
450 				bp, bh - bufhashtbl);
451 			panic("incore: buf fault");
452 		}
453 		/* hit */
454 		if (bp->b_lblkno == blkno && bp->b_vp == vp
455 			&& (bp->b_flags & B_INVAL) == 0) {
456 			splx(s);
457 			return (bp);
458 		}
459 		bp = bp->b_hash.le_next;
460 	}
461 	splx(s);
462 
463 	return(0);
464 }
465 
466 /*
467  * Get a block given a specified block and offset into a file/device.
468  */
469 struct buf *
470 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
471 {
472 	struct buf *bp;
473 	int s;
474 	struct bufhashhdr *bh;
475 
476 	s = splbio();
477 loop:
478 	if (bp = incore(vp, blkno)) {
479 		if (bp->b_flags & B_BUSY) {
480 			bp->b_flags |= B_WANTED;
481 			tsleep ((caddr_t)bp, PRIBIO, "getblk", 0);
482 			goto loop;
483 		}
484 		bp->b_flags |= B_BUSY | B_CACHE;
485 		bremfree(bp);
486 		/*
487 		 * check for size inconsistancies
488 		 */
489 		if (bp->b_bcount != size) {
490 			printf("getblk: invalid buffer size: %d\n", bp->b_bcount);
491 			bp->b_flags |= B_INVAL;
492 			bwrite(bp);
493 			goto loop;
494 		}
495 	} else {
496 
497 		if ((bp = getnewbuf(0, 0)) == 0)
498 			goto loop;
499 		allocbuf(bp, size);
500 		/*
501 		 * have to check again, because of a possible
502 		 * race condition.
503 		 */
504 		if (incore( vp, blkno)) {
505 			allocbuf(bp, 0);
506 			bp->b_flags |= B_INVAL;
507 			brelse(bp);
508 			goto loop;
509 		}
510 		bp->b_blkno = bp->b_lblkno = blkno;
511 		bgetvp(vp, bp);
512 		LIST_REMOVE(bp, b_hash);
513 		bh = BUFHASH(vp, blkno);
514 		LIST_INSERT_HEAD(bh, bp, b_hash);
515 	}
516 	splx(s);
517 	return (bp);
518 }
519 
520 /*
521  * Get an empty, disassociated buffer of given size.
522  */
523 struct buf *
524 geteblk(int size)
525 {
526 	struct buf *bp;
527 	while ((bp = getnewbuf(0, 0)) == 0)
528 		;
529 	allocbuf(bp, size);
530 	bp->b_flags |= B_INVAL;
531 	return (bp);
532 }
533 
534 /*
535  * Modify the length of a buffer's underlying buffer storage without
536  * destroying information (unless, of course the buffer is shrinking).
537  */
538 void
539 allocbuf(struct buf *bp, int size)
540 {
541 
542 	int newbsize = round_page(size);
543 
544 	if( newbsize == bp->b_bufsize) {
545 		bp->b_bcount = size;
546 		return;
547 	} else if( newbsize < bp->b_bufsize) {
548 		vm_hold_free_pages(
549 			(vm_offset_t) bp->b_data + newbsize,
550 			(vm_offset_t) bp->b_data + bp->b_bufsize);
551 	} else if( newbsize > bp->b_bufsize) {
552 		vm_hold_load_pages(
553 			(vm_offset_t) bp->b_data + bp->b_bufsize,
554 			(vm_offset_t) bp->b_data + newbsize);
555 	}
556 
557 	/* adjust buffer cache's idea of memory allocated to buffer contents */
558 	freebufspace -= newbsize - bp->b_bufsize;
559 	allocbufspace += newbsize - bp->b_bufsize;
560 
561 	bp->b_bufsize = newbsize;
562 	bp->b_bcount = size;
563 }
564 
565 /*
566  * Wait for buffer I/O completion, returning error status.
567  */
568 int
569 biowait(register struct buf *bp)
570 {
571 	int s;
572 
573 	s = splbio();
574 	while ((bp->b_flags & B_DONE) == 0)
575 		tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
576 	if((bp->b_flags & B_ERROR) || bp->b_error) {
577 		if ((bp->b_flags & B_INVAL) == 0) {
578 			bp->b_flags |= B_INVAL;
579 			bp->b_dev = NODEV;
580 			LIST_REMOVE(bp, b_hash);
581 			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
582 		}
583 		if (!bp->b_error)
584 			bp->b_error = EIO;
585 		else
586 			bp->b_flags |= B_ERROR;
587 		splx(s);
588 		return (bp->b_error);
589 	} else {
590 		splx(s);
591 		return (0);
592 	}
593 }
594 
595 /*
596  * Finish I/O on a buffer, calling an optional function.
597  * This is usually called from interrupt level, so process blocking
598  * is not *a good idea*.
599  */
600 void
601 biodone(register struct buf *bp)
602 {
603 	int s;
604 	s = splbio();
605 	bp->b_flags |= B_DONE;
606 
607 	if ((bp->b_flags & B_READ) == 0)  {
608 		vwakeup(bp);
609 	}
610 
611 	if (bp->b_flags & B_BOUNCE)
612 		vm_bounce_free(bp);
613 
614 	/* call optional completion function if requested */
615 	if (bp->b_flags & B_CALL) {
616 		bp->b_flags &= ~B_CALL;
617 		(*bp->b_iodone)(bp);
618 		splx(s);
619 		return;
620 	}
621 
622 /*
623  * For asynchronous completions, release the buffer now. The brelse
624  *	checks for B_WANTED and will do the wakeup there if necessary -
625  *	so no need to do a wakeup here in the async case.
626  */
627 
628 	if (bp->b_flags & B_ASYNC) {
629 		brelse(bp);
630 	} else {
631 		bp->b_flags &= ~B_WANTED;
632 		wakeup((caddr_t) bp);
633 	}
634 	splx(s);
635 }
636 
637 int
638 count_lock_queue()
639 {
640 	int count;
641 	struct buf *bp;
642 
643 	count = 0;
644 	for(bp = bufqueues[QUEUE_LOCKED].tqh_first;
645 	    bp != NULL;
646 	    bp = bp->b_freelist.tqe_next)
647 		count++;
648 	return(count);
649 }
650 
651 int vfs_update_interval = 30;
652 
653 void
654 vfs_update() {
655 	(void) spl0();
656 	while(1) {
657 		tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update",
658 			hz * vfs_update_interval);
659 		vfs_update_wakeup = 0;
660 		sync(curproc, NULL, NULL);
661 	}
662 }
663 
664 /*
665  * these routines are not in the correct place (yet)
666  * also they work *ONLY* for kernel_pmap!!!
667  */
668 void
669 vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) {
670 	vm_offset_t pg;
671 	vm_page_t p;
672 	vm_offset_t from = round_page(froma);
673 	vm_offset_t to = round_page(toa);
674 
675 	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
676 		vm_offset_t pa;
677 
678 	tryagain:
679 		if (cnt.v_free_count <= cnt.v_free_reserved) {
680 			VM_WAIT;
681 			goto tryagain;
682 		}
683 
684 		p =  vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS);
685 		if( !p) {
686 			VM_WAIT;
687 			goto tryagain;
688 		}
689 
690 		vm_page_wire(p);
691 		pmap_kenter( pg, VM_PAGE_TO_PHYS(p));
692 	}
693 }
694 
695 void
696 vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) {
697 	vm_offset_t pg;
698 	vm_page_t p;
699 	vm_offset_t from = round_page(froma);
700 	vm_offset_t to = round_page(toa);
701 
702 	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
703 		p = PHYS_TO_VM_PAGE( pmap_kextract( pg));
704 		pmap_kremove( pg);
705 		vm_page_free(p);
706 	}
707 }
708 
709 void
710 bufstats()
711 {
712 }
713 
714