xref: /freebsd/sys/kern/vfs_bio.c (revision 16f62314cdee3347833cdcbe2f2a8fbacea1e5b5)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  *
19  * $Id: vfs_bio.c,v 1.5 1994/08/04 19:43:13 davidg Exp $
20  */
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/kernel.h>
25 #include <sys/proc.h>
26 #include <sys/vnode.h>
27 #include <sys/buf.h>
28 #include <sys/mount.h>
29 #include <sys/malloc.h>
30 #include <sys/resourcevar.h>
31 #include <vm/vm.h>
32 #include <vm/vm_pageout.h>
33 
34 #include <miscfs/specfs/specdev.h>
35 
36 struct	buf *buf;		/* buffer header pool */
37 int	nbuf;			/* number of buffer headers calculated elsewhere */
38 
39 extern	vm_map_t buffer_map, io_map;
40 
41 void vm_hold_free_pages(vm_offset_t from, vm_offset_t to);
42 void vm_hold_load_pages(vm_offset_t from, vm_offset_t to);
43 
44 int needsbuffer;
45 
46 /*
47  * Internal update daemon, process 3
48  *	The variable vfs_update_wakeup allows for internal syncs.
49  */
50 int vfs_update_wakeup;
51 
52 /*
53  * Initialize buffer headers and related structures.
54  */
55 void bufinit()
56 {
57 	struct buf *bp;
58 	int i;
59 
60 	TAILQ_INIT(&bswlist);
61 	LIST_INIT(&invalhash);
62 
63 	/* first, make a null hash table */
64 	for(i=0;i<BUFHSZ;i++)
65 		LIST_INIT(&bufhashtbl[i]);
66 
67 	/* next, make a null set of free lists */
68 	for(i=0;i<BUFFER_QUEUES;i++)
69 		TAILQ_INIT(&bufqueues[i]);
70 
71 	/* finally, initialize each buffer header and stick on empty q */
72 	for(i=0;i<nbuf;i++) {
73 		bp = &buf[i];
74 		bzero(bp, sizeof *bp);
75 		bp->b_flags = B_INVAL;	/* we're just an empty header */
76 		bp->b_dev = NODEV;
77 		bp->b_vp = NULL;
78 		bp->b_rcred = NOCRED;
79 		bp->b_wcred = NOCRED;
80 		bp->b_qindex = QUEUE_EMPTY;
81 		bp->b_vnbufs.le_next = NOLIST;
82 		bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE);
83 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
84 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
85 	}
86 }
87 
88 /*
89  * remove the buffer from the appropriate free list
90  */
91 void
92 bremfree(struct buf *bp)
93 {
94 	int s = splbio();
95 	if( bp->b_qindex != QUEUE_NONE) {
96 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
97 		bp->b_qindex = QUEUE_NONE;
98 	} else {
99 		panic("bremfree: removing a buffer when not on a queue");
100 	}
101 	splx(s);
102 }
103 
104 /*
105  * Get a buffer with the specified data.  Look in the cache first.
106  */
107 int
108 bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred,
109 	struct buf **bpp)
110 {
111 	struct buf *bp;
112 
113 	bp = getblk (vp, blkno, size, 0, 0);
114 	*bpp = bp;
115 
116 	/* if not found in cache, do some I/O */
117 	if ((bp->b_flags & B_CACHE) == 0) {
118 		if (curproc && curproc->p_stats)	/* count block I/O */
119 			curproc->p_stats->p_ru.ru_inblock++;
120 		bp->b_flags |= B_READ;
121 		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
122 		if( bp->b_rcred == NOCRED) {
123 			if (cred != NOCRED)
124 				crhold(cred);
125 			bp->b_rcred = cred;
126 		}
127 		VOP_STRATEGY(bp);
128 		return( biowait (bp));
129 	}
130 
131 	return (0);
132 }
133 
134 /*
135  * Operates like bread, but also starts asynchronous I/O on
136  * read-ahead blocks.
137  */
138 int
139 breadn(struct vnode *vp, daddr_t blkno, int size,
140 	daddr_t *rablkno, int *rabsize,
141 	int cnt, struct ucred *cred, struct buf **bpp)
142 {
143 	struct buf *bp, *rabp;
144 	int i;
145 	int rv = 0, readwait = 0;
146 
147 	*bpp = bp = getblk (vp, blkno, size, 0, 0);
148 
149 	/* if not found in cache, do some I/O */
150 	if ((bp->b_flags & B_CACHE) == 0) {
151 		if (curproc && curproc->p_stats)	/* count block I/O */
152 			curproc->p_stats->p_ru.ru_inblock++;
153 		bp->b_flags |= B_READ;
154 		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
155 		if( bp->b_rcred == NOCRED) {
156 			if (cred != NOCRED)
157 				crhold(cred);
158 			bp->b_rcred = cred;
159 		}
160 		VOP_STRATEGY(bp);
161 		++readwait;
162 	}
163 
164 	for(i=0;i<cnt;i++, rablkno++, rabsize++) {
165 		if( incore(vp, *rablkno)) {
166 			continue;
167 		}
168 		rabp = getblk (vp, *rablkno, *rabsize, 0, 0);
169 
170 		if ((rabp->b_flags & B_CACHE) == 0) {
171 			if (curproc && curproc->p_stats)
172 				curproc->p_stats->p_ru.ru_inblock++;
173 			rabp->b_flags |= B_READ | B_ASYNC;
174 			rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
175 			if( rabp->b_rcred == NOCRED) {
176 				if (cred != NOCRED)
177 					crhold(cred);
178 				rabp->b_rcred = cred;
179 			}
180 			VOP_STRATEGY(rabp);
181 		} else {
182 			brelse(rabp);
183 		}
184 	}
185 
186 	if( readwait) {
187 		rv = biowait (bp);
188 	}
189 
190 	return (rv);
191 }
192 
193 /*
194  * Write, release buffer on completion.  (Done by iodone
195  * if async.)
196  */
197 int
198 bwrite(struct buf *bp)
199 {
200 	int oldflags = bp->b_flags;
201 
202 	if(bp->b_flags & B_INVAL) {
203 		brelse(bp);
204 		return (0);
205 	}
206 
207 	if(!(bp->b_flags & B_BUSY))
208 		panic("bwrite: buffer is not busy???");
209 
210 	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
211 	bp->b_flags |= B_WRITEINPROG;
212 
213 	if (oldflags & B_ASYNC) {
214 		if (oldflags & B_DELWRI) {
215 			reassignbuf(bp, bp->b_vp);
216 		} else if( curproc) {
217 			++curproc->p_stats->p_ru.ru_oublock;
218 		}
219 	}
220 
221 	bp->b_vp->v_numoutput++;
222 	VOP_STRATEGY(bp);
223 
224 	if( (oldflags & B_ASYNC) == 0) {
225 		int rtval = biowait(bp);
226 		if (oldflags & B_DELWRI) {
227 			reassignbuf(bp, bp->b_vp);
228 		} else if( curproc) {
229 			++curproc->p_stats->p_ru.ru_oublock;
230 		}
231 		brelse(bp);
232 		return (rtval);
233 	}
234 
235 	return(0);
236 }
237 
238 int
239 vn_bwrite(ap)
240 	struct vop_bwrite_args *ap;
241 {
242 	return (bwrite(ap->a_bp));
243 }
244 
245 /*
246  * Delayed write. (Buffer is marked dirty).
247  */
248 void
249 bdwrite(struct buf *bp)
250 {
251 
252 	if((bp->b_flags & B_BUSY) == 0) {
253 		panic("bdwrite: buffer is not busy");
254 	}
255 
256 	if(bp->b_flags & B_INVAL) {
257 		brelse(bp);
258 		return;
259 	}
260 
261 	if(bp->b_flags & B_TAPE) {
262 		bawrite(bp);
263 		return;
264 	}
265 
266 	bp->b_flags &= ~B_READ;
267 	if( (bp->b_flags & B_DELWRI) == 0) {
268 		if( curproc)
269 			++curproc->p_stats->p_ru.ru_oublock;
270 		bp->b_flags |= B_DONE|B_DELWRI;
271 		reassignbuf(bp, bp->b_vp);
272 	}
273 	brelse(bp);
274 	return;
275 }
276 
277 /*
278  * Asynchronous write.
279  * Start output on a buffer, but do not wait for it to complete.
280  * The buffer is released when the output completes.
281  */
282 void
283 bawrite(struct buf *bp)
284 {
285 	bp->b_flags |= B_ASYNC;
286 	(void) bwrite(bp);
287 }
288 
289 /*
290  * Release a buffer.
291  */
292 void
293 brelse(struct buf *bp)
294 {
295 	int x;
296 
297 	/* anyone need a "free" block? */
298 	x=splbio();
299 	if (needsbuffer) {
300 		needsbuffer = 0;
301 		wakeup((caddr_t)&needsbuffer);
302 	}
303 
304 	/* anyone need this block? */
305 	if (bp->b_flags & B_WANTED) {
306 		bp->b_flags &= ~(B_WANTED|B_AGE);
307 		wakeup((caddr_t)bp);
308 	}
309 
310 	if (bp->b_flags & B_LOCKED)
311 		bp->b_flags &= ~B_ERROR;
312 
313 	if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) ||
314 		(bp->b_bufsize <= 0)) {
315 		bp->b_flags |= B_INVAL;
316 		bp->b_flags &= ~(B_DELWRI|B_CACHE);
317 		if(bp->b_vp)
318 			brelvp(bp);
319 	}
320 
321 	if( bp->b_qindex != QUEUE_NONE)
322 		panic("brelse: free buffer onto another queue???");
323 
324 	/* enqueue */
325 	/* buffers with no memory */
326 	if(bp->b_bufsize == 0) {
327 		bp->b_qindex = QUEUE_EMPTY;
328 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
329 		LIST_REMOVE(bp, b_hash);
330 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
331 		bp->b_dev = NODEV;
332 	/* buffers with junk contents */
333 	} else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) {
334 		bp->b_qindex = QUEUE_AGE;
335 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
336 		LIST_REMOVE(bp, b_hash);
337 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
338 		bp->b_dev = NODEV;
339 	/* buffers that are locked */
340 	} else if(bp->b_flags & B_LOCKED) {
341 		bp->b_qindex = QUEUE_LOCKED;
342 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
343 	/* buffers with stale but valid contents */
344 	} else if(bp->b_flags & B_AGE) {
345 		bp->b_qindex = QUEUE_AGE;
346 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
347 	/* buffers with valid and quite potentially reuseable contents */
348 	} else {
349 		bp->b_qindex = QUEUE_LRU;
350 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
351 	}
352 
353 	/* unlock */
354 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE);
355 	splx(x);
356 }
357 
358 int freebufspace;
359 int allocbufspace;
360 
361 /*
362  * Find a buffer header which is available for use.
363  */
364 struct buf *
365 getnewbuf(int slpflag, int slptimeo)
366 {
367 	struct buf *bp;
368 	int s;
369 	s = splbio();
370 start:
371 	/* can we constitute a new buffer? */
372 	if (bp = bufqueues[QUEUE_EMPTY].tqh_first) {
373 		if( bp->b_qindex != QUEUE_EMPTY)
374 			panic("getnewbuf: inconsistent EMPTY queue");
375 		bremfree(bp);
376 		goto fillbuf;
377 	}
378 
379 tryfree:
380 	if (bp = bufqueues[QUEUE_AGE].tqh_first) {
381 		if( bp->b_qindex != QUEUE_AGE)
382 			panic("getnewbuf: inconsistent AGE queue");
383 		bremfree(bp);
384 	} else if (bp = bufqueues[QUEUE_LRU].tqh_first) {
385 		if( bp->b_qindex != QUEUE_LRU)
386 			panic("getnewbuf: inconsistent LRU queue");
387 		bremfree(bp);
388 	} else	{
389 		/* wait for a free buffer of any kind */
390 		needsbuffer = 1;
391 		tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0);
392 		splx(s);
393 		return (0);
394 	}
395 
396 
397 	/* if we are a delayed write, convert to an async write */
398 	if (bp->b_flags & B_DELWRI) {
399 		bp->b_flags |= B_BUSY;
400 		bawrite (bp);
401 		goto start;
402 	}
403 
404 	if(bp->b_vp)
405 		brelvp(bp);
406 
407 	/* we are not free, nor do we contain interesting data */
408 	if (bp->b_rcred != NOCRED)
409 		crfree(bp->b_rcred);
410 	if (bp->b_wcred != NOCRED)
411 		crfree(bp->b_wcred);
412 fillbuf:
413 	bp->b_flags = B_BUSY;
414 	LIST_REMOVE(bp, b_hash);
415 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
416 	splx(s);
417 	bp->b_dev = NODEV;
418 	bp->b_vp = NULL;
419 	bp->b_blkno = bp->b_lblkno = 0;
420 	bp->b_iodone = 0;
421 	bp->b_error = 0;
422 	bp->b_resid = 0;
423 	bp->b_bcount = 0;
424 	bp->b_wcred = bp->b_rcred = NOCRED;
425 	bp->b_dirtyoff = bp->b_dirtyend = 0;
426 	bp->b_validoff = bp->b_validend = 0;
427 	return (bp);
428 }
429 
430 /*
431  * Check to see if a block is currently memory resident.
432  */
433 struct buf *
434 incore(struct vnode *vp, daddr_t blkno)
435 {
436 	struct buf *bp;
437 	struct bufhashhdr *bh;
438 
439 	int s = splbio();
440 
441 	bh = BUFHASH(vp, blkno);
442 	bp = bh->lh_first;
443 
444 	/* Search hash chain */
445 	while (bp) {
446 		if( (bp < buf) || (bp >= buf + nbuf)) {
447 			printf("incore: buf out of range: %lx, hash: %d\n",
448 				bp, bh - bufhashtbl);
449 			panic("incore: buf fault");
450 		}
451 		/* hit */
452 		if (bp->b_lblkno == blkno && bp->b_vp == vp
453 			&& (bp->b_flags & B_INVAL) == 0) {
454 			splx(s);
455 			return (bp);
456 		}
457 		bp = bp->b_hash.le_next;
458 	}
459 	splx(s);
460 
461 	return(0);
462 }
463 
464 /*
465  * Get a block given a specified block and offset into a file/device.
466  */
467 struct buf *
468 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
469 {
470 	struct buf *bp;
471 	int s;
472 	struct bufhashhdr *bh;
473 
474 	s = splbio();
475 loop:
476 	if (bp = incore(vp, blkno)) {
477 		if (bp->b_flags & B_BUSY) {
478 			bp->b_flags |= B_WANTED;
479 			tsleep ((caddr_t)bp, PRIBIO, "getblk", 0);
480 			goto loop;
481 		}
482 		bp->b_flags |= B_BUSY | B_CACHE;
483 		bremfree(bp);
484 		/*
485 		 * check for size inconsistancies
486 		 */
487 		if (bp->b_bcount != size) {
488 			printf("getblk: invalid buffer size: %d\n", bp->b_bcount);
489 			bp->b_flags |= B_INVAL;
490 			bwrite(bp);
491 			goto loop;
492 		}
493 	} else {
494 
495 		if ((bp = getnewbuf(0, 0)) == 0)
496 			goto loop;
497 		allocbuf(bp, size);
498 		/*
499 		 * have to check again, because of a possible
500 		 * race condition.
501 		 */
502 		if (incore( vp, blkno)) {
503 			allocbuf(bp, 0);
504 			bp->b_flags |= B_INVAL;
505 			brelse(bp);
506 			goto loop;
507 		}
508 		bp->b_blkno = bp->b_lblkno = blkno;
509 		bgetvp(vp, bp);
510 		LIST_REMOVE(bp, b_hash);
511 		bh = BUFHASH(vp, blkno);
512 		LIST_INSERT_HEAD(bh, bp, b_hash);
513 	}
514 	splx(s);
515 	return (bp);
516 }
517 
518 /*
519  * Get an empty, disassociated buffer of given size.
520  */
521 struct buf *
522 geteblk(int size)
523 {
524 	struct buf *bp;
525 	while ((bp = getnewbuf(0, 0)) == 0)
526 		;
527 	allocbuf(bp, size);
528 	bp->b_flags |= B_INVAL;
529 	return (bp);
530 }
531 
532 /*
533  * Modify the length of a buffer's underlying buffer storage without
534  * destroying information (unless, of course the buffer is shrinking).
535  */
536 void
537 allocbuf(struct buf *bp, int size)
538 {
539 
540 	int newbsize = round_page(size);
541 
542 	if( newbsize == bp->b_bufsize) {
543 		bp->b_bcount = size;
544 		return;
545 	} else if( newbsize < bp->b_bufsize) {
546 		vm_hold_free_pages(
547 			(vm_offset_t) bp->b_data + newbsize,
548 			(vm_offset_t) bp->b_data + bp->b_bufsize);
549 	} else if( newbsize > bp->b_bufsize) {
550 		vm_hold_load_pages(
551 			(vm_offset_t) bp->b_data + bp->b_bufsize,
552 			(vm_offset_t) bp->b_data + newbsize);
553 	}
554 
555 	/* adjust buffer cache's idea of memory allocated to buffer contents */
556 	freebufspace -= newbsize - bp->b_bufsize;
557 	allocbufspace += newbsize - bp->b_bufsize;
558 
559 	bp->b_bufsize = newbsize;
560 	bp->b_bcount = size;
561 }
562 
563 /*
564  * Wait for buffer I/O completion, returning error status.
565  */
566 int
567 biowait(register struct buf *bp)
568 {
569 	int s;
570 
571 	s = splbio();
572 	while ((bp->b_flags & B_DONE) == 0)
573 		tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
574 	if((bp->b_flags & B_ERROR) || bp->b_error) {
575 		if ((bp->b_flags & B_INVAL) == 0) {
576 			bp->b_flags |= B_INVAL;
577 			bp->b_dev = NODEV;
578 			LIST_REMOVE(bp, b_hash);
579 			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
580 		}
581 		if (!bp->b_error)
582 			bp->b_error = EIO;
583 		else
584 			bp->b_flags |= B_ERROR;
585 		splx(s);
586 		return (bp->b_error);
587 	} else {
588 		splx(s);
589 		return (0);
590 	}
591 }
592 
593 /*
594  * Finish I/O on a buffer, calling an optional function.
595  * This is usually called from interrupt level, so process blocking
596  * is not *a good idea*.
597  */
598 void
599 biodone(register struct buf *bp)
600 {
601 	int s;
602 	s = splbio();
603 	bp->b_flags |= B_DONE;
604 
605 	if ((bp->b_flags & B_READ) == 0)  {
606 		vwakeup(bp);
607 	}
608 
609 	if (bp->b_flags & B_BOUNCE)
610 		vm_bounce_free(bp);
611 
612 	/* call optional completion function if requested */
613 	if (bp->b_flags & B_CALL) {
614 		bp->b_flags &= ~B_CALL;
615 		(*bp->b_iodone)(bp);
616 		splx(s);
617 		return;
618 	}
619 
620 /*
621  * For asynchronous completions, release the buffer now. The brelse
622  *	checks for B_WANTED and will do the wakeup there if necessary -
623  *	so no need to do a wakeup here in the async case.
624  */
625 
626 	if (bp->b_flags & B_ASYNC) {
627 		brelse(bp);
628 	} else {
629 		bp->b_flags &= ~B_WANTED;
630 		wakeup((caddr_t) bp);
631 	}
632 	splx(s);
633 }
634 
635 int
636 count_lock_queue()
637 {
638 	int count;
639 	struct buf *bp;
640 
641 	count = 0;
642 	for(bp = bufqueues[QUEUE_LOCKED].tqh_first;
643 	    bp != NULL;
644 	    bp = bp->b_freelist.tqe_next)
645 		count++;
646 	return(count);
647 }
648 
649 #ifndef UPDATE_INTERVAL
650 int vfs_update_interval = 30;
651 #else
652 int vfs_update_interval = UPDATE_INTERVAL;
653 #endif
654 
655 void
656 vfs_update() {
657 	(void) spl0();
658 	while(1) {
659 		tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update",
660 			hz * vfs_update_interval);
661 		vfs_update_wakeup = 0;
662 		sync(curproc, NULL, NULL);
663 	}
664 }
665 
666 /*
667  * these routines are not in the correct place (yet)
668  * also they work *ONLY* for kernel_pmap!!!
669  */
670 void
671 vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) {
672 	vm_offset_t pg;
673 	vm_page_t p;
674 	vm_offset_t from = round_page(froma);
675 	vm_offset_t to = round_page(toa);
676 
677 	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
678 		vm_offset_t pa;
679 
680 	tryagain:
681 		if (cnt.v_free_count <= cnt.v_free_reserved) {
682 			VM_WAIT;
683 			goto tryagain;
684 		}
685 
686 		p =  vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS);
687 		if( !p) {
688 			VM_WAIT;
689 			goto tryagain;
690 		}
691 
692 		vm_page_wire(p);
693 		pmap_kenter( pg, VM_PAGE_TO_PHYS(p));
694 	}
695 	pmap_update();
696 }
697 
698 void
699 vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) {
700 	vm_offset_t pg;
701 	vm_page_t p;
702 	vm_offset_t from = round_page(froma);
703 	vm_offset_t to = round_page(toa);
704 
705 	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
706 		p = PHYS_TO_VM_PAGE( pmap_kextract( pg));
707 		pmap_kremove( pg);
708 		vm_page_free(p);
709 	}
710 	pmap_update();
711 }
712 
713 void
714 bufstats()
715 {
716 }
717 
718