xref: /freebsd/sys/kern/vfs_bio.c (revision afe61c15161c324a7af299a9b8457aba5afc92db)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  */
19 
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <sys/kernel.h>
23 #include <sys/proc.h>
24 #include <sys/vnode.h>
25 #include <sys/buf.h>
26 #include <sys/mount.h>
27 #include <sys/malloc.h>
28 #include <sys/resourcevar.h>
29 #include <vm/vm.h>
30 #include <vm/vm_pageout.h>
31 
32 #include <miscfs/specfs/specdev.h>
33 
34 struct	buf *buf;		/* buffer header pool */
35 int	nbuf;			/* number of buffer headers calculated elsewhere */
36 
37 extern	vm_map_t buffer_map, io_map;
38 
39 void vm_hold_free_pages(vm_offset_t from, vm_offset_t to);
40 void vm_hold_load_pages(vm_offset_t from, vm_offset_t to);
41 
42 int needsbuffer;
43 
44 /*
45  * Internal update daemon, process 3
46  *	The variable vfs_update_wakeup allows for internal syncs.
47  */
48 int vfs_update_wakeup;
49 
50 /*
51  * Initialize buffer headers and related structures.
52  */
53 void bufinit()
54 {
55 	struct buf *bp;
56 	int i;
57 
58 	TAILQ_INIT(&bswlist);
59 	LIST_INIT(&invalhash);
60 
61 	/* first, make a null hash table */
62 	for(i=0;i<BUFHSZ;i++)
63 		LIST_INIT(&bufhashtbl[i]);
64 
65 	/* next, make a null set of free lists */
66 	for(i=0;i<BUFFER_QUEUES;i++)
67 		TAILQ_INIT(&bufqueues[i]);
68 
69 	/* finally, initialize each buffer header and stick on empty q */
70 	for(i=0;i<nbuf;i++) {
71 		bp = &buf[i];
72 		bzero(bp, sizeof *bp);
73 		bp->b_flags = B_INVAL;	/* we're just an empty header */
74 		bp->b_dev = NODEV;
75 		bp->b_vp = NULL;
76 		bp->b_rcred = NOCRED;
77 		bp->b_wcred = NOCRED;
78 		bp->b_qindex = QUEUE_EMPTY;
79 		bp->b_vnbufs.le_next = NOLIST;
80 		bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE);
81 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
82 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
83 	}
84 }
85 
86 /*
87  * remove the buffer from the appropriate free list
88  */
89 void
90 bremfree(struct buf *bp)
91 {
92 	int s = splbio();
93 	if( bp->b_qindex != QUEUE_NONE) {
94 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
95 		bp->b_qindex = QUEUE_NONE;
96 	} else {
97 		panic("bremfree: removing a buffer when not on a queue");
98 	}
99 	splx(s);
100 }
101 
102 /*
103  * Get a buffer with the specified data.  Look in the cache first.
104  */
105 int
106 bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred,
107 	struct buf **bpp)
108 {
109 	struct buf *bp;
110 
111 	bp = getblk (vp, blkno, size, 0, 0);
112 	*bpp = bp;
113 
114 	/* if not found in cache, do some I/O */
115 	if ((bp->b_flags & B_CACHE) == 0) {
116 		if (curproc && curproc->p_stats)	/* count block I/O */
117 			curproc->p_stats->p_ru.ru_inblock++;
118 		bp->b_flags |= B_READ;
119 		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
120 		if( bp->b_rcred == NOCRED) {
121 			if (cred != NOCRED)
122 				crhold(cred);
123 			bp->b_rcred = cred;
124 		}
125 		VOP_STRATEGY(bp);
126 		return( biowait (bp));
127 	}
128 
129 	return (0);
130 }
131 
132 /*
133  * Operates like bread, but also starts asynchronous I/O on
134  * read-ahead blocks.
135  */
136 int
137 breadn(struct vnode *vp, daddr_t blkno, int size,
138 	daddr_t *rablkno, int *rabsize,
139 	int cnt, struct ucred *cred, struct buf **bpp)
140 {
141 	struct buf *bp, *rabp;
142 	int i;
143 	int rv = 0, readwait = 0;
144 
145 	*bpp = bp = getblk (vp, blkno, size, 0, 0);
146 
147 	/* if not found in cache, do some I/O */
148 	if ((bp->b_flags & B_CACHE) == 0) {
149 		if (curproc && curproc->p_stats)	/* count block I/O */
150 			curproc->p_stats->p_ru.ru_inblock++;
151 		bp->b_flags |= B_READ;
152 		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
153 		if( bp->b_rcred == NOCRED) {
154 			if (cred != NOCRED)
155 				crhold(cred);
156 			bp->b_rcred = cred;
157 		}
158 		VOP_STRATEGY(bp);
159 		++readwait;
160 	}
161 
162 	for(i=0;i<cnt;i++, rablkno++, rabsize++) {
163 		if( incore(vp, *rablkno)) {
164 			continue;
165 		}
166 		rabp = getblk (vp, *rablkno, *rabsize, 0, 0);
167 
168 		if ((rabp->b_flags & B_CACHE) == 0) {
169 			if (curproc && curproc->p_stats)
170 				curproc->p_stats->p_ru.ru_inblock++;
171 			rabp->b_flags |= B_READ | B_ASYNC;
172 			rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
173 			if( rabp->b_rcred == NOCRED) {
174 				if (cred != NOCRED)
175 					crhold(cred);
176 				rabp->b_rcred = cred;
177 			}
178 			VOP_STRATEGY(rabp);
179 		} else {
180 			brelse(rabp);
181 		}
182 	}
183 
184 	if( readwait) {
185 		rv = biowait (bp);
186 	}
187 
188 	return (rv);
189 }
190 
191 /*
192  * Write, release buffer on completion.  (Done by iodone
193  * if async.)
194  */
195 int
196 bwrite(struct buf *bp)
197 {
198 	int oldflags = bp->b_flags;
199 
200 	if(bp->b_flags & B_INVAL) {
201 		brelse(bp);
202 		return (0);
203 	}
204 
205 	if(!(bp->b_flags & B_BUSY))
206 		panic("bwrite: buffer is not busy???");
207 
208 	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
209 	bp->b_flags |= B_WRITEINPROG;
210 
211 	if (oldflags & B_ASYNC) {
212 		if (oldflags & B_DELWRI) {
213 			reassignbuf(bp, bp->b_vp);
214 		} else if( curproc) {
215 			++curproc->p_stats->p_ru.ru_oublock;
216 		}
217 	}
218 
219 	bp->b_vp->v_numoutput++;
220 	VOP_STRATEGY(bp);
221 
222 	if( (oldflags & B_ASYNC) == 0) {
223 		int rtval = biowait(bp);
224 		if (oldflags & B_DELWRI) {
225 			reassignbuf(bp, bp->b_vp);
226 		} else if( curproc) {
227 			++curproc->p_stats->p_ru.ru_oublock;
228 		}
229 		brelse(bp);
230 		return (rtval);
231 	}
232 
233 	return(0);
234 }
235 
236 int
237 vn_bwrite(ap)
238 	struct vop_bwrite_args *ap;
239 {
240 	return (bwrite(ap->a_bp));
241 }
242 
243 /*
244  * Delayed write. (Buffer is marked dirty).
245  */
246 void
247 bdwrite(struct buf *bp)
248 {
249 
250 	if((bp->b_flags & B_BUSY) == 0) {
251 		panic("bdwrite: buffer is not busy");
252 	}
253 
254 	if(bp->b_flags & B_INVAL) {
255 		brelse(bp);
256 		return;
257 	}
258 
259 	if(bp->b_flags & B_TAPE) {
260 		bawrite(bp);
261 		return;
262 	}
263 
264 	bp->b_flags &= ~B_READ;
265 	if( (bp->b_flags & B_DELWRI) == 0) {
266 		if( curproc)
267 			++curproc->p_stats->p_ru.ru_oublock;
268 		bp->b_flags |= B_DONE|B_DELWRI;
269 		reassignbuf(bp, bp->b_vp);
270 	}
271 	brelse(bp);
272 	return;
273 }
274 
275 /*
276  * Asynchronous write.
277  * Start output on a buffer, but do not wait for it to complete.
278  * The buffer is released when the output completes.
279  */
280 void
281 bawrite(struct buf *bp)
282 {
283 	bp->b_flags |= B_ASYNC;
284 	(void) bwrite(bp);
285 }
286 
287 /*
288  * Release a buffer.
289  */
290 void
291 brelse(struct buf *bp)
292 {
293 	int x;
294 
295 	/* anyone need a "free" block? */
296 	x=splbio();
297 	if (needsbuffer) {
298 		needsbuffer = 0;
299 		wakeup((caddr_t)&needsbuffer);
300 	}
301 	/* anyone need this very block? */
302 	if (bp->b_flags & B_WANTED) {
303 		bp->b_flags &= ~(B_WANTED|B_AGE);
304 		wakeup((caddr_t)bp);
305 	}
306 
307 	if (bp->b_flags & B_LOCKED)
308 		bp->b_flags &= ~B_ERROR;
309 
310 	if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) ||
311 		(bp->b_bufsize <= 0)) {
312 		bp->b_flags |= B_INVAL;
313 		bp->b_flags &= ~(B_DELWRI|B_CACHE);
314 		if(bp->b_vp)
315 			brelvp(bp);
316 	}
317 
318 	if( bp->b_qindex != QUEUE_NONE)
319 		panic("brelse: free buffer onto another queue???");
320 
321 	/* enqueue */
322 	/* buffers with junk contents */
323 	if(bp->b_bufsize == 0) {
324 		bp->b_qindex = QUEUE_EMPTY;
325 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
326 		LIST_REMOVE(bp, b_hash);
327 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
328 		bp->b_dev = NODEV;
329 	} else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) {
330 		bp->b_qindex = QUEUE_AGE;
331 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
332 		LIST_REMOVE(bp, b_hash);
333 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
334 		bp->b_dev = NODEV;
335 	/* buffers that are locked */
336 	} else if(bp->b_flags & B_LOCKED) {
337 		bp->b_qindex = QUEUE_LOCKED;
338 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
339 	/* buffers with stale but valid contents */
340 	} else if(bp->b_flags & B_AGE) {
341 		bp->b_qindex = QUEUE_AGE;
342 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
343 	/* buffers with valid and quite potentially reuseable contents */
344 	} else {
345 		bp->b_qindex = QUEUE_LRU;
346 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
347 	}
348 
349 	/* unlock */
350 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE);
351 	splx(x);
352 }
353 
354 int freebufspace;
355 int allocbufspace;
356 
357 /*
358  * Find a buffer header which is available for use.
359  */
360 struct buf *
361 getnewbuf(int slpflag, int slptimeo)
362 {
363 	struct buf *bp;
364 	int x;
365 	x = splbio();
366 start:
367 	/* can we constitute a new buffer? */
368 	if (bp = bufqueues[QUEUE_EMPTY].tqh_first) {
369 		if( bp->b_qindex != QUEUE_EMPTY)
370 			panic("getnewbuf: inconsistent EMPTY queue");
371 		bremfree(bp);
372 		goto fillbuf;
373 	}
374 
375 tryfree:
376 	if (bp = bufqueues[QUEUE_AGE].tqh_first) {
377 		if( bp->b_qindex != QUEUE_AGE)
378 			panic("getnewbuf: inconsistent AGE queue");
379 		bremfree(bp);
380 	} else if (bp = bufqueues[QUEUE_LRU].tqh_first) {
381 		if( bp->b_qindex != QUEUE_LRU)
382 			panic("getnewbuf: inconsistent LRU queue");
383 		bremfree(bp);
384 	} else	{
385 		/* wait for a free buffer of any kind */
386 		needsbuffer = 1;
387 		tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0);
388 		splx(x);
389 		return (0);
390 	}
391 
392 
393 	/* if we are a delayed write, convert to an async write */
394 	if (bp->b_flags & B_DELWRI) {
395 		bp->b_flags |= B_BUSY;
396 		bawrite (bp);
397 		goto start;
398 	}
399 
400 	if(bp->b_vp)
401 		brelvp(bp);
402 
403 	/* we are not free, nor do we contain interesting data */
404 	if (bp->b_rcred != NOCRED)
405 		crfree(bp->b_rcred);
406 	if (bp->b_wcred != NOCRED)
407 		crfree(bp->b_wcred);
408 fillbuf:
409 	bp->b_flags = B_BUSY;
410 	LIST_REMOVE(bp, b_hash);
411 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
412 	splx(x);
413 	bp->b_dev = NODEV;
414 	bp->b_vp = NULL;
415 	bp->b_blkno = bp->b_lblkno = 0;
416 	bp->b_iodone = 0;
417 	bp->b_error = 0;
418 	bp->b_resid = 0;
419 	bp->b_bcount = 0;
420 	bp->b_wcred = bp->b_rcred = NOCRED;
421 	bp->b_dirtyoff = bp->b_dirtyend = 0;
422 	bp->b_validoff = bp->b_validend = 0;
423 	return (bp);
424 }
425 
426 /*
427  * Check to see if a block is currently memory resident.
428  */
429 struct buf *
430 incore(struct vnode *vp, daddr_t blkno)
431 {
432 	struct buf *bp;
433 	struct bufhashhdr *bh;
434 
435 	int s = splbio();
436 
437 	bh = BUFHASH(vp, blkno);
438 	bp = bh->lh_first;
439 
440 	/* Search hash chain */
441 	while (bp) {
442 		if( (bp < buf) || (bp >= buf + nbuf)) {
443 			printf("incore: buf out of range: %lx, hash: %d\n",
444 				bp, bh - bufhashtbl);
445 			panic("incore: buf fault");
446 		}
447 		/* hit */
448 		if (bp->b_lblkno == blkno && bp->b_vp == vp
449 			&& (bp->b_flags & B_INVAL) == 0) {
450 			splx(s);
451 			return (bp);
452 		}
453 		bp = bp->b_hash.le_next;
454 	}
455 	splx(s);
456 
457 	return(0);
458 }
459 
460 /*
461  * Get a block given a specified block and offset into a file/device.
462  */
463 struct buf *
464 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
465 {
466 	struct buf *bp;
467 	int x;
468 	struct bufhashhdr *bh;
469 
470 	x = splbio();
471 loop:
472 	if (bp = incore(vp, blkno)) {
473 		if (bp->b_flags & B_BUSY) {
474 			bp->b_flags |= B_WANTED;
475 			tsleep ((caddr_t)bp, PRIBIO, "getblk", 0);
476 			goto loop;
477 		}
478 		bp->b_flags |= B_BUSY | B_CACHE;
479 		bremfree(bp);
480 		/*
481 		 * check for size inconsistancies
482 		 */
483 		if (bp->b_bcount != size) {
484 			printf("getblk: invalid buffer size: %d\n", bp->b_bcount);
485 			bp->b_flags |= B_INVAL;
486 			bwrite(bp);
487 			goto loop;
488 		}
489 	} else {
490 
491 		if ((bp = getnewbuf(0, 0)) == 0)
492 			goto loop;
493 		allocbuf(bp, size);
494 		/*
495 		 * have to check again, because of a possible
496 		 * race condition.
497 		 */
498 		if (incore( vp, blkno)) {
499 			allocbuf(bp, 0);
500 			bp->b_flags |= B_INVAL;
501 			brelse(bp);
502 			goto loop;
503 		}
504 		bp->b_blkno = bp->b_lblkno = blkno;
505 		bgetvp(vp, bp);
506 		LIST_REMOVE(bp, b_hash);
507 		bh = BUFHASH(vp, blkno);
508 		LIST_INSERT_HEAD(bh, bp, b_hash);
509 	}
510 	splx(x);
511 	return (bp);
512 }
513 
514 /*
515  * Get an empty, disassociated buffer of given size.
516  */
517 struct buf *
518 geteblk(int size)
519 {
520 	struct buf *bp;
521 	while ((bp = getnewbuf(0, 0)) == 0)
522 		;
523 	allocbuf(bp, size);
524 	bp->b_flags |= B_INVAL;
525 	return (bp);
526 }
527 
528 /*
529  * Modify the length of a buffer's underlying buffer storage without
530  * destroying information (unless, of course the buffer is shrinking).
531  */
532 void
533 allocbuf(struct buf *bp, int size)
534 {
535 
536 	int newbsize = round_page(size);
537 
538 	if( newbsize == bp->b_bufsize) {
539 		bp->b_bcount = size;
540 		return;
541 	} else if( newbsize < bp->b_bufsize) {
542 		vm_hold_free_pages(
543 			(vm_offset_t) bp->b_data + newbsize,
544 			(vm_offset_t) bp->b_data + bp->b_bufsize);
545 	} else if( newbsize > bp->b_bufsize) {
546 		vm_hold_load_pages(
547 			(vm_offset_t) bp->b_data + bp->b_bufsize,
548 			(vm_offset_t) bp->b_data + newbsize);
549 	}
550 
551 	/* adjust buffer cache's idea of memory allocated to buffer contents */
552 	freebufspace -= newbsize - bp->b_bufsize;
553 	allocbufspace += newbsize - bp->b_bufsize;
554 
555 	bp->b_bufsize = newbsize;
556 	bp->b_bcount = size;
557 }
558 
559 /*
560  * Wait for buffer I/O completion, returning error status.
561  */
562 int
563 biowait(register struct buf *bp)
564 {
565 	int x;
566 
567 	x = splbio();
568 	while ((bp->b_flags & B_DONE) == 0)
569 		tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
570 	if((bp->b_flags & B_ERROR) || bp->b_error) {
571 		if ((bp->b_flags & B_INVAL) == 0) {
572 			bp->b_flags |= B_INVAL;
573 			bp->b_dev = NODEV;
574 			LIST_REMOVE(bp, b_hash);
575 			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
576 		}
577 		if (!bp->b_error)
578 			bp->b_error = EIO;
579 		else
580 			bp->b_flags |= B_ERROR;
581 		splx(x);
582 		return (bp->b_error);
583 	} else {
584 		splx(x);
585 		return (0);
586 	}
587 }
588 
589 /*
590  * Finish I/O on a buffer, calling an optional function.
591  * This is usually called from interrupt level, so process blocking
592  * is not *a good idea*.
593  */
594 void
595 biodone(register struct buf *bp)
596 {
597 	int s;
598 	s = splbio();
599 	bp->b_flags |= B_DONE;
600 
601 	if ((bp->b_flags & B_READ) == 0)  {
602 		vwakeup(bp);
603 	}
604 
605 	/* call optional completion function if requested */
606 	if (bp->b_flags & B_CALL) {
607 		bp->b_flags &= ~B_CALL;
608 		(*bp->b_iodone)(bp);
609 		splx(s);
610 		return;
611 	}
612 
613 /*
614  * For asynchronous completions, release the buffer now. The brelse
615  *	checks for B_WANTED and will do the wakeup there if necessary -
616  *	so no need to do a wakeup here in the async case.
617  */
618 
619 	if (bp->b_flags & B_ASYNC) {
620 		brelse(bp);
621 	} else {
622 		bp->b_flags &= ~B_WANTED;
623 		wakeup((caddr_t) bp);
624 	}
625 	splx(s);
626 }
627 
628 int
629 count_lock_queue()
630 {
631 	int count;
632 	struct buf *bp;
633 
634 	count = 0;
635 	for(bp = bufqueues[QUEUE_LOCKED].tqh_first;
636 	    bp != NULL;
637 	    bp = bp->b_freelist.tqe_next)
638 		count++;
639 	return(count);
640 }
641 
642 #ifndef UPDATE_INTERVAL
643 int vfs_update_interval = 30;
644 #else
645 int vfs_update_interval = UPDATE_INTERVAL;
646 #endif
647 
648 void
649 vfs_update() {
650 	(void) spl0();
651 	while(1) {
652 		tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update",
653 			hz * vfs_update_interval);
654 		vfs_update_wakeup = 0;
655 		sync(curproc, NULL, NULL);
656 	}
657 }
658 
659 /*
660  * these routines are not in the correct place (yet)
661  * also they work *ONLY* for kernel_pmap!!!
662  */
663 void
664 vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) {
665 	vm_offset_t pg;
666 	vm_page_t p;
667 	vm_offset_t from = round_page(froma);
668 	vm_offset_t to = round_page(toa);
669 
670 	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
671 		vm_offset_t pa;
672 
673 	tryagain:
674 		p =  vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS);
675 		if( !p) {
676 			VM_WAIT;
677 			goto tryagain;
678 		}
679 
680 		vm_page_wire(p);
681 		pmap_enter(kernel_pmap, pg, VM_PAGE_TO_PHYS(p),
682 			VM_PROT_READ|VM_PROT_WRITE, 1);
683 	}
684 }
685 
686 void
687 vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) {
688 	vm_offset_t pg;
689 	vm_page_t p;
690 	vm_offset_t from = round_page(froma);
691 	vm_offset_t to = round_page(toa);
692 
693 	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
694 		vm_offset_t pa;
695 		pa = pmap_kextract(pg);
696 		if( !pa) {
697 			printf("No pa for va: %x\n", pg);
698 		} else {
699 			p = PHYS_TO_VM_PAGE( pa);
700 			pmap_remove(kernel_pmap, pg, pg + PAGE_SIZE);
701 			vm_page_free(p);
702 		}
703 	}
704 }
705 
706 void
707 bufstats()
708 {
709 }
710 
711