vfs_bio.c (becbad1f6e18fec7c3bf286778a766ffca4457be) vfs_bio.c (21fae96123f71665f4325f1f69b5b99a24af6c4b)
1/*-
2 * Copyright (c) 2004 Poul-Henning Kamp
3 * Copyright (c) 1994,1997 John S. Dyson
4 * Copyright (c) 2013 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Konstantin Belousov
8 * under sponsorship from the FreeBSD Foundation.

--- 49 unchanged lines hidden (view full) ---

58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/mutex.h>
61#include <sys/kernel.h>
62#include <sys/kthread.h>
63#include <sys/proc.h>
64#include <sys/resourcevar.h>
65#include <sys/rwlock.h>
1/*-
2 * Copyright (c) 2004 Poul-Henning Kamp
3 * Copyright (c) 1994,1997 John S. Dyson
4 * Copyright (c) 2013 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Konstantin Belousov
8 * under sponsorship from the FreeBSD Foundation.

--- 49 unchanged lines hidden (view full) ---

58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/mutex.h>
61#include <sys/kernel.h>
62#include <sys/kthread.h>
63#include <sys/proc.h>
64#include <sys/resourcevar.h>
65#include <sys/rwlock.h>
66#include <sys/smp.h>
66#include <sys/sysctl.h>
67#include <sys/sysproto.h>
68#include <sys/vmem.h>
69#include <sys/vmmeter.h>
70#include <sys/vnode.h>
71#include <sys/watchdog.h>
72#include <geom/geom.h>
73#include <vm/vm.h>

--- 21 unchanged lines hidden (view full) ---

95};
96
97static struct buf *buf; /* buffer header pool */
98extern struct buf *swbuf; /* Swap buffer header pool. */
99caddr_t unmapped_buf;
100
101/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
102struct proc *bufdaemonproc;
67#include <sys/sysctl.h>
68#include <sys/sysproto.h>
69#include <sys/vmem.h>
70#include <sys/vmmeter.h>
71#include <sys/vnode.h>
72#include <sys/watchdog.h>
73#include <geom/geom.h>
74#include <vm/vm.h>

--- 21 unchanged lines hidden (view full) ---

96};
97
98static struct buf *buf; /* buffer header pool */
99extern struct buf *swbuf; /* Swap buffer header pool. */
100caddr_t unmapped_buf;
101
102/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
103struct proc *bufdaemonproc;
104struct proc *bufspacedaemonproc;
103
104static int inmem(struct vnode *vp, daddr_t blkno);
105static void vm_hold_free_pages(struct buf *bp, int newbsize);
106static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
107 vm_offset_t to);
108static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
109static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
110 vm_page_t m);
111static void vfs_clean_pages_dirty_buf(struct buf *bp);
112static void vfs_setdirty_locked_object(struct buf *bp);
113static void vfs_vmio_invalidate(struct buf *bp);
114static void vfs_vmio_truncate(struct buf *bp, int npages);
115static void vfs_vmio_extend(struct buf *bp, int npages, int size);
116static int vfs_bio_clcheck(struct vnode *vp, int size,
117 daddr_t lblkno, daddr_t blkno);
118static int buf_flush(struct vnode *vp, int);
105
106static int inmem(struct vnode *vp, daddr_t blkno);
107static void vm_hold_free_pages(struct buf *bp, int newbsize);
108static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
109 vm_offset_t to);
110static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
111static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
112 vm_page_t m);
113static void vfs_clean_pages_dirty_buf(struct buf *bp);
114static void vfs_setdirty_locked_object(struct buf *bp);
115static void vfs_vmio_invalidate(struct buf *bp);
116static void vfs_vmio_truncate(struct buf *bp, int npages);
117static void vfs_vmio_extend(struct buf *bp, int npages, int size);
118static int vfs_bio_clcheck(struct vnode *vp, int size,
119 daddr_t lblkno, daddr_t blkno);
120static int buf_flush(struct vnode *vp, int);
121static int buf_recycle(bool);
122static int buf_scan(bool);
119static int flushbufqueues(struct vnode *, int, int);
120static void buf_daemon(void);
121static void bremfreel(struct buf *bp);
122static __inline void bd_wakeup(void);
123static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
123static int flushbufqueues(struct vnode *, int, int);
124static void buf_daemon(void);
125static void bremfreel(struct buf *bp);
126static __inline void bd_wakeup(void);
127static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
128static void bufkva_reclaim(vmem_t *, int);
129static void bufkva_free(struct buf *);
130static int buf_import(void *, void **, int, int);
131static void buf_release(void *, void **, int);
132
124#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
125 defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
126static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
127#endif
128
129int vmiodirenable = TRUE;
130SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
131 "Use the VM system for directory writes");

--- 8 unchanged lines hidden (view full) ---

140#else
141SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
142 "Physical memory used for buffers");
143#endif
144static long bufkvaspace;
145SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
146 "Kernel virtual memory used for buffers");
147static long maxbufspace;
133#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
134 defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
135static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
136#endif
137
138int vmiodirenable = TRUE;
139SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
140 "Use the VM system for directory writes");

--- 8 unchanged lines hidden (view full) ---

149#else
150SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
151 "Physical memory used for buffers");
152#endif
153static long bufkvaspace;
154SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
155 "Kernel virtual memory used for buffers");
156static long maxbufspace;
148SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
149 "Maximum allowed value of bufspace (including buf_daemon)");
157SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
158 "Maximum allowed value of bufspace (including metadata)");
150static long bufmallocspace;
151SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
152 "Amount of malloced memory for buffers");
153static long maxbufmallocspace;
159static long bufmallocspace;
160SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
161 "Amount of malloced memory for buffers");
162static long maxbufmallocspace;
154SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
155 "Maximum amount of malloced memory for buffers");
163SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
164 0, "Maximum amount of malloced memory for buffers");
156static long lobufspace;
165static long lobufspace;
157SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
166SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
158 "Minimum amount of buffers we want to have");
159long hibufspace;
167 "Minimum amount of buffers we want to have");
168long hibufspace;
160SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
161 "Maximum allowed value of bufspace (excluding buf_daemon)");
162static int bufreusecnt;
163SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
164 "Number of times we have reused a buffer");
169SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
170 "Maximum allowed value of bufspace (excluding metadata)");
171long bufspacethresh;
172SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
173 0, "Bufspace consumed before waking the daemon to free some");
165static int buffreekvacnt;
166SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
167 "Number of times we have freed the KVA space from some buffer");
168static int bufdefragcnt;
169SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
170 "Number of times we have had to repeat buffer allocation to defragment");
171static long lorunningspace;
172SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |

--- 27 unchanged lines hidden (view full) ---

200int dirtybufthresh;
201SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
202 0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
203static int numfreebuffers;
204SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
205 "Number of free buffers");
206static int lofreebuffers;
207SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
174static int buffreekvacnt;
175SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
176 "Number of times we have freed the KVA space from some buffer");
177static int bufdefragcnt;
178SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
179 "Number of times we have had to repeat buffer allocation to defragment");
180static long lorunningspace;
181SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |

--- 27 unchanged lines hidden (view full) ---

209int dirtybufthresh;
210SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
211 0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
212static int numfreebuffers;
213SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
214 "Number of free buffers");
215static int lofreebuffers;
216SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
208 "XXX Unused");
217 "Target number of free buffers");
209static int hifreebuffers;
210SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
218static int hifreebuffers;
219SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
211 "XXX Complicatedly unused");
220 "Threshold for clean buffer recycling");
212static int getnewbufcalls;
213SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
214 "Number of calls to getnewbuf");
215static int getnewbufrestarts;
216SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
217 "Number of times getnewbuf has had to restart a buffer aquisition");
218static int mappingrestarts;
219SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
220 "Number of times getblk has had to restart a buffer mapping for "
221 "unmapped buffer");
221static int getnewbufcalls;
222SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
223 "Number of calls to getnewbuf");
224static int getnewbufrestarts;
225SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
226 "Number of times getnewbuf has had to restart a buffer aquisition");
227static int mappingrestarts;
228SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
229 "Number of times getblk has had to restart a buffer mapping for "
230 "unmapped buffer");
231static int numbufallocfails;
232SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
233 "Number of times buffer allocations failed");
222static int flushbufqtarget = 100;
223SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
224 "Amount of work to do in flushbufqueues when helping bufdaemon");
225static long notbufdflushes;
226SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
227 "Number of dirty buffer flushes done by the bufdaemon helpers");
228static long barrierwrites;
229SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
230 "Number of barrier writes");
231SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
232 &unmapped_buf_allowed, 0,
233 "Permit the use of the unmapped i/o");
234
235/*
234static int flushbufqtarget = 100;
235SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
236 "Amount of work to do in flushbufqueues when helping bufdaemon");
237static long notbufdflushes;
238SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
239 "Number of dirty buffer flushes done by the bufdaemon helpers");
240static long barrierwrites;
241SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
242 "Number of barrier writes");
243SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
244 &unmapped_buf_allowed, 0,
245 "Permit the use of the unmapped i/o");
246
247/*
236 * Lock for the non-dirty bufqueues
237 */
238static struct mtx_padalign bqclean;
239
240/*
241 * Lock for the dirty queue.
242 */
243static struct mtx_padalign bqdirty;
244
245/*
246 * This lock synchronizes access to bd_request.
247 */
248static struct mtx_padalign bdlock;
249
250/*
251 * This lock protects the runningbufreq and synchronizes runningbufwakeup and
252 * waitrunningbufspace().
253 */

--- 12 unchanged lines hidden (view full) ---

266/*
267 * Wakeup point for bufdaemon, as well as indicator of whether it is already
268 * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it
269 * is idling.
270 */
271static int bd_request;
272
273/*
248 * This lock synchronizes access to bd_request.
249 */
250static struct mtx_padalign bdlock;
251
252/*
253 * This lock protects the runningbufreq and synchronizes runningbufwakeup and
254 * waitrunningbufspace().
255 */

--- 12 unchanged lines hidden (view full) ---

268/*
269 * Wakeup point for bufdaemon, as well as indicator of whether it is already
270 * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it
271 * is idling.
272 */
273static int bd_request;
274
275/*
276 * Request/wakeup point for the bufspace daemon.
277 */
278static int bufspace_request;
279
280/*
274 * Request for the buf daemon to write more buffers than is indicated by
275 * lodirtybuf. This may be necessary to push out excess dependencies or
276 * defragment the address space where a simple count of the number of dirty
277 * buffers is insufficient to characterize the demand for flushing them.
278 */
279static int bd_speedupreq;
280
281/*

--- 11 unchanged lines hidden (view full) ---

293 * Used in runningbufwakeup() and waitrunningbufspace().
294 */
295static int runningbufreq;
296
297/*
298 * Synchronization (sleep/wakeup) variable for buffer requests.
299 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
300 * by and/or.
281 * Request for the buf daemon to write more buffers than is indicated by
282 * lodirtybuf. This may be necessary to push out excess dependencies or
283 * defragment the address space where a simple count of the number of dirty
284 * buffers is insufficient to characterize the demand for flushing them.
285 */
286static int bd_speedupreq;
287
288/*

--- 11 unchanged lines hidden (view full) ---

300 * Used in runningbufwakeup() and waitrunningbufspace().
301 */
302static int runningbufreq;
303
304/*
305 * Synchronization (sleep/wakeup) variable for buffer requests.
306 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
307 * by and/or.
301 * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
308 * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
302 * getnewbuf(), and getblk().
303 */
304static volatile int needsbuffer;
305
306/*
307 * Synchronization for bwillwrite() waiters.
308 */
309static int bdirtywait;
310
311/*
312 * Definitions for the buffer free lists.
313 */
309 * getnewbuf(), and getblk().
310 */
311static volatile int needsbuffer;
312
313/*
314 * Synchronization for bwillwrite() waiters.
315 */
316static int bdirtywait;
317
318/*
319 * Definitions for the buffer free lists.
320 */
314#define BUFFER_QUEUES 4 /* number of free buffer queues */
315
316#define QUEUE_NONE 0 /* on no queue */
321#define QUEUE_NONE 0 /* on no queue */
317#define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */
322#define QUEUE_EMPTY 1 /* empty buffer headers */
318#define QUEUE_DIRTY 2 /* B_DELWRI buffers */
323#define QUEUE_DIRTY 2 /* B_DELWRI buffers */
319#define QUEUE_EMPTY 3 /* empty buffer headers */
324#define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */
320#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */
321
325#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */
326
327/* Maximum number of clean buffer queues. */
328#define CLEAN_QUEUES 16
329
330/* Configured number of clean queues. */
331static int clean_queues;
332
333/* Maximum number of buffer queues. */
334#define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES)
335
322/* Queues for free buffers with various properties */
323static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
324#ifdef INVARIANTS
325static int bq_len[BUFFER_QUEUES];
326#endif
327
328/*
336/* Queues for free buffers with various properties */
337static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
338#ifdef INVARIANTS
339static int bq_len[BUFFER_QUEUES];
340#endif
341
342/*
343 * Lock for each bufqueue
344 */
345static struct mtx_padalign bqlocks[BUFFER_QUEUES];
346
347/*
348 * per-cpu empty buffer cache.
349 */
350uma_zone_t buf_zone;
351
352/*
329 * Single global constant for BUF_WMESG, to avoid getting multiple references.
330 * buf_wmesg is referred from macros.
331 */
332const char *buf_wmesg = BUF_WMESG;
333
353 * Single global constant for BUF_WMESG, to avoid getting multiple references.
354 * buf_wmesg is referred from macros.
355 */
356const char *buf_wmesg = BUF_WMESG;
357
334#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
335#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
336#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
337
338static int
339sysctl_runningspace(SYSCTL_HANDLER_ARGS)
340{
341 long value;
342 int error;
343
344 value = *(long *)arg1;
345 error = sysctl_handle_long(oidp, &value, 0, req);

--- 31 unchanged lines hidden (view full) ---

377 if (lvalue > INT_MAX)
378 /* On overflow, still write out a long to trigger ENOMEM. */
379 return (sysctl_handle_long(oidp, &lvalue, 0, req));
380 ivalue = lvalue;
381 return (sysctl_handle_int(oidp, &ivalue, 0, req));
382}
383#endif
384
358static int
359sysctl_runningspace(SYSCTL_HANDLER_ARGS)
360{
361 long value;
362 int error;
363
364 value = *(long *)arg1;
365 error = sysctl_handle_long(oidp, &value, 0, req);

--- 31 unchanged lines hidden (view full) ---

397 if (lvalue > INT_MAX)
398 /* On overflow, still write out a long to trigger ENOMEM. */
399 return (sysctl_handle_long(oidp, &lvalue, 0, req));
400 ivalue = lvalue;
401 return (sysctl_handle_int(oidp, &ivalue, 0, req));
402}
403#endif
404
405static int
406bqcleanq(void)
407{
408 static int nextq;
409
410 return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
411}
412
413static int
414bqisclean(int qindex)
415{
416
417 return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
418}
419
385/*
386 * bqlock:
387 *
388 * Return the appropriate queue lock based on the index.
389 */
390static inline struct mtx *
391bqlock(int qindex)
392{
393
420/*
421 * bqlock:
422 *
423 * Return the appropriate queue lock based on the index.
424 */
425static inline struct mtx *
426bqlock(int qindex)
427{
428
394 if (qindex == QUEUE_DIRTY)
395 return (struct mtx *)(&bqdirty);
396 return (struct mtx *)(&bqclean);
429 return (struct mtx *)&bqlocks[qindex];
397}
398
399/*
400 * bdirtywakeup:
401 *
402 * Wakeup any bwillwrite() waiters.
403 */
404static void

--- 37 unchanged lines hidden (view full) ---

442 * buf daemon will keep running until the condition clears.
443 */
444 if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
445 (lodirtybuffers + hidirtybuffers) / 2)
446 bd_wakeup();
447}
448
449/*
430}
431
432/*
433 * bdirtywakeup:
434 *
435 * Wakeup any bwillwrite() waiters.
436 */
437static void

--- 37 unchanged lines hidden (view full) ---

475 * buf daemon will keep running until the condition clears.
476 */
477 if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
478 (lodirtybuffers + hidirtybuffers) / 2)
479 bd_wakeup();
480}
481
482/*
450 * bufspacewakeup:
483 * bufspace_wakeup:
451 *
452 * Called when buffer space is potentially available for recovery.
453 * getnewbuf() will block on this flag when it is unable to free
454 * sufficient buffer space. Buffer space becomes recoverable when
455 * bp's get placed back in the queues.
456 */
484 *
485 * Called when buffer space is potentially available for recovery.
486 * getnewbuf() will block on this flag when it is unable to free
487 * sufficient buffer space. Buffer space becomes recoverable when
488 * bp's get placed back in the queues.
489 */
457static __inline void
458bufspacewakeup(void)
490static void
491bufspace_wakeup(void)
459{
492{
460 int need_wakeup, on;
461
462 /*
493
494 /*
463 * If someone is waiting for bufspace, wake them up. Even
464 * though we may not have freed the kva space yet, the waiting
465 * process will be able to now.
495 * If someone is waiting for bufspace, wake them up.
496 *
497 * Since needsbuffer is set prior to doing an additional queue
498 * scan it is safe to check for the flag prior to acquiring the
499 * lock. The thread that is preparing to scan again before
500 * blocking would discover the buf we released.
466 */
501 */
502 if (needsbuffer) {
503 rw_rlock(&nblock);
504 if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
505 wakeup(__DEVOLATILE(void *, &needsbuffer));
506 rw_runlock(&nblock);
507 }
508}
509
510/*
511 * bufspace_daemonwakeup:
512 *
513 * Wakeup the daemon responsible for freeing clean bufs.
514 */
515static void
516bufspace_daemonwakeup(void)
517{
467 rw_rlock(&nblock);
518 rw_rlock(&nblock);
468 for (;;) {
469 need_wakeup = 0;
470 on = needsbuffer;
471 if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
472 break;
473 need_wakeup = 1;
474 if (atomic_cmpset_rel_int(&needsbuffer, on,
475 on & ~VFS_BIO_NEED_BUFSPACE))
476 break;
519 if (bufspace_request == 0) {
520 bufspace_request = 1;
521 wakeup(&bufspace_request);
477 }
522 }
478 if (need_wakeup)
479 wakeup(__DEVOLATILE(void *, &needsbuffer));
480 rw_runlock(&nblock);
481}
482
483/*
523 rw_runlock(&nblock);
524}
525
526/*
484 * bufspaceadjust:
527 * bufspace_adjust:
485 *
486 * Adjust the reported bufspace for a KVA managed buffer, possibly
487 * waking any waiters.
488 */
489static void
528 *
529 * Adjust the reported bufspace for a KVA managed buffer, possibly
530 * waking any waiters.
531 */
532static void
490bufspaceadjust(struct buf *bp, int bufsize)
533bufspace_adjust(struct buf *bp, int bufsize)
491{
534{
535 long space;
492 int diff;
493
494 KASSERT((bp->b_flags & B_MALLOC) == 0,
536 int diff;
537
538 KASSERT((bp->b_flags & B_MALLOC) == 0,
495 ("bufspaceadjust: malloc buf %p", bp));
539 ("bufspace_adjust: malloc buf %p", bp));
496 diff = bufsize - bp->b_bufsize;
497 if (diff < 0) {
498 atomic_subtract_long(&bufspace, -diff);
540 diff = bufsize - bp->b_bufsize;
541 if (diff < 0) {
542 atomic_subtract_long(&bufspace, -diff);
499 bufspacewakeup();
500 } else
501 atomic_add_long(&bufspace, diff);
543 bufspace_wakeup();
544 } else {
545 space = atomic_fetchadd_long(&bufspace, diff);
546 /* Wake up the daemon on the transition. */
547 if (space < bufspacethresh && space + diff >= bufspacethresh)
548 bufspace_daemonwakeup();
549 }
502 bp->b_bufsize = bufsize;
503}
504
505/*
550 bp->b_bufsize = bufsize;
551}
552
553/*
554 * bufspace_reserve:
555 *
556 * Reserve bufspace before calling allocbuf(). metadata has a
557 * different space limit than data.
558 */
559static int
560bufspace_reserve(int size, bool metadata)
561{
562 long limit;
563 long space;
564
565 if (metadata)
566 limit = maxbufspace;
567 else
568 limit = hibufspace;
569 do {
570 space = bufspace;
571 if (space + size > limit)
572 return (ENOSPC);
573 } while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
574
575 /* Wake up the daemon on the transition. */
576 if (space < bufspacethresh && space + size >= bufspacethresh)
577 bufspace_daemonwakeup();
578
579 return (0);
580}
581
582/*
583 * bufspace_release:
584 *
585 * Release reserved bufspace after bufspace_adjust() has consumed it.
586 */
587static void
588bufspace_release(int size)
589{
590 atomic_subtract_long(&bufspace, size);
591 bufspace_wakeup();
592}
593
594/*
595 * bufspace_wait:
596 *
597 * Wait for bufspace, acting as the buf daemon if a locked vnode is
598 * supplied. needsbuffer must be set in a safe fashion prior to
599 * polling for space. The operation must be re-tried on return.
600 */
601static void
602bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
603{
604 struct thread *td;
605 int error, fl, norunbuf;
606
607 if ((gbflags & GB_NOWAIT_BD) != 0)
608 return;
609
610 td = curthread;
611 rw_wlock(&nblock);
612 while (needsbuffer != 0) {
613 if (vp != NULL && vp->v_type != VCHR &&
614 (td->td_pflags & TDP_BUFNEED) == 0) {
615 rw_wunlock(&nblock);
616 /*
617 * getblk() is called with a vnode locked, and
618 * some majority of the dirty buffers may as
619 * well belong to the vnode. Flushing the
620 * buffers there would make a progress that
621 * cannot be achieved by the buf_daemon, that
622 * cannot lock the vnode.
623 */
624 norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
625 (td->td_pflags & TDP_NORUNNINGBUF);
626
627 /*
628 * Play bufdaemon. The getnewbuf() function
629 * may be called while the thread owns lock
630 * for another dirty buffer for the same
631 * vnode, which makes it impossible to use
632 * VOP_FSYNC() there, due to the buffer lock
633 * recursion.
634 */
635 td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
636 fl = buf_flush(vp, flushbufqtarget);
637 td->td_pflags &= norunbuf;
638 rw_wlock(&nblock);
639 if (fl != 0)
640 continue;
641 if (needsbuffer == 0)
642 break;
643 }
644 error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
645 (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
646 if (error != 0)
647 break;
648 }
649 rw_wunlock(&nblock);
650}
651
652
653/*
654 * bufspace_daemon:
655 *
656 * buffer space management daemon. Tries to maintain some marginal
657 * amount of free buffer space so that requesting processes neither
658 * block nor work to reclaim buffers.
659 */
660static void
661bufspace_daemon(void)
662{
663 for (;;) {
664 kproc_suspend_check(bufspacedaemonproc);
665
666 /*
667 * Free buffers from the clean queue until we meet our
668 * targets.
669 *
670 * Theory of operation: The buffer cache is most efficient
671 * when some free buffer headers and space are always
672 * available to getnewbuf(). This daemon attempts to prevent
673 * the excessive blocking and synchronization associated
674 * with shortfall. It goes through three phases according
675 * demand:
676 *
677 * 1) The daemon wakes up voluntarily once per-second
678 * during idle periods when the counters are below
679 * the wakeup thresholds (bufspacethresh, lofreebuffers).
680 *
681 * 2) The daemon wakes up as we cross the thresholds
682 * ahead of any potential blocking. This may bounce
683 * slightly according to the rate of consumption and
684 * release.
685 *
686 * 3) The daemon and consumers are starved for working
687 * clean buffers. This is the 'bufspace' sleep below
688 * which will inefficiently trade bufs with bqrelse
689 * until we return to condition 2.
690 */
691 while (bufspace > lobufspace ||
692 numfreebuffers < hifreebuffers) {
693 if (buf_recycle(false) != 0) {
694 atomic_set_int(&needsbuffer, 1);
695 if (buf_recycle(false) != 0) {
696 rw_wlock(&nblock);
697 if (needsbuffer)
698 rw_sleep(__DEVOLATILE(void *,
699 &needsbuffer), &nblock,
700 PRIBIO|PDROP, "bufspace",
701 hz/10);
702 else
703 rw_wunlock(&nblock);
704 }
705 }
706 maybe_yield();
707 }
708
709 /*
710 * Re-check our limits under the exclusive nblock.
711 */
712 rw_wlock(&nblock);
713 if (bufspace < bufspacethresh &&
714 numfreebuffers > lofreebuffers) {
715 bufspace_request = 0;
716 rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
717 "-", hz);
718 } else
719 rw_wunlock(&nblock);
720 }
721}
722
723static struct kproc_desc bufspace_kp = {
724 "bufspacedaemon",
725 bufspace_daemon,
726 &bufspacedaemonproc
727};
728SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
729 &bufspace_kp);
730
731/*
506 * bufmallocadjust:
507 *
508 * Adjust the reported bufspace for a malloc managed buffer, possibly
509 * waking any waiters.
510 */
511static void
512bufmallocadjust(struct buf *bp, int bufsize)
513{
514 int diff;
515
516 KASSERT((bp->b_flags & B_MALLOC) != 0,
517 ("bufmallocadjust: non-malloc buf %p", bp));
518 diff = bufsize - bp->b_bufsize;
732 * bufmallocadjust:
733 *
734 * Adjust the reported bufspace for a malloc managed buffer, possibly
735 * waking any waiters.
736 */
737static void
738bufmallocadjust(struct buf *bp, int bufsize)
739{
740 int diff;
741
742 KASSERT((bp->b_flags & B_MALLOC) != 0,
743 ("bufmallocadjust: non-malloc buf %p", bp));
744 diff = bufsize - bp->b_bufsize;
519 if (diff < 0) {
745 if (diff < 0)
520 atomic_subtract_long(&bufmallocspace, -diff);
746 atomic_subtract_long(&bufmallocspace, -diff);
521 bufspacewakeup();
522 } else
747 else
523 atomic_add_long(&bufmallocspace, diff);
524 bp->b_bufsize = bufsize;
525}
526
527/*
528 * runningwakeup:
529 *
530 * Wake up processes that are waiting on asynchronous writes to fall

--- 35 unchanged lines hidden (view full) ---

566 if (space < lorunningspace)
567 return;
568 if (space - bspace > lorunningspace)
569 return;
570 runningwakeup();
571}
572
573/*
748 atomic_add_long(&bufmallocspace, diff);
749 bp->b_bufsize = bufsize;
750}
751
752/*
753 * runningwakeup:
754 *
755 * Wake up processes that are waiting on asynchronous writes to fall

--- 35 unchanged lines hidden (view full) ---

791 if (space < lorunningspace)
792 return;
793 if (space - bspace > lorunningspace)
794 return;
795 runningwakeup();
796}
797
798/*
574 * bufcountadd:
575 *
576 * Called when a buffer has been added to one of the free queues to
577 * account for the buffer and to wakeup anyone waiting for free buffers.
578 * This typically occurs when large amounts of metadata are being handled
579 * by the buffer cache ( else buffer space runs out first, usually ).
580 */
581static __inline void
582bufcountadd(struct buf *bp)
583{
584 int mask, need_wakeup, old, on;
585
586 KASSERT((bp->b_flags & B_INFREECNT) == 0,
587 ("buf %p already counted as free", bp));
588 bp->b_flags |= B_INFREECNT;
589 old = atomic_fetchadd_int(&numfreebuffers, 1);
590 KASSERT(old >= 0 && old < nbuf,
591 ("numfreebuffers climbed to %d", old + 1));
592 mask = VFS_BIO_NEED_ANY;
593 if (numfreebuffers >= hifreebuffers)
594 mask |= VFS_BIO_NEED_FREE;
595 rw_rlock(&nblock);
596 for (;;) {
597 need_wakeup = 0;
598 on = needsbuffer;
599 if (on == 0)
600 break;
601 need_wakeup = 1;
602 if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
603 break;
604 }
605 if (need_wakeup)
606 wakeup(__DEVOLATILE(void *, &needsbuffer));
607 rw_runlock(&nblock);
608}
609
610/*
611 * bufcountsub:
612 *
613 * Decrement the numfreebuffers count as needed.
614 */
615static void
616bufcountsub(struct buf *bp)
617{
618 int old;
619
620 /*
621 * Fixup numfreebuffers count. If the buffer is invalid or not
622 * delayed-write, the buffer was free and we must decrement
623 * numfreebuffers.
624 */
625 if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
626 KASSERT((bp->b_flags & B_INFREECNT) != 0,
627 ("buf %p not counted in numfreebuffers", bp));
628 bp->b_flags &= ~B_INFREECNT;
629 old = atomic_fetchadd_int(&numfreebuffers, -1);
630 KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
631 }
632}
633
634/*
635 * waitrunningbufspace()
636 *
637 * runningbufspace is a measure of the amount of I/O currently
638 * running. This routine is used in async-write situations to
639 * prevent creating huge backups of pending writes to a device.
640 * Only asynchronous writes are governed by this function.
641 *
642 * This does NOT turn an async write into a sync write. It waits

--- 199 unchanged lines hidden (view full) ---

842/* Initialize the buffer subsystem. Called before use of any buffers. */
843void
844bufinit(void)
845{
846 struct buf *bp;
847 int i;
848
849 CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
799 * waitrunningbufspace()
800 *
801 * runningbufspace is a measure of the amount of I/O currently
802 * running. This routine is used in async-write situations to
803 * prevent creating huge backups of pending writes to a device.
804 * Only asynchronous writes are governed by this function.
805 *
806 * This does NOT turn an async write into a sync write. It waits

--- 199 unchanged lines hidden (view full) ---

1006/* Initialize the buffer subsystem. Called before use of any buffers. */
1007void
1008bufinit(void)
1009{
1010 struct buf *bp;
1011 int i;
1012
1013 CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
850 mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
851 mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
1014 mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
1015 mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
1016 for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
1017 mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
852 mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
853 rw_init(&nblock, "needsbuffer lock");
854 mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
855 mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
856
857 /* next, make a null set of free lists */
858 for (i = 0; i < BUFFER_QUEUES; i++)
859 TAILQ_INIT(&bufqueues[i]);
860
861 unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
862
863 /* finally, initialize each buffer header and stick on empty q */
864 for (i = 0; i < nbuf; i++) {
865 bp = &buf[i];
866 bzero(bp, sizeof *bp);
1018 mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
1019 rw_init(&nblock, "needsbuffer lock");
1020 mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
1021 mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
1022
1023 /* next, make a null set of free lists */
1024 for (i = 0; i < BUFFER_QUEUES; i++)
1025 TAILQ_INIT(&bufqueues[i]);
1026
1027 unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
1028
1029 /* finally, initialize each buffer header and stick on empty q */
1030 for (i = 0; i < nbuf; i++) {
1031 bp = &buf[i];
1032 bzero(bp, sizeof *bp);
867 bp->b_flags = B_INVAL | B_INFREECNT;
1033 bp->b_flags = B_INVAL;
868 bp->b_rcred = NOCRED;
869 bp->b_wcred = NOCRED;
870 bp->b_qindex = QUEUE_EMPTY;
871 bp->b_xflags = 0;
872 bp->b_data = bp->b_kvabase = unmapped_buf;
873 LIST_INIT(&bp->b_dep);
874 BUF_LOCKINIT(bp);
875 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
876#ifdef INVARIANTS
877 bq_len[QUEUE_EMPTY]++;
878#endif
879 }
880
881 /*
882 * maxbufspace is the absolute maximum amount of buffer space we are
883 * allowed to reserve in KVM and in real terms. The absolute maximum
1034 bp->b_rcred = NOCRED;
1035 bp->b_wcred = NOCRED;
1036 bp->b_qindex = QUEUE_EMPTY;
1037 bp->b_xflags = 0;
1038 bp->b_data = bp->b_kvabase = unmapped_buf;
1039 LIST_INIT(&bp->b_dep);
1040 BUF_LOCKINIT(bp);
1041 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
1042#ifdef INVARIANTS
1043 bq_len[QUEUE_EMPTY]++;
1044#endif
1045 }
1046
1047 /*
1048 * maxbufspace is the absolute maximum amount of buffer space we are
1049 * allowed to reserve in KVM and in real terms. The absolute maximum
884 * is nominally used by buf_daemon. hibufspace is the nominal maximum
885 * used by most other processes. The differential is required to
886 * ensure that buf_daemon is able to run when other processes might
887 * be blocked waiting for buffer space.
1050 * is nominally used by metadata. hibufspace is the nominal maximum
1051 * used by most other requests. The differential is required to
1052 * ensure that metadata deadlocks don't occur.
888 *
889 * maxbufspace is based on BKVASIZE. Allocating buffers larger then
890 * this may result in KVM fragmentation which is not handled optimally
1053 *
1054 * maxbufspace is based on BKVASIZE. Allocating buffers larger then
1055 * this may result in KVM fragmentation which is not handled optimally
891 * by the system.
1056 * by the system. XXX This is less true with vmem. We could use
1057 * PAGE_SIZE.
892 */
893 maxbufspace = (long)nbuf * BKVASIZE;
894 hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
1058 */
1059 maxbufspace = (long)nbuf * BKVASIZE;
1060 hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
895 lobufspace = hibufspace - MAXBCACHEBUF;
1061 lobufspace = (hibufspace / 20) * 19; /* 95% */
1062 bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
896
897 /*
898 * Note: The 16 MiB upper limit for hirunningspace was chosen
899 * arbitrarily and may need further tuning. It corresponds to
900 * 128 outstanding write IO requests (if IO size is 128 KiB),
901 * which fits with many RAID controllers' tagged queuing limits.
902 * The lower 1 MiB limit is the historical upper limit for
903 * hirunningspace.
904 */
905 hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF),
906 16 * 1024 * 1024), 1024 * 1024);
907 lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
908
1063
1064 /*
1065 * Note: The 16 MiB upper limit for hirunningspace was chosen
1066 * arbitrarily and may need further tuning. It corresponds to
1067 * 128 outstanding write IO requests (if IO size is 128 KiB),
1068 * which fits with many RAID controllers' tagged queuing limits.
1069 * The lower 1 MiB limit is the historical upper limit for
1070 * hirunningspace.
1071 */
1072 hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF),
1073 16 * 1024 * 1024), 1024 * 1024);
1074 lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
1075
909/*
910 * Limit the amount of malloc memory since it is wired permanently into
911 * the kernel space. Even though this is accounted for in the buffer
912 * allocation, we don't want the malloced region to grow uncontrolled.
913 * The malloc scheme improves memory utilization significantly on average
914 * (small) directories.
915 */
1076 /*
1077 * Limit the amount of malloc memory since it is wired permanently into
1078 * the kernel space. Even though this is accounted for in the buffer
1079 * allocation, we don't want the malloced region to grow uncontrolled.
1080 * The malloc scheme improves memory utilization significantly on
1081 * average (small) directories.
1082 */
916 maxbufmallocspace = hibufspace / 20;
917
1083 maxbufmallocspace = hibufspace / 20;
1084
918/*
919 * Reduce the chance of a deadlock occuring by limiting the number
920 * of delayed-write dirty buffers we allow to stack up.
921 */
1085 /*
1086 * Reduce the chance of a deadlock occuring by limiting the number
1087 * of delayed-write dirty buffers we allow to stack up.
1088 */
922 hidirtybuffers = nbuf / 4 + 20;
923 dirtybufthresh = hidirtybuffers * 9 / 10;
924 numdirtybuffers = 0;
1089 hidirtybuffers = nbuf / 4 + 20;
1090 dirtybufthresh = hidirtybuffers * 9 / 10;
1091 numdirtybuffers = 0;
925/*
926 * To support extreme low-memory systems, make sure hidirtybuffers cannot
927 * eat up all available buffer space. This occurs when our minimum cannot
928 * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming
929 * BKVASIZE'd buffers.
930 */
1092 /*
1093 * To support extreme low-memory systems, make sure hidirtybuffers
1094 * cannot eat up all available buffer space. This occurs when our
1095 * minimum cannot be met. We try to size hidirtybuffers to 3/4 our
1096 * buffer space assuming BKVASIZE'd buffers.
1097 */
931 while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
932 hidirtybuffers >>= 1;
933 }
934 lodirtybuffers = hidirtybuffers / 2;
935
1098 while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
1099 hidirtybuffers >>= 1;
1100 }
1101 lodirtybuffers = hidirtybuffers / 2;
1102
936/*
937 * Try to keep the number of free buffers in the specified range,
938 * and give special processes (e.g. like buf_daemon) access to an
939 * emergency reserve.
940 */
941 lofreebuffers = nbuf / 18 + 5;
942 hifreebuffers = 2 * lofreebuffers;
1103 /*
1104 * lofreebuffers should be sufficient to avoid stalling waiting on
1105 * buf headers under heavy utilization. The bufs in per-cpu caches
1106 * are counted as free but will be unavailable to threads executing
1107 * on other cpus.
1108 *
1109 * hifreebuffers is the free target for the bufspace daemon. This
1110 * should be set appropriately to limit work per-iteration.
1111 */
1112 lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
1113 hifreebuffers = (3 * lofreebuffers) / 2;
943 numfreebuffers = nbuf;
944
945 bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
946 VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
1114 numfreebuffers = nbuf;
1115
1116 bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
1117 VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
1118
1119 /* Setup the kva and free list allocators. */
1120 vmem_set_reclaim(buffer_arena, bufkva_reclaim);
1121 buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
1122 NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
1123
1124 /*
1125 * Size the clean queue according to the amount of buffer space.
1126 * One queue per-256mb up to the max. More queues gives better
1127 * concurrency but less accurate LRU.
1128 */
1129 clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
1130
947}
948
949#ifdef INVARIANTS
950static inline void
951vfs_buf_check_mapped(struct buf *bp)
952{
953
954 KASSERT(bp->b_kvabase != unmapped_buf,

--- 169 unchanged lines hidden (view full) ---

1124 *
1125 * Insert the buffer into the appropriate free list.
1126 */
1127static void
1128binsfree(struct buf *bp, int qindex)
1129{
1130 struct mtx *olock, *nlock;
1131
1131}
1132
1133#ifdef INVARIANTS
1134static inline void
1135vfs_buf_check_mapped(struct buf *bp)
1136{
1137
1138 KASSERT(bp->b_kvabase != unmapped_buf,

--- 169 unchanged lines hidden (view full) ---

1308 *
1309 * Insert the buffer into the appropriate free list.
1310 */
1311static void
1312binsfree(struct buf *bp, int qindex)
1313{
1314 struct mtx *olock, *nlock;
1315
1132 BUF_ASSERT_XLOCKED(bp);
1316 if (qindex != QUEUE_EMPTY) {
1317 BUF_ASSERT_XLOCKED(bp);
1318 }
1133
1319
1320 /*
1321 * Stick to the same clean queue for the lifetime of the buf to
1322 * limit locking below. Otherwise pick ont sequentially.
1323 */
1324 if (qindex == QUEUE_CLEAN) {
1325 if (bqisclean(bp->b_qindex))
1326 qindex = bp->b_qindex;
1327 else
1328 qindex = bqcleanq();
1329 }
1330
1331 /*
1332 * Handle delayed bremfree() processing.
1333 */
1134 nlock = bqlock(qindex);
1334 nlock = bqlock(qindex);
1135 /* Handle delayed bremfree() processing. */
1136 if (bp->b_flags & B_REMFREE) {
1137 olock = bqlock(bp->b_qindex);
1138 mtx_lock(olock);
1139 bremfreel(bp);
1140 if (olock != nlock) {
1141 mtx_unlock(olock);
1142 mtx_lock(nlock);
1143 }

--- 7 unchanged lines hidden (view full) ---

1151 if (bp->b_flags & B_AGE)
1152 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1153 else
1154 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
1155#ifdef INVARIANTS
1156 bq_len[bp->b_qindex]++;
1157#endif
1158 mtx_unlock(nlock);
1335 if (bp->b_flags & B_REMFREE) {
1336 olock = bqlock(bp->b_qindex);
1337 mtx_lock(olock);
1338 bremfreel(bp);
1339 if (olock != nlock) {
1340 mtx_unlock(olock);
1341 mtx_lock(nlock);
1342 }

--- 7 unchanged lines hidden (view full) ---

1350 if (bp->b_flags & B_AGE)
1351 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1352 else
1353 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
1354#ifdef INVARIANTS
1355 bq_len[bp->b_qindex]++;
1356#endif
1357 mtx_unlock(nlock);
1358}
1159
1359
1360/*
1361 * buf_free:
1362 *
1363 * Free a buffer to the buf zone once it no longer has valid contents.
1364 */
1365static void
1366buf_free(struct buf *bp)
1367{
1368
1369 if (bp->b_flags & B_REMFREE)
1370 bremfreef(bp);
1371 if (bp->b_vflags & BV_BKGRDINPROG)
1372 panic("losing buffer 1");
1373 if (bp->b_rcred != NOCRED) {
1374 crfree(bp->b_rcred);
1375 bp->b_rcred = NOCRED;
1376 }
1377 if (bp->b_wcred != NOCRED) {
1378 crfree(bp->b_wcred);
1379 bp->b_wcred = NOCRED;
1380 }
1381 if (!LIST_EMPTY(&bp->b_dep))
1382 buf_deallocate(bp);
1383 bufkva_free(bp);
1384 BUF_UNLOCK(bp);
1385 uma_zfree(buf_zone, bp);
1386 atomic_add_int(&numfreebuffers, 1);
1387 bufspace_wakeup();
1388}
1389
1390/*
1391 * buf_import:
1392 *
1393 * Import bufs into the uma cache from the buf list. The system still
1394 * expects a static array of bufs and much of the synchronization
1395 * around bufs assumes type stable storage. As a result, UMA is used
1396 * only as a per-cpu cache of bufs still maintained on a global list.
1397 */
1398static int
1399buf_import(void *arg, void **store, int cnt, int flags)
1400{
1401 struct buf *bp;
1402 int i;
1403
1404 mtx_lock(&bqlocks[QUEUE_EMPTY]);
1405 for (i = 0; i < cnt; i++) {
1406 bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1407 if (bp == NULL)
1408 break;
1409 bremfreel(bp);
1410 store[i] = bp;
1411 }
1412 mtx_unlock(&bqlocks[QUEUE_EMPTY]);
1413
1414 return (i);
1415}
1416
1417/*
1418 * buf_release:
1419 *
1420 * Release bufs from the uma cache back to the buffer queues.
1421 */
1422static void
1423buf_release(void *arg, void **store, int cnt)
1424{
1425 int i;
1426
1427 for (i = 0; i < cnt; i++)
1428 binsfree(store[i], QUEUE_EMPTY);
1429}
1430
1431/*
1432 * buf_alloc:
1433 *
1434 * Allocate an empty buffer header.
1435 */
1436static struct buf *
1437buf_alloc(void)
1438{
1439 struct buf *bp;
1440
1441 bp = uma_zalloc(buf_zone, M_NOWAIT);
1442 if (bp == NULL) {
1443 bufspace_daemonwakeup();
1444 atomic_add_int(&numbufallocfails, 1);
1445 return (NULL);
1446 }
1447
1160 /*
1448 /*
1161 * Something we can maybe free or reuse.
1449 * Wake-up the bufspace daemon on transition.
1162 */
1450 */
1163 if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
1164 bufspacewakeup();
1451 if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
1452 bufspace_daemonwakeup();
1165
1453
1166 if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
1167 bufcountadd(bp);
1454 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1455 panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
1456
1457 KASSERT(bp->b_vp == NULL,
1458 ("bp: %p still has vnode %p.", bp, bp->b_vp));
1459 KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
1460 ("invalid buffer %p flags %#x", bp, bp->b_flags));
1461 KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
1462 ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
1463 KASSERT(bp->b_npages == 0,
1464 ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
1465 KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
1466 KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
1467
1468 bp->b_flags = 0;
1469 bp->b_ioflags = 0;
1470 bp->b_xflags = 0;
1471 bp->b_vflags = 0;
1472 bp->b_vp = NULL;
1473 bp->b_blkno = bp->b_lblkno = 0;
1474 bp->b_offset = NOOFFSET;
1475 bp->b_iodone = 0;
1476 bp->b_error = 0;
1477 bp->b_resid = 0;
1478 bp->b_bcount = 0;
1479 bp->b_npages = 0;
1480 bp->b_dirtyoff = bp->b_dirtyend = 0;
1481 bp->b_bufobj = NULL;
1482 bp->b_pin_count = 0;
1483 bp->b_data = bp->b_kvabase = unmapped_buf;
1484 bp->b_fsprivate1 = NULL;
1485 bp->b_fsprivate2 = NULL;
1486 bp->b_fsprivate3 = NULL;
1487 LIST_INIT(&bp->b_dep);
1488
1489 return (bp);
1168}
1169
1170/*
1490}
1491
1492/*
1493 * buf_qrecycle:
1494 *
1495 * Free a buffer from the given bufqueue. kva controls whether the
1496 * freed buf must own some kva resources. This is used for
1497 * defragmenting.
1498 */
1499static int
1500buf_qrecycle(int qindex, bool kva)
1501{
1502 struct buf *bp, *nbp;
1503
1504 if (kva)
1505 atomic_add_int(&bufdefragcnt, 1);
1506 nbp = NULL;
1507 mtx_lock(&bqlocks[qindex]);
1508 nbp = TAILQ_FIRST(&bufqueues[qindex]);
1509
1510 /*
1511 * Run scan, possibly freeing data and/or kva mappings on the fly
1512 * depending.
1513 */
1514 while ((bp = nbp) != NULL) {
1515 /*
1516 * Calculate next bp (we can only use it if we do not
1517 * release the bqlock).
1518 */
1519 nbp = TAILQ_NEXT(bp, b_freelist);
1520
1521 /*
1522 * If we are defragging then we need a buffer with
1523 * some kva to reclaim.
1524 */
1525 if (kva && bp->b_kvasize == 0)
1526 continue;
1527
1528 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1529 continue;
1530
1531 /*
1532 * Skip buffers with background writes in progress.
1533 */
1534 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
1535 BUF_UNLOCK(bp);
1536 continue;
1537 }
1538
1539 KASSERT(bp->b_qindex == qindex,
1540 ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
1541 /*
1542 * NOTE: nbp is now entirely invalid. We can only restart
1543 * the scan from this point on.
1544 */
1545 bremfreel(bp);
1546 mtx_unlock(&bqlocks[qindex]);
1547
1548 /*
1549 * Requeue the background write buffer with error and
1550 * restart the scan.
1551 */
1552 if ((bp->b_vflags & BV_BKGRDERR) != 0) {
1553 bqrelse(bp);
1554 mtx_lock(&bqlocks[qindex]);
1555 nbp = TAILQ_FIRST(&bufqueues[qindex]);
1556 continue;
1557 }
1558 bp->b_flags |= B_INVAL;
1559 brelse(bp);
1560 return (0);
1561 }
1562 mtx_unlock(&bqlocks[qindex]);
1563
1564 return (ENOBUFS);
1565}
1566
1567/*
1568 * buf_recycle:
1569 *
1570 * Iterate through all clean queues until we find a buf to recycle or
1571 * exhaust the search.
1572 */
1573static int
1574buf_recycle(bool kva)
1575{
1576 int qindex, first_qindex;
1577
1578 qindex = first_qindex = bqcleanq();
1579 do {
1580 if (buf_qrecycle(qindex, kva) == 0)
1581 return (0);
1582 if (++qindex == QUEUE_CLEAN + clean_queues)
1583 qindex = QUEUE_CLEAN;
1584 } while (qindex != first_qindex);
1585
1586 return (ENOBUFS);
1587}
1588
1589/*
1590 * buf_scan:
1591 *
1592 * Scan the clean queues looking for a buffer to recycle. needsbuffer
1593 * is set on failure so that the caller may optionally bufspace_wait()
1594 * in a race-free fashion.
1595 */
1596static int
1597buf_scan(bool defrag)
1598{
1599 int error;
1600
1601 /*
1602 * To avoid heavy synchronization and wakeup races we set
1603 * needsbuffer and re-poll before failing. This ensures that
1604 * no frees can be missed between an unsuccessful poll and
1605 * going to sleep in a synchronized fashion.
1606 */
1607 if ((error = buf_recycle(defrag)) != 0) {
1608 atomic_set_int(&needsbuffer, 1);
1609 bufspace_daemonwakeup();
1610 error = buf_recycle(defrag);
1611 }
1612 if (error == 0)
1613 atomic_add_int(&getnewbufrestarts, 1);
1614 return (error);
1615}
1616
1617/*
1171 * bremfree:
1172 *
1173 * Mark the buffer for removal from the appropriate free list.
1174 *
1175 */
1176void
1177bremfree(struct buf *bp)
1178{
1179
1180 CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1181 KASSERT((bp->b_flags & B_REMFREE) == 0,
1182 ("bremfree: buffer %p already marked for delayed removal.", bp));
1183 KASSERT(bp->b_qindex != QUEUE_NONE,
1184 ("bremfree: buffer %p not on a queue.", bp));
1185 BUF_ASSERT_XLOCKED(bp);
1186
1187 bp->b_flags |= B_REMFREE;
1618 * bremfree:
1619 *
1620 * Mark the buffer for removal from the appropriate free list.
1621 *
1622 */
1623void
1624bremfree(struct buf *bp)
1625{
1626
1627 CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1628 KASSERT((bp->b_flags & B_REMFREE) == 0,
1629 ("bremfree: buffer %p already marked for delayed removal.", bp));
1630 KASSERT(bp->b_qindex != QUEUE_NONE,
1631 ("bremfree: buffer %p not on a queue.", bp));
1632 BUF_ASSERT_XLOCKED(bp);
1633
1634 bp->b_flags |= B_REMFREE;
1188 bufcountsub(bp);
1189}
1190
1191/*
1192 * bremfreef:
1193 *
1194 * Force an immediate removal from a free list. Used only in nfs when
1195 * it abuses the b_freelist pointer.
1196 */

--- 17 unchanged lines hidden (view full) ---

1214static void
1215bremfreel(struct buf *bp)
1216{
1217
1218 CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
1219 bp, bp->b_vp, bp->b_flags);
1220 KASSERT(bp->b_qindex != QUEUE_NONE,
1221 ("bremfreel: buffer %p not on a queue.", bp));
1635}
1636
1637/*
1638 * bremfreef:
1639 *
1640 * Force an immediate removal from a free list. Used only in nfs when
1641 * it abuses the b_freelist pointer.
1642 */

--- 17 unchanged lines hidden (view full) ---

1660static void
1661bremfreel(struct buf *bp)
1662{
1663
1664 CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
1665 bp, bp->b_vp, bp->b_flags);
1666 KASSERT(bp->b_qindex != QUEUE_NONE,
1667 ("bremfreel: buffer %p not on a queue.", bp));
1222 BUF_ASSERT_XLOCKED(bp);
1668 if (bp->b_qindex != QUEUE_EMPTY) {
1669 BUF_ASSERT_XLOCKED(bp);
1670 }
1223 mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
1224
1225 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
1226#ifdef INVARIANTS
1227 KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
1228 bp->b_qindex));
1229 bq_len[bp->b_qindex]--;
1230#endif
1231 bp->b_qindex = QUEUE_NONE;
1671 mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
1672
1673 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
1674#ifdef INVARIANTS
1675 KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
1676 bp->b_qindex));
1677 bq_len[bp->b_qindex]--;
1678#endif
1679 bp->b_qindex = QUEUE_NONE;
1232 /*
1233 * If this was a delayed bremfree() we only need to remove the buffer
1234 * from the queue and return the stats are already done.
1235 */
1236 if (bp->b_flags & B_REMFREE) {
1237 bp->b_flags &= ~B_REMFREE;
1238 return;
1239 }
1240 bufcountsub(bp);
1680 bp->b_flags &= ~B_REMFREE;
1241}
1242
1243/*
1681}
1682
1683/*
1244 * bufkvafree:
1684 * bufkva_free:
1245 *
1246 * Free the kva allocation for a buffer.
1247 *
1248 */
1249static void
1685 *
1686 * Free the kva allocation for a buffer.
1687 *
1688 */
1689static void
1250bufkvafree(struct buf *bp)
1690bufkva_free(struct buf *bp)
1251{
1252
1253#ifdef INVARIANTS
1254 if (bp->b_kvasize == 0) {
1255 KASSERT(bp->b_kvabase == unmapped_buf &&
1256 bp->b_data == unmapped_buf,
1257 ("Leaked KVA space on %p", bp));
1258 } else if (buf_mapped(bp))

--- 7 unchanged lines hidden (view full) ---

1266 vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
1267 atomic_subtract_long(&bufkvaspace, bp->b_kvasize);
1268 atomic_add_int(&buffreekvacnt, 1);
1269 bp->b_data = bp->b_kvabase = unmapped_buf;
1270 bp->b_kvasize = 0;
1271}
1272
1273/*
1691{
1692
1693#ifdef INVARIANTS
1694 if (bp->b_kvasize == 0) {
1695 KASSERT(bp->b_kvabase == unmapped_buf &&
1696 bp->b_data == unmapped_buf,
1697 ("Leaked KVA space on %p", bp));
1698 } else if (buf_mapped(bp))

--- 7 unchanged lines hidden (view full) ---

1706 vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
1707 atomic_subtract_long(&bufkvaspace, bp->b_kvasize);
1708 atomic_add_int(&buffreekvacnt, 1);
1709 bp->b_data = bp->b_kvabase = unmapped_buf;
1710 bp->b_kvasize = 0;
1711}
1712
1713/*
1274 * bufkvaalloc:
1714 * bufkva_alloc:
1275 *
1276 * Allocate the buffer KVA and set b_kvasize and b_kvabase.
1277 */
1278static int
1715 *
1716 * Allocate the buffer KVA and set b_kvasize and b_kvabase.
1717 */
1718static int
1279bufkvaalloc(struct buf *bp, int maxsize, int gbflags)
1719bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
1280{
1281 vm_offset_t addr;
1282 int error;
1283
1284 KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
1285 ("Invalid gbflags 0x%x in %s", gbflags, __func__));
1286
1720{
1721 vm_offset_t addr;
1722 int error;
1723
1724 KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
1725 ("Invalid gbflags 0x%x in %s", gbflags, __func__));
1726
1287 bufkvafree(bp);
1727 bufkva_free(bp);
1288
1289 addr = 0;
1290 error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
1291 if (error != 0) {
1292 /*
1293 * Buffer map is too fragmented. Request the caller
1294 * to defragment the map.
1295 */
1728
1729 addr = 0;
1730 error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
1731 if (error != 0) {
1732 /*
1733 * Buffer map is too fragmented. Request the caller
1734 * to defragment the map.
1735 */
1296 atomic_add_int(&bufdefragcnt, 1);
1297 return (error);
1298 }
1299 bp->b_kvabase = (caddr_t)addr;
1300 bp->b_kvasize = maxsize;
1301 atomic_add_long(&bufkvaspace, bp->b_kvasize);
1302 if ((gbflags & GB_UNMAPPED) != 0) {
1303 bp->b_data = unmapped_buf;
1304 BUF_CHECK_UNMAPPED(bp);
1305 } else {
1306 bp->b_data = bp->b_kvabase;
1307 BUF_CHECK_MAPPED(bp);
1308 }
1309 return (0);
1310}
1311
1312/*
1736 return (error);
1737 }
1738 bp->b_kvabase = (caddr_t)addr;
1739 bp->b_kvasize = maxsize;
1740 atomic_add_long(&bufkvaspace, bp->b_kvasize);
1741 if ((gbflags & GB_UNMAPPED) != 0) {
1742 bp->b_data = unmapped_buf;
1743 BUF_CHECK_UNMAPPED(bp);
1744 } else {
1745 bp->b_data = bp->b_kvabase;
1746 BUF_CHECK_MAPPED(bp);
1747 }
1748 return (0);
1749}
1750
1751/*
1752 * bufkva_reclaim:
1753 *
1754 * Reclaim buffer kva by freeing buffers holding kva. This is a vmem
1755 * callback that fires to avoid returning failure.
1756 */
1757static void
1758bufkva_reclaim(vmem_t *vmem, int flags)
1759{
1760 int i;
1761
1762 for (i = 0; i < 5; i++)
1763 if (buf_scan(true) != 0)
1764 break;
1765 return;
1766}
1767
1768
1769/*
1313 * Attempt to initiate asynchronous I/O on read-ahead blocks. We must
1314 * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
1315 * the buffer is valid and we do not have to do anything.
1316 */
1317void
1318breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
1319 int cnt, struct ucred * cred)
1320{

--- 574 unchanged lines hidden (view full) ---

1895 if (bp->b_flags & B_DELWRI)
1896 bundirty(bp);
1897 if (bp->b_vp)
1898 brelvp(bp);
1899 }
1900
1901 /* buffers with no memory */
1902 if (bp->b_bufsize == 0) {
1770 * Attempt to initiate asynchronous I/O on read-ahead blocks. We must
1771 * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
1772 * the buffer is valid and we do not have to do anything.
1773 */
1774void
1775breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
1776 int cnt, struct ucred * cred)
1777{

--- 574 unchanged lines hidden (view full) ---

2352 if (bp->b_flags & B_DELWRI)
2353 bundirty(bp);
2354 if (bp->b_vp)
2355 brelvp(bp);
2356 }
2357
2358 /* buffers with no memory */
2359 if (bp->b_bufsize == 0) {
1903 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1904 if (bp->b_vflags & BV_BKGRDINPROG)
1905 panic("losing buffer 1");
1906 bufkvafree(bp);
1907 qindex = QUEUE_EMPTY;
1908 bp->b_flags |= B_AGE;
2360 buf_free(bp);
2361 return;
2362 }
1909 /* buffers with junk contents */
2363 /* buffers with junk contents */
1910 } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
2364 if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
1911 (bp->b_ioflags & BIO_ERROR)) {
1912 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1913 if (bp->b_vflags & BV_BKGRDINPROG)
1914 panic("losing buffer 2");
1915 qindex = QUEUE_CLEAN;
1916 bp->b_flags |= B_AGE;
1917 /* remaining buffers */
1918 } else if (bp->b_flags & B_DELWRI)
1919 qindex = QUEUE_DIRTY;
1920 else
1921 qindex = QUEUE_CLEAN;
1922
1923 binsfree(bp, qindex);
1924
1925 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
1926 if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1927 panic("brelse: not dirty");
1928 /* unlock */
1929 BUF_UNLOCK(bp);
2365 (bp->b_ioflags & BIO_ERROR)) {
2366 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
2367 if (bp->b_vflags & BV_BKGRDINPROG)
2368 panic("losing buffer 2");
2369 qindex = QUEUE_CLEAN;
2370 bp->b_flags |= B_AGE;
2371 /* remaining buffers */
2372 } else if (bp->b_flags & B_DELWRI)
2373 qindex = QUEUE_DIRTY;
2374 else
2375 qindex = QUEUE_CLEAN;
2376
2377 binsfree(bp, qindex);
2378
2379 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
2380 if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
2381 panic("brelse: not dirty");
2382 /* unlock */
2383 BUF_UNLOCK(bp);
2384 if (qindex == QUEUE_CLEAN)
2385 bufspace_wakeup();
1930}
1931
1932/*
1933 * Release a buffer back to the appropriate queue but do not try to free
1934 * it. The buffer is expected to be used again soon.
1935 *
1936 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1937 * biodone() to requeue an async I/O on completion. It is also used when

--- 6 unchanged lines hidden (view full) ---

1944bqrelse(struct buf *bp)
1945{
1946 int qindex;
1947
1948 CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1949 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1950 ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1951
2386}
2387
2388/*
2389 * Release a buffer back to the appropriate queue but do not try to free
2390 * it. The buffer is expected to be used again soon.
2391 *
2392 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
2393 * biodone() to requeue an async I/O on completion. It is also used when

--- 6 unchanged lines hidden (view full) ---

2400bqrelse(struct buf *bp)
2401{
2402 int qindex;
2403
2404 CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
2405 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
2406 ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
2407
2408 qindex = QUEUE_NONE;
1952 if (BUF_LOCKRECURSED(bp)) {
1953 /* do not release to free list */
1954 BUF_UNLOCK(bp);
1955 return;
1956 }
1957 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1958
1959 if (bp->b_flags & B_MANAGED) {

--- 19 unchanged lines hidden (view full) ---

1979 }
1980 qindex = QUEUE_CLEAN;
1981 }
1982 binsfree(bp, qindex);
1983
1984out:
1985 /* unlock */
1986 BUF_UNLOCK(bp);
2409 if (BUF_LOCKRECURSED(bp)) {
2410 /* do not release to free list */
2411 BUF_UNLOCK(bp);
2412 return;
2413 }
2414 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
2415
2416 if (bp->b_flags & B_MANAGED) {

--- 19 unchanged lines hidden (view full) ---

2436 }
2437 qindex = QUEUE_CLEAN;
2438 }
2439 binsfree(bp, qindex);
2440
2441out:
2442 /* unlock */
2443 BUF_UNLOCK(bp);
2444 if (qindex == QUEUE_CLEAN)
2445 bufspace_wakeup();
1987}
1988
1989/*
1990 * Complete I/O to a VMIO backed page. Validate the pages as appropriate,
1991 * restore bogus pages.
1992 */
1993static void
1994vfs_vmio_iodone(struct buf *bp)

--- 383 unchanged lines hidden (view full) ---

2378 */
2379 nwritten = bp->b_bufsize;
2380 (void) bwrite(bp);
2381
2382 return (nwritten);
2383}
2384
2385/*
2446}
2447
2448/*
2449 * Complete I/O to a VMIO backed page. Validate the pages as appropriate,
2450 * restore bogus pages.
2451 */
2452static void
2453vfs_vmio_iodone(struct buf *bp)

--- 383 unchanged lines hidden (view full) ---

2837 */
2838 nwritten = bp->b_bufsize;
2839 (void) bwrite(bp);
2840
2841 return (nwritten);
2842}
2843
2844/*
2386 * Ask the bufdaemon for help, or act as bufdaemon itself, when a
2387 * locked vnode is supplied.
2845 * getnewbuf_kva:
2846 *
2847 * Allocate KVA for an empty buf header according to gbflags.
2388 */
2848 */
2389static void
2390getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
2391 int defrag)
2849static int
2850getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
2392{
2851{
2393 struct thread *td;
2394 char *waitmsg;
2395 int error, fl, flags, norunbuf;
2396
2852
2397 mtx_assert(&bqclean, MA_OWNED);
2398
2399 if (defrag) {
2400 flags = VFS_BIO_NEED_BUFSPACE;
2401 waitmsg = "nbufkv";
2402 } else if (bufspace >= hibufspace) {
2403 waitmsg = "nbufbs";
2404 flags = VFS_BIO_NEED_BUFSPACE;
2405 } else {
2406 waitmsg = "newbuf";
2407 flags = VFS_BIO_NEED_ANY;
2408 }
2409 atomic_set_int(&needsbuffer, flags);
2410 mtx_unlock(&bqclean);
2411
2412 bd_speedup(); /* heeeelp */
2413 if ((gbflags & GB_NOWAIT_BD) != 0)
2414 return;
2415
2416 td = curthread;
2417 rw_wlock(&nblock);
2418 while ((needsbuffer & flags) != 0) {
2419 if (vp != NULL && vp->v_type != VCHR &&
2420 (td->td_pflags & TDP_BUFNEED) == 0) {
2421 rw_wunlock(&nblock);
2422 /*
2423 * getblk() is called with a vnode locked, and
2424 * some majority of the dirty buffers may as
2425 * well belong to the vnode. Flushing the
2426 * buffers there would make a progress that
2427 * cannot be achieved by the buf_daemon, that
2428 * cannot lock the vnode.
2429 */
2430 norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
2431 (td->td_pflags & TDP_NORUNNINGBUF);
2432
2433 /*
2434 * Play bufdaemon. The getnewbuf() function
2435 * may be called while the thread owns lock
2436 * for another dirty buffer for the same
2437 * vnode, which makes it impossible to use
2438 * VOP_FSYNC() there, due to the buffer lock
2439 * recursion.
2440 */
2441 td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
2442 fl = buf_flush(vp, flushbufqtarget);
2443 td->td_pflags &= norunbuf;
2444 rw_wlock(&nblock);
2445 if (fl != 0)
2446 continue;
2447 if ((needsbuffer & flags) == 0)
2448 break;
2449 }
2450 error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
2451 (PRIBIO + 4) | slpflag, waitmsg, slptimeo);
2452 if (error != 0)
2453 break;
2454 }
2455 rw_wunlock(&nblock);
2456}
2457
2458static void
2459getnewbuf_reuse_bp(struct buf *bp, int qindex)
2460{
2461
2462 CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
2463 "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
2464 bp->b_kvasize, bp->b_bufsize, qindex);
2465 mtx_assert(&bqclean, MA_NOTOWNED);
2466
2467 /*
2468 * Note: we no longer distinguish between VMIO and non-VMIO
2469 * buffers.
2470 */
2471 KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
2472 ("invalid buffer %p flags %#x found in queue %d", bp, bp->b_flags,
2473 qindex));
2474
2475 /*
2476 * When recycling a clean buffer we have to truncate it and
2477 * release the vnode.
2478 */
2479 if (qindex == QUEUE_CLEAN) {
2480 allocbuf(bp, 0);
2481 if (bp->b_vp != NULL)
2482 brelvp(bp);
2483 }
2484
2485 /*
2486 * Get the rest of the buffer freed up. b_kva* is still valid
2487 * after this operation.
2488 */
2489 if (bp->b_rcred != NOCRED) {
2490 crfree(bp->b_rcred);
2491 bp->b_rcred = NOCRED;
2492 }
2493 if (bp->b_wcred != NOCRED) {
2494 crfree(bp->b_wcred);
2495 bp->b_wcred = NOCRED;
2496 }
2497 if (!LIST_EMPTY(&bp->b_dep))
2498 buf_deallocate(bp);
2499 if (bp->b_vflags & BV_BKGRDINPROG)
2500 panic("losing buffer 3");
2501 KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d",
2502 bp, bp->b_vp, qindex));
2503 KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
2504 ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
2505 KASSERT(bp->b_npages == 0,
2506 ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
2507
2508 bp->b_flags = 0;
2509 bp->b_ioflags = 0;
2510 bp->b_xflags = 0;
2511 KASSERT((bp->b_flags & B_INFREECNT) == 0,
2512 ("buf %p still counted as free?", bp));
2513 bp->b_vflags = 0;
2514 bp->b_vp = NULL;
2515 bp->b_blkno = bp->b_lblkno = 0;
2516 bp->b_offset = NOOFFSET;
2517 bp->b_iodone = 0;
2518 bp->b_error = 0;
2519 bp->b_resid = 0;
2520 bp->b_bcount = 0;
2521 bp->b_npages = 0;
2522 bp->b_dirtyoff = bp->b_dirtyend = 0;
2523 bp->b_bufobj = NULL;
2524 bp->b_pin_count = 0;
2525 bp->b_data = bp->b_kvabase;
2526 bp->b_fsprivate1 = NULL;
2527 bp->b_fsprivate2 = NULL;
2528 bp->b_fsprivate3 = NULL;
2529
2530 LIST_INIT(&bp->b_dep);
2531}
2532
2533static struct buf *
2534getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
2535{
2536 struct buf *bp, *nbp;
2537 int nqindex, qindex, pass;
2538
2539 KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
2540
2541 pass = 0;
2542restart:
2543 if (pass != 0)
2544 atomic_add_int(&getnewbufrestarts, 1);
2545
2546 nbp = NULL;
2547 mtx_lock(&bqclean);
2548 /*
2549 * If we're not defragging or low on bufspace attempt to make a new
2550 * buf from a header.
2551 */
2552 if (defrag == 0 && bufspace + maxsize < hibufspace) {
2553 nqindex = QUEUE_EMPTY;
2554 nbp = TAILQ_FIRST(&bufqueues[nqindex]);
2555 }
2556 /*
2557 * All available buffers might be clean or we need to start recycling.
2558 */
2559 if (nbp == NULL) {
2560 nqindex = QUEUE_CLEAN;
2561 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2562 }
2563
2564 /*
2565 * Run scan, possibly freeing data and/or kva mappings on the fly
2566 * depending.
2567 */
2568 while ((bp = nbp) != NULL) {
2569 qindex = nqindex;
2570
2853 if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
2571 /*
2854 /*
2572 * Calculate next bp (we can only use it if we do not
2573 * release the bqlock)
2855 * In order to keep fragmentation sane we only allocate kva
2856 * in BKVASIZE chunks. XXX with vmem we can do page size.
2574 */
2857 */
2575 if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
2576 switch (qindex) {
2577 case QUEUE_EMPTY:
2578 nqindex = QUEUE_CLEAN;
2579 nbp = TAILQ_FIRST(&bufqueues[nqindex]);
2580 if (nbp != NULL)
2581 break;
2582 /* FALLTHROUGH */
2583 case QUEUE_CLEAN:
2584 if (metadata && pass == 0) {
2585 pass = 1;
2586 nqindex = QUEUE_EMPTY;
2587 nbp = TAILQ_FIRST(&bufqueues[nqindex]);
2588 }
2589 /*
2590 * nbp is NULL.
2591 */
2592 break;
2593 }
2594 }
2595 /*
2596 * If we are defragging then we need a buffer with
2597 * b_kvasize != 0. This situation occurs when we
2598 * have many unmapped bufs.
2599 */
2600 if (defrag && bp->b_kvasize == 0)
2601 continue;
2858 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
2602
2859
2603 /*
2604 * Start freeing the bp. This is somewhat involved. nbp
2605 * remains valid only for QUEUE_EMPTY[KVA] bp's.
2606 */
2607 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2608 continue;
2609 /*
2610 * BKGRDINPROG can only be set with the buf and bufobj
2611 * locks both held. We tolerate a race to clear it here.
2612 */
2613 if (bp->b_vflags & BV_BKGRDINPROG) {
2614 BUF_UNLOCK(bp);
2615 continue;
2616 }
2617
2618 /*
2619 * Requeue the background write buffer with error.
2620 */
2621 if ((bp->b_vflags & BV_BKGRDERR) != 0) {
2622 bremfreel(bp);
2623 mtx_unlock(&bqclean);
2624 bqrelse(bp);
2625 continue;
2626 }
2627
2628 KASSERT(bp->b_qindex == qindex,
2629 ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
2630
2631 bremfreel(bp);
2632 mtx_unlock(&bqclean);
2633
2634 /*
2635 * NOTE: nbp is now entirely invalid. We can only restart
2636 * the scan from this point on.
2637 */
2638 getnewbuf_reuse_bp(bp, qindex);
2639 mtx_assert(&bqclean, MA_NOTOWNED);
2640
2641 /*
2642 * If we are defragging then free the buffer.
2643 */
2644 if (defrag) {
2645 bp->b_flags |= B_INVAL;
2646 brelse(bp);
2647 defrag = 0;
2648 goto restart;
2649 }
2650
2651 /*
2652 * Notify any waiters for the buffer lock about
2653 * identity change by freeing the buffer.
2654 */
2655 if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
2656 bp->b_flags |= B_INVAL;
2657 brelse(bp);
2658 goto restart;
2659 }
2660
2661 if (metadata)
2662 break;
2663
2664 /*
2665 * If we are overcomitted then recover the buffer and its
2666 * KVM space. This occurs in rare situations when multiple
2667 * processes are blocked in getnewbuf() or allocbuf().
2668 */
2669 if (bufspace >= hibufspace && bp->b_kvasize != 0) {
2670 bp->b_flags |= B_INVAL;
2671 brelse(bp);
2672 goto restart;
2673 }
2674 break;
2860 if (maxsize != bp->b_kvasize &&
2861 bufkva_alloc(bp, maxsize, gbflags))
2862 return (ENOSPC);
2675 }
2863 }
2676 return (bp);
2864 return (0);
2677}
2678
2679/*
2680 * getnewbuf:
2681 *
2682 * Find and initialize a new buffer header, freeing up existing buffers
2683 * in the bufqueues as necessary. The new buffer is returned locked.
2684 *
2865}
2866
2867/*
2868 * getnewbuf:
2869 *
2870 * Find and initialize a new buffer header, freeing up existing buffers
2871 * in the bufqueues as necessary. The new buffer is returned locked.
2872 *
2685 * Important: B_INVAL is not set. If the caller wishes to throw the
2686 * buffer away, the caller must set B_INVAL prior to calling brelse().
2687 *
2688 * We block if:
2689 * We have insufficient buffer headers
2690 * We have insufficient buffer space
2691 * buffer_arena is too fragmented ( space reservation fails )
2692 * If we have to flush dirty buffers ( but we try to avoid this )
2873 * We block if:
2874 * We have insufficient buffer headers
2875 * We have insufficient buffer space
2876 * buffer_arena is too fragmented ( space reservation fails )
2877 * If we have to flush dirty buffers ( but we try to avoid this )
2878 *
2879 * The caller is responsible for releasing the reserved bufspace after
2880 * allocbuf() is called.
2693 */
2694static struct buf *
2881 */
2882static struct buf *
2695getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
2696 int gbflags)
2883getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
2697{
2698 struct buf *bp;
2884{
2885 struct buf *bp;
2699 int defrag, metadata;
2886 bool metadata, reserved;
2700
2701 KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
2702 ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
2703 if (!unmapped_buf_allowed)
2704 gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
2705
2887
2888 KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
2889 ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
2890 if (!unmapped_buf_allowed)
2891 gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
2892
2706 defrag = 0;
2707 if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
2708 vp->v_type == VCHR)
2893 if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
2894 vp->v_type == VCHR)
2709 metadata = 1;
2895 metadata = true;
2710 else
2896 else
2711 metadata = 0;
2712 /*
2713 * We can't afford to block since we might be holding a vnode lock,
2714 * which may prevent system daemons from running. We deal with
2715 * low-memory situations by proactively returning memory and running
2716 * async I/O rather then sync I/O.
2717 */
2897 metadata = false;
2718 atomic_add_int(&getnewbufcalls, 1);
2898 atomic_add_int(&getnewbufcalls, 1);
2719restart:
2720 bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
2721 GB_KVAALLOC)) == GB_UNMAPPED, metadata);
2722 if (bp != NULL)
2723 defrag = 0;
2899 reserved = false;
2900 do {
2901 if (reserved == false &&
2902 bufspace_reserve(maxsize, metadata) != 0)
2903 continue;
2904 reserved = true;
2905 if ((bp = buf_alloc()) == NULL)
2906 continue;
2907 if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
2908 return (bp);
2909 break;
2910 } while(buf_scan(false) == 0);
2724
2911
2725 /*
2726 * If we exhausted our list, sleep as appropriate. We may have to
2727 * wakeup various daemons and write out some dirty buffers.
2728 *
2729 * Generally we are sleeping due to insufficient buffer space.
2730 */
2731 if (bp == NULL) {
2732 mtx_assert(&bqclean, MA_OWNED);
2733 getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
2734 mtx_assert(&bqclean, MA_NOTOWNED);
2735 } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
2736 mtx_assert(&bqclean, MA_NOTOWNED);
2737
2738 bufkvafree(bp);
2739 atomic_add_int(&bufreusecnt, 1);
2740 } else {
2741 mtx_assert(&bqclean, MA_NOTOWNED);
2742
2743 /*
2744 * We finally have a valid bp. We aren't quite out of the
2745 * woods, we still have to reserve kva space. In order to
2746 * keep fragmentation sane we only allocate kva in BKVASIZE
2747 * chunks.
2748 */
2749 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
2750
2751 if (maxsize != bp->b_kvasize &&
2752 bufkvaalloc(bp, maxsize, gbflags)) {
2753 defrag = 1;
2754 bp->b_flags |= B_INVAL;
2755 brelse(bp);
2756 goto restart;
2757 } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) ==
2758 (GB_UNMAPPED | GB_KVAALLOC)) {
2759 bp->b_data = unmapped_buf;
2760 BUF_CHECK_UNMAPPED(bp);
2761 }
2762 atomic_add_int(&bufreusecnt, 1);
2912 if (reserved)
2913 bufspace_release(maxsize);
2914 if (bp != NULL) {
2915 bp->b_flags |= B_INVAL;
2916 brelse(bp);
2763 }
2917 }
2764 return (bp);
2918 bufspace_wait(vp, gbflags, slpflag, slptimeo);
2919
2920 return (NULL);
2765}
2766
2767/*
2768 * buf_daemon:
2769 *
2770 * buffer flushing daemon. Buffers are normally flushed by the
2771 * update daemon but if it cannot keep up this process starts to
2772 * take the load in an attempt to prevent getnewbuf() from blocking.
2773 */
2921}
2922
2923/*
2924 * buf_daemon:
2925 *
2926 * buffer flushing daemon. Buffers are normally flushed by the
2927 * update daemon but if it cannot keep up this process starts to
2928 * take the load in an attempt to prevent getnewbuf() from blocking.
2929 */
2774
2775static struct kproc_desc buf_kp = {
2776 "bufdaemon",
2777 buf_daemon,
2778 &bufdaemonproc
2779};
2780SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
2781
2782static int

--- 114 unchanged lines hidden (view full) ---

2897 int error;
2898 bool unlock;
2899
2900 flushed = 0;
2901 queue = QUEUE_DIRTY;
2902 bp = NULL;
2903 sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
2904 sentinel->b_qindex = QUEUE_SENTINEL;
2930static struct kproc_desc buf_kp = {
2931 "bufdaemon",
2932 buf_daemon,
2933 &bufdaemonproc
2934};
2935SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
2936
2937static int

--- 114 unchanged lines hidden (view full) ---

3052 int error;
3053 bool unlock;
3054
3055 flushed = 0;
3056 queue = QUEUE_DIRTY;
3057 bp = NULL;
3058 sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
3059 sentinel->b_qindex = QUEUE_SENTINEL;
2905 mtx_lock(&bqdirty);
3060 mtx_lock(&bqlocks[queue]);
2906 TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
3061 TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
2907 mtx_unlock(&bqdirty);
3062 mtx_unlock(&bqlocks[queue]);
2908 while (flushed != target) {
2909 maybe_yield();
3063 while (flushed != target) {
3064 maybe_yield();
2910 mtx_lock(&bqdirty);
3065 mtx_lock(&bqlocks[queue]);
2911 bp = TAILQ_NEXT(sentinel, b_freelist);
2912 if (bp != NULL) {
2913 TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2914 TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
2915 b_freelist);
2916 } else {
3066 bp = TAILQ_NEXT(sentinel, b_freelist);
3067 if (bp != NULL) {
3068 TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
3069 TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
3070 b_freelist);
3071 } else {
2917 mtx_unlock(&bqdirty);
3072 mtx_unlock(&bqlocks[queue]);
2918 break;
2919 }
2920 /*
2921 * Skip sentinels inserted by other invocations of the
2922 * flushbufqueues(), taking care to not reorder them.
2923 *
2924 * Only flush the buffers that belong to the
2925 * vnode locked by the curthread.
2926 */
2927 if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
2928 bp->b_vp != lvp)) {
3073 break;
3074 }
3075 /*
3076 * Skip sentinels inserted by other invocations of the
3077 * flushbufqueues(), taking care to not reorder them.
3078 *
3079 * Only flush the buffers that belong to the
3080 * vnode locked by the curthread.
3081 */
3082 if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
3083 bp->b_vp != lvp)) {
2929 mtx_unlock(&bqdirty);
3084 mtx_unlock(&bqlocks[queue]);
2930 continue;
2931 }
2932 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
3085 continue;
3086 }
3087 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
2933 mtx_unlock(&bqdirty);
3088 mtx_unlock(&bqlocks[queue]);
2934 if (error != 0)
2935 continue;
2936 if (bp->b_pin_count > 0) {
2937 BUF_UNLOCK(bp);
2938 continue;
2939 }
2940 /*
2941 * BKGRDINPROG can only be set with the buf and bufobj

--- 66 unchanged lines hidden (view full) ---

3008 if (curproc == bufdaemonproc &&
3009 runningbufspace > hirunningspace)
3010 waitrunningbufspace();
3011 continue;
3012 }
3013 vn_finished_write(mp);
3014 BUF_UNLOCK(bp);
3015 }
3089 if (error != 0)
3090 continue;
3091 if (bp->b_pin_count > 0) {
3092 BUF_UNLOCK(bp);
3093 continue;
3094 }
3095 /*
3096 * BKGRDINPROG can only be set with the buf and bufobj

--- 66 unchanged lines hidden (view full) ---

3163 if (curproc == bufdaemonproc &&
3164 runningbufspace > hirunningspace)
3165 waitrunningbufspace();
3166 continue;
3167 }
3168 vn_finished_write(mp);
3169 BUF_UNLOCK(bp);
3170 }
3016 mtx_lock(&bqdirty);
3171 mtx_lock(&bqlocks[queue]);
3017 TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
3172 TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
3018 mtx_unlock(&bqdirty);
3173 mtx_unlock(&bqlocks[queue]);
3019 free(sentinel, M_TEMP);
3020 return (flushed);
3021}
3022
3023/*
3024 * Check to see if a block is currently memory resident.
3025 */
3026struct buf *

--- 164 unchanged lines hidden (view full) ---

3191/*
3192 * Allocate the KVA mapping for an existing buffer.
3193 * If an unmapped buffer is provided but a mapped buffer is requested, take
3194 * also care to properly setup mappings between pages and KVA.
3195 */
3196static void
3197bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
3198{
3174 free(sentinel, M_TEMP);
3175 return (flushed);
3176}
3177
3178/*
3179 * Check to see if a block is currently memory resident.
3180 */
3181struct buf *

--- 164 unchanged lines hidden (view full) ---

3346/*
3347 * Allocate the KVA mapping for an existing buffer.
3348 * If an unmapped buffer is provided but a mapped buffer is requested, take
3349 * also care to properly setup mappings between pages and KVA.
3350 */
3351static void
3352bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
3353{
3199 struct buf *scratch_bp;
3200 int bsize, maxsize, need_mapping, need_kva;
3201 off_t offset;
3202
3203 need_mapping = bp->b_data == unmapped_buf &&
3204 (gbflags & GB_UNMAPPED) == 0;
3205 need_kva = bp->b_kvabase == unmapped_buf &&
3206 bp->b_data == unmapped_buf &&
3207 (gbflags & GB_KVAALLOC) != 0;

--- 16 unchanged lines hidden (view full) ---

3224 * if the buffer was mapped.
3225 */
3226 bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
3227 KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
3228 offset = blkno * bsize;
3229 maxsize = size + (offset & PAGE_MASK);
3230 maxsize = imax(maxsize, bsize);
3231
3354 int bsize, maxsize, need_mapping, need_kva;
3355 off_t offset;
3356
3357 need_mapping = bp->b_data == unmapped_buf &&
3358 (gbflags & GB_UNMAPPED) == 0;
3359 need_kva = bp->b_kvabase == unmapped_buf &&
3360 bp->b_data == unmapped_buf &&
3361 (gbflags & GB_KVAALLOC) != 0;

--- 16 unchanged lines hidden (view full) ---

3378 * if the buffer was mapped.
3379 */
3380 bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
3381 KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
3382 offset = blkno * bsize;
3383 maxsize = size + (offset & PAGE_MASK);
3384 maxsize = imax(maxsize, bsize);
3385
3232mapping_loop:
3233 if (bufkvaalloc(bp, maxsize, gbflags)) {
3234 /*
3235 * Request defragmentation. getnewbuf() returns us the
3236 * allocated space by the scratch buffer KVA.
3237 */
3238 scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
3239 (GB_UNMAPPED | GB_KVAALLOC));
3240 if (scratch_bp == NULL) {
3241 if ((gbflags & GB_NOWAIT_BD) != 0) {
3242 /*
3243 * XXXKIB: defragmentation cannot
3244 * succeed, not sure what else to do.
3245 */
3246 panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
3247 }
3248 atomic_add_int(&mappingrestarts, 1);
3249 goto mapping_loop;
3386 while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
3387 if ((gbflags & GB_NOWAIT_BD) != 0) {
3388 /*
3389 * XXXKIB: defragmentation cannot
3390 * succeed, not sure what else to do.
3391 */
3392 panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
3250 }
3393 }
3251 KASSERT(scratch_bp->b_kvabase != unmapped_buf,
3252 ("scratch bp has no KVA %p", scratch_bp));
3253 /* Grab pointers. */
3254 bp->b_kvabase = scratch_bp->b_kvabase;
3255 bp->b_kvasize = scratch_bp->b_kvasize;
3256 bp->b_data = scratch_bp->b_data;
3257
3258 /* Get rid of the scratch buffer. */
3259 scratch_bp->b_kvasize = 0;
3260 scratch_bp->b_flags |= B_INVAL;
3261 scratch_bp->b_data = scratch_bp->b_kvabase = unmapped_buf;
3262 brelse(scratch_bp);
3394 atomic_add_int(&mappingrestarts, 1);
3395 bufspace_wait(bp->b_vp, gbflags, 0, 0);
3263 }
3264has_addr:
3265 if (need_mapping) {
3266 /* b_offset is handled by bpmap_qenter. */
3267 bp->b_data = bp->b_kvabase;
3268 BUF_CHECK_MAPPED(bp);
3269 bpmap_qenter(bp);
3270 }

--- 210 unchanged lines hidden (view full) ---

3481 maxsize = size + (offset & PAGE_MASK);
3482 } else {
3483 maxsize = size;
3484 /* Do not allow non-VMIO notmapped buffers. */
3485 flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3486 }
3487 maxsize = imax(maxsize, bsize);
3488
3396 }
3397has_addr:
3398 if (need_mapping) {
3399 /* b_offset is handled by bpmap_qenter. */
3400 bp->b_data = bp->b_kvabase;
3401 BUF_CHECK_MAPPED(bp);
3402 bpmap_qenter(bp);
3403 }

--- 210 unchanged lines hidden (view full) ---

3614 maxsize = size + (offset & PAGE_MASK);
3615 } else {
3616 maxsize = size;
3617 /* Do not allow non-VMIO notmapped buffers. */
3618 flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3619 }
3620 maxsize = imax(maxsize, bsize);
3621
3489 bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
3622 bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
3490 if (bp == NULL) {
3491 if (slpflag || slptimeo)
3492 return NULL;
3493 goto loop;
3494 }
3495
3496 /*
3497 * This code is used to make sure that a buffer is not

--- 7 unchanged lines hidden (view full) ---

3505 * the splay tree implementation when dealing with duplicate
3506 * lblkno's.
3507 */
3508 BO_LOCK(bo);
3509 if (gbincore(bo, blkno)) {
3510 BO_UNLOCK(bo);
3511 bp->b_flags |= B_INVAL;
3512 brelse(bp);
3623 if (bp == NULL) {
3624 if (slpflag || slptimeo)
3625 return NULL;
3626 goto loop;
3627 }
3628
3629 /*
3630 * This code is used to make sure that a buffer is not

--- 7 unchanged lines hidden (view full) ---

3638 * the splay tree implementation when dealing with duplicate
3639 * lblkno's.
3640 */
3641 BO_LOCK(bo);
3642 if (gbincore(bo, blkno)) {
3643 BO_UNLOCK(bo);
3644 bp->b_flags |= B_INVAL;
3645 brelse(bp);
3646 bufspace_release(maxsize);
3513 goto loop;
3514 }
3515
3516 /*
3517 * Insert the buffer into the hash, so that it can
3518 * be found by incore.
3519 */
3520 bp->b_blkno = bp->b_lblkno = blkno;

--- 17 unchanged lines hidden (view full) ---

3538 bp->b_flags &= ~B_VMIO;
3539 KASSERT(bp->b_bufobj->bo_object == NULL,
3540 ("ARGH! has b_bufobj->bo_object %p %p\n",
3541 bp, bp->b_bufobj->bo_object));
3542 BUF_CHECK_MAPPED(bp);
3543 }
3544
3545 allocbuf(bp, size);
3647 goto loop;
3648 }
3649
3650 /*
3651 * Insert the buffer into the hash, so that it can
3652 * be found by incore.
3653 */
3654 bp->b_blkno = bp->b_lblkno = blkno;

--- 17 unchanged lines hidden (view full) ---

3672 bp->b_flags &= ~B_VMIO;
3673 KASSERT(bp->b_bufobj->bo_object == NULL,
3674 ("ARGH! has b_bufobj->bo_object %p %p\n",
3675 bp, bp->b_bufobj->bo_object));
3676 BUF_CHECK_MAPPED(bp);
3677 }
3678
3679 allocbuf(bp, size);
3680 bufspace_release(maxsize);
3546 bp->b_flags &= ~B_DONE;
3547 }
3548 CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
3549 BUF_ASSERT_HELD(bp);
3550end:
3551 KASSERT(bp->b_bufobj == bo,
3552 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3553 return (bp);

--- 5 unchanged lines hidden (view full) ---

3559 */
3560struct buf *
3561geteblk(int size, int flags)
3562{
3563 struct buf *bp;
3564 int maxsize;
3565
3566 maxsize = (size + BKVAMASK) & ~BKVAMASK;
3681 bp->b_flags &= ~B_DONE;
3682 }
3683 CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
3684 BUF_ASSERT_HELD(bp);
3685end:
3686 KASSERT(bp->b_bufobj == bo,
3687 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3688 return (bp);

--- 5 unchanged lines hidden (view full) ---

3694 */
3695struct buf *
3696geteblk(int size, int flags)
3697{
3698 struct buf *bp;
3699 int maxsize;
3700
3701 maxsize = (size + BKVAMASK) & ~BKVAMASK;
3567 while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
3702 while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
3568 if ((flags & GB_NOWAIT_BD) &&
3569 (curthread->td_pflags & TDP_BUFNEED) != 0)
3570 return (NULL);
3571 }
3572 allocbuf(bp, size);
3703 if ((flags & GB_NOWAIT_BD) &&
3704 (curthread->td_pflags & TDP_BUFNEED) != 0)
3705 return (NULL);
3706 }
3707 allocbuf(bp, size);
3708 bufspace_release(maxsize);
3573 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
3574 BUF_ASSERT_HELD(bp);
3575 return (bp);
3576}
3577
3578/*
3579 * Truncate the backing store for a non-vmio buffer.
3580 */

--- 9 unchanged lines hidden (view full) ---

3590 bufmallocadjust(bp, 0);
3591 free(bp->b_data, M_BIOBUF);
3592 bp->b_data = bp->b_kvabase;
3593 bp->b_flags &= ~B_MALLOC;
3594 }
3595 return;
3596 }
3597 vm_hold_free_pages(bp, newbsize);
3709 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
3710 BUF_ASSERT_HELD(bp);
3711 return (bp);
3712}
3713
3714/*
3715 * Truncate the backing store for a non-vmio buffer.
3716 */

--- 9 unchanged lines hidden (view full) ---

3726 bufmallocadjust(bp, 0);
3727 free(bp->b_data, M_BIOBUF);
3728 bp->b_data = bp->b_kvabase;
3729 bp->b_flags &= ~B_MALLOC;
3730 }
3731 return;
3732 }
3733 vm_hold_free_pages(bp, newbsize);
3598 bufspaceadjust(bp, newbsize);
3734 bufspace_adjust(bp, newbsize);
3599}
3600
3601/*
3602 * Extend the backing for a non-VMIO buffer.
3603 */
3604static void
3605vfs_nonvmio_extend(struct buf *bp, int newbsize)
3606{

--- 34 unchanged lines hidden (view full) ---

3641 newbsize = round_page(newbsize);
3642 }
3643 vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
3644 (vm_offset_t) bp->b_data + newbsize);
3645 if (origbuf != NULL) {
3646 bcopy(origbuf, bp->b_data, origbufsize);
3647 free(origbuf, M_BIOBUF);
3648 }
3735}
3736
3737/*
3738 * Extend the backing for a non-VMIO buffer.
3739 */
3740static void
3741vfs_nonvmio_extend(struct buf *bp, int newbsize)
3742{

--- 34 unchanged lines hidden (view full) ---

3777 newbsize = round_page(newbsize);
3778 }
3779 vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
3780 (vm_offset_t) bp->b_data + newbsize);
3781 if (origbuf != NULL) {
3782 bcopy(origbuf, bp->b_data, origbufsize);
3783 free(origbuf, M_BIOBUF);
3784 }
3649 bufspaceadjust(bp, newbsize);
3785 bufspace_adjust(bp, newbsize);
3650}
3651
3652/*
3653 * This code constitutes the buffer memory from either anonymous system
3654 * memory (in the case of non-VMIO operations) or from an associated
3655 * VM object (in the case of VMIO operations). This code is able to
3656 * resize a buffer up or down.
3657 *

--- 45 unchanged lines hidden (view full) ---

3703 if (size == 0 || bp->b_bufsize == 0)
3704 bp->b_flags |= B_CACHE;
3705
3706 if (newbsize < bp->b_bufsize)
3707 vfs_vmio_truncate(bp, desiredpages);
3708 /* XXX This looks as if it should be newbsize > b_bufsize */
3709 else if (size > bp->b_bcount)
3710 vfs_vmio_extend(bp, desiredpages, size);
3786}
3787
3788/*
3789 * This code constitutes the buffer memory from either anonymous system
3790 * memory (in the case of non-VMIO operations) or from an associated
3791 * VM object (in the case of VMIO operations). This code is able to
3792 * resize a buffer up or down.
3793 *

--- 45 unchanged lines hidden (view full) ---

3839 if (size == 0 || bp->b_bufsize == 0)
3840 bp->b_flags |= B_CACHE;
3841
3842 if (newbsize < bp->b_bufsize)
3843 vfs_vmio_truncate(bp, desiredpages);
3844 /* XXX This looks as if it should be newbsize > b_bufsize */
3845 else if (size > bp->b_bcount)
3846 vfs_vmio_extend(bp, desiredpages, size);
3711 bufspaceadjust(bp, newbsize);
3847 bufspace_adjust(bp, newbsize);
3712 }
3713 bp->b_bcount = size; /* requested buffer size. */
3714 return (1);
3715}
3716
3717extern int inflight_transient_maps;
3718
3719void

--- 871 unchanged lines hidden (view full) ---

4591
4592 if (have_addr) {
4593 db_printf("usage: countfreebufs\n");
4594 return;
4595 }
4596
4597 for (i = 0; i < nbuf; i++) {
4598 bp = &buf[i];
3848 }
3849 bp->b_bcount = size; /* requested buffer size. */
3850 return (1);
3851}
3852
3853extern int inflight_transient_maps;
3854
3855void

--- 871 unchanged lines hidden (view full) ---

4727
4728 if (have_addr) {
4729 db_printf("usage: countfreebufs\n");
4730 return;
4731 }
4732
4733 for (i = 0; i < nbuf; i++) {
4734 bp = &buf[i];
4599 if ((bp->b_flags & B_INFREECNT) != 0)
4735 if (bp->b_qindex == QUEUE_EMPTY)
4600 nfree++;
4601 else
4602 used++;
4603 }
4604
4605 db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
4606 nfree + used);
4607 db_printf("numfreebuffers is %d\n", numfreebuffers);
4608}
4609#endif /* DDB */
4736 nfree++;
4737 else
4738 used++;
4739 }
4740
4741 db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
4742 nfree + used);
4743 db_printf("numfreebuffers is %d\n", numfreebuffers);
4744}
4745#endif /* DDB */