xref: /titanic_51/usr/src/uts/common/sys/buf.h (revision 89a7715a55deca73d03076f5c24463717f0aaa91)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #ifndef _SYS_BUF_H
40 #define	_SYS_BUF_H
41 
42 #pragma ident	"%Z%%M%	%I%	%E% SMI"
43 
44 #include <sys/types32.h>
45 #include <sys/t_lock.h>
46 #include <sys/kstat.h>
47 
48 #ifdef	__cplusplus
49 extern "C" {
50 #endif
51 
52 /*
53  *	Each buffer in the pool is usually doubly linked into 2 lists:
54  *	the device with which it is currently associated (always)
55  *	and also on a list of blocks available for allocation
56  *	for other use (usually).
57  *	The latter list is kept in last-used order, and the two
58  *	lists are doubly linked to make it easy to remove
59  *	a buffer from one list when it was found by
60  *	looking through the other.
61  *	A buffer is on the available list, and is liable
62  *	to be reassigned to another disk block, if and only
63  *	if it is not marked BUSY.  When a buffer is busy, the
64  *	available-list pointers can be used for other purposes.
65  *	Most drivers use the forward ptr as a link in their I/O active queue.
66  *	A buffer header contains all the information required to perform I/O.
67  *	Most of the routines which manipulate these things are in bio.c.
68  *
69  *	There are a number of locks associated with the buffer management
70  *	system.
71  *	hbuf.b_lock:	protects hash chains, buffer hdr freelists
72  *			and delayed write freelist
73  *	bfree_lock;	protects the bfreelist structure
74  *	bhdr_lock:	protects the free header list
75  *	blist_lock:	protects b_list fields
76  *	buf.b_sem:	protects all remaining members in the buf struct
77  *	buf.b_io:	I/O synchronization variable
78  *
79  *	A buffer header is never "locked" (b_sem) when it is on
80  *	a "freelist" (bhdrlist or bfreelist avail lists).
81  */
82 typedef struct	buf {
83 	int	b_flags;		/* see defines below */
84 	struct buf *b_forw;		/* headed by d_tab of conf.c */
85 	struct buf *b_back;		/*  "  */
86 	struct buf *av_forw;		/* position on free list, */
87 	struct buf *av_back;		/* if not BUSY */
88 	o_dev_t	b_dev;			/* OLD major+minor device name */
89 	size_t b_bcount;		/* transfer count */
90 	union {
91 		caddr_t b_addr;		/* low order core address */
92 		struct fs *b_fs;	/* superblocks */
93 		struct cg *b_cg;	/* UFS cylinder group block */
94 		struct dinode *b_dino;	/* UFS ilist */
95 		daddr32_t *b_daddr;	/* disk blocks */
96 	} b_un;
97 
98 	lldaddr_t	_b_blkno;	/* block # on device (union) */
99 #define	b_lblkno	_b_blkno._f
100 #ifdef _LP64
101 #define	b_blkno		_b_blkno._f
102 #else
103 #define	b_blkno		_b_blkno._p._l
104 #endif /* _LP64 */
105 
106 	char	b_obs1;			/* obsolete */
107 	size_t	b_resid;		/* words not transferred after error */
108 	clock_t	b_start;		/* request start time */
109 	struct  proc  *b_proc;		/* process doing physical or swap I/O */
110 	struct	page  *b_pages;		/* page list for PAGEIO */
111 	clock_t b_obs2;			/* obsolete */
112 	/* Begin new stuff */
113 #define	b_actf	av_forw
114 #define	b_actl	av_back
115 #define	b_active b_bcount
116 #define	b_errcnt b_resid
117 	size_t	b_bufsize;		/* size of allocated buffer */
118 	int	(*b_iodone)(struct buf *);	/* function called by iodone */
119 	struct	vnode *b_vp;		/* vnode associated with block */
120 	struct 	buf *b_chain;		/* chain together all buffers here */
121 	int	b_obs3;			/* obsolete */
122 	int	b_error;		/* expanded error field */
123 	void	*b_private;		/* "opaque" driver private area */
124 	dev_t	b_edev;			/* expanded dev field */
125 	ksema_t	b_sem;			/* Exclusive access to buf */
126 	ksema_t	b_io;			/* I/O Synchronization */
127 	struct buf *b_list;		/* List of potential B_DELWRI bufs */
128 	struct page **b_shadow;		/* shadow page list */
129 	void	*b_dip;			/* device info pointer */
130 	struct vnode *b_file;		/* file associated with this buffer */
131 	offset_t b_offset;		/* offset in file assoc. with buffer */
132 } buf_t;
133 
134 /*
135  * Bufhd structures used at the head of the hashed buffer queues.
136  * We only need seven words for this, so this abbreviated
137  * definition saves some space.
138  */
139 struct diskhd {
140 	int	b_flags;		/* not used, needed for consistency */
141 	struct buf *b_forw, *b_back;	/* queue of unit queues */
142 	struct buf *av_forw, *av_back;	/* queue of bufs for this unit */
143 	o_dev_t	b_dev;			/* OLD major+minor device name */
144 	size_t b_bcount;		/* transfer count */
145 };
146 
147 
148 /*
149  * Statistics on the buffer cache
150  */
151 struct biostats {
152 	kstat_named_t	bio_lookup;	/* requests to assign buffer */
153 	kstat_named_t	bio_hit;	/* buffer already associated with blk */
154 	kstat_named_t	bio_bufwant;	/* kmem_allocs NOSLEEP failed new buf */
155 	kstat_named_t	bio_bufwait;	/* kmem_allocs with KM_SLEEP for buf */
156 	kstat_named_t	bio_bufbusy;	/* buffer locked by someone else */
157 	kstat_named_t	bio_bufdup;	/* duplicate buffer found for block */
158 };
159 
160 /*
161  * These flags are kept in b_flags.
162  * The first group is part of the DDI
163  */
164 #define	B_BUSY		0x0001	/* not on av_forw/back list */
165 #define	B_DONE		0x0002	/* transaction finished */
166 #define	B_ERROR		0x0004	/* transaction aborted */
167 #define	B_PAGEIO	0x0010	/* do I/O to pages on bp->p_pages */
168 #define	B_PHYS		0x0020	/* Physical IO potentially using UNIBUS map */
169 #define	B_READ		0x0040	/* read when I/O occurs */
170 #define	B_WRITE		0x0100	/* non-read pseudo-flag */
171 
172 /* Not part of the DDI */
173 #define	B_WANTED	0x0080		/* issue wakeup when BUSY goes off */
174 #define	B_AGE		0x000200	/* delayed write for correct aging */
175 #define	B_ASYNC		0x000400	/* don't wait for I/O completion */
176 #define	B_DELWRI	0x000800	/* delayed write-wait til buf needed */
177 #define	B_STALE		0x001000	/* on av_* list; invalid contents */
178 #define	B_DONTNEED	0x002000	/* after write, need not be cached */
179 #define	B_REMAPPED	0x004000	/* buffer is kernel addressable */
180 #define	B_FREE		0x008000	/* free page when done */
181 #define	B_INVAL		0x010000	/* does not contain valid info  */
182 #define	B_FORCE		0x020000	/* semi-permanent removal from cache */
183 #define	B_NOCACHE	0x080000 	/* don't cache block when released */
184 #define	B_TRUNC		0x100000	/* truncate page without I/O */
185 #define	B_SHADOW	0x200000	/* is b_shadow field valid? */
186 #define	B_RETRYWRI	0x400000	/* retry write til works or bfinval */
187 #define	B_FAILFAST	0x1000000	/* Fail promptly if device goes away */
188 #define	B_STARTED	0x2000000	/* io:::start probe called for buf */
189 #define	B_ABRWRITE	0x4000000	/* Application based recovery active */
190 
191 /*
192  * Insq/Remq for the buffer hash lists.
193  */
194 #define	bremhash(bp) { \
195 	ASSERT((bp)->b_forw != NULL); \
196 	ASSERT((bp)->b_back != NULL); \
197 	(bp)->b_back->b_forw = (bp)->b_forw; \
198 	(bp)->b_forw->b_back = (bp)->b_back; \
199 	(bp)->b_forw = (bp)->b_back = NULL; \
200 }
201 #define	binshash(bp, dp) { \
202 	ASSERT((bp)->b_forw == NULL); \
203 	ASSERT((bp)->b_back == NULL); \
204 	ASSERT((dp)->b_forw != NULL); \
205 	ASSERT((dp)->b_back != NULL); \
206 	(bp)->b_forw = (dp)->b_forw; \
207 	(bp)->b_back = (dp); \
208 	(dp)->b_forw->b_back = (bp); \
209 	(dp)->b_forw = (bp); \
210 }
211 
212 
213 /*
214  * The hash structure maintains two lists:
215  *
216  * 	1) The hash list of buffers (b_forw & b_back)
217  *	2) The LRU free list of buffers on this hash bucket (av_forw & av_back)
218  *
219  * The dwbuf structure keeps a list of delayed write buffers per hash bucket
220  * hence there are exactly the same number of dwbuf structures as there are
221  * the hash buckets (hbuf structures) in the system.
222  *
223  * The number of buffers on the freelist may not be equal to the number of
224  * buffers on the hash list. That is because when buffers are busy they are
225  * taken off the freelist but not off the hash list. "b_length" field keeps
226  * track of the number of free buffers (including delayed writes ones) on
227  * the hash bucket. The "b_lock" mutex protects the free list as well as
228  * the hash list. It also protects the counter "b_length".
229  *
230  * Enties b_forw, b_back, av_forw & av_back must be at the same offset
231  * as the ones in buf structure.
232  */
233 struct	hbuf {
234 	int	b_flags;
235 
236 	struct	buf	*b_forw;	/* hash list forw pointer */
237 	struct	buf	*b_back;	/* hash list back pointer */
238 
239 	struct	buf	*av_forw;	/* free list forw pointer */
240 	struct	buf	*av_back;	/* free list back pointer */
241 
242 	int		b_length;	/* # of entries on free list */
243 	kmutex_t	b_lock;		/* lock to protect this structure */
244 };
245 
246 
247 /*
248  * The delayed list pointer entries should match with the buf strcuture.
249  */
250 struct	dwbuf {
251 	int	b_flags;		/* not used */
252 
253 	struct	buf	*b_forw;	/* not used */
254 	struct	buf	*b_back;	/* not used */
255 
256 	struct	buf	*av_forw;	/* delayed write forw pointer */
257 	struct	buf	*av_back;	/* delayed write back pointer */
258 };
259 
260 
261 /*
262  * Unlink a buffer from the available (free or delayed write) list and mark
263  * it busy (internal interface).
264  */
265 #define	notavail(bp) \
266 {\
267 	ASSERT(SEMA_HELD(&bp->b_sem)); \
268 	ASSERT((bp)->av_forw != NULL); \
269 	ASSERT((bp)->av_back != NULL); \
270 	ASSERT((bp)->av_forw != (bp)); \
271 	ASSERT((bp)->av_back != (bp)); \
272 	(bp)->av_back->av_forw = (bp)->av_forw; \
273 	(bp)->av_forw->av_back = (bp)->av_back; \
274 	(bp)->b_flags |= B_BUSY; \
275 	(bp)->av_forw = (bp)->av_back = NULL; \
276 }
277 
278 #if defined(_KERNEL)
279 /*
280  * Macros to avoid the extra function call needed for binary compat.
281  *
282  * B_RETRYWRI is not included in clear_flags for BWRITE(), BWRITE2(),
283  * or brwrite() so that the retry operation is persistent until the
284  * write either succeeds or the buffer is bfinval()'d.
285  *
286  */
287 #define	BREAD(dev, blkno, bsize) \
288 	bread_common(/* ufsvfsp */ NULL, dev, blkno, bsize)
289 
290 #define	BWRITE(bp) \
291 	bwrite_common(/* ufsvfsp */ NULL, bp, /* force_wait */ 0, \
292 		/* do_relse */ 1, \
293 		/* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI))
294 
295 #define	BWRITE2(bp) \
296 	bwrite_common(/* ufsvfsp */ NULL, bp, /* force_wait */ 1, \
297 		/* do_relse */ 0, \
298 		/* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI))
299 
300 #define	GETBLK(dev, blkno, bsize) \
301 	getblk_common(/* ufsvfsp */ NULL, dev, blkno, bsize, /* errflg */ 0)
302 
303 
304 /*
305  * Macros for new retry write interfaces.
306  */
307 
308 /*
309  * Same as bdwrite() except write failures are retried.
310  */
311 #define	bdrwrite(bp) { \
312 	(bp)->b_flags |= B_RETRYWRI; \
313 	bdwrite((bp)); \
314 }
315 
316 /*
317  * Same as bwrite() except write failures are retried.
318  */
319 #define	brwrite(bp) { \
320 	(bp)->b_flags |= B_RETRYWRI; \
321 	bwrite_common((bp), /* force_wait */ 0, /* do_relse */ 1, \
322 		/* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI)); \
323 }
324 
325 extern struct hbuf	*hbuf;		/* Hash table */
326 extern struct dwbuf	*dwbuf;		/* delayed write hash table */
327 extern struct buf	*buf;		/* The buffer pool itself */
328 extern struct buf	bfreelist;	/* head of available list */
329 
330 extern void (*bio_lufs_strategy)(void *, buf_t *);	/* UFS Logging */
331 extern void (*bio_snapshot_strategy)(void *, buf_t *);	/* UFS snapshots */
332 
333 int	bcheck(dev_t, struct buf *);
334 int	iowait(struct buf *);
335 int	hash2ints(int x, int y);
336 int	bio_busy(int);
337 int	biowait(struct buf *);
338 int	biomodified(struct buf *);
339 int	geterror(struct buf *);
340 void	minphys(struct buf *);
341 /*
342  * ufsvfsp is declared as a void * to avoid having everyone that uses
343  * this header file include sys/fs/ufs_inode.h.
344  */
345 void	bwrite_common(void *ufsvfsp, struct buf *, int force_wait,
346 	int do_relse, int clear_flags);
347 void	bwrite(struct buf *);
348 void	bwrite2(struct buf *);
349 void	bdwrite(struct buf *);
350 void	bawrite(struct buf *);
351 void	brelse(struct buf *);
352 void	iodone(struct buf *);
353 void	clrbuf(struct buf *);
354 void	bflush(dev_t);
355 void	blkflush(dev_t, daddr_t);
356 void	binval(dev_t);
357 int	bfinval(dev_t, int);
358 void	binit(void);
359 void	biodone(struct buf *);
360 void	bioinit(struct buf *);
361 void	biofini(struct buf *);
362 void	bp_mapin(struct buf *);
363 void	*bp_mapin_common(struct buf *, int);
364 void	bp_mapout(struct buf *);
365 int	bp_copyin(struct buf *, void *, offset_t, size_t);
366 int	bp_copyout(void *, struct buf *, offset_t, size_t);
367 void	bp_init(size_t, uint_t);
368 int	bp_color(struct buf *);
369 void	pageio_done(struct buf *);
370 struct buf *bread(dev_t, daddr_t, long);
371 struct buf *bread_common(void *, dev_t, daddr_t, long);
372 struct buf *breada(dev_t, daddr_t, daddr_t, long);
373 struct buf *getblk(dev_t, daddr_t, long);
374 struct buf *getblk_common(void *, dev_t, daddr_t, long, int);
375 struct buf *ngeteblk(long);
376 struct buf *geteblk(void);
377 struct buf *pageio_setup(struct page *, size_t, struct vnode *, int);
378 void bioerror(struct buf *bp, int error);
379 void bioreset(struct buf *bp);
380 struct buf *bioclone(struct buf *, off_t, size_t, dev_t, daddr_t,
381 	int (*)(struct buf *), struct buf *, int);
382 size_t	biosize(void);
383 #endif	/* defined(_KERNEL) */
384 
385 #ifdef	__cplusplus
386 }
387 #endif
388 
389 #endif	/* _SYS_BUF_H */
390