xref: /freebsd/stand/common/bcache.c (revision ca987d4641cdcd7f27e153db17c5bf064934faf5)
1*ca987d46SWarner Losh /*-
2*ca987d46SWarner Losh  * Copyright (c) 1998 Michael Smith <msmith@freebsd.org>
3*ca987d46SWarner Losh  * Copyright 2015 Toomas Soome <tsoome@me.com>
4*ca987d46SWarner Losh  * All rights reserved.
5*ca987d46SWarner Losh  *
6*ca987d46SWarner Losh  * Redistribution and use in source and binary forms, with or without
7*ca987d46SWarner Losh  * modification, are permitted provided that the following conditions
8*ca987d46SWarner Losh  * are met:
9*ca987d46SWarner Losh  * 1. Redistributions of source code must retain the above copyright
10*ca987d46SWarner Losh  *    notice, this list of conditions and the following disclaimer.
11*ca987d46SWarner Losh  * 2. Redistributions in binary form must reproduce the above copyright
12*ca987d46SWarner Losh  *    notice, this list of conditions and the following disclaimer in the
13*ca987d46SWarner Losh  *    documentation and/or other materials provided with the distribution.
14*ca987d46SWarner Losh  *
15*ca987d46SWarner Losh  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16*ca987d46SWarner Losh  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17*ca987d46SWarner Losh  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18*ca987d46SWarner Losh  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19*ca987d46SWarner Losh  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20*ca987d46SWarner Losh  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21*ca987d46SWarner Losh  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22*ca987d46SWarner Losh  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23*ca987d46SWarner Losh  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24*ca987d46SWarner Losh  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25*ca987d46SWarner Losh  * SUCH DAMAGE.
26*ca987d46SWarner Losh  */
27*ca987d46SWarner Losh 
28*ca987d46SWarner Losh #include <sys/cdefs.h>
29*ca987d46SWarner Losh #include <sys/param.h>
30*ca987d46SWarner Losh __FBSDID("$FreeBSD$");
31*ca987d46SWarner Losh 
32*ca987d46SWarner Losh /*
33*ca987d46SWarner Losh  * Simple hashed block cache
34*ca987d46SWarner Losh  */
35*ca987d46SWarner Losh 
36*ca987d46SWarner Losh #include <sys/stdint.h>
37*ca987d46SWarner Losh 
38*ca987d46SWarner Losh #include <stand.h>
39*ca987d46SWarner Losh #include <string.h>
40*ca987d46SWarner Losh #include <strings.h>
41*ca987d46SWarner Losh 
42*ca987d46SWarner Losh #include "bootstrap.h"
43*ca987d46SWarner Losh 
44*ca987d46SWarner Losh /* #define BCACHE_DEBUG */
45*ca987d46SWarner Losh 
46*ca987d46SWarner Losh #ifdef BCACHE_DEBUG
47*ca987d46SWarner Losh # define DEBUG(fmt, args...)	printf("%s: " fmt "\n" , __func__ , ## args)
48*ca987d46SWarner Losh #else
49*ca987d46SWarner Losh # define DEBUG(fmt, args...)
50*ca987d46SWarner Losh #endif
51*ca987d46SWarner Losh 
52*ca987d46SWarner Losh struct bcachectl
53*ca987d46SWarner Losh {
54*ca987d46SWarner Losh     daddr_t	bc_blkno;
55*ca987d46SWarner Losh     int		bc_count;
56*ca987d46SWarner Losh };
57*ca987d46SWarner Losh 
58*ca987d46SWarner Losh /*
59*ca987d46SWarner Losh  * bcache per device node. cache is allocated on device first open and freed
60*ca987d46SWarner Losh  * on last close, to save memory. The issue there is the size; biosdisk
61*ca987d46SWarner Losh  * supports up to 31 (0x1f) devices. Classic setup would use single disk
62*ca987d46SWarner Losh  * to boot from, but this has changed with zfs.
63*ca987d46SWarner Losh  */
64*ca987d46SWarner Losh struct bcache {
65*ca987d46SWarner Losh     struct bcachectl	*bcache_ctl;
66*ca987d46SWarner Losh     caddr_t		bcache_data;
67*ca987d46SWarner Losh     size_t		bcache_nblks;
68*ca987d46SWarner Losh     size_t		ra;
69*ca987d46SWarner Losh };
70*ca987d46SWarner Losh 
71*ca987d46SWarner Losh static u_int bcache_total_nblks;	/* set by bcache_init */
72*ca987d46SWarner Losh static u_int bcache_blksize;		/* set by bcache_init */
73*ca987d46SWarner Losh static u_int bcache_numdev;		/* set by bcache_add_dev */
74*ca987d46SWarner Losh /* statistics */
75*ca987d46SWarner Losh static u_int bcache_units;	/* number of devices with cache */
76*ca987d46SWarner Losh static u_int bcache_unit_nblks;	/* nblocks per unit */
77*ca987d46SWarner Losh static u_int bcache_hits;
78*ca987d46SWarner Losh static u_int bcache_misses;
79*ca987d46SWarner Losh static u_int bcache_ops;
80*ca987d46SWarner Losh static u_int bcache_bypasses;
81*ca987d46SWarner Losh static u_int bcache_bcount;
82*ca987d46SWarner Losh static u_int bcache_rablks;
83*ca987d46SWarner Losh 
84*ca987d46SWarner Losh #define	BHASH(bc, blkno)	((blkno) & ((bc)->bcache_nblks - 1))
85*ca987d46SWarner Losh #define	BCACHE_LOOKUP(bc, blkno)	\
86*ca987d46SWarner Losh 	((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno))
87*ca987d46SWarner Losh #define	BCACHE_READAHEAD	256
88*ca987d46SWarner Losh #define	BCACHE_MINREADAHEAD	32
89*ca987d46SWarner Losh #define	BCACHE_MARKER		0xdeadbeef
90*ca987d46SWarner Losh 
91*ca987d46SWarner Losh static void	bcache_invalidate(struct bcache *bc, daddr_t blkno);
92*ca987d46SWarner Losh static void	bcache_insert(struct bcache *bc, daddr_t blkno);
93*ca987d46SWarner Losh static void	bcache_free_instance(struct bcache *bc);
94*ca987d46SWarner Losh 
95*ca987d46SWarner Losh /*
96*ca987d46SWarner Losh  * Initialise the cache for (nblks) of (bsize).
97*ca987d46SWarner Losh  */
98*ca987d46SWarner Losh void
99*ca987d46SWarner Losh bcache_init(size_t nblks, size_t bsize)
100*ca987d46SWarner Losh {
101*ca987d46SWarner Losh     /* set up control data */
102*ca987d46SWarner Losh     bcache_total_nblks = nblks;
103*ca987d46SWarner Losh     bcache_blksize = bsize;
104*ca987d46SWarner Losh }
105*ca987d46SWarner Losh 
106*ca987d46SWarner Losh /*
107*ca987d46SWarner Losh  * add number of devices to bcache. we have to divide cache space
108*ca987d46SWarner Losh  * between the devices, so bcache_add_dev() can be used to set up the
109*ca987d46SWarner Losh  * number. The issue is, we need to get the number before actual allocations.
110*ca987d46SWarner Losh  * bcache_add_dev() is supposed to be called from device init() call, so the
111*ca987d46SWarner Losh  * assumption is, devsw dv_init is called for plain devices first, and
112*ca987d46SWarner Losh  * for zfs, last.
113*ca987d46SWarner Losh  */
114*ca987d46SWarner Losh void
115*ca987d46SWarner Losh bcache_add_dev(int devices)
116*ca987d46SWarner Losh {
117*ca987d46SWarner Losh     bcache_numdev += devices;
118*ca987d46SWarner Losh }
119*ca987d46SWarner Losh 
120*ca987d46SWarner Losh void *
121*ca987d46SWarner Losh bcache_allocate(void)
122*ca987d46SWarner Losh {
123*ca987d46SWarner Losh     u_int i;
124*ca987d46SWarner Losh     struct bcache *bc = malloc(sizeof (struct bcache));
125*ca987d46SWarner Losh     int disks = bcache_numdev;
126*ca987d46SWarner Losh     uint32_t *marker;
127*ca987d46SWarner Losh 
128*ca987d46SWarner Losh     if (disks == 0)
129*ca987d46SWarner Losh 	disks = 1;	/* safe guard */
130*ca987d46SWarner Losh 
131*ca987d46SWarner Losh     if (bc == NULL) {
132*ca987d46SWarner Losh 	errno = ENOMEM;
133*ca987d46SWarner Losh 	return (bc);
134*ca987d46SWarner Losh     }
135*ca987d46SWarner Losh 
136*ca987d46SWarner Losh     /*
137*ca987d46SWarner Losh      * the bcache block count must be power of 2 for hash function
138*ca987d46SWarner Losh      */
139*ca987d46SWarner Losh     i = fls(disks) - 1;		/* highbit - 1 */
140*ca987d46SWarner Losh     if (disks > (1 << i))	/* next power of 2 */
141*ca987d46SWarner Losh 	i++;
142*ca987d46SWarner Losh 
143*ca987d46SWarner Losh     bc->bcache_nblks = bcache_total_nblks >> i;
144*ca987d46SWarner Losh     bcache_unit_nblks = bc->bcache_nblks;
145*ca987d46SWarner Losh     bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize +
146*ca987d46SWarner Losh 	sizeof(uint32_t));
147*ca987d46SWarner Losh     if (bc->bcache_data == NULL) {
148*ca987d46SWarner Losh 	/* dont error out yet. fall back to 32 blocks and try again */
149*ca987d46SWarner Losh 	bc->bcache_nblks = 32;
150*ca987d46SWarner Losh 	bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize +
151*ca987d46SWarner Losh 	sizeof(uint32_t));
152*ca987d46SWarner Losh     }
153*ca987d46SWarner Losh 
154*ca987d46SWarner Losh     bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl));
155*ca987d46SWarner Losh 
156*ca987d46SWarner Losh     if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) {
157*ca987d46SWarner Losh 	bcache_free_instance(bc);
158*ca987d46SWarner Losh 	errno = ENOMEM;
159*ca987d46SWarner Losh 	return (NULL);
160*ca987d46SWarner Losh     }
161*ca987d46SWarner Losh     /* Insert cache end marker. */
162*ca987d46SWarner Losh     marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize);
163*ca987d46SWarner Losh     *marker = BCACHE_MARKER;
164*ca987d46SWarner Losh 
165*ca987d46SWarner Losh     /* Flush the cache */
166*ca987d46SWarner Losh     for (i = 0; i < bc->bcache_nblks; i++) {
167*ca987d46SWarner Losh 	bc->bcache_ctl[i].bc_count = -1;
168*ca987d46SWarner Losh 	bc->bcache_ctl[i].bc_blkno = -1;
169*ca987d46SWarner Losh     }
170*ca987d46SWarner Losh     bcache_units++;
171*ca987d46SWarner Losh     bc->ra = BCACHE_READAHEAD;	/* optimistic read ahead */
172*ca987d46SWarner Losh     return (bc);
173*ca987d46SWarner Losh }
174*ca987d46SWarner Losh 
175*ca987d46SWarner Losh void
176*ca987d46SWarner Losh bcache_free(void *cache)
177*ca987d46SWarner Losh {
178*ca987d46SWarner Losh     struct bcache *bc = cache;
179*ca987d46SWarner Losh 
180*ca987d46SWarner Losh     if (bc == NULL)
181*ca987d46SWarner Losh 	return;
182*ca987d46SWarner Losh 
183*ca987d46SWarner Losh     bcache_free_instance(bc);
184*ca987d46SWarner Losh     bcache_units--;
185*ca987d46SWarner Losh }
186*ca987d46SWarner Losh 
187*ca987d46SWarner Losh /*
188*ca987d46SWarner Losh  * Handle a write request; write directly to the disk, and populate the
189*ca987d46SWarner Losh  * cache with the new values.
190*ca987d46SWarner Losh  */
191*ca987d46SWarner Losh static int
192*ca987d46SWarner Losh write_strategy(void *devdata, int rw, daddr_t blk, size_t size,
193*ca987d46SWarner Losh     char *buf, size_t *rsize)
194*ca987d46SWarner Losh {
195*ca987d46SWarner Losh     struct bcache_devdata	*dd = (struct bcache_devdata *)devdata;
196*ca987d46SWarner Losh     struct bcache		*bc = dd->dv_cache;
197*ca987d46SWarner Losh     daddr_t			i, nblk;
198*ca987d46SWarner Losh 
199*ca987d46SWarner Losh     nblk = size / bcache_blksize;
200*ca987d46SWarner Losh 
201*ca987d46SWarner Losh     /* Invalidate the blocks being written */
202*ca987d46SWarner Losh     for (i = 0; i < nblk; i++) {
203*ca987d46SWarner Losh 	bcache_invalidate(bc, blk + i);
204*ca987d46SWarner Losh     }
205*ca987d46SWarner Losh 
206*ca987d46SWarner Losh     /* Write the blocks */
207*ca987d46SWarner Losh     return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize));
208*ca987d46SWarner Losh }
209*ca987d46SWarner Losh 
210*ca987d46SWarner Losh /*
211*ca987d46SWarner Losh  * Handle a read request; fill in parts of the request that can
212*ca987d46SWarner Losh  * be satisfied by the cache, use the supplied strategy routine to do
213*ca987d46SWarner Losh  * device I/O and then use the I/O results to populate the cache.
214*ca987d46SWarner Losh  */
215*ca987d46SWarner Losh static int
216*ca987d46SWarner Losh read_strategy(void *devdata, int rw, daddr_t blk, size_t size,
217*ca987d46SWarner Losh     char *buf, size_t *rsize)
218*ca987d46SWarner Losh {
219*ca987d46SWarner Losh     struct bcache_devdata	*dd = (struct bcache_devdata *)devdata;
220*ca987d46SWarner Losh     struct bcache		*bc = dd->dv_cache;
221*ca987d46SWarner Losh     size_t			i, nblk, p_size, r_size, complete, ra;
222*ca987d46SWarner Losh     int				result;
223*ca987d46SWarner Losh     daddr_t			p_blk;
224*ca987d46SWarner Losh     caddr_t			p_buf;
225*ca987d46SWarner Losh     uint32_t			*marker;
226*ca987d46SWarner Losh 
227*ca987d46SWarner Losh     if (bc == NULL) {
228*ca987d46SWarner Losh 	errno = ENODEV;
229*ca987d46SWarner Losh 	return (-1);
230*ca987d46SWarner Losh     }
231*ca987d46SWarner Losh 
232*ca987d46SWarner Losh     marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize);
233*ca987d46SWarner Losh 
234*ca987d46SWarner Losh     if (rsize != NULL)
235*ca987d46SWarner Losh 	*rsize = 0;
236*ca987d46SWarner Losh 
237*ca987d46SWarner Losh     nblk = size / bcache_blksize;
238*ca987d46SWarner Losh     if (nblk == 0 && size != 0)
239*ca987d46SWarner Losh 	nblk++;
240*ca987d46SWarner Losh     result = 0;
241*ca987d46SWarner Losh     complete = 1;
242*ca987d46SWarner Losh 
243*ca987d46SWarner Losh     /* Satisfy any cache hits up front, break on first miss */
244*ca987d46SWarner Losh     for (i = 0; i < nblk; i++) {
245*ca987d46SWarner Losh 	if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) {
246*ca987d46SWarner Losh 	    bcache_misses += (nblk - i);
247*ca987d46SWarner Losh 	    complete = 0;
248*ca987d46SWarner Losh 	    if (nblk - i > BCACHE_MINREADAHEAD && bc->ra > BCACHE_MINREADAHEAD)
249*ca987d46SWarner Losh 		bc->ra >>= 1;	/* reduce read ahead */
250*ca987d46SWarner Losh 	    break;
251*ca987d46SWarner Losh 	} else {
252*ca987d46SWarner Losh 	    bcache_hits++;
253*ca987d46SWarner Losh 	}
254*ca987d46SWarner Losh     }
255*ca987d46SWarner Losh 
256*ca987d46SWarner Losh    if (complete) {	/* whole set was in cache, return it */
257*ca987d46SWarner Losh 	if (bc->ra < BCACHE_READAHEAD)
258*ca987d46SWarner Losh 		bc->ra <<= 1;	/* increase read ahead */
259*ca987d46SWarner Losh 	bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size);
260*ca987d46SWarner Losh 	goto done;
261*ca987d46SWarner Losh    }
262*ca987d46SWarner Losh 
263*ca987d46SWarner Losh     /*
264*ca987d46SWarner Losh      * Fill in any misses. From check we have i pointing to first missing
265*ca987d46SWarner Losh      * block, read in all remaining blocks + readahead.
266*ca987d46SWarner Losh      * We have space at least for nblk - i before bcache wraps.
267*ca987d46SWarner Losh      */
268*ca987d46SWarner Losh     p_blk = blk + i;
269*ca987d46SWarner Losh     p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk));
270*ca987d46SWarner Losh     r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */
271*ca987d46SWarner Losh 
272*ca987d46SWarner Losh     p_size = MIN(r_size, nblk - i);	/* read at least those blocks */
273*ca987d46SWarner Losh 
274*ca987d46SWarner Losh     /*
275*ca987d46SWarner Losh      * The read ahead size setup.
276*ca987d46SWarner Losh      * While the read ahead can save us IO, it also can complicate things:
277*ca987d46SWarner Losh      * 1. We do not want to read ahead by wrapping around the
278*ca987d46SWarner Losh      * bcache end - this would complicate the cache management.
279*ca987d46SWarner Losh      * 2. We are using bc->ra as dynamic hint for read ahead size,
280*ca987d46SWarner Losh      * detected cache hits will increase the read-ahead block count, and
281*ca987d46SWarner Losh      * misses will decrease, see the code above.
282*ca987d46SWarner Losh      * 3. The bcache is sized by 512B blocks, however, the underlying device
283*ca987d46SWarner Losh      * may have a larger sector size, and we should perform the IO by
284*ca987d46SWarner Losh      * taking into account these larger sector sizes. We could solve this by
285*ca987d46SWarner Losh      * passing the sector size to bcache_allocate(), or by using ioctl(), but
286*ca987d46SWarner Losh      * in this version we are using the constant, 16 blocks, and are rounding
287*ca987d46SWarner Losh      * read ahead block count down to multiple of 16.
288*ca987d46SWarner Losh      * Using the constant has two reasons, we are not entirely sure if the
289*ca987d46SWarner Losh      * BIOS disk interface is providing the correct value for sector size.
290*ca987d46SWarner Losh      * And secondly, this way we get the most conservative setup for the ra.
291*ca987d46SWarner Losh      *
292*ca987d46SWarner Losh      * The selection of multiple of 16 blocks (8KB) is quite arbitrary, however,
293*ca987d46SWarner Losh      * we want to cover CDs (2K) and 4K disks.
294*ca987d46SWarner Losh      * bcache_allocate() will always fall back to a minimum of 32 blocks.
295*ca987d46SWarner Losh      * Our choice of 16 read ahead blocks will always fit inside the bcache.
296*ca987d46SWarner Losh      */
297*ca987d46SWarner Losh 
298*ca987d46SWarner Losh     if ((rw & F_NORA) == F_NORA)
299*ca987d46SWarner Losh 	ra = 0;
300*ca987d46SWarner Losh     else
301*ca987d46SWarner Losh 	ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size);
302*ca987d46SWarner Losh 
303*ca987d46SWarner Losh     if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */
304*ca987d46SWarner Losh 	ra = MIN(bc->ra, ra - 1);
305*ca987d46SWarner Losh 	ra = rounddown(ra, 16);		/* multiple of 16 blocks */
306*ca987d46SWarner Losh 	p_size += ra;
307*ca987d46SWarner Losh     }
308*ca987d46SWarner Losh 
309*ca987d46SWarner Losh     /* invalidate bcache */
310*ca987d46SWarner Losh     for (i = 0; i < p_size; i++) {
311*ca987d46SWarner Losh 	bcache_invalidate(bc, p_blk + i);
312*ca987d46SWarner Losh     }
313*ca987d46SWarner Losh 
314*ca987d46SWarner Losh     r_size = 0;
315*ca987d46SWarner Losh     /*
316*ca987d46SWarner Losh      * with read-ahead, it may happen we are attempting to read past
317*ca987d46SWarner Losh      * disk end, as bcache has no information about disk size.
318*ca987d46SWarner Losh      * in such case we should get partial read if some blocks can be
319*ca987d46SWarner Losh      * read or error, if no blocks can be read.
320*ca987d46SWarner Losh      * in either case we should return the data in bcache and only
321*ca987d46SWarner Losh      * return error if there is no data.
322*ca987d46SWarner Losh      */
323*ca987d46SWarner Losh     rw &= F_MASK;
324*ca987d46SWarner Losh     result = dd->dv_strategy(dd->dv_devdata, rw, p_blk,
325*ca987d46SWarner Losh 	p_size * bcache_blksize, p_buf, &r_size);
326*ca987d46SWarner Losh 
327*ca987d46SWarner Losh     r_size /= bcache_blksize;
328*ca987d46SWarner Losh     for (i = 0; i < r_size; i++)
329*ca987d46SWarner Losh 	bcache_insert(bc, p_blk + i);
330*ca987d46SWarner Losh 
331*ca987d46SWarner Losh     /* update ra statistics */
332*ca987d46SWarner Losh     if (r_size != 0) {
333*ca987d46SWarner Losh 	if (r_size < p_size)
334*ca987d46SWarner Losh 	    bcache_rablks += (p_size - r_size);
335*ca987d46SWarner Losh 	else
336*ca987d46SWarner Losh 	    bcache_rablks += ra;
337*ca987d46SWarner Losh     }
338*ca987d46SWarner Losh 
339*ca987d46SWarner Losh     /* check how much data can we copy */
340*ca987d46SWarner Losh     for (i = 0; i < nblk; i++) {
341*ca987d46SWarner Losh 	if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i)))
342*ca987d46SWarner Losh 	    break;
343*ca987d46SWarner Losh     }
344*ca987d46SWarner Losh 
345*ca987d46SWarner Losh     if (size > i * bcache_blksize)
346*ca987d46SWarner Losh 	size = i * bcache_blksize;
347*ca987d46SWarner Losh 
348*ca987d46SWarner Losh     if (size != 0) {
349*ca987d46SWarner Losh 	bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size);
350*ca987d46SWarner Losh 	result = 0;
351*ca987d46SWarner Losh     }
352*ca987d46SWarner Losh 
353*ca987d46SWarner Losh     if (*marker != BCACHE_MARKER) {
354*ca987d46SWarner Losh 	printf("BUG: bcache corruption detected: nblks: %zu p_blk: %lu, "
355*ca987d46SWarner Losh 	    "p_size: %zu, ra: %zu\n", bc->bcache_nblks,
356*ca987d46SWarner Losh 	    (long unsigned)BHASH(bc, p_blk), p_size, ra);
357*ca987d46SWarner Losh     }
358*ca987d46SWarner Losh 
359*ca987d46SWarner Losh  done:
360*ca987d46SWarner Losh     if ((result == 0) && (rsize != NULL))
361*ca987d46SWarner Losh 	*rsize = size;
362*ca987d46SWarner Losh     return(result);
363*ca987d46SWarner Losh }
364*ca987d46SWarner Losh 
365*ca987d46SWarner Losh /*
366*ca987d46SWarner Losh  * Requests larger than 1/2 cache size will be bypassed and go
367*ca987d46SWarner Losh  * directly to the disk.  XXX tune this.
368*ca987d46SWarner Losh  */
369*ca987d46SWarner Losh int
370*ca987d46SWarner Losh bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size,
371*ca987d46SWarner Losh     char *buf, size_t *rsize)
372*ca987d46SWarner Losh {
373*ca987d46SWarner Losh     struct bcache_devdata	*dd = (struct bcache_devdata *)devdata;
374*ca987d46SWarner Losh     struct bcache		*bc = dd->dv_cache;
375*ca987d46SWarner Losh     u_int bcache_nblks = 0;
376*ca987d46SWarner Losh     int nblk, cblk, ret;
377*ca987d46SWarner Losh     size_t csize, isize, total;
378*ca987d46SWarner Losh 
379*ca987d46SWarner Losh     bcache_ops++;
380*ca987d46SWarner Losh 
381*ca987d46SWarner Losh     if (bc != NULL)
382*ca987d46SWarner Losh 	bcache_nblks = bc->bcache_nblks;
383*ca987d46SWarner Losh 
384*ca987d46SWarner Losh     /* bypass large requests, or when the cache is inactive */
385*ca987d46SWarner Losh     if (bc == NULL ||
386*ca987d46SWarner Losh 	((size * 2 / bcache_blksize) > bcache_nblks)) {
387*ca987d46SWarner Losh 	DEBUG("bypass %zu from %qu", size / bcache_blksize, blk);
388*ca987d46SWarner Losh 	bcache_bypasses++;
389*ca987d46SWarner Losh 	rw &= F_MASK;
390*ca987d46SWarner Losh 	return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize));
391*ca987d46SWarner Losh     }
392*ca987d46SWarner Losh 
393*ca987d46SWarner Losh     switch (rw & F_MASK) {
394*ca987d46SWarner Losh     case F_READ:
395*ca987d46SWarner Losh 	nblk = size / bcache_blksize;
396*ca987d46SWarner Losh 	if (size != 0 && nblk == 0)
397*ca987d46SWarner Losh 	    nblk++;	/* read at least one block */
398*ca987d46SWarner Losh 
399*ca987d46SWarner Losh 	ret = 0;
400*ca987d46SWarner Losh 	total = 0;
401*ca987d46SWarner Losh 	while(size) {
402*ca987d46SWarner Losh 	    cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */
403*ca987d46SWarner Losh 	    cblk = MIN(cblk, nblk);
404*ca987d46SWarner Losh 
405*ca987d46SWarner Losh 	    if (size <= bcache_blksize)
406*ca987d46SWarner Losh 		csize = size;
407*ca987d46SWarner Losh 	    else
408*ca987d46SWarner Losh 		csize = cblk * bcache_blksize;
409*ca987d46SWarner Losh 
410*ca987d46SWarner Losh 	    ret = read_strategy(devdata, rw, blk, csize, buf+total, &isize);
411*ca987d46SWarner Losh 
412*ca987d46SWarner Losh 	    /*
413*ca987d46SWarner Losh 	     * we may have error from read ahead, if we have read some data
414*ca987d46SWarner Losh 	     * return partial read.
415*ca987d46SWarner Losh 	     */
416*ca987d46SWarner Losh 	    if (ret != 0 || isize == 0) {
417*ca987d46SWarner Losh 		if (total != 0)
418*ca987d46SWarner Losh 		    ret = 0;
419*ca987d46SWarner Losh 		break;
420*ca987d46SWarner Losh 	    }
421*ca987d46SWarner Losh 	    blk += isize / bcache_blksize;
422*ca987d46SWarner Losh 	    total += isize;
423*ca987d46SWarner Losh 	    size -= isize;
424*ca987d46SWarner Losh 	    nblk = size / bcache_blksize;
425*ca987d46SWarner Losh 	}
426*ca987d46SWarner Losh 
427*ca987d46SWarner Losh 	if (rsize)
428*ca987d46SWarner Losh 	    *rsize = total;
429*ca987d46SWarner Losh 
430*ca987d46SWarner Losh 	return (ret);
431*ca987d46SWarner Losh     case F_WRITE:
432*ca987d46SWarner Losh 	return write_strategy(devdata, F_WRITE, blk, size, buf, rsize);
433*ca987d46SWarner Losh     }
434*ca987d46SWarner Losh     return -1;
435*ca987d46SWarner Losh }
436*ca987d46SWarner Losh 
437*ca987d46SWarner Losh /*
438*ca987d46SWarner Losh  * Free allocated bcache instance
439*ca987d46SWarner Losh  */
440*ca987d46SWarner Losh static void
441*ca987d46SWarner Losh bcache_free_instance(struct bcache *bc)
442*ca987d46SWarner Losh {
443*ca987d46SWarner Losh     if (bc != NULL) {
444*ca987d46SWarner Losh 	if (bc->bcache_ctl)
445*ca987d46SWarner Losh 	    free(bc->bcache_ctl);
446*ca987d46SWarner Losh 	if (bc->bcache_data)
447*ca987d46SWarner Losh 	    free(bc->bcache_data);
448*ca987d46SWarner Losh 	free(bc);
449*ca987d46SWarner Losh     }
450*ca987d46SWarner Losh }
451*ca987d46SWarner Losh 
452*ca987d46SWarner Losh /*
453*ca987d46SWarner Losh  * Insert a block into the cache.
454*ca987d46SWarner Losh  */
455*ca987d46SWarner Losh static void
456*ca987d46SWarner Losh bcache_insert(struct bcache *bc, daddr_t blkno)
457*ca987d46SWarner Losh {
458*ca987d46SWarner Losh     u_int	cand;
459*ca987d46SWarner Losh 
460*ca987d46SWarner Losh     cand = BHASH(bc, blkno);
461*ca987d46SWarner Losh 
462*ca987d46SWarner Losh     DEBUG("insert blk %llu -> %u # %d", blkno, cand, bcache_bcount);
463*ca987d46SWarner Losh     bc->bcache_ctl[cand].bc_blkno = blkno;
464*ca987d46SWarner Losh     bc->bcache_ctl[cand].bc_count = bcache_bcount++;
465*ca987d46SWarner Losh }
466*ca987d46SWarner Losh 
467*ca987d46SWarner Losh /*
468*ca987d46SWarner Losh  * Invalidate a block from the cache.
469*ca987d46SWarner Losh  */
470*ca987d46SWarner Losh static void
471*ca987d46SWarner Losh bcache_invalidate(struct bcache *bc, daddr_t blkno)
472*ca987d46SWarner Losh {
473*ca987d46SWarner Losh     u_int	i;
474*ca987d46SWarner Losh 
475*ca987d46SWarner Losh     i = BHASH(bc, blkno);
476*ca987d46SWarner Losh     if (bc->bcache_ctl[i].bc_blkno == blkno) {
477*ca987d46SWarner Losh 	bc->bcache_ctl[i].bc_count = -1;
478*ca987d46SWarner Losh 	bc->bcache_ctl[i].bc_blkno = -1;
479*ca987d46SWarner Losh 	DEBUG("invalidate blk %llu", blkno);
480*ca987d46SWarner Losh     }
481*ca987d46SWarner Losh }
482*ca987d46SWarner Losh 
483*ca987d46SWarner Losh #ifndef BOOT2
484*ca987d46SWarner Losh COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache);
485*ca987d46SWarner Losh 
486*ca987d46SWarner Losh static int
487*ca987d46SWarner Losh command_bcache(int argc, char *argv[])
488*ca987d46SWarner Losh {
489*ca987d46SWarner Losh     if (argc != 1) {
490*ca987d46SWarner Losh 	command_errmsg = "wrong number of arguments";
491*ca987d46SWarner Losh 	return(CMD_ERROR);
492*ca987d46SWarner Losh     }
493*ca987d46SWarner Losh 
494*ca987d46SWarner Losh     printf("\ncache blocks: %d\n", bcache_total_nblks);
495*ca987d46SWarner Losh     printf("cache blocksz: %d\n", bcache_blksize);
496*ca987d46SWarner Losh     printf("cache readahead: %d\n", bcache_rablks);
497*ca987d46SWarner Losh     printf("unit cache blocks: %d\n", bcache_unit_nblks);
498*ca987d46SWarner Losh     printf("cached units: %d\n", bcache_units);
499*ca987d46SWarner Losh     printf("%d ops  %d bypasses  %d hits  %d misses\n", bcache_ops,
500*ca987d46SWarner Losh 	bcache_bypasses, bcache_hits, bcache_misses);
501*ca987d46SWarner Losh     return(CMD_OK);
502*ca987d46SWarner Losh }
503*ca987d46SWarner Losh #endif
504