xref: /freebsd/stand/common/bcache.c (revision 54c1a65736ec012b583ade1d53c477e182c574e4)
1 /*-
2  * Copyright (c) 1998 Michael Smith <msmith@freebsd.org>
3  * Copyright 2015 Toomas Soome <tsoome@me.com>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 #include <sys/param.h>
30 __FBSDID("$FreeBSD$");
31 
32 /*
33  * Simple hashed block cache
34  */
35 
36 #include <sys/stdint.h>
37 
38 #include <stand.h>
39 #include <string.h>
40 #include <strings.h>
41 
42 #include "bootstrap.h"
43 
44 /* #define BCACHE_DEBUG */
45 
46 #ifdef BCACHE_DEBUG
47 # define DPRINTF(fmt, args...)	printf("%s: " fmt "\n" , __func__ , ## args)
48 #else
49 # define DPRINTF(fmt, args...)	((void)0)
50 #endif
51 
52 struct bcachectl
53 {
54     daddr_t	bc_blkno;
55     int		bc_count;
56 };
57 
58 /*
59  * bcache per device node. cache is allocated on device first open and freed
60  * on last close, to save memory. The issue there is the size; biosdisk
61  * supports up to 31 (0x1f) devices. Classic setup would use single disk
62  * to boot from, but this has changed with zfs.
63  */
64 struct bcache {
65     struct bcachectl	*bcache_ctl;
66     caddr_t		bcache_data;
67     size_t		bcache_nblks;
68     size_t		ra;
69     daddr_t		bcache_nextblkno;
70     size_t		ralen;
71 };
72 
73 static u_int bcache_total_nblks;	/* set by bcache_init */
74 static u_int bcache_blksize;		/* set by bcache_init */
75 static u_int bcache_numdev;		/* set by bcache_add_dev */
76 /* statistics */
77 static u_int bcache_units;	/* number of devices with cache */
78 static u_int bcache_unit_nblks;	/* nblocks per unit */
79 static u_int bcache_hits;
80 static u_int bcache_misses;
81 static u_int bcache_ops;
82 static u_int bcache_bypasses;
83 static u_int bcache_bcount;
84 static u_int bcache_rablks;
85 
86 #define	BHASH(bc, blkno)	((blkno) & ((bc)->bcache_nblks - 1))
87 #define	BCACHE_LOOKUP(bc, blkno)	\
88 	((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno))
89 #define	BCACHE_READAHEAD	512
90 #define	BCACHE_MINREADAHEAD	32
91 #define	BCACHE_MAXIOWRA		512
92 
93 static void	bcache_invalidate(struct bcache *bc, daddr_t blkno);
94 static void	bcache_insert(struct bcache *bc, daddr_t blkno);
95 static void	bcache_free_instance(struct bcache *bc);
96 
97 /*
98  * Initialise the cache for (nblks) of (bsize).
99  */
100 void
101 bcache_init(size_t nblks, size_t bsize)
102 {
103     /* set up control data */
104     bcache_total_nblks = nblks;
105     bcache_blksize = bsize;
106 }
107 
108 /*
109  * add number of devices to bcache. we have to divide cache space
110  * between the devices, so bcache_add_dev() can be used to set up the
111  * number. The issue is, we need to get the number before actual allocations.
112  * bcache_add_dev() is supposed to be called from device init() call, so the
113  * assumption is, devsw dv_init is called for plain devices first, and
114  * for zfs, last.
115  */
116 void
117 bcache_add_dev(int devices)
118 {
119     bcache_numdev += devices;
120 }
121 
122 void *
123 bcache_allocate(void)
124 {
125     u_int i;
126     struct bcache *bc = malloc(sizeof (struct bcache));
127     int disks = bcache_numdev;
128 
129     if (disks == 0)
130 	disks = 1;	/* safe guard */
131 
132     if (bc == NULL) {
133 	errno = ENOMEM;
134 	return (bc);
135     }
136 
137     /*
138      * the bcache block count must be power of 2 for hash function
139      */
140     i = fls(disks) - 1;		/* highbit - 1 */
141     if (disks > (1 << i))	/* next power of 2 */
142 	i++;
143 
144     bc->bcache_nblks = bcache_total_nblks >> i;
145     bcache_unit_nblks = bc->bcache_nblks;
146     bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize);
147     if (bc->bcache_data == NULL) {
148 	/* dont error out yet. fall back to 32 blocks and try again */
149 	bc->bcache_nblks = 32;
150 	bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize +
151 	sizeof(uint32_t));
152     }
153 
154     bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl));
155 
156     if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) {
157 	bcache_free_instance(bc);
158 	errno = ENOMEM;
159 	return (NULL);
160     }
161 
162     /* Flush the cache */
163     for (i = 0; i < bc->bcache_nblks; i++) {
164 	bc->bcache_ctl[i].bc_count = -1;
165 	bc->bcache_ctl[i].bc_blkno = -1;
166     }
167     bcache_units++;
168     bc->ra = BCACHE_READAHEAD;	/* optimistic read ahead */
169     bc->bcache_nextblkno = -1;
170     return (bc);
171 }
172 
173 void
174 bcache_free(void *cache)
175 {
176     struct bcache *bc = cache;
177 
178     if (bc == NULL)
179 	return;
180 
181     bcache_free_instance(bc);
182     bcache_units--;
183 }
184 
185 /*
186  * Handle a write request; write directly to the disk, and populate the
187  * cache with the new values.
188  */
189 static int
190 write_strategy(void *devdata, int rw, daddr_t blk, size_t size,
191     char *buf, size_t *rsize)
192 {
193     struct bcache_devdata	*dd = (struct bcache_devdata *)devdata;
194     struct bcache		*bc = dd->dv_cache;
195     daddr_t			i, nblk;
196 
197     nblk = size / bcache_blksize;
198 
199     /* Invalidate the blocks being written */
200     for (i = 0; i < nblk; i++) {
201 	bcache_invalidate(bc, blk + i);
202     }
203 
204     /* Write the blocks */
205     return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize));
206 }
207 
208 /*
209  * Handle a read request; fill in parts of the request that can
210  * be satisfied by the cache, use the supplied strategy routine to do
211  * device I/O and then use the I/O results to populate the cache.
212  */
213 static int
214 read_strategy(void *devdata, int rw, daddr_t blk, size_t size,
215     char *buf, size_t *rsize)
216 {
217     struct bcache_devdata	*dd = (struct bcache_devdata *)devdata;
218     struct bcache		*bc = dd->dv_cache;
219     size_t			i, nblk, p_size, r_size, complete, ra;
220     int				result;
221     daddr_t			p_blk;
222     caddr_t			p_buf;
223 
224     if (bc == NULL) {
225 	errno = ENODEV;
226 	return (-1);
227     }
228 
229     if (rsize != NULL)
230 	*rsize = 0;
231 
232     nblk = size / bcache_blksize;
233     if (nblk == 0 && size != 0)
234 	nblk++;
235     result = 0;
236     complete = 1;
237 
238     /* Satisfy any cache hits up front, break on first miss */
239     for (i = 0; i < nblk; i++) {
240 	if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) {
241 	    bcache_misses += (nblk - i);
242 	    complete = 0;
243 	    break;
244 	} else {
245 	    bcache_hits++;
246 	}
247     }
248 
249     /*
250      * Adjust read-ahead size if appropriate.  Subject to the requirement
251      * that bc->ra must stay in between MINREADAHEAD and READAHEAD, we
252      * increase it when we notice that readahead was useful and decrease
253      * it when we notice that readahead was not useful.
254      */
255     if (complete || (i == bc->ralen && bc->ralen > 0)) {
256 	if (bc->ra < BCACHE_READAHEAD)
257 	    bc->ra <<= 1;	/* increase read ahead */
258     } else {
259 	if (nblk - i > BCACHE_MINREADAHEAD && bc->ralen > 0 &&
260 	  bc->ra > BCACHE_MINREADAHEAD)
261 	    bc->ra >>= 1;	/* reduce read ahead */
262     }
263 
264     /* Adjust our "unconsumed readahead" value. */
265     if (blk == bc->bcache_nextblkno) {
266 	if (nblk > bc->ralen)
267 	    bc->ralen = 0;
268 	else
269 	    bc->ralen -= nblk;
270     }
271 
272     if (complete) {	/* whole set was in cache, return it */
273 	bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size);
274 	goto done;
275     }
276 
277     /*
278      * Fill in any misses. From check we have i pointing to first missing
279      * block, read in all remaining blocks + readahead.
280      * We have space at least for nblk - i before bcache wraps.
281      */
282     p_blk = blk + i;
283     p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk));
284     r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */
285 
286     p_size = MIN(r_size, nblk - i);	/* read at least those blocks */
287 
288     /*
289      * The read ahead size setup.
290      * While the read ahead can save us IO, it also can complicate things:
291      * 1. We do not want to read ahead by wrapping around the
292      * bcache end - this would complicate the cache management.
293      * 2. We are using bc->ra as dynamic hint for read ahead size,
294      * detected cache hits will increase the read-ahead block count, and
295      * misses will decrease, see the code above.
296      * 3. The bcache is sized by 512B blocks, however, the underlying device
297      * may have a larger sector size, and we should perform the IO by
298      * taking into account these larger sector sizes. We could solve this by
299      * passing the sector size to bcache_allocate(), or by using ioctl(), but
300      * in this version we are using the constant, 16 blocks, and are rounding
301      * read ahead block count down to multiple of 16.
302      * Using the constant has two reasons, we are not entirely sure if the
303      * BIOS disk interface is providing the correct value for sector size.
304      * And secondly, this way we get the most conservative setup for the ra.
305      *
306      * The selection of multiple of 16 blocks (8KB) is quite arbitrary, however,
307      * we want to cover CDs (2K) and 4K disks.
308      * bcache_allocate() will always fall back to a minimum of 32 blocks.
309      * Our choice of 16 read ahead blocks will always fit inside the bcache.
310      */
311 
312     if ((rw & F_NORA) == F_NORA)
313 	ra = 0;
314     else
315 	ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size);
316 
317     /*
318      * Only trigger read-ahead if we detect two blocks being read
319      * sequentially.
320      */
321     if ((bc->bcache_nextblkno != blk) && ra != 0) {
322         ra = 0;
323     }
324 
325     if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */
326 	ra = MIN(bc->ra, ra - 1);
327 	ra = rounddown(ra, 16);		/* multiple of 16 blocks */
328 	if (ra + p_size > BCACHE_MAXIOWRA)
329 	    ra = BCACHE_MAXIOWRA - p_size;
330 	bc->ralen = ra;
331 	p_size += ra;
332     } else {
333 	bc->ralen = 0;
334     }
335 
336     /* invalidate bcache */
337     for (i = 0; i < p_size; i++) {
338 	bcache_invalidate(bc, p_blk + i);
339     }
340 
341     r_size = 0;
342     /*
343      * with read-ahead, it may happen we are attempting to read past
344      * disk end, as bcache has no information about disk size.
345      * in such case we should get partial read if some blocks can be
346      * read or error, if no blocks can be read.
347      * in either case we should return the data in bcache and only
348      * return error if there is no data.
349      */
350     rw &= F_MASK;
351     result = dd->dv_strategy(dd->dv_devdata, rw, p_blk,
352 	p_size * bcache_blksize, p_buf, &r_size);
353 
354     r_size /= bcache_blksize;
355     for (i = 0; i < r_size; i++)
356 	bcache_insert(bc, p_blk + i);
357 
358     /* update ra statistics */
359     if (r_size != 0) {
360 	if (r_size < p_size)
361 	    bcache_rablks += (p_size - r_size);
362 	else
363 	    bcache_rablks += ra;
364     }
365 
366     /* check how much data can we copy */
367     for (i = 0; i < nblk; i++) {
368 	if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i)))
369 	    break;
370     }
371 
372     if (size > i * bcache_blksize)
373 	size = i * bcache_blksize;
374 
375     if (size != 0) {
376 	bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size);
377 	result = 0;
378     }
379 
380 done:
381     if (result == 0) {
382         if (rsize != NULL)
383 	    *rsize = size;
384         bc->bcache_nextblkno = blk + (size / DEV_BSIZE);
385     }
386     return(result);
387 }
388 
389 /*
390  * Requests larger than 1/2 cache size will be bypassed and go
391  * directly to the disk.  XXX tune this.
392  */
393 int
394 bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size,
395     char *buf, size_t *rsize)
396 {
397     struct bcache_devdata	*dd = (struct bcache_devdata *)devdata;
398     struct bcache		*bc = dd->dv_cache;
399     u_int bcache_nblks = 0;
400     int nblk, cblk, ret;
401     size_t csize, isize, total;
402 
403     bcache_ops++;
404 
405     if (bc != NULL)
406 	bcache_nblks = bc->bcache_nblks;
407 
408     /* bypass large requests, or when the cache is inactive */
409     if (bc == NULL ||
410 	((size * 2 / bcache_blksize) > bcache_nblks)) {
411 	DPRINTF("bypass %zu from %qu", size / bcache_blksize, blk);
412 	bcache_bypasses++;
413 	rw &= F_MASK;
414 	return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize));
415     }
416 
417     switch (rw & F_MASK) {
418     case F_READ:
419 	nblk = size / bcache_blksize;
420 	if (size != 0 && nblk == 0)
421 	    nblk++;	/* read at least one block */
422 
423 	ret = 0;
424 	total = 0;
425 	while(size) {
426 	    cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */
427 	    cblk = MIN(cblk, nblk);
428 
429 	    if (size <= bcache_blksize)
430 		csize = size;
431 	    else
432 		csize = cblk * bcache_blksize;
433 
434 	    ret = read_strategy(devdata, rw, blk, csize, buf+total, &isize);
435 
436 	    /*
437 	     * we may have error from read ahead, if we have read some data
438 	     * return partial read.
439 	     */
440 	    if (ret != 0 || isize == 0) {
441 		if (total != 0)
442 		    ret = 0;
443 		break;
444 	    }
445 	    blk += isize / bcache_blksize;
446 	    total += isize;
447 	    size -= isize;
448 	    nblk = size / bcache_blksize;
449 	}
450 
451 	if (rsize)
452 	    *rsize = total;
453 
454 	return (ret);
455     case F_WRITE:
456 	return write_strategy(devdata, F_WRITE, blk, size, buf, rsize);
457     }
458     return -1;
459 }
460 
461 /*
462  * Free allocated bcache instance
463  */
464 static void
465 bcache_free_instance(struct bcache *bc)
466 {
467     if (bc != NULL) {
468 	free(bc->bcache_ctl);
469 	free(bc->bcache_data);
470 	free(bc);
471     }
472 }
473 
474 /*
475  * Insert a block into the cache.
476  */
477 static void
478 bcache_insert(struct bcache *bc, daddr_t blkno)
479 {
480     u_int	cand;
481 
482     cand = BHASH(bc, blkno);
483 
484     DPRINTF("insert blk %llu -> %u # %d", blkno, cand, bcache_bcount);
485     bc->bcache_ctl[cand].bc_blkno = blkno;
486     bc->bcache_ctl[cand].bc_count = bcache_bcount++;
487 }
488 
489 /*
490  * Invalidate a block from the cache.
491  */
492 static void
493 bcache_invalidate(struct bcache *bc, daddr_t blkno)
494 {
495     u_int	i;
496 
497     i = BHASH(bc, blkno);
498     if (bc->bcache_ctl[i].bc_blkno == blkno) {
499 	bc->bcache_ctl[i].bc_count = -1;
500 	bc->bcache_ctl[i].bc_blkno = -1;
501 	DPRINTF("invalidate blk %llu", blkno);
502     }
503 }
504 
505 #ifndef BOOT2
506 COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache);
507 
508 static int
509 command_bcache(int argc, char *argv[] __unused)
510 {
511     if (argc != 1) {
512 	command_errmsg = "wrong number of arguments";
513 	return(CMD_ERROR);
514     }
515 
516     printf("\ncache blocks: %u\n", bcache_total_nblks);
517     printf("cache blocksz: %u\n", bcache_blksize);
518     printf("cache readahead: %u\n", bcache_rablks);
519     printf("unit cache blocks: %u\n", bcache_unit_nblks);
520     printf("cached units: %u\n", bcache_units);
521     printf("%u ops %d bypasses %u hits %u misses\n", bcache_ops,
522 	bcache_bypasses, bcache_hits, bcache_misses);
523     return(CMD_OK);
524 }
525 #endif
526