1 /*-
2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org>
3 * Copyright 2015 Toomas Soome <tsoome@me.com>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 #include <sys/param.h>
30
31 /*
32 * Simple hashed block cache
33 */
34
35 #include <sys/stdint.h>
36
37 #include <stand.h>
38 #include <string.h>
39 #include <strings.h>
40
41 #include "bootstrap.h"
42
43 /* #define BCACHE_DEBUG */
44
45 #ifdef BCACHE_DEBUG
46 # define DEBUG(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args)
47 #else
48 # define DEBUG(fmt, args...)
49 #endif
50
51 struct bcachectl
52 {
53 daddr_t bc_blkno;
54 int bc_count;
55 };
56
57 /*
58 * bcache per device node. cache is allocated on device first open and freed
59 * on last close, to save memory. The issue there is the size; biosdisk
60 * supports up to 31 (0x1f) devices. Classic setup would use single disk
61 * to boot from, but this has changed with zfs.
62 */
63 struct bcache {
64 struct bcachectl *bcache_ctl;
65 caddr_t bcache_data;
66 size_t bcache_nblks;
67 size_t ra;
68 };
69
70 static u_int bcache_total_nblks; /* set by bcache_init */
71 static u_int bcache_blksize; /* set by bcache_init */
72 static u_int bcache_numdev; /* set by bcache_add_dev */
73 /* statistics */
74 static u_int bcache_units; /* number of devices with cache */
75 static u_int bcache_unit_nblks; /* nblocks per unit */
76 static u_int bcache_hits;
77 static u_int bcache_misses;
78 static u_int bcache_ops;
79 static u_int bcache_bypasses;
80 static u_int bcache_bcount;
81 static u_int bcache_rablks;
82
83 #define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1))
84 #define BCACHE_LOOKUP(bc, blkno) \
85 ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno))
86 #define BCACHE_READAHEAD 256
87 #define BCACHE_MINREADAHEAD 32
88 #define BCACHE_MARKER 0xdeadbeef
89
90 static void bcache_invalidate(struct bcache *bc, daddr_t blkno);
91 static void bcache_insert(struct bcache *bc, daddr_t blkno);
92 static void bcache_free_instance(struct bcache *bc);
93
94 /*
95 * Initialise the cache for (nblks) of (bsize).
96 */
97 void
bcache_init(size_t nblks,size_t bsize)98 bcache_init(size_t nblks, size_t bsize)
99 {
100 /* set up control data */
101 bcache_total_nblks = nblks;
102 bcache_blksize = bsize;
103 }
104
105 /*
106 * add number of devices to bcache. we have to divide cache space
107 * between the devices, so bcache_add_dev() can be used to set up the
108 * number. The issue is, we need to get the number before actual allocations.
109 * bcache_add_dev() is supposed to be called from device init() call, so the
110 * assumption is, devsw dv_init is called for plain devices first, and
111 * for zfs, last.
112 */
113 void
bcache_add_dev(int devices)114 bcache_add_dev(int devices)
115 {
116 bcache_numdev += devices;
117 }
118
119 void *
bcache_allocate(void)120 bcache_allocate(void)
121 {
122 u_int i;
123 struct bcache *bc = malloc(sizeof (struct bcache));
124 int disks = bcache_numdev;
125 uint32_t *marker;
126
127 if (disks == 0)
128 disks = 1; /* safe guard */
129
130 if (bc == NULL) {
131 errno = ENOMEM;
132 return (bc);
133 }
134
135 /*
136 * the bcache block count must be power of 2 for hash function
137 */
138 i = fls(disks) - 1; /* highbit - 1 */
139 if (disks > (1 << i)) /* next power of 2 */
140 i++;
141
142 bc->bcache_nblks = bcache_total_nblks >> i;
143 bcache_unit_nblks = bc->bcache_nblks;
144 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize +
145 sizeof (uint32_t));
146 if (bc->bcache_data == NULL) {
147 /* dont error out yet. fall back to 32 blocks and try again */
148 bc->bcache_nblks = 32;
149 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize +
150 sizeof (uint32_t));
151 }
152
153 bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl));
154
155 if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) {
156 bcache_free_instance(bc);
157 errno = ENOMEM;
158 return (NULL);
159 }
160 /* Insert cache end marker. */
161 marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize);
162 *marker = BCACHE_MARKER;
163
164 /* Flush the cache */
165 for (i = 0; i < bc->bcache_nblks; i++) {
166 bc->bcache_ctl[i].bc_count = -1;
167 bc->bcache_ctl[i].bc_blkno = -1;
168 }
169 bcache_units++;
170 bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */
171 return (bc);
172 }
173
174 void
bcache_free(void * cache)175 bcache_free(void *cache)
176 {
177 struct bcache *bc = cache;
178
179 if (bc == NULL)
180 return;
181
182 bcache_free_instance(bc);
183 bcache_units--;
184 }
185
186 /*
187 * Handle a write request; write directly to the disk, and populate the
188 * cache with the new values.
189 */
190 static int
write_strategy(void * devdata,int rw,daddr_t blk,size_t size,char * buf,size_t * rsize)191 write_strategy(void *devdata, int rw, daddr_t blk, size_t size,
192 char *buf, size_t *rsize)
193 {
194 struct bcache_devdata *dd = (struct bcache_devdata *)devdata;
195 struct bcache *bc = dd->dv_cache;
196 daddr_t i, nblk;
197
198 nblk = size / bcache_blksize;
199
200 /* Invalidate the blocks being written */
201 for (i = 0; i < nblk; i++) {
202 bcache_invalidate(bc, blk + i);
203 }
204
205 /* Write the blocks */
206 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize));
207 }
208
209 /*
210 * Handle a read request; fill in parts of the request that can
211 * be satisfied by the cache, use the supplied strategy routine to do
212 * device I/O and then use the I/O results to populate the cache.
213 */
214 static int
read_strategy(void * devdata,int rw,daddr_t blk,size_t size,char * buf,size_t * rsize)215 read_strategy(void *devdata, int rw, daddr_t blk, size_t size,
216 char *buf, size_t *rsize)
217 {
218 struct bcache_devdata *dd = (struct bcache_devdata *)devdata;
219 struct bcache *bc = dd->dv_cache;
220 size_t i, nblk, p_size, r_size, complete, ra;
221 int result;
222 daddr_t p_blk;
223 caddr_t p_buf;
224 uint32_t *marker;
225
226 if (bc == NULL) {
227 errno = ENODEV;
228 return (-1);
229 }
230 marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize);
231
232 if (rsize != NULL)
233 *rsize = 0;
234
235 nblk = size / bcache_blksize;
236 if (nblk == 0 && size != 0)
237 nblk++;
238 result = 0;
239 complete = 1;
240
241 /* Satisfy any cache hits up front, break on first miss */
242 for (i = 0; i < nblk; i++) {
243 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) {
244 bcache_misses += (nblk - i);
245 complete = 0;
246 if (nblk - i > BCACHE_MINREADAHEAD && bc->ra > BCACHE_MINREADAHEAD)
247 bc->ra >>= 1; /* reduce read ahead */
248 break;
249 } else {
250 bcache_hits++;
251 }
252 }
253
254 if (complete) { /* whole set was in cache, return it */
255 if (bc->ra < BCACHE_READAHEAD)
256 bc->ra <<= 1; /* increase read ahead */
257 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size);
258 goto done;
259 }
260
261 /*
262 * Fill in any misses. From check we have i pointing to first missing
263 * block, read in all remaining blocks + readahead.
264 * We have space at least for nblk - i before bcache wraps.
265 */
266 p_blk = blk + i;
267 p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk));
268 r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */
269
270 p_size = MIN(r_size, nblk - i); /* read at least those blocks */
271
272 /*
273 * The read ahead size setup.
274 * While the read ahead can save us IO, it also can complicate things:
275 * 1. We do not want to read ahead by wrapping around the
276 * bcache end - this would complicate the cache management.
277 * 2. We are using bc->ra as dynamic hint for read ahead size,
278 * detected cache hits will increase the read-ahead block count, and
279 * misses will decrease, see the code above.
280 * 3. The bcache is sized by 512B blocks, however, the underlying device
281 * may have a larger sector size, and we should perform the IO by
282 * taking into account these larger sector sizes. We could solve this by
283 * passing the sector size to bcache_allocate(), or by using ioctl(), but
284 * in this version we are using the constant, 16 blocks, and are rounding
285 * read ahead block count down to multiple of 16.
286 * Using the constant has two reasons, we are not entirely sure if the
287 * BIOS disk interface is providing the correct value for sector size.
288 * And secondly, this way we get the most conservative setup for the ra.
289 *
290 * The selection of multiple of 16 blocks (8KB) is quite arbitrary, however,
291 * we want to cover CDs (2K) and 4K disks.
292 * bcache_allocate() will always fall back to a minimum of 32 blocks.
293 * Our choice of 16 read ahead blocks will always fit inside the bcache.
294 */
295
296 if ((rw & F_NORA) == F_NORA)
297 ra = 0;
298 else
299 ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size);
300
301 if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */
302 ra = MIN(bc->ra, ra - 1);
303 ra = rounddown(ra, 16); /* multiple of 16 blocks */
304 p_size += ra;
305 }
306
307 /* invalidate bcache */
308 for (i = 0; i < p_size; i++) {
309 bcache_invalidate(bc, p_blk + i);
310 }
311
312 r_size = 0;
313 /*
314 * with read-ahead, it may happen we are attempting to read past
315 * disk end, as bcache has no information about disk size.
316 * in such case we should get partial read if some blocks can be
317 * read or error, if no blocks can be read.
318 * in either case we should return the data in bcache and only
319 * return error if there is no data.
320 */
321 rw &= F_MASK;
322 result = dd->dv_strategy(dd->dv_devdata, rw, p_blk,
323 p_size * bcache_blksize, p_buf, &r_size);
324
325 r_size /= bcache_blksize;
326 for (i = 0; i < r_size; i++)
327 bcache_insert(bc, p_blk + i);
328
329 /* update ra statistics */
330 if (r_size != 0) {
331 if (r_size < p_size)
332 bcache_rablks += (p_size - r_size);
333 else
334 bcache_rablks += ra;
335 }
336
337 /* check how much data can we copy */
338 for (i = 0; i < nblk; i++) {
339 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i)))
340 break;
341 }
342
343 if (size > i * bcache_blksize)
344 size = i * bcache_blksize;
345
346 if (size != 0) {
347 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size);
348 result = 0;
349 }
350
351 if (*marker != BCACHE_MARKER) {
352 printf("BUG: bcache corruption detected: nblks: %zu p_blk: %lu, "
353 "p_size: %zu, ra: %zu\n", bc->bcache_nblks,
354 (long unsigned)BHASH(bc, p_blk), p_size, ra);
355 }
356
357 done:
358 if ((result == 0) && (rsize != NULL))
359 *rsize = size;
360 return(result);
361 }
362
363 /*
364 * Requests larger than 1/2 cache size will be bypassed and go
365 * directly to the disk. XXX tune this.
366 */
367 int
bcache_strategy(void * devdata,int rw,daddr_t blk,size_t size,char * buf,size_t * rsize)368 bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size,
369 char *buf, size_t *rsize)
370 {
371 struct bcache_devdata *dd = (struct bcache_devdata *)devdata;
372 struct bcache *bc = dd->dv_cache;
373 u_int bcache_nblks = 0;
374 int nblk, cblk, ret;
375 size_t csize, isize, total;
376
377 bcache_ops++;
378
379 if (bc != NULL)
380 bcache_nblks = bc->bcache_nblks;
381
382 /* bypass large requests, or when the cache is inactive */
383 if (bc == NULL ||
384 ((size * 2 / bcache_blksize) > bcache_nblks)) {
385 DEBUG("bypass %zu from %qu", size / bcache_blksize, blk);
386 bcache_bypasses++;
387 rw &= F_MASK;
388 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize));
389 }
390
391 switch (rw & F_MASK) {
392 case F_READ:
393 nblk = size / bcache_blksize;
394 if (size != 0 && nblk == 0)
395 nblk++; /* read at least one block */
396
397 ret = 0;
398 total = 0;
399 while(size) {
400 cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */
401 cblk = MIN(cblk, nblk);
402
403 if (size <= bcache_blksize)
404 csize = size;
405 else
406 csize = cblk * bcache_blksize;
407
408 ret = read_strategy(devdata, rw, blk, csize, buf+total, &isize);
409
410 /*
411 * we may have error from read ahead, if we have read some data
412 * return partial read.
413 */
414 if (ret != 0 || isize == 0) {
415 if (total != 0)
416 ret = 0;
417 break;
418 }
419 blk += isize / bcache_blksize;
420 total += isize;
421 size -= isize;
422 nblk = size / bcache_blksize;
423 }
424
425 if (rsize)
426 *rsize = total;
427
428 return (ret);
429 case F_WRITE:
430 return write_strategy(devdata, F_WRITE, blk, size, buf, rsize);
431 }
432 return -1;
433 }
434
435 /*
436 * Free allocated bcache instance
437 */
438 static void
bcache_free_instance(struct bcache * bc)439 bcache_free_instance(struct bcache *bc)
440 {
441 if (bc != NULL) {
442 if (bc->bcache_ctl)
443 free(bc->bcache_ctl);
444 if (bc->bcache_data)
445 free(bc->bcache_data);
446 free(bc);
447 }
448 }
449
450 /*
451 * Insert a block into the cache.
452 */
453 static void
bcache_insert(struct bcache * bc,daddr_t blkno)454 bcache_insert(struct bcache *bc, daddr_t blkno)
455 {
456 u_int cand;
457
458 cand = BHASH(bc, blkno);
459
460 DEBUG("insert blk %llu -> %u # %d", blkno, cand, bcache_bcount);
461 bc->bcache_ctl[cand].bc_blkno = blkno;
462 bc->bcache_ctl[cand].bc_count = bcache_bcount++;
463 }
464
465 /*
466 * Invalidate a block from the cache.
467 */
468 static void
bcache_invalidate(struct bcache * bc,daddr_t blkno)469 bcache_invalidate(struct bcache *bc, daddr_t blkno)
470 {
471 u_int i;
472
473 i = BHASH(bc, blkno);
474 if (bc->bcache_ctl[i].bc_blkno == blkno) {
475 bc->bcache_ctl[i].bc_count = -1;
476 bc->bcache_ctl[i].bc_blkno = -1;
477 DEBUG("invalidate blk %llu", blkno);
478 }
479 }
480
481 #ifndef BOOT2
482 COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache);
483
484 static int
command_bcache(int argc,char * argv[]__attribute ((unused)))485 command_bcache(int argc, char *argv[] __attribute((unused)))
486 {
487 if (argc != 1) {
488 command_errmsg = "wrong number of arguments";
489 return(CMD_ERROR);
490 }
491
492 printf("\ncache blocks: %d\n", bcache_total_nblks);
493 printf("cache blocksz: %d\n", bcache_blksize);
494 printf("cache readahead: %d\n", bcache_rablks);
495 printf("unit cache blocks: %d\n", bcache_unit_nblks);
496 printf("cached units: %d\n", bcache_units);
497 printf("%d ops %d bypasses %d hits %d misses\n", bcache_ops,
498 bcache_bypasses, bcache_hits, bcache_misses);
499 return(CMD_OK);
500 }
501 #endif
502