1 /*
2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org>
3 * Copyright 2015 Toomas Soome <tsoome@me.com>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 #include <sys/param.h>
30
31 /*
32 * Simple hashed block cache
33 */
34
35 #include <sys/stdint.h>
36
37 #include <stand.h>
38 #include <string.h>
39 #include <strings.h>
40
41 #include "bootstrap.h"
42
43 /* #define BCACHE_DEBUG */
44
45 #ifdef BCACHE_DEBUG
46 #define DPRINTF(fmt, args...) printf("%s: " fmt "\n", __func__, ## args)
47 #else
48 #define DPRINTF(fmt, args...) ((void)0)
49 #endif
50
51 struct bcachectl
52 {
53 daddr_t bc_blkno;
54 int bc_count;
55 };
56
57 /*
58 * bcache per device node. cache is allocated on device first open and freed
59 * on last close, to save memory. The issue there is the size; biosdisk
60 * supports up to 31 (0x1f) devices. Classic setup would use single disk
61 * to boot from, but this has changed with zfs.
62 */
63 struct bcache {
64 struct bcachectl *bcache_ctl;
65 caddr_t bcache_data;
66 size_t bcache_nblks;
67 size_t ra;
68 };
69
70 static uint_t bcache_total_nblks; /* set by bcache_init */
71 static uint_t bcache_blksize; /* set by bcache_init */
72 static uint_t bcache_numdev; /* set by bcache_add_dev */
73 /* statistics */
74 static uint_t bcache_units; /* number of devices with cache */
75 static uint_t bcache_unit_nblks; /* nblocks per unit */
76 static uint_t bcache_hits;
77 static uint_t bcache_misses;
78 static uint_t bcache_ops;
79 static uint_t bcache_bypasses;
80 static uint_t bcache_bcount;
81 static uint_t bcache_rablks;
82
83 #define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1))
84 #define BCACHE_LOOKUP(bc, blkno) \
85 ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno))
86 #define BCACHE_READAHEAD 256
87 #define BCACHE_MINREADAHEAD 32
88
89 static void bcache_invalidate(struct bcache *bc, daddr_t blkno);
90 static void bcache_insert(struct bcache *bc, daddr_t blkno);
91 static void bcache_free_instance(struct bcache *bc);
92
93 /*
94 * Initialise the cache for (nblks) of (bsize).
95 */
96 void
bcache_init(size_t nblks,size_t bsize)97 bcache_init(size_t nblks, size_t bsize)
98 {
99 /* set up control data */
100 bcache_total_nblks = nblks;
101 bcache_blksize = bsize;
102 }
103
104 /*
105 * add number of devices to bcache. we have to divide cache space
106 * between the devices, so bcache_add_dev() can be used to set up the
107 * number. The issue is, we need to get the number before actual allocations.
108 * bcache_add_dev() is supposed to be called from device init() call, so the
109 * assumption is, devsw dv_init is called for plain devices first, and
110 * for zfs, last.
111 */
112 void
bcache_add_dev(int devices)113 bcache_add_dev(int devices)
114 {
115 bcache_numdev += devices;
116 }
117
118 void *
bcache_allocate(void)119 bcache_allocate(void)
120 {
121 uint_t i;
122 struct bcache *bc = malloc(sizeof (struct bcache));
123 int disks = bcache_numdev;
124
125 if (disks == 0)
126 disks = 1; /* safe guard */
127
128 if (bc == NULL) {
129 errno = ENOMEM;
130 return (bc);
131 }
132
133 /*
134 * the bcache block count must be power of 2 for hash function
135 */
136 i = fls(disks) - 1; /* highbit - 1 */
137 if (disks > (1 << i)) /* next power of 2 */
138 i++;
139
140 bc->bcache_nblks = bcache_total_nblks >> i;
141 bcache_unit_nblks = bc->bcache_nblks;
142 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize);
143 if (bc->bcache_data == NULL) {
144 /* dont error out yet. fall back to 32 blocks and try again */
145 bc->bcache_nblks = 32;
146 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize +
147 sizeof (uint32_t));
148 }
149
150 bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof (struct bcachectl));
151
152 if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) {
153 bcache_free_instance(bc);
154 errno = ENOMEM;
155 return (NULL);
156 }
157
158 /* Flush the cache */
159 for (i = 0; i < bc->bcache_nblks; i++) {
160 bc->bcache_ctl[i].bc_count = -1;
161 bc->bcache_ctl[i].bc_blkno = -1;
162 }
163 bcache_units++;
164 bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */
165 return (bc);
166 }
167
168 void
bcache_free(void * cache)169 bcache_free(void *cache)
170 {
171 struct bcache *bc = cache;
172
173 if (bc == NULL)
174 return;
175
176 bcache_free_instance(bc);
177 bcache_units--;
178 }
179
180 /*
181 * Handle a write request; write directly to the disk, and populate the
182 * cache with the new values.
183 */
184 static int
write_strategy(void * devdata,int rw,daddr_t blk,size_t size,char * buf,size_t * rsize)185 write_strategy(void *devdata, int rw, daddr_t blk, size_t size,
186 char *buf, size_t *rsize)
187 {
188 struct bcache_devdata *dd = (struct bcache_devdata *)devdata;
189 struct bcache *bc = dd->dv_cache;
190 daddr_t i, nblk;
191
192 nblk = size / bcache_blksize;
193
194 /* Invalidate the blocks being written */
195 for (i = 0; i < nblk; i++) {
196 bcache_invalidate(bc, blk + i);
197 }
198
199 /* Write the blocks */
200 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize));
201 }
202
203 /*
204 * Handle a read request; fill in parts of the request that can
205 * be satisfied by the cache, use the supplied strategy routine to do
206 * device I/O and then use the I/O results to populate the cache.
207 */
208 static int
read_strategy(void * devdata,int rw,daddr_t blk,size_t size,char * buf,size_t * rsize)209 read_strategy(void *devdata, int rw, daddr_t blk, size_t size,
210 char *buf, size_t *rsize)
211 {
212 struct bcache_devdata *dd = devdata;
213 struct bcache *bc = dd->dv_cache;
214 size_t i, nblk, p_size, r_size, complete, ra;
215 int result;
216 daddr_t p_blk;
217 caddr_t p_buf;
218
219 if (bc == NULL) {
220 errno = ENODEV;
221 return (-1);
222 }
223
224 if (rsize != NULL)
225 *rsize = 0;
226
227 nblk = size / bcache_blksize;
228 if (nblk == 0 && size != 0)
229 nblk++;
230 result = 0;
231 complete = 1;
232
233 /* Satisfy any cache hits up front, break on first miss */
234 for (i = 0; i < nblk; i++) {
235 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) {
236 bcache_misses += (nblk - i);
237 complete = 0;
238 if (nblk - i > BCACHE_MINREADAHEAD &&
239 bc->ra > BCACHE_MINREADAHEAD)
240 bc->ra >>= 1; /* reduce read ahead */
241 break;
242 } else {
243 bcache_hits++;
244 }
245 }
246
247 if (complete) { /* whole set was in cache, return it */
248 if (bc->ra < BCACHE_READAHEAD)
249 bc->ra <<= 1; /* increase read ahead */
250 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)),
251 buf, size);
252 goto done;
253 }
254
255 /*
256 * Fill in any misses. From check we have i pointing to first missing
257 * block, read in all remaining blocks + readahead.
258 * We have space at least for nblk - i before bcache wraps.
259 */
260 p_blk = blk + i;
261 p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk));
262 r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */
263
264 p_size = MIN(r_size, nblk - i); /* read at least those blocks */
265
266 /*
267 * The read ahead size setup.
268 * While the read ahead can save us IO, it also can complicate things:
269 * 1. We do not want to read ahead by wrapping around the
270 * bcache end - this would complicate the cache management.
271 * 2. We are using bc->ra as dynamic hint for read ahead size,
272 * detected cache hits will increase the read-ahead block count,
273 * and misses will decrease, see the code above.
274 * 3. The bcache is sized by 512B blocks, however, the underlying device
275 * may have a larger sector size, and we should perform the IO by
276 * taking into account these larger sector sizes. We could solve
277 * this by passing the sector size to bcache_allocate(), or by
278 * using ioctl(), but in this version we are using the constant,
279 * 16 blocks, and are rounding read ahead block count down to
280 * multiple of 16. Using the constant has two reasons, we are not
281 * entirely sure if the BIOS disk interface is providing the
282 * correct value for sector size. And secondly, this way we get
283 * the most conservative setup for the ra.
284 *
285 * The selection of multiple of 16 blocks (8KB) is quite arbitrary,
286 * however, we want to cover CDs (2K) and 4K disks.
287 * bcache_allocate() will always fall back to a minimum of 32 blocks.
288 * Our choice of 16 read ahead blocks will always fit inside the bcache.
289 */
290
291 if ((rw & F_NORA) == F_NORA)
292 ra = 0;
293 else
294 ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size);
295
296 if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */
297 ra = MIN(bc->ra, ra - 1);
298 ra = rounddown(ra, 16); /* multiple of 16 blocks */
299 p_size += ra;
300 }
301
302 /* invalidate bcache */
303 for (i = 0; i < p_size; i++) {
304 bcache_invalidate(bc, p_blk + i);
305 }
306
307 r_size = 0;
308 /*
309 * with read-ahead, it may happen we are attempting to read past
310 * disk end, as bcache has no information about disk size.
311 * in such case we should get partial read if some blocks can be
312 * read or error, if no blocks can be read.
313 * in either case we should return the data in bcache and only
314 * return error if there is no data.
315 */
316 rw &= F_MASK;
317 result = dd->dv_strategy(dd->dv_devdata, rw, p_blk,
318 p_size * bcache_blksize, p_buf, &r_size);
319
320 r_size /= bcache_blksize;
321 for (i = 0; i < r_size; i++)
322 bcache_insert(bc, p_blk + i);
323
324 /* update ra statistics */
325 if (r_size != 0) {
326 if (r_size < p_size)
327 bcache_rablks += (p_size - r_size);
328 else
329 bcache_rablks += ra;
330 }
331
332 /* check how much data can we copy */
333 for (i = 0; i < nblk; i++) {
334 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i)))
335 break;
336 }
337
338 if (size > i * bcache_blksize)
339 size = i * bcache_blksize;
340
341 if (size != 0) {
342 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)),
343 buf, size);
344 result = 0;
345 }
346
347 done:
348 if ((result == 0) && (rsize != NULL))
349 *rsize = size;
350 return (result);
351 }
352
353 /*
354 * Requests larger than 1/2 cache size will be bypassed and go
355 * directly to the disk. XXX tune this.
356 */
357 int
bcache_strategy(void * devdata,int rw,daddr_t blk,size_t size,char * buf,size_t * rsize)358 bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size,
359 char *buf, size_t *rsize)
360 {
361 struct bcache_devdata *dd = (struct bcache_devdata *)devdata;
362 struct bcache *bc = dd->dv_cache;
363 uint_t bcache_nblks = 0;
364 int nblk, cblk, ret;
365 size_t csize, isize, total;
366
367 bcache_ops++;
368
369 if (bc != NULL)
370 bcache_nblks = bc->bcache_nblks;
371
372 /* bypass large requests, or when the cache is inactive */
373 if (bc == NULL ||
374 ((size * 2 / bcache_blksize) > bcache_nblks)) {
375 DPRINTF("bypass %zu from %jd", size / bcache_blksize,
376 (intmax_t)blk);
377 bcache_bypasses++;
378 rw &= F_MASK;
379 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf,
380 rsize));
381 }
382
383 switch (rw & F_MASK) {
384 case F_READ:
385 nblk = size / bcache_blksize;
386 if (size != 0 && nblk == 0)
387 nblk++; /* read at least one block */
388
389 ret = 0;
390 total = 0;
391 while (size) {
392 /* # of blocks left */
393 cblk = bcache_nblks - BHASH(bc, blk);
394 cblk = MIN(cblk, nblk);
395
396 if (size <= bcache_blksize)
397 csize = size;
398 else
399 csize = cblk * bcache_blksize;
400
401 ret = read_strategy(devdata, rw, blk, csize,
402 buf + total, &isize);
403
404 /*
405 * we may have error from read ahead, if we have read
406 * some data return partial read.
407 */
408 if (ret != 0 || isize == 0) {
409 if (total != 0)
410 ret = 0;
411 break;
412 }
413 blk += isize / bcache_blksize;
414 total += isize;
415 size -= isize;
416 nblk = size / bcache_blksize;
417 }
418
419 if (rsize)
420 *rsize = total;
421
422 return (ret);
423 case F_WRITE:
424 return (write_strategy(devdata, F_WRITE, blk, size, buf,
425 rsize));
426 }
427 return (-1);
428 }
429
430 /*
431 * Free allocated bcache instance
432 */
433 static void
bcache_free_instance(struct bcache * bc)434 bcache_free_instance(struct bcache *bc)
435 {
436 if (bc != NULL) {
437 free(bc->bcache_ctl);
438 free(bc->bcache_data);
439 free(bc);
440 }
441 }
442
443 /*
444 * Insert a block into the cache.
445 */
446 static void
bcache_insert(struct bcache * bc,daddr_t blkno)447 bcache_insert(struct bcache *bc, daddr_t blkno)
448 {
449 uint_t cand;
450
451 cand = BHASH(bc, blkno);
452
453 DPRINTF("insert blk %jd -> %u # %d", (intmax_t)blkno, cand,
454 bcache_bcount);
455 bc->bcache_ctl[cand].bc_blkno = blkno;
456 bc->bcache_ctl[cand].bc_count = bcache_bcount++;
457 }
458
459 /*
460 * Invalidate a block from the cache.
461 */
462 static void
bcache_invalidate(struct bcache * bc,daddr_t blkno)463 bcache_invalidate(struct bcache *bc, daddr_t blkno)
464 {
465 uint_t i;
466
467 i = BHASH(bc, blkno);
468 if (bc->bcache_ctl[i].bc_blkno == blkno) {
469 bc->bcache_ctl[i].bc_count = -1;
470 bc->bcache_ctl[i].bc_blkno = -1;
471 DPRINTF("invalidate blk %jd", (intmax_t)blkno);
472 }
473 }
474
475 COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats",
476 command_bcache);
477
478 static int
command_bcache(int argc,char * argv[]__unused)479 command_bcache(int argc, char *argv[] __unused)
480 {
481 if (argc != 1) {
482 command_errmsg = "wrong number of arguments";
483 return (CMD_ERROR);
484 }
485
486 printf("\ncache blocks: %u\n", bcache_total_nblks);
487 printf("cache blocksz: %u\n", bcache_blksize);
488 printf("cache readahead: %u\n", bcache_rablks);
489 printf("unit cache blocks: %u\n", bcache_unit_nblks);
490 printf("cached units: %u\n", bcache_units);
491 printf("%u ops %u bypasses %u hits %u misses\n", bcache_ops,
492 bcache_bypasses, bcache_hits, bcache_misses);
493 return (CMD_OK);
494 }
495