1 /*- 2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org> 3 * Copyright 2015 Toomas Soome <tsoome@me.com> 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 #include <sys/param.h> 30 31 /* 32 * Simple hashed block cache 33 */ 34 35 #include <sys/stdint.h> 36 37 #include <stand.h> 38 #include <string.h> 39 #include <strings.h> 40 41 #include "bootstrap.h" 42 43 /* #define BCACHE_DEBUG */ 44 45 #ifdef BCACHE_DEBUG 46 # define DEBUG(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args) 47 #else 48 # define DEBUG(fmt, args...) 49 #endif 50 51 struct bcachectl 52 { 53 daddr_t bc_blkno; 54 int bc_count; 55 }; 56 57 /* 58 * bcache per device node. cache is allocated on device first open and freed 59 * on last close, to save memory. The issue there is the size; biosdisk 60 * supports up to 31 (0x1f) devices. Classic setup would use single disk 61 * to boot from, but this has changed with zfs. 62 */ 63 struct bcache { 64 struct bcachectl *bcache_ctl; 65 caddr_t bcache_data; 66 size_t bcache_nblks; 67 size_t ra; 68 }; 69 70 static u_int bcache_total_nblks; /* set by bcache_init */ 71 static u_int bcache_blksize; /* set by bcache_init */ 72 static u_int bcache_numdev; /* set by bcache_add_dev */ 73 /* statistics */ 74 static u_int bcache_units; /* number of devices with cache */ 75 static u_int bcache_unit_nblks; /* nblocks per unit */ 76 static u_int bcache_hits; 77 static u_int bcache_misses; 78 static u_int bcache_ops; 79 static u_int bcache_bypasses; 80 static u_int bcache_bcount; 81 static u_int bcache_rablks; 82 83 #define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1)) 84 #define BCACHE_LOOKUP(bc, blkno) \ 85 ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno)) 86 #define BCACHE_READAHEAD 256 87 #define BCACHE_MINREADAHEAD 32 88 #define BCACHE_MARKER 0xdeadbeef 89 90 static void bcache_invalidate(struct bcache *bc, daddr_t blkno); 91 static void bcache_insert(struct bcache *bc, daddr_t blkno); 92 static void bcache_free_instance(struct bcache *bc); 93 94 /* 95 * Initialise the cache for (nblks) of (bsize). 96 */ 97 void 98 bcache_init(size_t nblks, size_t bsize) 99 { 100 /* set up control data */ 101 bcache_total_nblks = nblks; 102 bcache_blksize = bsize; 103 } 104 105 /* 106 * add number of devices to bcache. we have to divide cache space 107 * between the devices, so bcache_add_dev() can be used to set up the 108 * number. The issue is, we need to get the number before actual allocations. 109 * bcache_add_dev() is supposed to be called from device init() call, so the 110 * assumption is, devsw dv_init is called for plain devices first, and 111 * for zfs, last. 112 */ 113 void 114 bcache_add_dev(int devices) 115 { 116 bcache_numdev += devices; 117 } 118 119 void * 120 bcache_allocate(void) 121 { 122 u_int i; 123 struct bcache *bc = malloc(sizeof (struct bcache)); 124 int disks = bcache_numdev; 125 uint32_t *marker; 126 127 if (disks == 0) 128 disks = 1; /* safe guard */ 129 130 if (bc == NULL) { 131 errno = ENOMEM; 132 return (bc); 133 } 134 135 /* 136 * the bcache block count must be power of 2 for hash function 137 */ 138 i = fls(disks) - 1; /* highbit - 1 */ 139 if (disks > (1 << i)) /* next power of 2 */ 140 i++; 141 142 bc->bcache_nblks = bcache_total_nblks >> i; 143 bcache_unit_nblks = bc->bcache_nblks; 144 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + 145 sizeof (uint32_t)); 146 if (bc->bcache_data == NULL) { 147 /* dont error out yet. fall back to 32 blocks and try again */ 148 bc->bcache_nblks = 32; 149 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + 150 sizeof (uint32_t)); 151 } 152 153 bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl)); 154 155 if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) { 156 bcache_free_instance(bc); 157 errno = ENOMEM; 158 return (NULL); 159 } 160 /* Insert cache end marker. */ 161 marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize); 162 *marker = BCACHE_MARKER; 163 164 /* Flush the cache */ 165 for (i = 0; i < bc->bcache_nblks; i++) { 166 bc->bcache_ctl[i].bc_count = -1; 167 bc->bcache_ctl[i].bc_blkno = -1; 168 } 169 bcache_units++; 170 bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */ 171 return (bc); 172 } 173 174 void 175 bcache_free(void *cache) 176 { 177 struct bcache *bc = cache; 178 179 if (bc == NULL) 180 return; 181 182 bcache_free_instance(bc); 183 bcache_units--; 184 } 185 186 /* 187 * Handle a write request; write directly to the disk, and populate the 188 * cache with the new values. 189 */ 190 static int 191 write_strategy(void *devdata, int rw, daddr_t blk, size_t size, 192 char *buf, size_t *rsize) 193 { 194 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 195 struct bcache *bc = dd->dv_cache; 196 daddr_t i, nblk; 197 198 nblk = size / bcache_blksize; 199 200 /* Invalidate the blocks being written */ 201 for (i = 0; i < nblk; i++) { 202 bcache_invalidate(bc, blk + i); 203 } 204 205 /* Write the blocks */ 206 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 207 } 208 209 /* 210 * Handle a read request; fill in parts of the request that can 211 * be satisfied by the cache, use the supplied strategy routine to do 212 * device I/O and then use the I/O results to populate the cache. 213 */ 214 static int 215 read_strategy(void *devdata, int rw, daddr_t blk, size_t size, 216 char *buf, size_t *rsize) 217 { 218 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 219 struct bcache *bc = dd->dv_cache; 220 size_t i, nblk, p_size, r_size, complete, ra; 221 int result; 222 daddr_t p_blk; 223 caddr_t p_buf; 224 uint32_t *marker; 225 226 if (bc == NULL) { 227 errno = ENODEV; 228 return (-1); 229 } 230 marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize); 231 232 if (rsize != NULL) 233 *rsize = 0; 234 235 nblk = size / bcache_blksize; 236 if (nblk == 0 && size != 0) 237 nblk++; 238 result = 0; 239 complete = 1; 240 241 /* Satisfy any cache hits up front, break on first miss */ 242 for (i = 0; i < nblk; i++) { 243 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) { 244 bcache_misses += (nblk - i); 245 complete = 0; 246 if (nblk - i > BCACHE_MINREADAHEAD && bc->ra > BCACHE_MINREADAHEAD) 247 bc->ra >>= 1; /* reduce read ahead */ 248 break; 249 } else { 250 bcache_hits++; 251 } 252 } 253 254 if (complete) { /* whole set was in cache, return it */ 255 if (bc->ra < BCACHE_READAHEAD) 256 bc->ra <<= 1; /* increase read ahead */ 257 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 258 goto done; 259 } 260 261 /* 262 * Fill in any misses. From check we have i pointing to first missing 263 * block, read in all remaining blocks + readahead. 264 * We have space at least for nblk - i before bcache wraps. 265 */ 266 p_blk = blk + i; 267 p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk)); 268 r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */ 269 270 p_size = MIN(r_size, nblk - i); /* read at least those blocks */ 271 272 /* 273 * The read ahead size setup. 274 * While the read ahead can save us IO, it also can complicate things: 275 * 1. We do not want to read ahead by wrapping around the 276 * bcache end - this would complicate the cache management. 277 * 2. We are using bc->ra as dynamic hint for read ahead size, 278 * detected cache hits will increase the read-ahead block count, and 279 * misses will decrease, see the code above. 280 * 3. The bcache is sized by 512B blocks, however, the underlying device 281 * may have a larger sector size, and we should perform the IO by 282 * taking into account these larger sector sizes. We could solve this by 283 * passing the sector size to bcache_allocate(), or by using ioctl(), but 284 * in this version we are using the constant, 16 blocks, and are rounding 285 * read ahead block count down to multiple of 16. 286 * Using the constant has two reasons, we are not entirely sure if the 287 * BIOS disk interface is providing the correct value for sector size. 288 * And secondly, this way we get the most conservative setup for the ra. 289 * 290 * The selection of multiple of 16 blocks (8KB) is quite arbitrary, however, 291 * we want to cover CDs (2K) and 4K disks. 292 * bcache_allocate() will always fall back to a minimum of 32 blocks. 293 * Our choice of 16 read ahead blocks will always fit inside the bcache. 294 */ 295 296 if ((rw & F_NORA) == F_NORA) 297 ra = 0; 298 else 299 ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size); 300 301 if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */ 302 ra = MIN(bc->ra, ra - 1); 303 ra = rounddown(ra, 16); /* multiple of 16 blocks */ 304 p_size += ra; 305 } 306 307 /* invalidate bcache */ 308 for (i = 0; i < p_size; i++) { 309 bcache_invalidate(bc, p_blk + i); 310 } 311 312 r_size = 0; 313 /* 314 * with read-ahead, it may happen we are attempting to read past 315 * disk end, as bcache has no information about disk size. 316 * in such case we should get partial read if some blocks can be 317 * read or error, if no blocks can be read. 318 * in either case we should return the data in bcache and only 319 * return error if there is no data. 320 */ 321 rw &= F_MASK; 322 result = dd->dv_strategy(dd->dv_devdata, rw, p_blk, 323 p_size * bcache_blksize, p_buf, &r_size); 324 325 r_size /= bcache_blksize; 326 for (i = 0; i < r_size; i++) 327 bcache_insert(bc, p_blk + i); 328 329 /* update ra statistics */ 330 if (r_size != 0) { 331 if (r_size < p_size) 332 bcache_rablks += (p_size - r_size); 333 else 334 bcache_rablks += ra; 335 } 336 337 /* check how much data can we copy */ 338 for (i = 0; i < nblk; i++) { 339 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) 340 break; 341 } 342 343 if (size > i * bcache_blksize) 344 size = i * bcache_blksize; 345 346 if (size != 0) { 347 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 348 result = 0; 349 } 350 351 if (*marker != BCACHE_MARKER) { 352 printf("BUG: bcache corruption detected: nblks: %zu p_blk: %lu, " 353 "p_size: %zu, ra: %zu\n", bc->bcache_nblks, 354 (long unsigned)BHASH(bc, p_blk), p_size, ra); 355 } 356 357 done: 358 if ((result == 0) && (rsize != NULL)) 359 *rsize = size; 360 return(result); 361 } 362 363 /* 364 * Requests larger than 1/2 cache size will be bypassed and go 365 * directly to the disk. XXX tune this. 366 */ 367 int 368 bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size, 369 char *buf, size_t *rsize) 370 { 371 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 372 struct bcache *bc = dd->dv_cache; 373 u_int bcache_nblks = 0; 374 int nblk, cblk, ret; 375 size_t csize, isize, total; 376 377 bcache_ops++; 378 379 if (bc != NULL) 380 bcache_nblks = bc->bcache_nblks; 381 382 /* bypass large requests, or when the cache is inactive */ 383 if (bc == NULL || 384 ((size * 2 / bcache_blksize) > bcache_nblks)) { 385 DEBUG("bypass %zu from %qu", size / bcache_blksize, blk); 386 bcache_bypasses++; 387 rw &= F_MASK; 388 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 389 } 390 391 switch (rw & F_MASK) { 392 case F_READ: 393 nblk = size / bcache_blksize; 394 if (size != 0 && nblk == 0) 395 nblk++; /* read at least one block */ 396 397 ret = 0; 398 total = 0; 399 while(size) { 400 cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */ 401 cblk = MIN(cblk, nblk); 402 403 if (size <= bcache_blksize) 404 csize = size; 405 else 406 csize = cblk * bcache_blksize; 407 408 ret = read_strategy(devdata, rw, blk, csize, buf+total, &isize); 409 410 /* 411 * we may have error from read ahead, if we have read some data 412 * return partial read. 413 */ 414 if (ret != 0 || isize == 0) { 415 if (total != 0) 416 ret = 0; 417 break; 418 } 419 blk += isize / bcache_blksize; 420 total += isize; 421 size -= isize; 422 nblk = size / bcache_blksize; 423 } 424 425 if (rsize) 426 *rsize = total; 427 428 return (ret); 429 case F_WRITE: 430 return write_strategy(devdata, F_WRITE, blk, size, buf, rsize); 431 } 432 return -1; 433 } 434 435 /* 436 * Free allocated bcache instance 437 */ 438 static void 439 bcache_free_instance(struct bcache *bc) 440 { 441 if (bc != NULL) { 442 if (bc->bcache_ctl) 443 free(bc->bcache_ctl); 444 if (bc->bcache_data) 445 free(bc->bcache_data); 446 free(bc); 447 } 448 } 449 450 /* 451 * Insert a block into the cache. 452 */ 453 static void 454 bcache_insert(struct bcache *bc, daddr_t blkno) 455 { 456 u_int cand; 457 458 cand = BHASH(bc, blkno); 459 460 DEBUG("insert blk %llu -> %u # %d", blkno, cand, bcache_bcount); 461 bc->bcache_ctl[cand].bc_blkno = blkno; 462 bc->bcache_ctl[cand].bc_count = bcache_bcount++; 463 } 464 465 /* 466 * Invalidate a block from the cache. 467 */ 468 static void 469 bcache_invalidate(struct bcache *bc, daddr_t blkno) 470 { 471 u_int i; 472 473 i = BHASH(bc, blkno); 474 if (bc->bcache_ctl[i].bc_blkno == blkno) { 475 bc->bcache_ctl[i].bc_count = -1; 476 bc->bcache_ctl[i].bc_blkno = -1; 477 DEBUG("invalidate blk %llu", blkno); 478 } 479 } 480 481 #ifndef BOOT2 482 COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache); 483 484 static int 485 command_bcache(int argc, char *argv[] __attribute((unused))) 486 { 487 if (argc != 1) { 488 command_errmsg = "wrong number of arguments"; 489 return(CMD_ERROR); 490 } 491 492 printf("\ncache blocks: %d\n", bcache_total_nblks); 493 printf("cache blocksz: %d\n", bcache_blksize); 494 printf("cache readahead: %d\n", bcache_rablks); 495 printf("unit cache blocks: %d\n", bcache_unit_nblks); 496 printf("cached units: %d\n", bcache_units); 497 printf("%d ops %d bypasses %d hits %d misses\n", bcache_ops, 498 bcache_bypasses, bcache_hits, bcache_misses); 499 return(CMD_OK); 500 } 501 #endif 502