1 /*- 2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org> 3 * Copyright 2015 Toomas Soome <tsoome@me.com> 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 #include <sys/param.h> 30 __FBSDID("$FreeBSD$"); 31 32 /* 33 * Simple hashed block cache 34 */ 35 36 #include <sys/stdint.h> 37 38 #include <stand.h> 39 #include <string.h> 40 #include <strings.h> 41 42 #include "bootstrap.h" 43 44 /* #define BCACHE_DEBUG */ 45 46 #ifdef BCACHE_DEBUG 47 # define DPRINTF(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args) 48 #else 49 # define DPRINTF(fmt, args...) ((void)0) 50 #endif 51 52 struct bcachectl 53 { 54 daddr_t bc_blkno; 55 int bc_count; 56 }; 57 58 /* 59 * bcache per device node. cache is allocated on device first open and freed 60 * on last close, to save memory. The issue there is the size; biosdisk 61 * supports up to 31 (0x1f) devices. Classic setup would use single disk 62 * to boot from, but this has changed with zfs. 63 */ 64 struct bcache { 65 struct bcachectl *bcache_ctl; 66 caddr_t bcache_data; 67 size_t bcache_nblks; 68 size_t ra; 69 daddr_t bcache_nextblkno; 70 size_t ralen; 71 }; 72 73 static u_int bcache_total_nblks; /* set by bcache_init */ 74 static u_int bcache_blksize; /* set by bcache_init */ 75 static u_int bcache_numdev; /* set by bcache_add_dev */ 76 /* statistics */ 77 static u_int bcache_units; /* number of devices with cache */ 78 static u_int bcache_unit_nblks; /* nblocks per unit */ 79 static u_int bcache_hits; 80 static u_int bcache_misses; 81 static u_int bcache_ops; 82 static u_int bcache_bypasses; 83 static u_int bcache_bcount; 84 static u_int bcache_rablks; 85 86 #define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1)) 87 #define BCACHE_LOOKUP(bc, blkno) \ 88 ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno)) 89 #define BCACHE_READAHEAD 512 90 #define BCACHE_MINREADAHEAD 32 91 #define BCACHE_MAXIOWRA 512 92 93 static void bcache_invalidate(struct bcache *bc, daddr_t blkno); 94 static void bcache_insert(struct bcache *bc, daddr_t blkno); 95 static void bcache_free_instance(struct bcache *bc); 96 97 /* 98 * Initialise the cache for (nblks) of (bsize). 99 */ 100 void 101 bcache_init(size_t nblks, size_t bsize) 102 { 103 /* set up control data */ 104 bcache_total_nblks = nblks; 105 bcache_blksize = bsize; 106 } 107 108 /* 109 * add number of devices to bcache. we have to divide cache space 110 * between the devices, so bcache_add_dev() can be used to set up the 111 * number. The issue is, we need to get the number before actual allocations. 112 * bcache_add_dev() is supposed to be called from device init() call, so the 113 * assumption is, devsw dv_init is called for plain devices first, and 114 * for zfs, last. 115 */ 116 void 117 bcache_add_dev(int devices) 118 { 119 bcache_numdev += devices; 120 } 121 122 void * 123 bcache_allocate(void) 124 { 125 u_int i; 126 struct bcache *bc = malloc(sizeof (struct bcache)); 127 int disks = bcache_numdev; 128 129 if (disks == 0) 130 disks = 1; /* safe guard */ 131 132 if (bc == NULL) { 133 errno = ENOMEM; 134 return (bc); 135 } 136 137 /* 138 * the bcache block count must be power of 2 for hash function 139 */ 140 i = fls(disks) - 1; /* highbit - 1 */ 141 if (disks > (1 << i)) /* next power of 2 */ 142 i++; 143 144 bc->bcache_nblks = bcache_total_nblks >> i; 145 bcache_unit_nblks = bc->bcache_nblks; 146 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize); 147 if (bc->bcache_data == NULL) { 148 /* dont error out yet. fall back to 32 blocks and try again */ 149 bc->bcache_nblks = 32; 150 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + 151 sizeof(uint32_t)); 152 } 153 154 bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl)); 155 156 if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) { 157 bcache_free_instance(bc); 158 errno = ENOMEM; 159 return (NULL); 160 } 161 162 /* Flush the cache */ 163 for (i = 0; i < bc->bcache_nblks; i++) { 164 bc->bcache_ctl[i].bc_count = -1; 165 bc->bcache_ctl[i].bc_blkno = -1; 166 } 167 bcache_units++; 168 bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */ 169 bc->bcache_nextblkno = -1; 170 return (bc); 171 } 172 173 void 174 bcache_free(void *cache) 175 { 176 struct bcache *bc = cache; 177 178 if (bc == NULL) 179 return; 180 181 bcache_free_instance(bc); 182 bcache_units--; 183 } 184 185 /* 186 * Handle a write request; write directly to the disk, and populate the 187 * cache with the new values. 188 */ 189 static int 190 write_strategy(void *devdata, int rw, daddr_t blk, size_t size, 191 char *buf, size_t *rsize) 192 { 193 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 194 struct bcache *bc = dd->dv_cache; 195 daddr_t i, nblk; 196 197 nblk = size / bcache_blksize; 198 199 /* Invalidate the blocks being written */ 200 for (i = 0; i < nblk; i++) { 201 bcache_invalidate(bc, blk + i); 202 } 203 204 /* Write the blocks */ 205 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 206 } 207 208 /* 209 * Handle a read request; fill in parts of the request that can 210 * be satisfied by the cache, use the supplied strategy routine to do 211 * device I/O and then use the I/O results to populate the cache. 212 */ 213 static int 214 read_strategy(void *devdata, int rw, daddr_t blk, size_t size, 215 char *buf, size_t *rsize) 216 { 217 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 218 struct bcache *bc = dd->dv_cache; 219 size_t i, nblk, p_size, r_size, complete, ra; 220 int result; 221 daddr_t p_blk; 222 caddr_t p_buf; 223 224 if (bc == NULL) { 225 errno = ENODEV; 226 return (-1); 227 } 228 229 if (rsize != NULL) 230 *rsize = 0; 231 232 nblk = size / bcache_blksize; 233 if (nblk == 0 && size != 0) 234 nblk++; 235 result = 0; 236 complete = 1; 237 238 /* Satisfy any cache hits up front, break on first miss */ 239 for (i = 0; i < nblk; i++) { 240 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) { 241 bcache_misses += (nblk - i); 242 complete = 0; 243 break; 244 } else { 245 bcache_hits++; 246 } 247 } 248 249 /* 250 * Adjust read-ahead size if appropriate. Subject to the requirement 251 * that bc->ra must stay in between MINREADAHEAD and READAHEAD, we 252 * increase it when we notice that readahead was useful and decrease 253 * it when we notice that readahead was not useful. 254 */ 255 if (complete || (i == bc->ralen && bc->ralen > 0)) { 256 if (bc->ra < BCACHE_READAHEAD) 257 bc->ra <<= 1; /* increase read ahead */ 258 } else { 259 if (nblk - i > BCACHE_MINREADAHEAD && bc->ralen > 0 && 260 bc->ra > BCACHE_MINREADAHEAD) 261 bc->ra >>= 1; /* reduce read ahead */ 262 } 263 264 /* Adjust our "unconsumed readahead" value. */ 265 if (blk == bc->bcache_nextblkno) { 266 if (nblk > bc->ralen) 267 bc->ralen = 0; 268 else 269 bc->ralen -= nblk; 270 } 271 272 if (complete) { /* whole set was in cache, return it */ 273 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 274 goto done; 275 } 276 277 /* 278 * Fill in any misses. From check we have i pointing to first missing 279 * block, read in all remaining blocks + readahead. 280 * We have space at least for nblk - i before bcache wraps. 281 */ 282 p_blk = blk + i; 283 p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk)); 284 r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */ 285 286 p_size = MIN(r_size, nblk - i); /* read at least those blocks */ 287 288 /* 289 * The read ahead size setup. 290 * While the read ahead can save us IO, it also can complicate things: 291 * 1. We do not want to read ahead by wrapping around the 292 * bcache end - this would complicate the cache management. 293 * 2. We are using bc->ra as dynamic hint for read ahead size, 294 * detected cache hits will increase the read-ahead block count, and 295 * misses will decrease, see the code above. 296 * 3. The bcache is sized by 512B blocks, however, the underlying device 297 * may have a larger sector size, and we should perform the IO by 298 * taking into account these larger sector sizes. We could solve this by 299 * passing the sector size to bcache_allocate(), or by using ioctl(), but 300 * in this version we are using the constant, 16 blocks, and are rounding 301 * read ahead block count down to multiple of 16. 302 * Using the constant has two reasons, we are not entirely sure if the 303 * BIOS disk interface is providing the correct value for sector size. 304 * And secondly, this way we get the most conservative setup for the ra. 305 * 306 * The selection of multiple of 16 blocks (8KB) is quite arbitrary, however, 307 * we want to cover CDs (2K) and 4K disks. 308 * bcache_allocate() will always fall back to a minimum of 32 blocks. 309 * Our choice of 16 read ahead blocks will always fit inside the bcache. 310 */ 311 312 if ((rw & F_NORA) == F_NORA) 313 ra = 0; 314 else 315 ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size); 316 317 /* 318 * Only trigger read-ahead if we detect two blocks being read 319 * sequentially. 320 */ 321 if ((bc->bcache_nextblkno != blk) && ra != 0) { 322 ra = 0; 323 } 324 325 if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */ 326 ra = MIN(bc->ra, ra - 1); 327 ra = rounddown(ra, 16); /* multiple of 16 blocks */ 328 if (ra + p_size > BCACHE_MAXIOWRA) 329 ra = BCACHE_MAXIOWRA - p_size; 330 bc->ralen = ra; 331 p_size += ra; 332 } else { 333 bc->ralen = 0; 334 } 335 336 /* invalidate bcache */ 337 for (i = 0; i < p_size; i++) { 338 bcache_invalidate(bc, p_blk + i); 339 } 340 341 r_size = 0; 342 /* 343 * with read-ahead, it may happen we are attempting to read past 344 * disk end, as bcache has no information about disk size. 345 * in such case we should get partial read if some blocks can be 346 * read or error, if no blocks can be read. 347 * in either case we should return the data in bcache and only 348 * return error if there is no data. 349 */ 350 rw &= F_MASK; 351 result = dd->dv_strategy(dd->dv_devdata, rw, p_blk, 352 p_size * bcache_blksize, p_buf, &r_size); 353 354 r_size /= bcache_blksize; 355 for (i = 0; i < r_size; i++) 356 bcache_insert(bc, p_blk + i); 357 358 /* update ra statistics */ 359 if (r_size != 0) { 360 if (r_size < p_size) 361 bcache_rablks += (p_size - r_size); 362 else 363 bcache_rablks += ra; 364 } 365 366 /* check how much data can we copy */ 367 for (i = 0; i < nblk; i++) { 368 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) 369 break; 370 } 371 372 if (size > i * bcache_blksize) 373 size = i * bcache_blksize; 374 375 if (size != 0) { 376 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 377 result = 0; 378 } 379 380 done: 381 if (result == 0) { 382 if (rsize != NULL) 383 *rsize = size; 384 bc->bcache_nextblkno = blk + (size / DEV_BSIZE); 385 } 386 return(result); 387 } 388 389 /* 390 * Requests larger than 1/2 cache size will be bypassed and go 391 * directly to the disk. XXX tune this. 392 */ 393 int 394 bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size, 395 char *buf, size_t *rsize) 396 { 397 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 398 struct bcache *bc = dd->dv_cache; 399 u_int bcache_nblks = 0; 400 int nblk, cblk, ret; 401 size_t csize, isize, total; 402 403 bcache_ops++; 404 405 if (bc != NULL) 406 bcache_nblks = bc->bcache_nblks; 407 408 /* bypass large requests, or when the cache is inactive */ 409 if (bc == NULL || 410 ((size * 2 / bcache_blksize) > bcache_nblks)) { 411 DPRINTF("bypass %zu from %qu", size / bcache_blksize, blk); 412 bcache_bypasses++; 413 rw &= F_MASK; 414 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 415 } 416 417 switch (rw & F_MASK) { 418 case F_READ: 419 nblk = size / bcache_blksize; 420 if (size != 0 && nblk == 0) 421 nblk++; /* read at least one block */ 422 423 ret = 0; 424 total = 0; 425 while(size) { 426 cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */ 427 cblk = MIN(cblk, nblk); 428 429 if (size <= bcache_blksize) 430 csize = size; 431 else 432 csize = cblk * bcache_blksize; 433 434 ret = read_strategy(devdata, rw, blk, csize, buf+total, &isize); 435 436 /* 437 * we may have error from read ahead, if we have read some data 438 * return partial read. 439 */ 440 if (ret != 0 || isize == 0) { 441 if (total != 0) 442 ret = 0; 443 break; 444 } 445 blk += isize / bcache_blksize; 446 total += isize; 447 size -= isize; 448 nblk = size / bcache_blksize; 449 } 450 451 if (rsize) 452 *rsize = total; 453 454 return (ret); 455 case F_WRITE: 456 return write_strategy(devdata, F_WRITE, blk, size, buf, rsize); 457 } 458 return -1; 459 } 460 461 /* 462 * Free allocated bcache instance 463 */ 464 static void 465 bcache_free_instance(struct bcache *bc) 466 { 467 if (bc != NULL) { 468 free(bc->bcache_ctl); 469 free(bc->bcache_data); 470 free(bc); 471 } 472 } 473 474 /* 475 * Insert a block into the cache. 476 */ 477 static void 478 bcache_insert(struct bcache *bc, daddr_t blkno) 479 { 480 u_int cand; 481 482 cand = BHASH(bc, blkno); 483 484 DPRINTF("insert blk %llu -> %u # %d", blkno, cand, bcache_bcount); 485 bc->bcache_ctl[cand].bc_blkno = blkno; 486 bc->bcache_ctl[cand].bc_count = bcache_bcount++; 487 } 488 489 /* 490 * Invalidate a block from the cache. 491 */ 492 static void 493 bcache_invalidate(struct bcache *bc, daddr_t blkno) 494 { 495 u_int i; 496 497 i = BHASH(bc, blkno); 498 if (bc->bcache_ctl[i].bc_blkno == blkno) { 499 bc->bcache_ctl[i].bc_count = -1; 500 bc->bcache_ctl[i].bc_blkno = -1; 501 DPRINTF("invalidate blk %llu", blkno); 502 } 503 } 504 505 #ifndef BOOT2 506 COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache); 507 508 static int 509 command_bcache(int argc, char *argv[] __unused) 510 { 511 if (argc != 1) { 512 command_errmsg = "wrong number of arguments"; 513 return(CMD_ERROR); 514 } 515 516 printf("\ncache blocks: %u\n", bcache_total_nblks); 517 printf("cache blocksz: %u\n", bcache_blksize); 518 printf("cache readahead: %u\n", bcache_rablks); 519 printf("unit cache blocks: %u\n", bcache_unit_nblks); 520 printf("cached units: %u\n", bcache_units); 521 printf("%u ops %d bypasses %u hits %u misses\n", bcache_ops, 522 bcache_bypasses, bcache_hits, bcache_misses); 523 return(CMD_OK); 524 } 525 #endif 526