1 /*- 2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org> 3 * Copyright 2015 Toomas Soome <tsoome@me.com> 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 #include <sys/param.h> 30 /* 31 * Simple hashed block cache 32 */ 33 34 #include <sys/stdint.h> 35 36 #include <stand.h> 37 #include <string.h> 38 #include <strings.h> 39 40 #include "bootstrap.h" 41 42 /* #define BCACHE_DEBUG */ 43 44 #ifdef BCACHE_DEBUG 45 # define DPRINTF(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args) 46 #else 47 # define DPRINTF(fmt, args...) ((void)0) 48 #endif 49 50 struct bcachectl 51 { 52 daddr_t bc_blkno; 53 int bc_count; 54 }; 55 56 /* 57 * bcache per device node. cache is allocated on device first open and freed 58 * on last close, to save memory. The issue there is the size; biosdisk 59 * supports up to 31 (0x1f) devices. Classic setup would use single disk 60 * to boot from, but this has changed with zfs. 61 */ 62 struct bcache { 63 struct bcachectl *bcache_ctl; 64 caddr_t bcache_data; 65 size_t bcache_nblks; 66 size_t ra; 67 daddr_t bcache_nextblkno; 68 size_t ralen; 69 }; 70 71 static u_int bcache_total_nblks; /* set by bcache_init */ 72 static u_int bcache_blksize; /* set by bcache_init */ 73 static u_int bcache_numdev; /* set by bcache_add_dev */ 74 /* statistics */ 75 static u_int bcache_units; /* number of devices with cache */ 76 static u_int bcache_unit_nblks; /* nblocks per unit */ 77 static u_int bcache_hits; 78 static u_int bcache_misses; 79 static u_int bcache_ops; 80 static u_int bcache_bypasses; 81 static u_int bcache_bcount; 82 static u_int bcache_rablks; 83 84 #define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1)) 85 #define BCACHE_LOOKUP(bc, blkno) \ 86 ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno)) 87 #define BCACHE_READAHEAD 512 88 #define BCACHE_MINREADAHEAD 32 89 #define BCACHE_MAXIOWRA 512 90 91 static void bcache_invalidate(struct bcache *bc, daddr_t blkno); 92 static void bcache_insert(struct bcache *bc, daddr_t blkno); 93 static void bcache_free_instance(struct bcache *bc); 94 95 /* 96 * Initialise the cache for (nblks) of (bsize). 97 */ 98 void 99 bcache_init(size_t nblks, size_t bsize) 100 { 101 /* set up control data */ 102 bcache_total_nblks = nblks; 103 bcache_blksize = bsize; 104 } 105 106 /* 107 * add number of devices to bcache. we have to divide cache space 108 * between the devices, so bcache_add_dev() can be used to set up the 109 * number. The issue is, we need to get the number before actual allocations. 110 * bcache_add_dev() is supposed to be called from device init() call, so the 111 * assumption is, devsw dv_init is called for plain devices first, and 112 * for zfs, last. 113 */ 114 void 115 bcache_add_dev(int devices) 116 { 117 bcache_numdev += devices; 118 } 119 120 void * 121 bcache_allocate(void) 122 { 123 u_int i; 124 struct bcache *bc = malloc(sizeof (struct bcache)); 125 int disks = bcache_numdev; 126 127 if (disks == 0) 128 disks = 1; /* safe guard */ 129 130 if (bc == NULL) { 131 errno = ENOMEM; 132 return (bc); 133 } 134 135 /* 136 * the bcache block count must be power of 2 for hash function 137 */ 138 i = fls(disks) - 1; /* highbit - 1 */ 139 if (disks > (1 << i)) /* next power of 2 */ 140 i++; 141 142 bc->bcache_nblks = bcache_total_nblks >> i; 143 bcache_unit_nblks = bc->bcache_nblks; 144 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize); 145 if (bc->bcache_data == NULL) { 146 /* dont error out yet. fall back to 32 blocks and try again */ 147 bc->bcache_nblks = 32; 148 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + 149 sizeof(uint32_t)); 150 } 151 152 bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl)); 153 154 if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) { 155 bcache_free_instance(bc); 156 errno = ENOMEM; 157 return (NULL); 158 } 159 160 /* Flush the cache */ 161 for (i = 0; i < bc->bcache_nblks; i++) { 162 bc->bcache_ctl[i].bc_count = -1; 163 bc->bcache_ctl[i].bc_blkno = -1; 164 } 165 bcache_units++; 166 bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */ 167 bc->bcache_nextblkno = -1; 168 return (bc); 169 } 170 171 void 172 bcache_free(void *cache) 173 { 174 struct bcache *bc = cache; 175 176 if (bc == NULL) 177 return; 178 179 bcache_free_instance(bc); 180 bcache_units--; 181 } 182 183 /* 184 * Handle a write request; write directly to the disk, and populate the 185 * cache with the new values. 186 */ 187 static int 188 write_strategy(void *devdata, int rw, daddr_t blk, size_t size, 189 char *buf, size_t *rsize) 190 { 191 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 192 struct bcache *bc = dd->dv_cache; 193 daddr_t i, nblk; 194 195 nblk = size / bcache_blksize; 196 197 /* Invalidate the blocks being written */ 198 for (i = 0; i < nblk; i++) { 199 bcache_invalidate(bc, blk + i); 200 } 201 202 /* Write the blocks */ 203 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 204 } 205 206 /* 207 * Handle a read request; fill in parts of the request that can 208 * be satisfied by the cache, use the supplied strategy routine to do 209 * device I/O and then use the I/O results to populate the cache. 210 */ 211 static int 212 read_strategy(void *devdata, int rw, daddr_t blk, size_t size, 213 char *buf, size_t *rsize) 214 { 215 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 216 struct bcache *bc = dd->dv_cache; 217 size_t i, nblk, p_size, r_size, complete, ra; 218 int result; 219 daddr_t p_blk; 220 caddr_t p_buf; 221 222 if (bc == NULL) { 223 errno = ENODEV; 224 return (-1); 225 } 226 227 if (rsize != NULL) 228 *rsize = 0; 229 230 nblk = size / bcache_blksize; 231 if (nblk == 0 && size != 0) 232 nblk++; 233 result = 0; 234 complete = 1; 235 236 /* Satisfy any cache hits up front, break on first miss */ 237 for (i = 0; i < nblk; i++) { 238 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) { 239 bcache_misses += (nblk - i); 240 complete = 0; 241 break; 242 } else { 243 bcache_hits++; 244 } 245 } 246 247 /* 248 * Adjust read-ahead size if appropriate. Subject to the requirement 249 * that bc->ra must stay in between MINREADAHEAD and READAHEAD, we 250 * increase it when we notice that readahead was useful and decrease 251 * it when we notice that readahead was not useful. 252 */ 253 if (complete || (i == bc->ralen && bc->ralen > 0)) { 254 if (bc->ra < BCACHE_READAHEAD) 255 bc->ra <<= 1; /* increase read ahead */ 256 } else { 257 if (nblk - i > BCACHE_MINREADAHEAD && bc->ralen > 0 && 258 bc->ra > BCACHE_MINREADAHEAD) 259 bc->ra >>= 1; /* reduce read ahead */ 260 } 261 262 /* Adjust our "unconsumed readahead" value. */ 263 if (blk == bc->bcache_nextblkno) { 264 if (nblk > bc->ralen) 265 bc->ralen = 0; 266 else 267 bc->ralen -= nblk; 268 } 269 270 if (complete) { /* whole set was in cache, return it */ 271 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 272 goto done; 273 } 274 275 /* 276 * Fill in any misses. From check we have i pointing to first missing 277 * block, read in all remaining blocks + readahead. 278 * We have space at least for nblk - i before bcache wraps. 279 */ 280 p_blk = blk + i; 281 p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk)); 282 r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */ 283 284 p_size = MIN(r_size, nblk - i); /* read at least those blocks */ 285 286 /* 287 * The read ahead size setup. 288 * While the read ahead can save us IO, it also can complicate things: 289 * 1. We do not want to read ahead by wrapping around the 290 * bcache end - this would complicate the cache management. 291 * 2. We are using bc->ra as dynamic hint for read ahead size, 292 * detected cache hits will increase the read-ahead block count, and 293 * misses will decrease, see the code above. 294 * 3. The bcache is sized by 512B blocks, however, the underlying device 295 * may have a larger sector size, and we should perform the IO by 296 * taking into account these larger sector sizes. We could solve this by 297 * passing the sector size to bcache_allocate(), or by using ioctl(), but 298 * in this version we are using the constant, 16 blocks, and are rounding 299 * read ahead block count down to multiple of 16. 300 * Using the constant has two reasons, we are not entirely sure if the 301 * BIOS disk interface is providing the correct value for sector size. 302 * And secondly, this way we get the most conservative setup for the ra. 303 * 304 * The selection of multiple of 16 blocks (8KB) is quite arbitrary, however, 305 * we want to cover CDs (2K) and 4K disks. 306 * bcache_allocate() will always fall back to a minimum of 32 blocks. 307 * Our choice of 16 read ahead blocks will always fit inside the bcache. 308 */ 309 310 if ((rw & F_NORA) == F_NORA) 311 ra = 0; 312 else 313 ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size); 314 315 /* 316 * Only trigger read-ahead if we detect two blocks being read 317 * sequentially. 318 */ 319 if ((bc->bcache_nextblkno != blk) && ra != 0) { 320 ra = 0; 321 } 322 323 if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */ 324 ra = MIN(bc->ra, ra - 1); 325 ra = rounddown(ra, 16); /* multiple of 16 blocks */ 326 if (ra + p_size > BCACHE_MAXIOWRA) 327 ra = BCACHE_MAXIOWRA - p_size; 328 bc->ralen = ra; 329 p_size += ra; 330 } else { 331 bc->ralen = 0; 332 } 333 334 /* invalidate bcache */ 335 for (i = 0; i < p_size; i++) { 336 bcache_invalidate(bc, p_blk + i); 337 } 338 339 r_size = 0; 340 /* 341 * with read-ahead, it may happen we are attempting to read past 342 * disk end, as bcache has no information about disk size. 343 * in such case we should get partial read if some blocks can be 344 * read or error, if no blocks can be read. 345 * in either case we should return the data in bcache and only 346 * return error if there is no data. 347 */ 348 rw &= F_MASK; 349 result = dd->dv_strategy(dd->dv_devdata, rw, p_blk, 350 p_size * bcache_blksize, p_buf, &r_size); 351 352 r_size /= bcache_blksize; 353 for (i = 0; i < r_size; i++) 354 bcache_insert(bc, p_blk + i); 355 356 /* update ra statistics */ 357 if (r_size != 0) { 358 if (r_size < p_size) 359 bcache_rablks += (p_size - r_size); 360 else 361 bcache_rablks += ra; 362 } 363 364 /* check how much data can we copy */ 365 for (i = 0; i < nblk; i++) { 366 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) 367 break; 368 } 369 370 if (size > i * bcache_blksize) 371 size = i * bcache_blksize; 372 373 if (size != 0) { 374 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 375 result = 0; 376 } 377 378 done: 379 if (result == 0) { 380 if (rsize != NULL) 381 *rsize = size; 382 bc->bcache_nextblkno = blk + (size / DEV_BSIZE); 383 } 384 return(result); 385 } 386 387 /* 388 * Requests larger than 1/2 cache size will be bypassed and go 389 * directly to the disk. XXX tune this. 390 */ 391 int 392 bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size, 393 char *buf, size_t *rsize) 394 { 395 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 396 struct bcache *bc = dd->dv_cache; 397 u_int bcache_nblks = 0; 398 int nblk, cblk, ret; 399 size_t csize, isize, total; 400 401 bcache_ops++; 402 403 if (bc != NULL) 404 bcache_nblks = bc->bcache_nblks; 405 406 /* bypass large requests, or when the cache is inactive */ 407 if (bc == NULL || 408 ((size * 2 / bcache_blksize) > bcache_nblks)) { 409 DPRINTF("bypass %zu from %jd", size / bcache_blksize, blk); 410 bcache_bypasses++; 411 rw &= F_MASK; 412 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 413 } 414 415 switch (rw & F_MASK) { 416 case F_READ: 417 nblk = size / bcache_blksize; 418 if (size != 0 && nblk == 0) 419 nblk++; /* read at least one block */ 420 421 ret = 0; 422 total = 0; 423 while(size) { 424 cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */ 425 cblk = MIN(cblk, nblk); 426 427 if (size <= bcache_blksize) 428 csize = size; 429 else 430 csize = cblk * bcache_blksize; 431 432 ret = read_strategy(devdata, rw, blk, csize, buf+total, &isize); 433 434 /* 435 * we may have error from read ahead, if we have read some data 436 * return partial read. 437 */ 438 if (ret != 0 || isize == 0) { 439 if (total != 0) 440 ret = 0; 441 break; 442 } 443 blk += isize / bcache_blksize; 444 total += isize; 445 size -= isize; 446 nblk = size / bcache_blksize; 447 } 448 449 if (rsize) 450 *rsize = total; 451 452 return (ret); 453 case F_WRITE: 454 return write_strategy(devdata, F_WRITE, blk, size, buf, rsize); 455 } 456 return -1; 457 } 458 459 /* 460 * Free allocated bcache instance 461 */ 462 static void 463 bcache_free_instance(struct bcache *bc) 464 { 465 if (bc != NULL) { 466 free(bc->bcache_ctl); 467 free(bc->bcache_data); 468 free(bc); 469 } 470 } 471 472 /* 473 * Insert a block into the cache. 474 */ 475 static void 476 bcache_insert(struct bcache *bc, daddr_t blkno) 477 { 478 u_int cand; 479 480 cand = BHASH(bc, blkno); 481 482 DPRINTF("insert blk %jd -> %u # %d", blkno, cand, bcache_bcount); 483 bc->bcache_ctl[cand].bc_blkno = blkno; 484 bc->bcache_ctl[cand].bc_count = bcache_bcount++; 485 } 486 487 /* 488 * Invalidate a block from the cache. 489 */ 490 static void 491 bcache_invalidate(struct bcache *bc, daddr_t blkno) 492 { 493 u_int i; 494 495 i = BHASH(bc, blkno); 496 if (bc->bcache_ctl[i].bc_blkno == blkno) { 497 bc->bcache_ctl[i].bc_count = -1; 498 bc->bcache_ctl[i].bc_blkno = -1; 499 DPRINTF("invalidate blk %ju", blkno); 500 } 501 } 502 503 #ifndef BOOT2 504 COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache); 505 506 static int 507 command_bcache(int argc, char *argv[] __unused) 508 { 509 if (argc != 1) { 510 command_errmsg = "wrong number of arguments"; 511 return(CMD_ERROR); 512 } 513 514 printf("\ncache blocks: %u\n", bcache_total_nblks); 515 printf("cache blocksz: %u\n", bcache_blksize); 516 printf("cache readahead: %u\n", bcache_rablks); 517 printf("unit cache blocks: %u\n", bcache_unit_nblks); 518 printf("cached units: %u\n", bcache_units); 519 printf("%u ops %d bypasses %u hits %u misses\n", bcache_ops, 520 bcache_bypasses, bcache_hits, bcache_misses); 521 return(CMD_OK); 522 } 523 #endif 524