1 /*- 2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org> 3 * Copyright 2015 Toomas Soome <tsoome@me.com> 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/param.h> 29 /* 30 * Simple hashed block cache 31 */ 32 33 #include <sys/stdint.h> 34 35 #include <stand.h> 36 #include <string.h> 37 #include <strings.h> 38 39 #include "bootstrap.h" 40 41 /* #define BCACHE_DEBUG */ 42 43 #ifdef BCACHE_DEBUG 44 # define DPRINTF(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args) 45 #else 46 # define DPRINTF(fmt, args...) ((void)0) 47 #endif 48 49 struct bcachectl 50 { 51 daddr_t bc_blkno; 52 int bc_count; 53 }; 54 55 /* 56 * bcache per device node. cache is allocated on device first open and freed 57 * on last close, to save memory. The issue there is the size; biosdisk 58 * supports up to 31 (0x1f) devices. Classic setup would use single disk 59 * to boot from, but this has changed with zfs. 60 */ 61 struct bcache { 62 struct bcachectl *bcache_ctl; 63 caddr_t bcache_data; 64 size_t bcache_nblks; 65 size_t ra; 66 daddr_t bcache_nextblkno; 67 size_t ralen; 68 }; 69 70 static u_int bcache_total_nblks; /* set by bcache_init */ 71 static u_int bcache_blksize; /* set by bcache_init */ 72 static u_int bcache_numdev; /* set by bcache_add_dev */ 73 /* statistics */ 74 static u_int bcache_units; /* number of devices with cache */ 75 static u_int bcache_unit_nblks; /* nblocks per unit */ 76 static u_int bcache_hits; 77 static u_int bcache_misses; 78 static u_int bcache_ops; 79 static u_int bcache_bypasses; 80 static u_int bcache_bcount; 81 static u_int bcache_rablks; 82 83 #define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1)) 84 #define BCACHE_LOOKUP(bc, blkno) \ 85 ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno)) 86 #define BCACHE_READAHEAD 512 87 #define BCACHE_MINREADAHEAD 32 88 #define BCACHE_MAXIOWRA 512 89 90 static void bcache_invalidate(struct bcache *bc, daddr_t blkno); 91 static void bcache_insert(struct bcache *bc, daddr_t blkno); 92 static void bcache_free_instance(struct bcache *bc); 93 94 /* 95 * Initialise the cache for (nblks) of (bsize). 96 */ 97 void 98 bcache_init(size_t nblks, size_t bsize) 99 { 100 /* set up control data */ 101 bcache_total_nblks = nblks; 102 bcache_blksize = bsize; 103 } 104 105 /* 106 * add number of devices to bcache. we have to divide cache space 107 * between the devices, so bcache_add_dev() can be used to set up the 108 * number. The issue is, we need to get the number before actual allocations. 109 * bcache_add_dev() is supposed to be called from device init() call, so the 110 * assumption is, devsw dv_init is called for plain devices first, and 111 * for zfs, last. 112 */ 113 void 114 bcache_add_dev(int devices) 115 { 116 bcache_numdev += devices; 117 } 118 119 void * 120 bcache_allocate(void) 121 { 122 u_int i; 123 struct bcache *bc = malloc(sizeof (struct bcache)); 124 int disks = bcache_numdev; 125 126 if (disks == 0) 127 disks = 1; /* safe guard */ 128 129 if (bc == NULL) { 130 errno = ENOMEM; 131 return (bc); 132 } 133 134 /* 135 * the bcache block count must be power of 2 for hash function 136 */ 137 i = fls(disks) - 1; /* highbit - 1 */ 138 if (disks > (1 << i)) /* next power of 2 */ 139 i++; 140 141 bc->bcache_nblks = bcache_total_nblks >> i; 142 bcache_unit_nblks = bc->bcache_nblks; 143 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize); 144 if (bc->bcache_data == NULL) { 145 /* dont error out yet. fall back to 32 blocks and try again */ 146 bc->bcache_nblks = 32; 147 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + 148 sizeof(uint32_t)); 149 } 150 151 bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl)); 152 153 if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) { 154 bcache_free_instance(bc); 155 errno = ENOMEM; 156 return (NULL); 157 } 158 159 /* Flush the cache */ 160 for (i = 0; i < bc->bcache_nblks; i++) { 161 bc->bcache_ctl[i].bc_count = -1; 162 bc->bcache_ctl[i].bc_blkno = -1; 163 } 164 bcache_units++; 165 bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */ 166 bc->bcache_nextblkno = -1; 167 return (bc); 168 } 169 170 void 171 bcache_free(void *cache) 172 { 173 struct bcache *bc = cache; 174 175 if (bc == NULL) 176 return; 177 178 bcache_free_instance(bc); 179 bcache_units--; 180 } 181 182 /* 183 * Handle a write request; write directly to the disk, and populate the 184 * cache with the new values. 185 */ 186 static int 187 write_strategy(void *devdata, int rw, daddr_t blk, size_t size, 188 char *buf, size_t *rsize) 189 { 190 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 191 struct bcache *bc = dd->dv_cache; 192 daddr_t i, nblk; 193 194 nblk = size / bcache_blksize; 195 196 /* Invalidate the blocks being written */ 197 for (i = 0; i < nblk; i++) { 198 bcache_invalidate(bc, blk + i); 199 } 200 201 /* Write the blocks */ 202 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 203 } 204 205 /* 206 * Handle a read request; fill in parts of the request that can 207 * be satisfied by the cache, use the supplied strategy routine to do 208 * device I/O and then use the I/O results to populate the cache. 209 */ 210 static int 211 read_strategy(void *devdata, int rw, daddr_t blk, size_t size, 212 char *buf, size_t *rsize) 213 { 214 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 215 struct bcache *bc = dd->dv_cache; 216 size_t i, nblk, p_size, r_size, complete, ra; 217 int result; 218 daddr_t p_blk; 219 caddr_t p_buf; 220 221 if (bc == NULL) { 222 errno = ENODEV; 223 return (-1); 224 } 225 226 if (rsize != NULL) 227 *rsize = 0; 228 229 nblk = size / bcache_blksize; 230 if (nblk == 0 && size != 0) 231 nblk++; 232 result = 0; 233 complete = 1; 234 235 /* Satisfy any cache hits up front, break on first miss */ 236 for (i = 0; i < nblk; i++) { 237 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) { 238 bcache_misses += (nblk - i); 239 complete = 0; 240 break; 241 } else { 242 bcache_hits++; 243 } 244 } 245 246 /* 247 * Adjust read-ahead size if appropriate. Subject to the requirement 248 * that bc->ra must stay in between MINREADAHEAD and READAHEAD, we 249 * increase it when we notice that readahead was useful and decrease 250 * it when we notice that readahead was not useful. 251 */ 252 if (complete || (i == bc->ralen && bc->ralen > 0)) { 253 if (bc->ra < BCACHE_READAHEAD) 254 bc->ra <<= 1; /* increase read ahead */ 255 } else { 256 if (nblk - i > BCACHE_MINREADAHEAD && bc->ralen > 0 && 257 bc->ra > BCACHE_MINREADAHEAD) 258 bc->ra >>= 1; /* reduce read ahead */ 259 } 260 261 /* Adjust our "unconsumed readahead" value. */ 262 if (blk == bc->bcache_nextblkno) { 263 if (nblk > bc->ralen) 264 bc->ralen = 0; 265 else 266 bc->ralen -= nblk; 267 } 268 269 if (complete) { /* whole set was in cache, return it */ 270 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 271 goto done; 272 } 273 274 /* 275 * Fill in any misses. From check we have i pointing to first missing 276 * block, read in all remaining blocks + readahead. 277 * We have space at least for nblk - i before bcache wraps. 278 */ 279 p_blk = blk + i; 280 p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk)); 281 r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */ 282 283 p_size = MIN(r_size, nblk - i); /* read at least those blocks */ 284 285 /* 286 * The read ahead size setup. 287 * While the read ahead can save us IO, it also can complicate things: 288 * 1. We do not want to read ahead by wrapping around the 289 * bcache end - this would complicate the cache management. 290 * 2. We are using bc->ra as dynamic hint for read ahead size, 291 * detected cache hits will increase the read-ahead block count, and 292 * misses will decrease, see the code above. 293 * 3. The bcache is sized by 512B blocks, however, the underlying device 294 * may have a larger sector size, and we should perform the IO by 295 * taking into account these larger sector sizes. We could solve this by 296 * passing the sector size to bcache_allocate(), or by using ioctl(), but 297 * in this version we are using the constant, 16 blocks, and are rounding 298 * read ahead block count down to multiple of 16. 299 * Using the constant has two reasons, we are not entirely sure if the 300 * BIOS disk interface is providing the correct value for sector size. 301 * And secondly, this way we get the most conservative setup for the ra. 302 * 303 * The selection of multiple of 16 blocks (8KB) is quite arbitrary, however, 304 * we want to cover CDs (2K) and 4K disks. 305 * bcache_allocate() will always fall back to a minimum of 32 blocks. 306 * Our choice of 16 read ahead blocks will always fit inside the bcache. 307 */ 308 309 if ((rw & F_NORA) == F_NORA) 310 ra = 0; 311 else 312 ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size); 313 314 /* 315 * Only trigger read-ahead if we detect two blocks being read 316 * sequentially. 317 */ 318 if ((bc->bcache_nextblkno != blk) && ra != 0) { 319 ra = 0; 320 } 321 322 if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */ 323 ra = MIN(bc->ra, ra - 1); 324 ra = rounddown(ra, 16); /* multiple of 16 blocks */ 325 if (ra + p_size > BCACHE_MAXIOWRA) 326 ra = BCACHE_MAXIOWRA - p_size; 327 bc->ralen = ra; 328 p_size += ra; 329 } else { 330 bc->ralen = 0; 331 } 332 333 /* invalidate bcache */ 334 for (i = 0; i < p_size; i++) { 335 bcache_invalidate(bc, p_blk + i); 336 } 337 338 r_size = 0; 339 /* 340 * with read-ahead, it may happen we are attempting to read past 341 * disk end, as bcache has no information about disk size. 342 * in such case we should get partial read if some blocks can be 343 * read or error, if no blocks can be read. 344 * in either case we should return the data in bcache and only 345 * return error if there is no data. 346 */ 347 rw &= F_MASK; 348 result = dd->dv_strategy(dd->dv_devdata, rw, p_blk, 349 p_size * bcache_blksize, p_buf, &r_size); 350 351 r_size /= bcache_blksize; 352 for (i = 0; i < r_size; i++) 353 bcache_insert(bc, p_blk + i); 354 355 /* update ra statistics */ 356 if (r_size != 0) { 357 if (r_size < p_size) 358 bcache_rablks += (p_size - r_size); 359 else 360 bcache_rablks += ra; 361 } 362 363 /* check how much data can we copy */ 364 for (i = 0; i < nblk; i++) { 365 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) 366 break; 367 } 368 369 if (size > i * bcache_blksize) 370 size = i * bcache_blksize; 371 372 if (size != 0) { 373 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 374 result = 0; 375 } 376 377 done: 378 if (result == 0) { 379 if (rsize != NULL) 380 *rsize = size; 381 bc->bcache_nextblkno = blk + (size / DEV_BSIZE); 382 } 383 return(result); 384 } 385 386 /* 387 * Requests larger than 1/2 cache size will be bypassed and go 388 * directly to the disk. XXX tune this. 389 */ 390 int 391 bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size, 392 char *buf, size_t *rsize) 393 { 394 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 395 struct bcache *bc = dd->dv_cache; 396 u_int bcache_nblks = 0; 397 int nblk, cblk, ret; 398 size_t csize, isize, total; 399 400 bcache_ops++; 401 402 if (bc != NULL) 403 bcache_nblks = bc->bcache_nblks; 404 405 /* bypass large requests, or when the cache is inactive */ 406 if (bc == NULL || 407 ((size * 2 / bcache_blksize) > bcache_nblks)) { 408 DPRINTF("bypass %zu from %jd", size / bcache_blksize, blk); 409 bcache_bypasses++; 410 rw &= F_MASK; 411 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 412 } 413 414 switch (rw & F_MASK) { 415 case F_READ: 416 nblk = size / bcache_blksize; 417 if (size != 0 && nblk == 0) 418 nblk++; /* read at least one block */ 419 420 ret = 0; 421 total = 0; 422 while(size) { 423 cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */ 424 cblk = MIN(cblk, nblk); 425 426 if (size <= bcache_blksize) 427 csize = size; 428 else 429 csize = cblk * bcache_blksize; 430 431 ret = read_strategy(devdata, rw, blk, csize, buf+total, &isize); 432 433 /* 434 * we may have error from read ahead, if we have read some data 435 * return partial read. 436 */ 437 if (ret != 0 || isize == 0) { 438 if (total != 0) 439 ret = 0; 440 break; 441 } 442 blk += isize / bcache_blksize; 443 total += isize; 444 size -= isize; 445 nblk = size / bcache_blksize; 446 } 447 448 if (rsize) 449 *rsize = total; 450 451 return (ret); 452 case F_WRITE: 453 return write_strategy(devdata, F_WRITE, blk, size, buf, rsize); 454 } 455 return -1; 456 } 457 458 /* 459 * Free allocated bcache instance 460 */ 461 static void 462 bcache_free_instance(struct bcache *bc) 463 { 464 if (bc != NULL) { 465 free(bc->bcache_ctl); 466 free(bc->bcache_data); 467 free(bc); 468 } 469 } 470 471 /* 472 * Insert a block into the cache. 473 */ 474 static void 475 bcache_insert(struct bcache *bc, daddr_t blkno) 476 { 477 u_int cand; 478 479 cand = BHASH(bc, blkno); 480 481 DPRINTF("insert blk %jd -> %u # %d", blkno, cand, bcache_bcount); 482 bc->bcache_ctl[cand].bc_blkno = blkno; 483 bc->bcache_ctl[cand].bc_count = bcache_bcount++; 484 } 485 486 /* 487 * Invalidate a block from the cache. 488 */ 489 static void 490 bcache_invalidate(struct bcache *bc, daddr_t blkno) 491 { 492 u_int i; 493 494 i = BHASH(bc, blkno); 495 if (bc->bcache_ctl[i].bc_blkno == blkno) { 496 bc->bcache_ctl[i].bc_count = -1; 497 bc->bcache_ctl[i].bc_blkno = -1; 498 DPRINTF("invalidate blk %ju", blkno); 499 } 500 } 501 502 #ifndef BOOT2 503 COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache); 504 505 static int 506 command_bcache(int argc, char *argv[] __unused) 507 { 508 if (argc != 1) { 509 command_errmsg = "wrong number of arguments"; 510 return(CMD_ERROR); 511 } 512 513 printf("\ncache blocks: %u\n", bcache_total_nblks); 514 printf("cache blocksz: %u\n", bcache_blksize); 515 printf("cache readahead: %u\n", bcache_rablks); 516 printf("unit cache blocks: %u\n", bcache_unit_nblks); 517 printf("cached units: %u\n", bcache_units); 518 printf("%u ops %d bypasses %u hits %u misses\n", bcache_ops, 519 bcache_bypasses, bcache_hits, bcache_misses); 520 return(CMD_OK); 521 } 522 #endif 523