1 /* 2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org> 3 * Copyright 2015 Toomas Soome <tsoome@me.com> 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 #include <sys/param.h> 30 31 /* 32 * Simple hashed block cache 33 */ 34 35 #include <sys/stdint.h> 36 37 #include <stand.h> 38 #include <string.h> 39 #include <strings.h> 40 41 #include "bootstrap.h" 42 43 /* #define BCACHE_DEBUG */ 44 45 #ifdef BCACHE_DEBUG 46 #define DPRINTF(fmt, args...) printf("%s: " fmt "\n", __func__, ## args) 47 #else 48 #define DPRINTF(fmt, args...) ((void)0) 49 #endif 50 51 struct bcachectl 52 { 53 daddr_t bc_blkno; 54 int bc_count; 55 }; 56 57 /* 58 * bcache per device node. cache is allocated on device first open and freed 59 * on last close, to save memory. The issue there is the size; biosdisk 60 * supports up to 31 (0x1f) devices. Classic setup would use single disk 61 * to boot from, but this has changed with zfs. 62 */ 63 struct bcache { 64 struct bcachectl *bcache_ctl; 65 caddr_t bcache_data; 66 size_t bcache_nblks; 67 size_t ra; 68 }; 69 70 static uint_t bcache_total_nblks; /* set by bcache_init */ 71 static uint_t bcache_blksize; /* set by bcache_init */ 72 static uint_t bcache_numdev; /* set by bcache_add_dev */ 73 /* statistics */ 74 static uint_t bcache_units; /* number of devices with cache */ 75 static uint_t bcache_unit_nblks; /* nblocks per unit */ 76 static uint_t bcache_hits; 77 static uint_t bcache_misses; 78 static uint_t bcache_ops; 79 static uint_t bcache_bypasses; 80 static uint_t bcache_bcount; 81 static uint_t bcache_rablks; 82 83 #define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1)) 84 #define BCACHE_LOOKUP(bc, blkno) \ 85 ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno)) 86 #define BCACHE_READAHEAD 256 87 #define BCACHE_MINREADAHEAD 32 88 89 static void bcache_invalidate(struct bcache *bc, daddr_t blkno); 90 static void bcache_insert(struct bcache *bc, daddr_t blkno); 91 static void bcache_free_instance(struct bcache *bc); 92 93 /* 94 * Initialise the cache for (nblks) of (bsize). 95 */ 96 void 97 bcache_init(size_t nblks, size_t bsize) 98 { 99 /* set up control data */ 100 bcache_total_nblks = nblks; 101 bcache_blksize = bsize; 102 } 103 104 /* 105 * add number of devices to bcache. we have to divide cache space 106 * between the devices, so bcache_add_dev() can be used to set up the 107 * number. The issue is, we need to get the number before actual allocations. 108 * bcache_add_dev() is supposed to be called from device init() call, so the 109 * assumption is, devsw dv_init is called for plain devices first, and 110 * for zfs, last. 111 */ 112 void 113 bcache_add_dev(int devices) 114 { 115 bcache_numdev += devices; 116 } 117 118 void * 119 bcache_allocate(void) 120 { 121 uint_t i; 122 struct bcache *bc = malloc(sizeof (struct bcache)); 123 int disks = bcache_numdev; 124 125 if (disks == 0) 126 disks = 1; /* safe guard */ 127 128 if (bc == NULL) { 129 errno = ENOMEM; 130 return (bc); 131 } 132 133 /* 134 * the bcache block count must be power of 2 for hash function 135 */ 136 i = fls(disks) - 1; /* highbit - 1 */ 137 if (disks > (1 << i)) /* next power of 2 */ 138 i++; 139 140 bc->bcache_nblks = bcache_total_nblks >> i; 141 bcache_unit_nblks = bc->bcache_nblks; 142 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize); 143 if (bc->bcache_data == NULL) { 144 /* dont error out yet. fall back to 32 blocks and try again */ 145 bc->bcache_nblks = 32; 146 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + 147 sizeof (uint32_t)); 148 } 149 150 bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof (struct bcachectl)); 151 152 if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) { 153 bcache_free_instance(bc); 154 errno = ENOMEM; 155 return (NULL); 156 } 157 158 /* Flush the cache */ 159 for (i = 0; i < bc->bcache_nblks; i++) { 160 bc->bcache_ctl[i].bc_count = -1; 161 bc->bcache_ctl[i].bc_blkno = -1; 162 } 163 bcache_units++; 164 bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */ 165 return (bc); 166 } 167 168 void 169 bcache_free(void *cache) 170 { 171 struct bcache *bc = cache; 172 173 if (bc == NULL) 174 return; 175 176 bcache_free_instance(bc); 177 bcache_units--; 178 } 179 180 /* 181 * Handle a write request; write directly to the disk, and populate the 182 * cache with the new values. 183 */ 184 static int 185 write_strategy(void *devdata, int rw, daddr_t blk, size_t size, 186 char *buf, size_t *rsize) 187 { 188 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 189 struct bcache *bc = dd->dv_cache; 190 daddr_t i, nblk; 191 192 nblk = size / bcache_blksize; 193 194 /* Invalidate the blocks being written */ 195 for (i = 0; i < nblk; i++) { 196 bcache_invalidate(bc, blk + i); 197 } 198 199 /* Write the blocks */ 200 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 201 } 202 203 /* 204 * Handle a read request; fill in parts of the request that can 205 * be satisfied by the cache, use the supplied strategy routine to do 206 * device I/O and then use the I/O results to populate the cache. 207 */ 208 static int 209 read_strategy(void *devdata, int rw, daddr_t blk, size_t size, 210 char *buf, size_t *rsize) 211 { 212 struct bcache_devdata *dd = devdata; 213 struct bcache *bc = dd->dv_cache; 214 size_t i, nblk, p_size, r_size, complete, ra; 215 int result; 216 daddr_t p_blk; 217 caddr_t p_buf; 218 219 if (bc == NULL) { 220 errno = ENODEV; 221 return (-1); 222 } 223 224 if (rsize != NULL) 225 *rsize = 0; 226 227 nblk = size / bcache_blksize; 228 if (nblk == 0 && size != 0) 229 nblk++; 230 result = 0; 231 complete = 1; 232 233 /* Satisfy any cache hits up front, break on first miss */ 234 for (i = 0; i < nblk; i++) { 235 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) { 236 bcache_misses += (nblk - i); 237 complete = 0; 238 if (nblk - i > BCACHE_MINREADAHEAD && 239 bc->ra > BCACHE_MINREADAHEAD) 240 bc->ra >>= 1; /* reduce read ahead */ 241 break; 242 } else { 243 bcache_hits++; 244 } 245 } 246 247 if (complete) { /* whole set was in cache, return it */ 248 if (bc->ra < BCACHE_READAHEAD) 249 bc->ra <<= 1; /* increase read ahead */ 250 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), 251 buf, size); 252 goto done; 253 } 254 255 /* 256 * Fill in any misses. From check we have i pointing to first missing 257 * block, read in all remaining blocks + readahead. 258 * We have space at least for nblk - i before bcache wraps. 259 */ 260 p_blk = blk + i; 261 p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk)); 262 r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */ 263 264 p_size = MIN(r_size, nblk - i); /* read at least those blocks */ 265 266 /* 267 * The read ahead size setup. 268 * While the read ahead can save us IO, it also can complicate things: 269 * 1. We do not want to read ahead by wrapping around the 270 * bcache end - this would complicate the cache management. 271 * 2. We are using bc->ra as dynamic hint for read ahead size, 272 * detected cache hits will increase the read-ahead block count, 273 * and misses will decrease, see the code above. 274 * 3. The bcache is sized by 512B blocks, however, the underlying device 275 * may have a larger sector size, and we should perform the IO by 276 * taking into account these larger sector sizes. We could solve 277 * this by passing the sector size to bcache_allocate(), or by 278 * using ioctl(), but in this version we are using the constant, 279 * 16 blocks, and are rounding read ahead block count down to 280 * multiple of 16. Using the constant has two reasons, we are not 281 * entirely sure if the BIOS disk interface is providing the 282 * correct value for sector size. And secondly, this way we get 283 * the most conservative setup for the ra. 284 * 285 * The selection of multiple of 16 blocks (8KB) is quite arbitrary, 286 * however, we want to cover CDs (2K) and 4K disks. 287 * bcache_allocate() will always fall back to a minimum of 32 blocks. 288 * Our choice of 16 read ahead blocks will always fit inside the bcache. 289 */ 290 291 if ((rw & F_NORA) == F_NORA) 292 ra = 0; 293 else 294 ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size); 295 296 if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */ 297 ra = MIN(bc->ra, ra - 1); 298 ra = rounddown(ra, 16); /* multiple of 16 blocks */ 299 p_size += ra; 300 } 301 302 /* invalidate bcache */ 303 for (i = 0; i < p_size; i++) { 304 bcache_invalidate(bc, p_blk + i); 305 } 306 307 r_size = 0; 308 /* 309 * with read-ahead, it may happen we are attempting to read past 310 * disk end, as bcache has no information about disk size. 311 * in such case we should get partial read if some blocks can be 312 * read or error, if no blocks can be read. 313 * in either case we should return the data in bcache and only 314 * return error if there is no data. 315 */ 316 rw &= F_MASK; 317 result = dd->dv_strategy(dd->dv_devdata, rw, p_blk, 318 p_size * bcache_blksize, p_buf, &r_size); 319 320 r_size /= bcache_blksize; 321 for (i = 0; i < r_size; i++) 322 bcache_insert(bc, p_blk + i); 323 324 /* update ra statistics */ 325 if (r_size != 0) { 326 if (r_size < p_size) 327 bcache_rablks += (p_size - r_size); 328 else 329 bcache_rablks += ra; 330 } 331 332 /* check how much data can we copy */ 333 for (i = 0; i < nblk; i++) { 334 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) 335 break; 336 } 337 338 if (size > i * bcache_blksize) 339 size = i * bcache_blksize; 340 341 if (size != 0) { 342 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), 343 buf, size); 344 result = 0; 345 } 346 347 done: 348 if ((result == 0) && (rsize != NULL)) 349 *rsize = size; 350 return (result); 351 } 352 353 /* 354 * Requests larger than 1/2 cache size will be bypassed and go 355 * directly to the disk. XXX tune this. 356 */ 357 int 358 bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size, 359 char *buf, size_t *rsize) 360 { 361 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 362 struct bcache *bc = dd->dv_cache; 363 uint_t bcache_nblks = 0; 364 int nblk, cblk, ret; 365 size_t csize, isize, total; 366 367 bcache_ops++; 368 369 if (bc != NULL) 370 bcache_nblks = bc->bcache_nblks; 371 372 /* bypass large requests, or when the cache is inactive */ 373 if (bc == NULL || 374 ((size * 2 / bcache_blksize) > bcache_nblks)) { 375 DPRINTF("bypass %zu from %jd", size / bcache_blksize, 376 (intmax_t)blk); 377 bcache_bypasses++; 378 rw &= F_MASK; 379 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, 380 rsize)); 381 } 382 383 switch (rw & F_MASK) { 384 case F_READ: 385 nblk = size / bcache_blksize; 386 if (size != 0 && nblk == 0) 387 nblk++; /* read at least one block */ 388 389 ret = 0; 390 total = 0; 391 while (size) { 392 /* # of blocks left */ 393 cblk = bcache_nblks - BHASH(bc, blk); 394 cblk = MIN(cblk, nblk); 395 396 if (size <= bcache_blksize) 397 csize = size; 398 else 399 csize = cblk * bcache_blksize; 400 401 ret = read_strategy(devdata, rw, blk, csize, 402 buf + total, &isize); 403 404 /* 405 * we may have error from read ahead, if we have read 406 * some data return partial read. 407 */ 408 if (ret != 0 || isize == 0) { 409 if (total != 0) 410 ret = 0; 411 break; 412 } 413 blk += isize / bcache_blksize; 414 total += isize; 415 size -= isize; 416 nblk = size / bcache_blksize; 417 } 418 419 if (rsize) 420 *rsize = total; 421 422 return (ret); 423 case F_WRITE: 424 return (write_strategy(devdata, F_WRITE, blk, size, buf, 425 rsize)); 426 } 427 return (-1); 428 } 429 430 /* 431 * Free allocated bcache instance 432 */ 433 static void 434 bcache_free_instance(struct bcache *bc) 435 { 436 if (bc != NULL) { 437 free(bc->bcache_ctl); 438 free(bc->bcache_data); 439 free(bc); 440 } 441 } 442 443 /* 444 * Insert a block into the cache. 445 */ 446 static void 447 bcache_insert(struct bcache *bc, daddr_t blkno) 448 { 449 uint_t cand; 450 451 cand = BHASH(bc, blkno); 452 453 DPRINTF("insert blk %jd -> %u # %d", (intmax_t)blkno, cand, 454 bcache_bcount); 455 bc->bcache_ctl[cand].bc_blkno = blkno; 456 bc->bcache_ctl[cand].bc_count = bcache_bcount++; 457 } 458 459 /* 460 * Invalidate a block from the cache. 461 */ 462 static void 463 bcache_invalidate(struct bcache *bc, daddr_t blkno) 464 { 465 uint_t i; 466 467 i = BHASH(bc, blkno); 468 if (bc->bcache_ctl[i].bc_blkno == blkno) { 469 bc->bcache_ctl[i].bc_count = -1; 470 bc->bcache_ctl[i].bc_blkno = -1; 471 DPRINTF("invalidate blk %jd", (intmax_t)blkno); 472 } 473 } 474 475 COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", 476 command_bcache); 477 478 static int 479 command_bcache(int argc, char *argv[] __unused) 480 { 481 if (argc != 1) { 482 command_errmsg = "wrong number of arguments"; 483 return (CMD_ERROR); 484 } 485 486 printf("\ncache blocks: %u\n", bcache_total_nblks); 487 printf("cache blocksz: %u\n", bcache_blksize); 488 printf("cache readahead: %u\n", bcache_rablks); 489 printf("unit cache blocks: %u\n", bcache_unit_nblks); 490 printf("cached units: %u\n", bcache_units); 491 printf("%u ops %u bypasses %u hits %u misses\n", bcache_ops, 492 bcache_bypasses, bcache_hits, bcache_misses); 493 return (CMD_OK); 494 } 495