1 /*- 2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org> 3 * Copyright 2015 Toomas Soome <tsoome@me.com> 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 #include <sys/param.h> 30 __FBSDID("$FreeBSD$"); 31 32 /* 33 * Simple hashed block cache 34 */ 35 36 #include <sys/stdint.h> 37 38 #include <stand.h> 39 #include <string.h> 40 #include <strings.h> 41 42 #include "bootstrap.h" 43 44 /* #define BCACHE_DEBUG */ 45 46 #ifdef BCACHE_DEBUG 47 # define DEBUG(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args) 48 #else 49 # define DEBUG(fmt, args...) 50 #endif 51 52 struct bcachectl 53 { 54 daddr_t bc_blkno; 55 int bc_count; 56 }; 57 58 /* 59 * bcache per device node. cache is allocated on device first open and freed 60 * on last close, to save memory. The issue there is the size; biosdisk 61 * supports up to 31 (0x1f) devices. Classic setup would use single disk 62 * to boot from, but this has changed with zfs. 63 */ 64 struct bcache { 65 struct bcachectl *bcache_ctl; 66 caddr_t bcache_data; 67 u_int bcache_nblks; 68 size_t ra; 69 }; 70 71 static u_int bcache_total_nblks; /* set by bcache_init */ 72 static u_int bcache_blksize; /* set by bcache_init */ 73 static u_int bcache_numdev; /* set by bcache_add_dev */ 74 /* statistics */ 75 static u_int bcache_units; /* number of devices with cache */ 76 static u_int bcache_unit_nblks; /* nblocks per unit */ 77 static u_int bcache_hits; 78 static u_int bcache_misses; 79 static u_int bcache_ops; 80 static u_int bcache_bypasses; 81 static u_int bcache_bcount; 82 static u_int bcache_rablks; 83 84 #define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1)) 85 #define BCACHE_LOOKUP(bc, blkno) \ 86 ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno)) 87 #define BCACHE_READAHEAD 256 88 #define BCACHE_MINREADAHEAD 32 89 90 static void bcache_invalidate(struct bcache *bc, daddr_t blkno); 91 static void bcache_insert(struct bcache *bc, daddr_t blkno); 92 static void bcache_free_instance(struct bcache *bc); 93 94 /* 95 * Initialise the cache for (nblks) of (bsize). 96 */ 97 void 98 bcache_init(u_int nblks, size_t bsize) 99 { 100 /* set up control data */ 101 bcache_total_nblks = nblks; 102 bcache_blksize = bsize; 103 } 104 105 /* 106 * add number of devices to bcache. we have to divide cache space 107 * between the devices, so bcache_add_dev() can be used to set up the 108 * number. The issue is, we need to get the number before actual allocations. 109 * bcache_add_dev() is supposed to be called from device init() call, so the 110 * assumption is, devsw dv_init is called for plain devices first, and 111 * for zfs, last. 112 */ 113 void 114 bcache_add_dev(int devices) 115 { 116 bcache_numdev += devices; 117 } 118 119 void * 120 bcache_allocate(void) 121 { 122 u_int i; 123 struct bcache *bc = malloc(sizeof (struct bcache)); 124 int disks = bcache_numdev; 125 126 if (disks == 0) 127 disks = 1; /* safe guard */ 128 129 if (bc == NULL) { 130 errno = ENOMEM; 131 return (bc); 132 } 133 134 /* 135 * the bcache block count must be power of 2 for hash function 136 */ 137 i = fls(disks) - 1; /* highbit - 1 */ 138 if (disks > (1 << i)) /* next power of 2 */ 139 i++; 140 141 bc->bcache_nblks = bcache_total_nblks >> i; 142 bcache_unit_nblks = bc->bcache_nblks; 143 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize); 144 if (bc->bcache_data == NULL) { 145 /* dont error out yet. fall back to 32 blocks and try again */ 146 bc->bcache_nblks = 32; 147 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize); 148 } 149 150 bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl)); 151 152 if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) { 153 bcache_free_instance(bc); 154 errno = ENOMEM; 155 return(NULL); 156 } 157 158 /* Flush the cache */ 159 for (i = 0; i < bc->bcache_nblks; i++) { 160 bc->bcache_ctl[i].bc_count = -1; 161 bc->bcache_ctl[i].bc_blkno = -1; 162 } 163 bcache_units++; 164 bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */ 165 return (bc); 166 } 167 168 void 169 bcache_free(void *cache) 170 { 171 struct bcache *bc = cache; 172 173 if (bc == NULL) 174 return; 175 176 bcache_free_instance(bc); 177 bcache_units--; 178 } 179 180 /* 181 * Handle a write request; write directly to the disk, and populate the 182 * cache with the new values. 183 */ 184 static int 185 write_strategy(void *devdata, int rw, daddr_t blk, size_t offset, 186 size_t size, char *buf, size_t *rsize) 187 { 188 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 189 struct bcache *bc = dd->dv_cache; 190 daddr_t i, nblk; 191 192 nblk = size / bcache_blksize; 193 194 /* Invalidate the blocks being written */ 195 for (i = 0; i < nblk; i++) { 196 bcache_invalidate(bc, blk + i); 197 } 198 199 /* Write the blocks */ 200 return (dd->dv_strategy(dd->dv_devdata, rw, blk, offset, size, buf, rsize)); 201 } 202 203 /* 204 * Handle a read request; fill in parts of the request that can 205 * be satisfied by the cache, use the supplied strategy routine to do 206 * device I/O and then use the I/O results to populate the cache. 207 */ 208 static int 209 read_strategy(void *devdata, int rw, daddr_t blk, size_t offset, 210 size_t size, char *buf, size_t *rsize) 211 { 212 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 213 struct bcache *bc = dd->dv_cache; 214 size_t i, nblk, p_size, r_size, complete, ra; 215 int result; 216 daddr_t p_blk; 217 caddr_t p_buf; 218 219 if (bc == NULL) { 220 errno = ENODEV; 221 return (-1); 222 } 223 224 if (rsize != NULL) 225 *rsize = 0; 226 227 nblk = size / bcache_blksize; 228 if ((nblk == 0 && size != 0) || offset != 0) 229 nblk++; 230 result = 0; 231 complete = 1; 232 233 /* Satisfy any cache hits up front, break on first miss */ 234 for (i = 0; i < nblk; i++) { 235 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) { 236 bcache_misses += (nblk - i); 237 complete = 0; 238 if (nblk - i > BCACHE_MINREADAHEAD && bc->ra > BCACHE_MINREADAHEAD) 239 bc->ra >>= 1; /* reduce read ahead */ 240 break; 241 } else { 242 bcache_hits++; 243 } 244 } 245 246 if (complete) { /* whole set was in cache, return it */ 247 if (bc->ra < BCACHE_READAHEAD) 248 bc->ra <<= 1; /* increase read ahead */ 249 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)) + offset, 250 buf, size); 251 goto done; 252 } 253 254 /* 255 * Fill in any misses. From check we have i pointing to first missing 256 * block, read in all remaining blocks + readahead. 257 * We have space at least for nblk - i before bcache wraps. 258 */ 259 p_blk = blk + i; 260 p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk)); 261 r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */ 262 263 p_size = MIN(r_size, nblk - i); /* read at least those blocks */ 264 265 ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size); 266 if (ra != bc->bcache_nblks) { /* do we have RA space? */ 267 ra = MIN(bc->ra, ra); 268 p_size += ra; 269 } 270 271 /* invalidate bcache */ 272 for (i = 0; i < p_size; i++) { 273 bcache_invalidate(bc, p_blk + i); 274 } 275 276 r_size = 0; 277 /* 278 * with read-ahead, it may happen we are attempting to read past 279 * disk end, as bcache has no information about disk size. 280 * in such case we should get partial read if some blocks can be 281 * read or error, if no blocks can be read. 282 * in either case we should return the data in bcache and only 283 * return error if there is no data. 284 */ 285 result = dd->dv_strategy(dd->dv_devdata, rw, p_blk, 0, 286 p_size * bcache_blksize, p_buf, &r_size); 287 288 r_size /= bcache_blksize; 289 for (i = 0; i < r_size; i++) 290 bcache_insert(bc, p_blk + i); 291 292 /* update ra statistics */ 293 if (r_size != 0) { 294 if (r_size < p_size) 295 bcache_rablks += (p_size - r_size); 296 else 297 bcache_rablks += ra; 298 } 299 300 /* check how much data can we copy */ 301 for (i = 0; i < nblk; i++) { 302 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) 303 break; 304 } 305 306 if (size > i * bcache_blksize) 307 size = i * bcache_blksize; 308 309 if (size != 0) { 310 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)) + offset, 311 buf, size); 312 result = 0; 313 } 314 315 done: 316 if ((result == 0) && (rsize != NULL)) 317 *rsize = size; 318 return(result); 319 } 320 321 /* 322 * Requests larger than 1/2 cache size will be bypassed and go 323 * directly to the disk. XXX tune this. 324 */ 325 int 326 bcache_strategy(void *devdata, int rw, daddr_t blk, size_t offset, 327 size_t size, char *buf, size_t *rsize) 328 { 329 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 330 struct bcache *bc = dd->dv_cache; 331 u_int bcache_nblks = 0; 332 int nblk, cblk, ret; 333 size_t csize, isize, total; 334 335 bcache_ops++; 336 337 if (bc != NULL) 338 bcache_nblks = bc->bcache_nblks; 339 340 /* bypass large requests, or when the cache is inactive */ 341 if (bc == NULL || 342 (offset == 0 && ((size * 2 / bcache_blksize) > bcache_nblks))) { 343 DEBUG("bypass %d from %d", size / bcache_blksize, blk); 344 bcache_bypasses++; 345 return (dd->dv_strategy(dd->dv_devdata, rw, blk, offset, size, buf, 346 rsize)); 347 } 348 349 /* normalize offset */ 350 while (offset >= bcache_blksize) { 351 blk++; 352 offset -= bcache_blksize; 353 } 354 355 switch (rw) { 356 case F_READ: 357 nblk = size / bcache_blksize; 358 if (offset || (size != 0 && nblk == 0)) 359 nblk++; /* read at least one block */ 360 361 ret = 0; 362 total = 0; 363 while(size) { 364 cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */ 365 cblk = MIN(cblk, nblk); 366 367 if (size <= bcache_blksize) 368 csize = size; 369 else { 370 csize = cblk * bcache_blksize; 371 if (offset) 372 csize -= (bcache_blksize - offset); 373 } 374 375 ret = read_strategy(devdata, rw, blk, offset, 376 csize, buf+total, &isize); 377 378 /* 379 * we may have error from read ahead, if we have read some data 380 * return partial read. 381 */ 382 if (ret != 0 || isize == 0) { 383 if (total != 0) 384 ret = 0; 385 break; 386 } 387 blk += (offset+isize) / bcache_blksize; 388 offset = 0; 389 total += isize; 390 size -= isize; 391 nblk = size / bcache_blksize; 392 } 393 394 if (rsize) 395 *rsize = total; 396 397 return (ret); 398 case F_WRITE: 399 return write_strategy(devdata, rw, blk, offset, size, buf, rsize); 400 } 401 return -1; 402 } 403 404 /* 405 * Free allocated bcache instance 406 */ 407 static void 408 bcache_free_instance(struct bcache *bc) 409 { 410 if (bc != NULL) { 411 if (bc->bcache_ctl) 412 free(bc->bcache_ctl); 413 if (bc->bcache_data) 414 free(bc->bcache_data); 415 free(bc); 416 } 417 } 418 419 /* 420 * Insert a block into the cache. 421 */ 422 static void 423 bcache_insert(struct bcache *bc, daddr_t blkno) 424 { 425 u_int cand; 426 427 cand = BHASH(bc, blkno); 428 429 DEBUG("insert blk %llu -> %u # %d", blkno, cand, bcache_bcount); 430 bc->bcache_ctl[cand].bc_blkno = blkno; 431 bc->bcache_ctl[cand].bc_count = bcache_bcount++; 432 } 433 434 /* 435 * Invalidate a block from the cache. 436 */ 437 static void 438 bcache_invalidate(struct bcache *bc, daddr_t blkno) 439 { 440 u_int i; 441 442 i = BHASH(bc, blkno); 443 if (bc->bcache_ctl[i].bc_blkno == blkno) { 444 bc->bcache_ctl[i].bc_count = -1; 445 bc->bcache_ctl[i].bc_blkno = -1; 446 DEBUG("invalidate blk %llu", blkno); 447 } 448 } 449 450 #ifndef BOOT2 451 COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache); 452 453 static int 454 command_bcache(int argc, char *argv[] __attribute((unused))) 455 { 456 if (argc != 1) { 457 command_errmsg = "wrong number of arguments"; 458 return(CMD_ERROR); 459 } 460 461 printf("\ncache blocks: %d\n", bcache_total_nblks); 462 printf("cache blocksz: %d\n", bcache_blksize); 463 printf("cache readahead: %d\n", bcache_rablks); 464 printf("unit cache blocks: %d\n", bcache_unit_nblks); 465 printf("cached units: %d\n", bcache_units); 466 printf("%d ops %d bypasses %d hits %d misses\n", bcache_ops, 467 bcache_bypasses, bcache_hits, bcache_misses); 468 return(CMD_OK); 469 } 470 #endif 471