1*ca987d46SWarner Losh /*- 2*ca987d46SWarner Losh * Copyright (c) 1998 Michael Smith <msmith@freebsd.org> 3*ca987d46SWarner Losh * Copyright 2015 Toomas Soome <tsoome@me.com> 4*ca987d46SWarner Losh * All rights reserved. 5*ca987d46SWarner Losh * 6*ca987d46SWarner Losh * Redistribution and use in source and binary forms, with or without 7*ca987d46SWarner Losh * modification, are permitted provided that the following conditions 8*ca987d46SWarner Losh * are met: 9*ca987d46SWarner Losh * 1. Redistributions of source code must retain the above copyright 10*ca987d46SWarner Losh * notice, this list of conditions and the following disclaimer. 11*ca987d46SWarner Losh * 2. Redistributions in binary form must reproduce the above copyright 12*ca987d46SWarner Losh * notice, this list of conditions and the following disclaimer in the 13*ca987d46SWarner Losh * documentation and/or other materials provided with the distribution. 14*ca987d46SWarner Losh * 15*ca987d46SWarner Losh * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16*ca987d46SWarner Losh * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17*ca987d46SWarner Losh * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18*ca987d46SWarner Losh * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19*ca987d46SWarner Losh * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20*ca987d46SWarner Losh * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21*ca987d46SWarner Losh * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22*ca987d46SWarner Losh * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23*ca987d46SWarner Losh * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24*ca987d46SWarner Losh * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25*ca987d46SWarner Losh * SUCH DAMAGE. 26*ca987d46SWarner Losh */ 27*ca987d46SWarner Losh 28*ca987d46SWarner Losh #include <sys/cdefs.h> 29*ca987d46SWarner Losh #include <sys/param.h> 30*ca987d46SWarner Losh __FBSDID("$FreeBSD$"); 31*ca987d46SWarner Losh 32*ca987d46SWarner Losh /* 33*ca987d46SWarner Losh * Simple hashed block cache 34*ca987d46SWarner Losh */ 35*ca987d46SWarner Losh 36*ca987d46SWarner Losh #include <sys/stdint.h> 37*ca987d46SWarner Losh 38*ca987d46SWarner Losh #include <stand.h> 39*ca987d46SWarner Losh #include <string.h> 40*ca987d46SWarner Losh #include <strings.h> 41*ca987d46SWarner Losh 42*ca987d46SWarner Losh #include "bootstrap.h" 43*ca987d46SWarner Losh 44*ca987d46SWarner Losh /* #define BCACHE_DEBUG */ 45*ca987d46SWarner Losh 46*ca987d46SWarner Losh #ifdef BCACHE_DEBUG 47*ca987d46SWarner Losh # define DEBUG(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args) 48*ca987d46SWarner Losh #else 49*ca987d46SWarner Losh # define DEBUG(fmt, args...) 50*ca987d46SWarner Losh #endif 51*ca987d46SWarner Losh 52*ca987d46SWarner Losh struct bcachectl 53*ca987d46SWarner Losh { 54*ca987d46SWarner Losh daddr_t bc_blkno; 55*ca987d46SWarner Losh int bc_count; 56*ca987d46SWarner Losh }; 57*ca987d46SWarner Losh 58*ca987d46SWarner Losh /* 59*ca987d46SWarner Losh * bcache per device node. cache is allocated on device first open and freed 60*ca987d46SWarner Losh * on last close, to save memory. The issue there is the size; biosdisk 61*ca987d46SWarner Losh * supports up to 31 (0x1f) devices. Classic setup would use single disk 62*ca987d46SWarner Losh * to boot from, but this has changed with zfs. 63*ca987d46SWarner Losh */ 64*ca987d46SWarner Losh struct bcache { 65*ca987d46SWarner Losh struct bcachectl *bcache_ctl; 66*ca987d46SWarner Losh caddr_t bcache_data; 67*ca987d46SWarner Losh size_t bcache_nblks; 68*ca987d46SWarner Losh size_t ra; 69*ca987d46SWarner Losh }; 70*ca987d46SWarner Losh 71*ca987d46SWarner Losh static u_int bcache_total_nblks; /* set by bcache_init */ 72*ca987d46SWarner Losh static u_int bcache_blksize; /* set by bcache_init */ 73*ca987d46SWarner Losh static u_int bcache_numdev; /* set by bcache_add_dev */ 74*ca987d46SWarner Losh /* statistics */ 75*ca987d46SWarner Losh static u_int bcache_units; /* number of devices with cache */ 76*ca987d46SWarner Losh static u_int bcache_unit_nblks; /* nblocks per unit */ 77*ca987d46SWarner Losh static u_int bcache_hits; 78*ca987d46SWarner Losh static u_int bcache_misses; 79*ca987d46SWarner Losh static u_int bcache_ops; 80*ca987d46SWarner Losh static u_int bcache_bypasses; 81*ca987d46SWarner Losh static u_int bcache_bcount; 82*ca987d46SWarner Losh static u_int bcache_rablks; 83*ca987d46SWarner Losh 84*ca987d46SWarner Losh #define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1)) 85*ca987d46SWarner Losh #define BCACHE_LOOKUP(bc, blkno) \ 86*ca987d46SWarner Losh ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno)) 87*ca987d46SWarner Losh #define BCACHE_READAHEAD 256 88*ca987d46SWarner Losh #define BCACHE_MINREADAHEAD 32 89*ca987d46SWarner Losh #define BCACHE_MARKER 0xdeadbeef 90*ca987d46SWarner Losh 91*ca987d46SWarner Losh static void bcache_invalidate(struct bcache *bc, daddr_t blkno); 92*ca987d46SWarner Losh static void bcache_insert(struct bcache *bc, daddr_t blkno); 93*ca987d46SWarner Losh static void bcache_free_instance(struct bcache *bc); 94*ca987d46SWarner Losh 95*ca987d46SWarner Losh /* 96*ca987d46SWarner Losh * Initialise the cache for (nblks) of (bsize). 97*ca987d46SWarner Losh */ 98*ca987d46SWarner Losh void 99*ca987d46SWarner Losh bcache_init(size_t nblks, size_t bsize) 100*ca987d46SWarner Losh { 101*ca987d46SWarner Losh /* set up control data */ 102*ca987d46SWarner Losh bcache_total_nblks = nblks; 103*ca987d46SWarner Losh bcache_blksize = bsize; 104*ca987d46SWarner Losh } 105*ca987d46SWarner Losh 106*ca987d46SWarner Losh /* 107*ca987d46SWarner Losh * add number of devices to bcache. we have to divide cache space 108*ca987d46SWarner Losh * between the devices, so bcache_add_dev() can be used to set up the 109*ca987d46SWarner Losh * number. The issue is, we need to get the number before actual allocations. 110*ca987d46SWarner Losh * bcache_add_dev() is supposed to be called from device init() call, so the 111*ca987d46SWarner Losh * assumption is, devsw dv_init is called for plain devices first, and 112*ca987d46SWarner Losh * for zfs, last. 113*ca987d46SWarner Losh */ 114*ca987d46SWarner Losh void 115*ca987d46SWarner Losh bcache_add_dev(int devices) 116*ca987d46SWarner Losh { 117*ca987d46SWarner Losh bcache_numdev += devices; 118*ca987d46SWarner Losh } 119*ca987d46SWarner Losh 120*ca987d46SWarner Losh void * 121*ca987d46SWarner Losh bcache_allocate(void) 122*ca987d46SWarner Losh { 123*ca987d46SWarner Losh u_int i; 124*ca987d46SWarner Losh struct bcache *bc = malloc(sizeof (struct bcache)); 125*ca987d46SWarner Losh int disks = bcache_numdev; 126*ca987d46SWarner Losh uint32_t *marker; 127*ca987d46SWarner Losh 128*ca987d46SWarner Losh if (disks == 0) 129*ca987d46SWarner Losh disks = 1; /* safe guard */ 130*ca987d46SWarner Losh 131*ca987d46SWarner Losh if (bc == NULL) { 132*ca987d46SWarner Losh errno = ENOMEM; 133*ca987d46SWarner Losh return (bc); 134*ca987d46SWarner Losh } 135*ca987d46SWarner Losh 136*ca987d46SWarner Losh /* 137*ca987d46SWarner Losh * the bcache block count must be power of 2 for hash function 138*ca987d46SWarner Losh */ 139*ca987d46SWarner Losh i = fls(disks) - 1; /* highbit - 1 */ 140*ca987d46SWarner Losh if (disks > (1 << i)) /* next power of 2 */ 141*ca987d46SWarner Losh i++; 142*ca987d46SWarner Losh 143*ca987d46SWarner Losh bc->bcache_nblks = bcache_total_nblks >> i; 144*ca987d46SWarner Losh bcache_unit_nblks = bc->bcache_nblks; 145*ca987d46SWarner Losh bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + 146*ca987d46SWarner Losh sizeof(uint32_t)); 147*ca987d46SWarner Losh if (bc->bcache_data == NULL) { 148*ca987d46SWarner Losh /* dont error out yet. fall back to 32 blocks and try again */ 149*ca987d46SWarner Losh bc->bcache_nblks = 32; 150*ca987d46SWarner Losh bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + 151*ca987d46SWarner Losh sizeof(uint32_t)); 152*ca987d46SWarner Losh } 153*ca987d46SWarner Losh 154*ca987d46SWarner Losh bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl)); 155*ca987d46SWarner Losh 156*ca987d46SWarner Losh if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) { 157*ca987d46SWarner Losh bcache_free_instance(bc); 158*ca987d46SWarner Losh errno = ENOMEM; 159*ca987d46SWarner Losh return (NULL); 160*ca987d46SWarner Losh } 161*ca987d46SWarner Losh /* Insert cache end marker. */ 162*ca987d46SWarner Losh marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize); 163*ca987d46SWarner Losh *marker = BCACHE_MARKER; 164*ca987d46SWarner Losh 165*ca987d46SWarner Losh /* Flush the cache */ 166*ca987d46SWarner Losh for (i = 0; i < bc->bcache_nblks; i++) { 167*ca987d46SWarner Losh bc->bcache_ctl[i].bc_count = -1; 168*ca987d46SWarner Losh bc->bcache_ctl[i].bc_blkno = -1; 169*ca987d46SWarner Losh } 170*ca987d46SWarner Losh bcache_units++; 171*ca987d46SWarner Losh bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */ 172*ca987d46SWarner Losh return (bc); 173*ca987d46SWarner Losh } 174*ca987d46SWarner Losh 175*ca987d46SWarner Losh void 176*ca987d46SWarner Losh bcache_free(void *cache) 177*ca987d46SWarner Losh { 178*ca987d46SWarner Losh struct bcache *bc = cache; 179*ca987d46SWarner Losh 180*ca987d46SWarner Losh if (bc == NULL) 181*ca987d46SWarner Losh return; 182*ca987d46SWarner Losh 183*ca987d46SWarner Losh bcache_free_instance(bc); 184*ca987d46SWarner Losh bcache_units--; 185*ca987d46SWarner Losh } 186*ca987d46SWarner Losh 187*ca987d46SWarner Losh /* 188*ca987d46SWarner Losh * Handle a write request; write directly to the disk, and populate the 189*ca987d46SWarner Losh * cache with the new values. 190*ca987d46SWarner Losh */ 191*ca987d46SWarner Losh static int 192*ca987d46SWarner Losh write_strategy(void *devdata, int rw, daddr_t blk, size_t size, 193*ca987d46SWarner Losh char *buf, size_t *rsize) 194*ca987d46SWarner Losh { 195*ca987d46SWarner Losh struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 196*ca987d46SWarner Losh struct bcache *bc = dd->dv_cache; 197*ca987d46SWarner Losh daddr_t i, nblk; 198*ca987d46SWarner Losh 199*ca987d46SWarner Losh nblk = size / bcache_blksize; 200*ca987d46SWarner Losh 201*ca987d46SWarner Losh /* Invalidate the blocks being written */ 202*ca987d46SWarner Losh for (i = 0; i < nblk; i++) { 203*ca987d46SWarner Losh bcache_invalidate(bc, blk + i); 204*ca987d46SWarner Losh } 205*ca987d46SWarner Losh 206*ca987d46SWarner Losh /* Write the blocks */ 207*ca987d46SWarner Losh return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 208*ca987d46SWarner Losh } 209*ca987d46SWarner Losh 210*ca987d46SWarner Losh /* 211*ca987d46SWarner Losh * Handle a read request; fill in parts of the request that can 212*ca987d46SWarner Losh * be satisfied by the cache, use the supplied strategy routine to do 213*ca987d46SWarner Losh * device I/O and then use the I/O results to populate the cache. 214*ca987d46SWarner Losh */ 215*ca987d46SWarner Losh static int 216*ca987d46SWarner Losh read_strategy(void *devdata, int rw, daddr_t blk, size_t size, 217*ca987d46SWarner Losh char *buf, size_t *rsize) 218*ca987d46SWarner Losh { 219*ca987d46SWarner Losh struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 220*ca987d46SWarner Losh struct bcache *bc = dd->dv_cache; 221*ca987d46SWarner Losh size_t i, nblk, p_size, r_size, complete, ra; 222*ca987d46SWarner Losh int result; 223*ca987d46SWarner Losh daddr_t p_blk; 224*ca987d46SWarner Losh caddr_t p_buf; 225*ca987d46SWarner Losh uint32_t *marker; 226*ca987d46SWarner Losh 227*ca987d46SWarner Losh if (bc == NULL) { 228*ca987d46SWarner Losh errno = ENODEV; 229*ca987d46SWarner Losh return (-1); 230*ca987d46SWarner Losh } 231*ca987d46SWarner Losh 232*ca987d46SWarner Losh marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize); 233*ca987d46SWarner Losh 234*ca987d46SWarner Losh if (rsize != NULL) 235*ca987d46SWarner Losh *rsize = 0; 236*ca987d46SWarner Losh 237*ca987d46SWarner Losh nblk = size / bcache_blksize; 238*ca987d46SWarner Losh if (nblk == 0 && size != 0) 239*ca987d46SWarner Losh nblk++; 240*ca987d46SWarner Losh result = 0; 241*ca987d46SWarner Losh complete = 1; 242*ca987d46SWarner Losh 243*ca987d46SWarner Losh /* Satisfy any cache hits up front, break on first miss */ 244*ca987d46SWarner Losh for (i = 0; i < nblk; i++) { 245*ca987d46SWarner Losh if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) { 246*ca987d46SWarner Losh bcache_misses += (nblk - i); 247*ca987d46SWarner Losh complete = 0; 248*ca987d46SWarner Losh if (nblk - i > BCACHE_MINREADAHEAD && bc->ra > BCACHE_MINREADAHEAD) 249*ca987d46SWarner Losh bc->ra >>= 1; /* reduce read ahead */ 250*ca987d46SWarner Losh break; 251*ca987d46SWarner Losh } else { 252*ca987d46SWarner Losh bcache_hits++; 253*ca987d46SWarner Losh } 254*ca987d46SWarner Losh } 255*ca987d46SWarner Losh 256*ca987d46SWarner Losh if (complete) { /* whole set was in cache, return it */ 257*ca987d46SWarner Losh if (bc->ra < BCACHE_READAHEAD) 258*ca987d46SWarner Losh bc->ra <<= 1; /* increase read ahead */ 259*ca987d46SWarner Losh bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 260*ca987d46SWarner Losh goto done; 261*ca987d46SWarner Losh } 262*ca987d46SWarner Losh 263*ca987d46SWarner Losh /* 264*ca987d46SWarner Losh * Fill in any misses. From check we have i pointing to first missing 265*ca987d46SWarner Losh * block, read in all remaining blocks + readahead. 266*ca987d46SWarner Losh * We have space at least for nblk - i before bcache wraps. 267*ca987d46SWarner Losh */ 268*ca987d46SWarner Losh p_blk = blk + i; 269*ca987d46SWarner Losh p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk)); 270*ca987d46SWarner Losh r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */ 271*ca987d46SWarner Losh 272*ca987d46SWarner Losh p_size = MIN(r_size, nblk - i); /* read at least those blocks */ 273*ca987d46SWarner Losh 274*ca987d46SWarner Losh /* 275*ca987d46SWarner Losh * The read ahead size setup. 276*ca987d46SWarner Losh * While the read ahead can save us IO, it also can complicate things: 277*ca987d46SWarner Losh * 1. We do not want to read ahead by wrapping around the 278*ca987d46SWarner Losh * bcache end - this would complicate the cache management. 279*ca987d46SWarner Losh * 2. We are using bc->ra as dynamic hint for read ahead size, 280*ca987d46SWarner Losh * detected cache hits will increase the read-ahead block count, and 281*ca987d46SWarner Losh * misses will decrease, see the code above. 282*ca987d46SWarner Losh * 3. The bcache is sized by 512B blocks, however, the underlying device 283*ca987d46SWarner Losh * may have a larger sector size, and we should perform the IO by 284*ca987d46SWarner Losh * taking into account these larger sector sizes. We could solve this by 285*ca987d46SWarner Losh * passing the sector size to bcache_allocate(), or by using ioctl(), but 286*ca987d46SWarner Losh * in this version we are using the constant, 16 blocks, and are rounding 287*ca987d46SWarner Losh * read ahead block count down to multiple of 16. 288*ca987d46SWarner Losh * Using the constant has two reasons, we are not entirely sure if the 289*ca987d46SWarner Losh * BIOS disk interface is providing the correct value for sector size. 290*ca987d46SWarner Losh * And secondly, this way we get the most conservative setup for the ra. 291*ca987d46SWarner Losh * 292*ca987d46SWarner Losh * The selection of multiple of 16 blocks (8KB) is quite arbitrary, however, 293*ca987d46SWarner Losh * we want to cover CDs (2K) and 4K disks. 294*ca987d46SWarner Losh * bcache_allocate() will always fall back to a minimum of 32 blocks. 295*ca987d46SWarner Losh * Our choice of 16 read ahead blocks will always fit inside the bcache. 296*ca987d46SWarner Losh */ 297*ca987d46SWarner Losh 298*ca987d46SWarner Losh if ((rw & F_NORA) == F_NORA) 299*ca987d46SWarner Losh ra = 0; 300*ca987d46SWarner Losh else 301*ca987d46SWarner Losh ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size); 302*ca987d46SWarner Losh 303*ca987d46SWarner Losh if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */ 304*ca987d46SWarner Losh ra = MIN(bc->ra, ra - 1); 305*ca987d46SWarner Losh ra = rounddown(ra, 16); /* multiple of 16 blocks */ 306*ca987d46SWarner Losh p_size += ra; 307*ca987d46SWarner Losh } 308*ca987d46SWarner Losh 309*ca987d46SWarner Losh /* invalidate bcache */ 310*ca987d46SWarner Losh for (i = 0; i < p_size; i++) { 311*ca987d46SWarner Losh bcache_invalidate(bc, p_blk + i); 312*ca987d46SWarner Losh } 313*ca987d46SWarner Losh 314*ca987d46SWarner Losh r_size = 0; 315*ca987d46SWarner Losh /* 316*ca987d46SWarner Losh * with read-ahead, it may happen we are attempting to read past 317*ca987d46SWarner Losh * disk end, as bcache has no information about disk size. 318*ca987d46SWarner Losh * in such case we should get partial read if some blocks can be 319*ca987d46SWarner Losh * read or error, if no blocks can be read. 320*ca987d46SWarner Losh * in either case we should return the data in bcache and only 321*ca987d46SWarner Losh * return error if there is no data. 322*ca987d46SWarner Losh */ 323*ca987d46SWarner Losh rw &= F_MASK; 324*ca987d46SWarner Losh result = dd->dv_strategy(dd->dv_devdata, rw, p_blk, 325*ca987d46SWarner Losh p_size * bcache_blksize, p_buf, &r_size); 326*ca987d46SWarner Losh 327*ca987d46SWarner Losh r_size /= bcache_blksize; 328*ca987d46SWarner Losh for (i = 0; i < r_size; i++) 329*ca987d46SWarner Losh bcache_insert(bc, p_blk + i); 330*ca987d46SWarner Losh 331*ca987d46SWarner Losh /* update ra statistics */ 332*ca987d46SWarner Losh if (r_size != 0) { 333*ca987d46SWarner Losh if (r_size < p_size) 334*ca987d46SWarner Losh bcache_rablks += (p_size - r_size); 335*ca987d46SWarner Losh else 336*ca987d46SWarner Losh bcache_rablks += ra; 337*ca987d46SWarner Losh } 338*ca987d46SWarner Losh 339*ca987d46SWarner Losh /* check how much data can we copy */ 340*ca987d46SWarner Losh for (i = 0; i < nblk; i++) { 341*ca987d46SWarner Losh if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) 342*ca987d46SWarner Losh break; 343*ca987d46SWarner Losh } 344*ca987d46SWarner Losh 345*ca987d46SWarner Losh if (size > i * bcache_blksize) 346*ca987d46SWarner Losh size = i * bcache_blksize; 347*ca987d46SWarner Losh 348*ca987d46SWarner Losh if (size != 0) { 349*ca987d46SWarner Losh bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 350*ca987d46SWarner Losh result = 0; 351*ca987d46SWarner Losh } 352*ca987d46SWarner Losh 353*ca987d46SWarner Losh if (*marker != BCACHE_MARKER) { 354*ca987d46SWarner Losh printf("BUG: bcache corruption detected: nblks: %zu p_blk: %lu, " 355*ca987d46SWarner Losh "p_size: %zu, ra: %zu\n", bc->bcache_nblks, 356*ca987d46SWarner Losh (long unsigned)BHASH(bc, p_blk), p_size, ra); 357*ca987d46SWarner Losh } 358*ca987d46SWarner Losh 359*ca987d46SWarner Losh done: 360*ca987d46SWarner Losh if ((result == 0) && (rsize != NULL)) 361*ca987d46SWarner Losh *rsize = size; 362*ca987d46SWarner Losh return(result); 363*ca987d46SWarner Losh } 364*ca987d46SWarner Losh 365*ca987d46SWarner Losh /* 366*ca987d46SWarner Losh * Requests larger than 1/2 cache size will be bypassed and go 367*ca987d46SWarner Losh * directly to the disk. XXX tune this. 368*ca987d46SWarner Losh */ 369*ca987d46SWarner Losh int 370*ca987d46SWarner Losh bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size, 371*ca987d46SWarner Losh char *buf, size_t *rsize) 372*ca987d46SWarner Losh { 373*ca987d46SWarner Losh struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 374*ca987d46SWarner Losh struct bcache *bc = dd->dv_cache; 375*ca987d46SWarner Losh u_int bcache_nblks = 0; 376*ca987d46SWarner Losh int nblk, cblk, ret; 377*ca987d46SWarner Losh size_t csize, isize, total; 378*ca987d46SWarner Losh 379*ca987d46SWarner Losh bcache_ops++; 380*ca987d46SWarner Losh 381*ca987d46SWarner Losh if (bc != NULL) 382*ca987d46SWarner Losh bcache_nblks = bc->bcache_nblks; 383*ca987d46SWarner Losh 384*ca987d46SWarner Losh /* bypass large requests, or when the cache is inactive */ 385*ca987d46SWarner Losh if (bc == NULL || 386*ca987d46SWarner Losh ((size * 2 / bcache_blksize) > bcache_nblks)) { 387*ca987d46SWarner Losh DEBUG("bypass %zu from %qu", size / bcache_blksize, blk); 388*ca987d46SWarner Losh bcache_bypasses++; 389*ca987d46SWarner Losh rw &= F_MASK; 390*ca987d46SWarner Losh return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 391*ca987d46SWarner Losh } 392*ca987d46SWarner Losh 393*ca987d46SWarner Losh switch (rw & F_MASK) { 394*ca987d46SWarner Losh case F_READ: 395*ca987d46SWarner Losh nblk = size / bcache_blksize; 396*ca987d46SWarner Losh if (size != 0 && nblk == 0) 397*ca987d46SWarner Losh nblk++; /* read at least one block */ 398*ca987d46SWarner Losh 399*ca987d46SWarner Losh ret = 0; 400*ca987d46SWarner Losh total = 0; 401*ca987d46SWarner Losh while(size) { 402*ca987d46SWarner Losh cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */ 403*ca987d46SWarner Losh cblk = MIN(cblk, nblk); 404*ca987d46SWarner Losh 405*ca987d46SWarner Losh if (size <= bcache_blksize) 406*ca987d46SWarner Losh csize = size; 407*ca987d46SWarner Losh else 408*ca987d46SWarner Losh csize = cblk * bcache_blksize; 409*ca987d46SWarner Losh 410*ca987d46SWarner Losh ret = read_strategy(devdata, rw, blk, csize, buf+total, &isize); 411*ca987d46SWarner Losh 412*ca987d46SWarner Losh /* 413*ca987d46SWarner Losh * we may have error from read ahead, if we have read some data 414*ca987d46SWarner Losh * return partial read. 415*ca987d46SWarner Losh */ 416*ca987d46SWarner Losh if (ret != 0 || isize == 0) { 417*ca987d46SWarner Losh if (total != 0) 418*ca987d46SWarner Losh ret = 0; 419*ca987d46SWarner Losh break; 420*ca987d46SWarner Losh } 421*ca987d46SWarner Losh blk += isize / bcache_blksize; 422*ca987d46SWarner Losh total += isize; 423*ca987d46SWarner Losh size -= isize; 424*ca987d46SWarner Losh nblk = size / bcache_blksize; 425*ca987d46SWarner Losh } 426*ca987d46SWarner Losh 427*ca987d46SWarner Losh if (rsize) 428*ca987d46SWarner Losh *rsize = total; 429*ca987d46SWarner Losh 430*ca987d46SWarner Losh return (ret); 431*ca987d46SWarner Losh case F_WRITE: 432*ca987d46SWarner Losh return write_strategy(devdata, F_WRITE, blk, size, buf, rsize); 433*ca987d46SWarner Losh } 434*ca987d46SWarner Losh return -1; 435*ca987d46SWarner Losh } 436*ca987d46SWarner Losh 437*ca987d46SWarner Losh /* 438*ca987d46SWarner Losh * Free allocated bcache instance 439*ca987d46SWarner Losh */ 440*ca987d46SWarner Losh static void 441*ca987d46SWarner Losh bcache_free_instance(struct bcache *bc) 442*ca987d46SWarner Losh { 443*ca987d46SWarner Losh if (bc != NULL) { 444*ca987d46SWarner Losh if (bc->bcache_ctl) 445*ca987d46SWarner Losh free(bc->bcache_ctl); 446*ca987d46SWarner Losh if (bc->bcache_data) 447*ca987d46SWarner Losh free(bc->bcache_data); 448*ca987d46SWarner Losh free(bc); 449*ca987d46SWarner Losh } 450*ca987d46SWarner Losh } 451*ca987d46SWarner Losh 452*ca987d46SWarner Losh /* 453*ca987d46SWarner Losh * Insert a block into the cache. 454*ca987d46SWarner Losh */ 455*ca987d46SWarner Losh static void 456*ca987d46SWarner Losh bcache_insert(struct bcache *bc, daddr_t blkno) 457*ca987d46SWarner Losh { 458*ca987d46SWarner Losh u_int cand; 459*ca987d46SWarner Losh 460*ca987d46SWarner Losh cand = BHASH(bc, blkno); 461*ca987d46SWarner Losh 462*ca987d46SWarner Losh DEBUG("insert blk %llu -> %u # %d", blkno, cand, bcache_bcount); 463*ca987d46SWarner Losh bc->bcache_ctl[cand].bc_blkno = blkno; 464*ca987d46SWarner Losh bc->bcache_ctl[cand].bc_count = bcache_bcount++; 465*ca987d46SWarner Losh } 466*ca987d46SWarner Losh 467*ca987d46SWarner Losh /* 468*ca987d46SWarner Losh * Invalidate a block from the cache. 469*ca987d46SWarner Losh */ 470*ca987d46SWarner Losh static void 471*ca987d46SWarner Losh bcache_invalidate(struct bcache *bc, daddr_t blkno) 472*ca987d46SWarner Losh { 473*ca987d46SWarner Losh u_int i; 474*ca987d46SWarner Losh 475*ca987d46SWarner Losh i = BHASH(bc, blkno); 476*ca987d46SWarner Losh if (bc->bcache_ctl[i].bc_blkno == blkno) { 477*ca987d46SWarner Losh bc->bcache_ctl[i].bc_count = -1; 478*ca987d46SWarner Losh bc->bcache_ctl[i].bc_blkno = -1; 479*ca987d46SWarner Losh DEBUG("invalidate blk %llu", blkno); 480*ca987d46SWarner Losh } 481*ca987d46SWarner Losh } 482*ca987d46SWarner Losh 483*ca987d46SWarner Losh #ifndef BOOT2 484*ca987d46SWarner Losh COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache); 485*ca987d46SWarner Losh 486*ca987d46SWarner Losh static int 487*ca987d46SWarner Losh command_bcache(int argc, char *argv[]) 488*ca987d46SWarner Losh { 489*ca987d46SWarner Losh if (argc != 1) { 490*ca987d46SWarner Losh command_errmsg = "wrong number of arguments"; 491*ca987d46SWarner Losh return(CMD_ERROR); 492*ca987d46SWarner Losh } 493*ca987d46SWarner Losh 494*ca987d46SWarner Losh printf("\ncache blocks: %d\n", bcache_total_nblks); 495*ca987d46SWarner Losh printf("cache blocksz: %d\n", bcache_blksize); 496*ca987d46SWarner Losh printf("cache readahead: %d\n", bcache_rablks); 497*ca987d46SWarner Losh printf("unit cache blocks: %d\n", bcache_unit_nblks); 498*ca987d46SWarner Losh printf("cached units: %d\n", bcache_units); 499*ca987d46SWarner Losh printf("%d ops %d bypasses %d hits %d misses\n", bcache_ops, 500*ca987d46SWarner Losh bcache_bypasses, bcache_hits, bcache_misses); 501*ca987d46SWarner Losh return(CMD_OK); 502*ca987d46SWarner Losh } 503*ca987d46SWarner Losh #endif 504