xref: /illumos-gate/usr/src/boot/common/bcache.c (revision f17620a4f72a29025a22655ba8735ccd20ae174f)
1 /*
2  * Copyright (c) 1998 Michael Smith <msmith@freebsd.org>
3  * Copyright 2015 Toomas Soome <tsoome@me.com>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 #include <sys/param.h>
30 
31 /*
32  * Simple hashed block cache
33  */
34 
35 #include <sys/stdint.h>
36 
37 #include <stand.h>
38 #include <string.h>
39 #include <strings.h>
40 
41 #include "bootstrap.h"
42 
43 /* #define BCACHE_DEBUG */
44 
45 #ifdef BCACHE_DEBUG
46 #define	DPRINTF(fmt, args...)	printf("%s: " fmt "\n", __func__, ## args)
47 #else
48 #define	DPRINTF(fmt, args...)	((void)0)
49 #endif
50 
51 struct bcachectl
52 {
53 	daddr_t	bc_blkno;
54 	int	bc_count;
55 };
56 
57 /*
58  * bcache per device node. cache is allocated on device first open and freed
59  * on last close, to save memory. The issue there is the size; biosdisk
60  * supports up to 31 (0x1f) devices. Classic setup would use single disk
61  * to boot from, but this has changed with zfs.
62  */
63 struct bcache {
64 	struct bcachectl	*bcache_ctl;
65 	caddr_t			bcache_data;
66 	size_t			bcache_nblks;
67 	size_t			ra;
68 };
69 
70 static uint_t bcache_total_nblks;	/* set by bcache_init */
71 static uint_t bcache_blksize;		/* set by bcache_init */
72 static uint_t bcache_numdev;		/* set by bcache_add_dev */
73 /* statistics */
74 static uint_t bcache_units;	/* number of devices with cache */
75 static uint_t bcache_unit_nblks;	/* nblocks per unit */
76 static uint_t bcache_hits;
77 static uint_t bcache_misses;
78 static uint_t bcache_ops;
79 static uint_t bcache_bypasses;
80 static uint_t bcache_bcount;
81 static uint_t bcache_rablks;
82 
83 #define	BHASH(bc, blkno)	((blkno) & ((bc)->bcache_nblks - 1))
84 #define	BCACHE_LOOKUP(bc, blkno)	\
85 	((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno))
86 #define	BCACHE_READAHEAD	256
87 #define	BCACHE_MINREADAHEAD	32
88 
89 static void	bcache_invalidate(struct bcache *bc, daddr_t blkno);
90 static void	bcache_insert(struct bcache *bc, daddr_t blkno);
91 static void	bcache_free_instance(struct bcache *bc);
92 
93 /*
94  * Initialise the cache for (nblks) of (bsize).
95  */
96 void
97 bcache_init(size_t nblks, size_t bsize)
98 {
99 	/* set up control data */
100 	bcache_total_nblks = nblks;
101 	bcache_blksize = bsize;
102 }
103 
104 /*
105  * add number of devices to bcache. we have to divide cache space
106  * between the devices, so bcache_add_dev() can be used to set up the
107  * number. The issue is, we need to get the number before actual allocations.
108  * bcache_add_dev() is supposed to be called from device init() call, so the
109  * assumption is, devsw dv_init is called for plain devices first, and
110  * for zfs, last.
111  */
112 void
113 bcache_add_dev(int devices)
114 {
115 	bcache_numdev += devices;
116 }
117 
118 void *
119 bcache_allocate(void)
120 {
121 	uint_t i;
122 	struct bcache *bc = malloc(sizeof (struct bcache));
123 	int disks = bcache_numdev;
124 
125 	if (disks == 0)
126 		disks = 1;	/* safe guard */
127 
128 	if (bc == NULL) {
129 		errno = ENOMEM;
130 		return (bc);
131 	}
132 
133 	/*
134 	 * the bcache block count must be power of 2 for hash function
135 	 */
136 	i = fls(disks) - 1;		/* highbit - 1 */
137 	if (disks > (1 << i))	/* next power of 2 */
138 		i++;
139 
140 	bc->bcache_nblks = bcache_total_nblks >> i;
141 	bcache_unit_nblks = bc->bcache_nblks;
142 	bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize);
143 	if (bc->bcache_data == NULL) {
144 		/* dont error out yet. fall back to 32 blocks and try again */
145 		bc->bcache_nblks = 32;
146 		bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize +
147 		    sizeof (uint32_t));
148 	}
149 
150 	bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof (struct bcachectl));
151 
152 	if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) {
153 		bcache_free_instance(bc);
154 		errno = ENOMEM;
155 		return (NULL);
156 	}
157 
158 	/* Flush the cache */
159 	for (i = 0; i < bc->bcache_nblks; i++) {
160 		bc->bcache_ctl[i].bc_count = -1;
161 		bc->bcache_ctl[i].bc_blkno = -1;
162 	}
163 	bcache_units++;
164 	bc->ra = BCACHE_READAHEAD;	/* optimistic read ahead */
165 	return (bc);
166 }
167 
168 void
169 bcache_free(void *cache)
170 {
171 	struct bcache *bc = cache;
172 
173 	if (bc == NULL)
174 		return;
175 
176 	bcache_free_instance(bc);
177 	bcache_units--;
178 }
179 
180 /*
181  * Handle a write request; write directly to the disk, and populate the
182  * cache with the new values.
183  */
184 static int
185 write_strategy(void *devdata, int rw, daddr_t blk, size_t size,
186     char *buf, size_t *rsize)
187 {
188 	struct bcache_devdata	*dd = (struct bcache_devdata *)devdata;
189 	struct bcache		*bc = dd->dv_cache;
190 	daddr_t			i, nblk;
191 
192 	nblk = size / bcache_blksize;
193 
194 	/* Invalidate the blocks being written */
195 	for (i = 0; i < nblk; i++) {
196 		bcache_invalidate(bc, blk + i);
197 	}
198 
199 	/* Write the blocks */
200 	return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize));
201 }
202 
203 /*
204  * Handle a read request; fill in parts of the request that can
205  * be satisfied by the cache, use the supplied strategy routine to do
206  * device I/O and then use the I/O results to populate the cache.
207  */
208 static int
209 read_strategy(void *devdata, int rw, daddr_t blk, size_t size,
210     char *buf, size_t *rsize)
211 {
212 	struct bcache_devdata	*dd = devdata;
213 	struct bcache		*bc = dd->dv_cache;
214 	size_t			i, nblk, p_size, r_size, complete, ra;
215 	int			result;
216 	daddr_t			p_blk;
217 	caddr_t			p_buf;
218 
219 	if (bc == NULL) {
220 		errno = ENODEV;
221 		return (-1);
222 	}
223 
224 	if (rsize != NULL)
225 		*rsize = 0;
226 
227 	nblk = size / bcache_blksize;
228 	if (nblk == 0 && size != 0)
229 		nblk++;
230 	result = 0;
231 	complete = 1;
232 
233 	/* Satisfy any cache hits up front, break on first miss */
234 	for (i = 0; i < nblk; i++) {
235 		if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) {
236 			bcache_misses += (nblk - i);
237 			complete = 0;
238 			if (nblk - i > BCACHE_MINREADAHEAD &&
239 			    bc->ra > BCACHE_MINREADAHEAD)
240 				bc->ra >>= 1;	/* reduce read ahead */
241 			break;
242 		} else {
243 			bcache_hits++;
244 		}
245 	}
246 
247 	if (complete) {	/* whole set was in cache, return it */
248 		if (bc->ra < BCACHE_READAHEAD)
249 			bc->ra <<= 1;	/* increase read ahead */
250 		bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)),
251 		    buf, size);
252 		goto done;
253 	}
254 
255 	/*
256 	 * Fill in any misses. From check we have i pointing to first missing
257 	 * block, read in all remaining blocks + readahead.
258 	 * We have space at least for nblk - i before bcache wraps.
259 	 */
260 	p_blk = blk + i;
261 	p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk));
262 	r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */
263 
264 	p_size = MIN(r_size, nblk - i);	/* read at least those blocks */
265 
266 	/*
267 	 * The read ahead size setup.
268 	 * While the read ahead can save us IO, it also can complicate things:
269 	 * 1. We do not want to read ahead by wrapping around the
270 	 *	bcache end - this would complicate the cache management.
271 	 * 2. We are using bc->ra as dynamic hint for read ahead size,
272 	 *	detected cache hits will increase the read-ahead block count,
273 	 *	and misses will decrease, see the code above.
274 	 * 3. The bcache is sized by 512B blocks, however, the underlying device
275 	 *	may have a larger sector size, and we should perform the IO by
276 	 *	taking into account these larger sector sizes. We could solve
277 	 *	this by passing the sector size to bcache_allocate(), or by
278 	 *	using ioctl(), but in this version we are using the constant,
279 	 *	16 blocks, and are rounding read ahead block count down to
280 	 *	multiple of 16. Using the constant has two reasons, we are not
281 	 *	entirely sure if the BIOS disk interface is providing the
282 	 *	correct value for sector size. And secondly, this way we get
283 	 *	the most conservative setup for the ra.
284 	 *
285 	 * The selection of multiple of 16 blocks (8KB) is quite arbitrary,
286 	 * however, we want to cover CDs (2K) and 4K disks.
287 	 * bcache_allocate() will always fall back to a minimum of 32 blocks.
288 	 * Our choice of 16 read ahead blocks will always fit inside the bcache.
289 	 */
290 
291 	if ((rw & F_NORA) == F_NORA)
292 		ra = 0;
293 	else
294 		ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size);
295 
296 	if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */
297 		ra = MIN(bc->ra, ra - 1);
298 		ra = rounddown(ra, 16);		/* multiple of 16 blocks */
299 		p_size += ra;
300 	}
301 
302 	/* invalidate bcache */
303 	for (i = 0; i < p_size; i++) {
304 		bcache_invalidate(bc, p_blk + i);
305 	}
306 
307 	r_size = 0;
308 	/*
309 	 * with read-ahead, it may happen we are attempting to read past
310 	 * disk end, as bcache has no information about disk size.
311 	 * in such case we should get partial read if some blocks can be
312 	 * read or error, if no blocks can be read.
313 	 * in either case we should return the data in bcache and only
314 	 * return error if there is no data.
315 	 */
316 	rw &= F_MASK;
317 	result = dd->dv_strategy(dd->dv_devdata, rw, p_blk,
318 	    p_size * bcache_blksize, p_buf, &r_size);
319 
320 	r_size /= bcache_blksize;
321 	for (i = 0; i < r_size; i++)
322 		bcache_insert(bc, p_blk + i);
323 
324 	/* update ra statistics */
325 	if (r_size != 0) {
326 		if (r_size < p_size)
327 			bcache_rablks += (p_size - r_size);
328 		else
329 			bcache_rablks += ra;
330 	}
331 
332 	/* check how much data can we copy */
333 	for (i = 0; i < nblk; i++) {
334 		if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i)))
335 			break;
336 	}
337 
338 	if (size > i * bcache_blksize)
339 		size = i * bcache_blksize;
340 
341 	if (size != 0) {
342 		bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)),
343 		    buf, size);
344 		result = 0;
345 	}
346 
347 done:
348 	if ((result == 0) && (rsize != NULL))
349 		*rsize = size;
350 	return (result);
351 }
352 
353 /*
354  * Requests larger than 1/2 cache size will be bypassed and go
355  * directly to the disk.  XXX tune this.
356  */
357 int
358 bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size,
359     char *buf, size_t *rsize)
360 {
361 	struct bcache_devdata	*dd = (struct bcache_devdata *)devdata;
362 	struct bcache		*bc = dd->dv_cache;
363 	uint_t bcache_nblks = 0;
364 	int nblk, cblk, ret;
365 	size_t csize, isize, total;
366 
367 	bcache_ops++;
368 
369 	if (bc != NULL)
370 		bcache_nblks = bc->bcache_nblks;
371 
372 	/* bypass large requests, or when the cache is inactive */
373 	if (bc == NULL ||
374 	    ((size * 2 / bcache_blksize) > bcache_nblks)) {
375 		DPRINTF("bypass %zu from %jd", size / bcache_blksize,
376 		    (intmax_t)blk);
377 		bcache_bypasses++;
378 		rw &= F_MASK;
379 		return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf,
380 		    rsize));
381 	}
382 
383 	switch (rw & F_MASK) {
384 	case F_READ:
385 		nblk = size / bcache_blksize;
386 		if (size != 0 && nblk == 0)
387 			nblk++;	/* read at least one block */
388 
389 		ret = 0;
390 		total = 0;
391 		while (size) {
392 			/* # of blocks left */
393 			cblk = bcache_nblks - BHASH(bc, blk);
394 			cblk = MIN(cblk, nblk);
395 
396 			if (size <= bcache_blksize)
397 				csize = size;
398 			else
399 				csize = cblk * bcache_blksize;
400 
401 			ret = read_strategy(devdata, rw, blk, csize,
402 			    buf + total, &isize);
403 
404 			/*
405 			 * we may have error from read ahead, if we have read
406 			 * some data return partial read.
407 			 */
408 			if (ret != 0 || isize == 0) {
409 				if (total != 0)
410 					ret = 0;
411 				break;
412 			}
413 			blk += isize / bcache_blksize;
414 			total += isize;
415 			size -= isize;
416 			nblk = size / bcache_blksize;
417 		}
418 
419 		if (rsize)
420 			*rsize = total;
421 
422 		return (ret);
423 	case F_WRITE:
424 		return (write_strategy(devdata, F_WRITE, blk, size, buf,
425 		    rsize));
426 	}
427 	return (-1);
428 }
429 
430 /*
431  * Free allocated bcache instance
432  */
433 static void
434 bcache_free_instance(struct bcache *bc)
435 {
436 	if (bc != NULL) {
437 		free(bc->bcache_ctl);
438 		free(bc->bcache_data);
439 		free(bc);
440 	}
441 }
442 
443 /*
444  * Insert a block into the cache.
445  */
446 static void
447 bcache_insert(struct bcache *bc, daddr_t blkno)
448 {
449 	uint_t	cand;
450 
451 	cand = BHASH(bc, blkno);
452 
453 	DPRINTF("insert blk %jd -> %u # %d", (intmax_t)blkno, cand,
454 	    bcache_bcount);
455 	bc->bcache_ctl[cand].bc_blkno = blkno;
456 	bc->bcache_ctl[cand].bc_count = bcache_bcount++;
457 }
458 
459 /*
460  * Invalidate a block from the cache.
461  */
462 static void
463 bcache_invalidate(struct bcache *bc, daddr_t blkno)
464 {
465 	uint_t	i;
466 
467 	i = BHASH(bc, blkno);
468 	if (bc->bcache_ctl[i].bc_blkno == blkno) {
469 		bc->bcache_ctl[i].bc_count = -1;
470 		bc->bcache_ctl[i].bc_blkno = -1;
471 		DPRINTF("invalidate blk %jd", (intmax_t)blkno);
472 	}
473 }
474 
475 COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats",
476     command_bcache);
477 
478 static int
479 command_bcache(int argc, char *argv[] __unused)
480 {
481 	if (argc != 1) {
482 		command_errmsg = "wrong number of arguments";
483 		return (CMD_ERROR);
484 	}
485 
486 	printf("\ncache blocks: %u\n", bcache_total_nblks);
487 	printf("cache blocksz: %u\n", bcache_blksize);
488 	printf("cache readahead: %u\n", bcache_rablks);
489 	printf("unit cache blocks: %u\n", bcache_unit_nblks);
490 	printf("cached units: %u\n", bcache_units);
491 	printf("%u ops %u bypasses %u hits  %u misses\n", bcache_ops,
492 	    bcache_bypasses, bcache_hits, bcache_misses);
493 	return (CMD_OK);
494 }
495