xref: /titanic_41/usr/src/uts/common/fs/zfs/dmu_zfetch.c (revision ccbf80fa3b6bf6b986dca9037e5ad9d6c9f9fa65)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/dnode.h>
30 #include <sys/dmu_objset.h>
31 #include <sys/dmu_zfetch.h>
32 #include <sys/dmu.h>
33 #include <sys/dbuf.h>
34 
35 /*
36  * I'm against tune-ables, but these should probably exist as tweakable globals
37  * until we can get this working the way we want it to.
38  */
39 
40 /* max # of streams per zfetch */
41 uint32_t	zfetch_max_streams = 8;
42 /* min time before stream reclaim */
43 uint32_t	zfetch_min_sec_reap = 2;
44 /* max number of blocks to fetch at a time */
45 uint32_t	zfetch_block_cap = 256;
46 /* number of bytes in a array_read at which we stop prefetching (1Mb) */
47 uint64_t	zfetch_array_rd_sz = 1024 * 1024;
48 
49 /* forward decls for static routines */
50 static int		dmu_zfetch_colinear(zfetch_t *, zstream_t *);
51 static void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
52 static uint64_t		dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
53 static uint64_t		dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
54 static int		dmu_zfetch_find(zfetch_t *, zstream_t *, int);
55 static int		dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
56 static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
57 static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
58 static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
59 
60 /*
61  * Given a zfetch structure and a zstream structure, determine whether the
62  * blocks to be read are part of a co-linear pair of existing prefetch
63  * streams.  If a set is found, coalesce the streams, removing one, and
64  * configure the prefetch so it looks for a strided access pattern.
65  *
66  * In other words: if we find two sequential access streams that are
67  * the same length and distance N appart, and this read is N from the
68  * last stream, then we are probably in a strided access pattern.  So
69  * combine the two sequential streams into a single strided stream.
70  *
71  * If no co-linear streams are found, return NULL.
72  */
73 static int
74 dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
75 {
76 	zstream_t	*z_walk;
77 	zstream_t	*z_comp;
78 
79 	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
80 		return (0);
81 
82 	if (zh == NULL) {
83 		rw_exit(&zf->zf_rwlock);
84 		return (0);
85 	}
86 
87 	for (z_walk = list_head(&zf->zf_stream); z_walk;
88 	    z_walk = list_next(&zf->zf_stream, z_walk)) {
89 		for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
90 		    z_comp = list_next(&zf->zf_stream, z_comp)) {
91 			int64_t		diff;
92 
93 			if (z_walk->zst_len != z_walk->zst_stride ||
94 			    z_comp->zst_len != z_comp->zst_stride) {
95 				continue;
96 			}
97 
98 			diff = z_comp->zst_offset - z_walk->zst_offset;
99 			if (z_comp->zst_offset + diff == zh->zst_offset) {
100 				z_walk->zst_offset = zh->zst_offset;
101 				z_walk->zst_direction = diff < 0 ? -1 : 1;
102 				z_walk->zst_stride =
103 				    diff * z_walk->zst_direction;
104 				z_walk->zst_ph_offset =
105 				    zh->zst_offset + z_walk->zst_stride;
106 				dmu_zfetch_stream_remove(zf, z_comp);
107 				mutex_destroy(&z_comp->zst_lock);
108 				kmem_free(z_comp, sizeof (zstream_t));
109 
110 				dmu_zfetch_dofetch(zf, z_walk);
111 
112 				rw_exit(&zf->zf_rwlock);
113 				return (1);
114 			}
115 
116 			diff = z_walk->zst_offset - z_comp->zst_offset;
117 			if (z_walk->zst_offset + diff == zh->zst_offset) {
118 				z_walk->zst_offset = zh->zst_offset;
119 				z_walk->zst_direction = diff < 0 ? -1 : 1;
120 				z_walk->zst_stride =
121 				    diff * z_walk->zst_direction;
122 				z_walk->zst_ph_offset =
123 				    zh->zst_offset + z_walk->zst_stride;
124 				dmu_zfetch_stream_remove(zf, z_comp);
125 				mutex_destroy(&z_comp->zst_lock);
126 				kmem_free(z_comp, sizeof (zstream_t));
127 
128 				dmu_zfetch_dofetch(zf, z_walk);
129 
130 				rw_exit(&zf->zf_rwlock);
131 				return (1);
132 			}
133 		}
134 	}
135 
136 	rw_exit(&zf->zf_rwlock);
137 	return (0);
138 }
139 
140 /*
141  * Given a zstream_t, determine the bounds of the prefetch.  Then call the
142  * routine that actually prefetches the individual blocks.
143  */
144 static void
145 dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
146 {
147 	uint64_t	prefetch_tail;
148 	uint64_t	prefetch_limit;
149 	uint64_t	prefetch_ofst;
150 	uint64_t	prefetch_len;
151 	uint64_t	blocks_fetched;
152 
153 	zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
154 	zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
155 
156 	prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
157 	    (int64_t)(zs->zst_offset + zs->zst_stride));
158 	/*
159 	 * XXX: use a faster division method?
160 	 */
161 	prefetch_limit = zs->zst_offset + zs->zst_len +
162 	    (zs->zst_cap * zs->zst_stride) / zs->zst_len;
163 
164 	while (prefetch_tail < prefetch_limit) {
165 		prefetch_ofst = zs->zst_offset + zs->zst_direction *
166 		    (prefetch_tail - zs->zst_offset);
167 
168 		prefetch_len = zs->zst_len;
169 
170 		/*
171 		 * Don't prefetch beyond the end of the file, if working
172 		 * backwards.
173 		 */
174 		if ((zs->zst_direction == ZFETCH_BACKWARD) &&
175 		    (prefetch_ofst > prefetch_tail)) {
176 			prefetch_len += prefetch_ofst;
177 			prefetch_ofst = 0;
178 		}
179 
180 		/* don't prefetch more than we're supposed to */
181 		if (prefetch_len > zs->zst_len)
182 			break;
183 
184 		blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
185 		    prefetch_ofst, zs->zst_len);
186 
187 		prefetch_tail += zs->zst_stride;
188 		/* stop if we've run out of stuff to prefetch */
189 		if (blocks_fetched < zs->zst_len)
190 			break;
191 	}
192 	zs->zst_ph_offset = prefetch_tail;
193 	zs->zst_last = lbolt;
194 }
195 
196 /*
197  * This takes a pointer to a zfetch structure and a dnode.  It performs the
198  * necessary setup for the zfetch structure, grokking data from the
199  * associated dnode.
200  */
201 void
202 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
203 {
204 	if (zf == NULL) {
205 		return;
206 	}
207 
208 	zf->zf_dnode = dno;
209 	zf->zf_stream_cnt = 0;
210 	zf->zf_alloc_fail = 0;
211 
212 	list_create(&zf->zf_stream, sizeof (zstream_t),
213 	    offsetof(zstream_t, zst_node));
214 
215 	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
216 }
217 
218 /*
219  * This function computes the actual size, in blocks, that can be prefetched,
220  * and fetches it.
221  */
222 static uint64_t
223 dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
224 {
225 	uint64_t	fetchsz;
226 	uint64_t	i;
227 
228 	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
229 
230 	for (i = 0; i < fetchsz; i++) {
231 		dbuf_prefetch(dn, blkid + i);
232 	}
233 
234 	return (fetchsz);
235 }
236 
237 /*
238  * this function returns the number of blocks that would be prefetched, based
239  * upon the supplied dnode, blockid, and nblks.  This is used so that we can
240  * update streams in place, and then prefetch with their old value after the
241  * fact.  This way, we can delay the prefetch, but subsequent accesses to the
242  * stream won't result in the same data being prefetched multiple times.
243  */
244 static uint64_t
245 dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
246 {
247 	uint64_t	fetchsz;
248 
249 	if (blkid > dn->dn_maxblkid) {
250 		return (0);
251 	}
252 
253 	/* compute fetch size */
254 	if (blkid + nblks + 1 > dn->dn_maxblkid) {
255 		fetchsz = (dn->dn_maxblkid - blkid) + 1;
256 		ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
257 	} else {
258 		fetchsz = nblks;
259 	}
260 
261 
262 	return (fetchsz);
263 }
264 
265 /*
266  * given a zfetch and a zsearch structure, see if there is an associated zstream
267  * for this block read.  If so, it starts a prefetch for the stream it
268  * located and returns true, otherwise it returns false
269  */
270 static int
271 dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
272 {
273 	zstream_t	*zs;
274 	int64_t		diff;
275 	int		reset = !prefetched;
276 	int		rc = 0;
277 
278 	if (zh == NULL)
279 		return (0);
280 
281 	/*
282 	 * XXX: This locking strategy is a bit coarse; however, it's impact has
283 	 * yet to be tested.  If this turns out to be an issue, it can be
284 	 * modified in a number of different ways.
285 	 */
286 
287 	rw_enter(&zf->zf_rwlock, RW_READER);
288 top:
289 
290 	for (zs = list_head(&zf->zf_stream); zs;
291 	    zs = list_next(&zf->zf_stream, zs)) {
292 
293 		/*
294 		 * XXX - should this be an assert?
295 		 */
296 		if (zs->zst_len == 0) {
297 			/* bogus stream */
298 			continue;
299 		}
300 
301 		/*
302 		 * We hit this case when we are in a strided prefetch stream:
303 		 * we will read "len" blocks before "striding".
304 		 */
305 		if (zh->zst_offset >= zs->zst_offset &&
306 		    zh->zst_offset < zs->zst_offset + zs->zst_len) {
307 			/* already fetched */
308 			rc = 1;
309 			goto out;
310 		}
311 
312 		/*
313 		 * This is the forward sequential read case: we increment
314 		 * len by one each time we hit here, so we will enter this
315 		 * case on every read.
316 		 */
317 		if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
318 
319 			reset = !prefetched && zs->zst_len > 1;
320 
321 			mutex_enter(&zs->zst_lock);
322 
323 			if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
324 				mutex_exit(&zs->zst_lock);
325 				goto top;
326 			}
327 			zs->zst_len += zh->zst_len;
328 			diff = zs->zst_len - zfetch_block_cap;
329 			if (diff > 0) {
330 				zs->zst_offset += diff;
331 				zs->zst_len = zs->zst_len > diff ?
332 				    zs->zst_len - diff : 0;
333 			}
334 			zs->zst_direction = ZFETCH_FORWARD;
335 
336 			break;
337 
338 		/*
339 		 * Same as above, but reading backwards through the file.
340 		 */
341 		} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
342 			/* backwards sequential access */
343 
344 			reset = !prefetched && zs->zst_len > 1;
345 
346 			mutex_enter(&zs->zst_lock);
347 
348 			if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
349 				mutex_exit(&zs->zst_lock);
350 				goto top;
351 			}
352 
353 			zs->zst_offset = zs->zst_offset > zh->zst_len ?
354 			    zs->zst_offset - zh->zst_len : 0;
355 			zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
356 			    zs->zst_ph_offset - zh->zst_len : 0;
357 			zs->zst_len += zh->zst_len;
358 
359 			diff = zs->zst_len - zfetch_block_cap;
360 			if (diff > 0) {
361 				zs->zst_ph_offset = zs->zst_ph_offset > diff ?
362 				    zs->zst_ph_offset - diff : 0;
363 				zs->zst_len = zs->zst_len > diff ?
364 				    zs->zst_len - diff : zs->zst_len;
365 			}
366 			zs->zst_direction = ZFETCH_BACKWARD;
367 
368 			break;
369 
370 		} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
371 		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
372 			/* strided forward access */
373 
374 			mutex_enter(&zs->zst_lock);
375 
376 			if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
377 			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
378 				mutex_exit(&zs->zst_lock);
379 				goto top;
380 			}
381 
382 			zs->zst_offset += zs->zst_stride;
383 			zs->zst_direction = ZFETCH_FORWARD;
384 
385 			break;
386 
387 		} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
388 		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
389 			/* strided reverse access */
390 
391 			mutex_enter(&zs->zst_lock);
392 
393 			if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
394 			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
395 				mutex_exit(&zs->zst_lock);
396 				goto top;
397 			}
398 
399 			zs->zst_offset = zs->zst_offset > zs->zst_stride ?
400 			    zs->zst_offset - zs->zst_stride : 0;
401 			zs->zst_ph_offset = (zs->zst_ph_offset >
402 			    (2 * zs->zst_stride)) ?
403 			    (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
404 			zs->zst_direction = ZFETCH_BACKWARD;
405 
406 			break;
407 		}
408 	}
409 
410 	if (zs) {
411 		if (reset) {
412 			zstream_t *remove = zs;
413 
414 			rc = 0;
415 			mutex_exit(&zs->zst_lock);
416 			rw_exit(&zf->zf_rwlock);
417 			rw_enter(&zf->zf_rwlock, RW_WRITER);
418 			/*
419 			 * Relocate the stream, in case someone removes
420 			 * it while we were acquiring the WRITER lock.
421 			 */
422 			for (zs = list_head(&zf->zf_stream); zs;
423 			    zs = list_next(&zf->zf_stream, zs)) {
424 				if (zs == remove) {
425 					dmu_zfetch_stream_remove(zf, zs);
426 					mutex_destroy(&zs->zst_lock);
427 					kmem_free(zs, sizeof (zstream_t));
428 					break;
429 				}
430 			}
431 		} else {
432 			rc = 1;
433 			dmu_zfetch_dofetch(zf, zs);
434 			mutex_exit(&zs->zst_lock);
435 		}
436 	}
437 out:
438 	rw_exit(&zf->zf_rwlock);
439 	return (rc);
440 }
441 
442 /*
443  * Clean-up state associated with a zfetch structure.  This frees allocated
444  * structure members, empties the zf_stream tree, and generally makes things
445  * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
446  */
447 void
448 dmu_zfetch_rele(zfetch_t *zf)
449 {
450 	zstream_t	*zs;
451 	zstream_t	*zs_next;
452 
453 	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
454 
455 	for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
456 		zs_next = list_next(&zf->zf_stream, zs);
457 
458 		list_remove(&zf->zf_stream, zs);
459 		mutex_destroy(&zs->zst_lock);
460 		kmem_free(zs, sizeof (zstream_t));
461 	}
462 	list_destroy(&zf->zf_stream);
463 	rw_destroy(&zf->zf_rwlock);
464 
465 	zf->zf_dnode = NULL;
466 }
467 
468 /*
469  * Given a zfetch and zstream structure, insert the zstream structure into the
470  * AVL tree contained within the zfetch structure.  Peform the appropriate
471  * book-keeping.  It is possible that another thread has inserted a stream which
472  * matches one that we are about to insert, so we must be sure to check for this
473  * case.  If one is found, return failure, and let the caller cleanup the
474  * duplicates.
475  */
476 static int
477 dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
478 {
479 	zstream_t	*zs_walk;
480 	zstream_t	*zs_next;
481 
482 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
483 
484 	for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
485 		zs_next = list_next(&zf->zf_stream, zs_walk);
486 
487 		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
488 		    return (0);
489 		}
490 	}
491 
492 	list_insert_head(&zf->zf_stream, zs);
493 	zf->zf_stream_cnt++;
494 
495 	return (1);
496 }
497 
498 
499 /*
500  * Walk the list of zstreams in the given zfetch, find an old one (by time), and
501  * reclaim it for use by the caller.
502  */
503 static zstream_t *
504 dmu_zfetch_stream_reclaim(zfetch_t *zf)
505 {
506 	zstream_t	*zs;
507 
508 	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
509 		return (0);
510 
511 	for (zs = list_head(&zf->zf_stream); zs;
512 	    zs = list_next(&zf->zf_stream, zs)) {
513 
514 		if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
515 			break;
516 	}
517 
518 	if (zs) {
519 		dmu_zfetch_stream_remove(zf, zs);
520 		mutex_destroy(&zs->zst_lock);
521 		bzero(zs, sizeof (zstream_t));
522 	} else {
523 		zf->zf_alloc_fail++;
524 	}
525 	rw_exit(&zf->zf_rwlock);
526 
527 	return (zs);
528 }
529 
530 /*
531  * Given a zfetch and zstream structure, remove the zstream structure from its
532  * container in the zfetch structure.  Perform the appropriate book-keeping.
533  */
534 static void
535 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
536 {
537 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
538 
539 	list_remove(&zf->zf_stream, zs);
540 	zf->zf_stream_cnt--;
541 }
542 
543 static int
544 dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
545 {
546 	if (zs1->zst_offset != zs2->zst_offset)
547 		return (0);
548 
549 	if (zs1->zst_len != zs2->zst_len)
550 		return (0);
551 
552 	if (zs1->zst_stride != zs2->zst_stride)
553 		return (0);
554 
555 	if (zs1->zst_ph_offset != zs2->zst_ph_offset)
556 		return (0);
557 
558 	if (zs1->zst_cap != zs2->zst_cap)
559 		return (0);
560 
561 	if (zs1->zst_direction != zs2->zst_direction)
562 		return (0);
563 
564 	return (1);
565 }
566 
567 /*
568  * This is the prefetch entry point.  It calls all of the other dmu_zfetch
569  * routines to create, delete, find, or operate upon prefetch streams.
570  */
571 void
572 dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
573 {
574 	zstream_t	zst;
575 	zstream_t	*newstream;
576 	int		fetched;
577 	int		inserted;
578 	unsigned int	blkshft;
579 	uint64_t	blksz;
580 
581 	/* files that aren't ln2 blocksz are only one block -- nothing to do */
582 	if (!zf->zf_dnode->dn_datablkshift) {
583 		return;
584 	}
585 
586 	/* convert offset and size, into blockid and nblocks */
587 	blkshft = zf->zf_dnode->dn_datablkshift;
588 	blksz = (1 << blkshft);
589 
590 	bzero(&zst, sizeof (zstream_t));
591 	zst.zst_offset = offset >> blkshft;
592 	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
593 	    P2ALIGN(offset, blksz)) >> blkshft;
594 
595 	fetched = dmu_zfetch_find(zf, &zst, prefetched);
596 	if (!fetched) {
597 		fetched = dmu_zfetch_colinear(zf, &zst);
598 	}
599 
600 	if (!fetched) {
601 		newstream = dmu_zfetch_stream_reclaim(zf);
602 
603 		/*
604 		 * we still couldn't find a stream, drop the lock, and allocate
605 		 * one if possible.  Otherwise, give up and go home.
606 		 */
607 		if (newstream == NULL) {
608 			uint64_t	maxblocks;
609 			uint32_t	max_streams;
610 			uint32_t	cur_streams;
611 
612 			cur_streams = zf->zf_stream_cnt;
613 			maxblocks = zf->zf_dnode->dn_maxblkid;
614 
615 			max_streams = MIN(zfetch_max_streams,
616 			    (maxblocks / zfetch_block_cap));
617 			if (max_streams == 0) {
618 				max_streams++;
619 			}
620 
621 			if (cur_streams >= max_streams) {
622 				return;
623 			}
624 
625 			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
626 		}
627 
628 		newstream->zst_offset = zst.zst_offset;
629 		newstream->zst_len = zst.zst_len;
630 		newstream->zst_stride = zst.zst_len;
631 		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
632 		newstream->zst_cap = zst.zst_len;
633 		newstream->zst_direction = ZFETCH_FORWARD;
634 		newstream->zst_last = lbolt;
635 
636 		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
637 
638 		rw_enter(&zf->zf_rwlock, RW_WRITER);
639 		inserted = dmu_zfetch_stream_insert(zf, newstream);
640 		rw_exit(&zf->zf_rwlock);
641 
642 		if (!inserted) {
643 			mutex_destroy(&newstream->zst_lock);
644 			kmem_free(newstream, sizeof (zstream_t));
645 		}
646 	}
647 }
648