xref: /freebsd/sys/contrib/openzfs/module/zfs/multilist.c (revision d500a85e640d1cd270747c12e17c511b53864436)
1 /*
2  * CDDL HEADER START
3  *
4  * This file and its contents are supplied under the terms of the
5  * Common Development and Distribution License ("CDDL"), version 1.0.
6  * You may only use this file in accordance with the terms of version
7  * 1.0 of the CDDL.
8  *
9  * A full copy of the text of the CDDL should have accompanied this
10  * source.  A copy of the CDDL is also available via the Internet at
11  * http://www.illumos.org/license/CDDL.
12  *
13  * CDDL HEADER END
14  */
15 /*
16  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
17  */
18 
19 #include <sys/zfs_context.h>
20 #include <sys/multilist.h>
21 #include <sys/trace_zfs.h>
22 
23 /* needed for spa_get_random() */
24 #include <sys/spa.h>
25 
26 /*
27  * This overrides the number of sublists in each multilist_t, which defaults
28  * to the number of CPUs in the system (see multilist_create()).
29  */
30 int zfs_multilist_num_sublists = 0;
31 
32 /*
33  * Given the object contained on the list, return a pointer to the
34  * object's multilist_node_t structure it contains.
35  */
36 #ifdef ZFS_DEBUG
37 static multilist_node_t *
38 multilist_d2l(multilist_t *ml, void *obj)
39 {
40 	return ((multilist_node_t *)((char *)obj + ml->ml_offset));
41 }
42 #endif
43 
44 /*
45  * Initialize a new mutlilist using the parameters specified.
46  *
47  *  - 'size' denotes the size of the structure containing the
48  *     multilist_node_t.
49  *  - 'offset' denotes the byte offset of the mutlilist_node_t within
50  *     the structure that contains it.
51  *  - 'num' specifies the number of internal sublists to create.
52  *  - 'index_func' is used to determine which sublist to insert into
53  *     when the multilist_insert() function is called; as well as which
54  *     sublist to remove from when multilist_remove() is called. The
55  *     requirements this function must meet, are the following:
56  *
57  *      - It must always return the same value when called on the same
58  *        object (to ensure the object is removed from the list it was
59  *        inserted into).
60  *
61  *      - It must return a value in the range [0, number of sublists).
62  *        The multilist_get_num_sublists() function may be used to
63  *        determine the number of sublists in the multilist.
64  *
65  *     Also, in order to reduce internal contention between the sublists
66  *     during insertion and removal, this function should choose evenly
67  *     between all available sublists when inserting. This isn't a hard
68  *     requirement, but a general rule of thumb in order to garner the
69  *     best multi-threaded performance out of the data structure.
70  */
71 static multilist_t *
72 multilist_create_impl(size_t size, size_t offset,
73     unsigned int num, multilist_sublist_index_func_t *index_func)
74 {
75 	ASSERT3U(size, >, 0);
76 	ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
77 	ASSERT3U(num, >, 0);
78 	ASSERT3P(index_func, !=, NULL);
79 
80 	multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP);
81 	ml->ml_offset = offset;
82 	ml->ml_num_sublists = num;
83 	ml->ml_index_func = index_func;
84 
85 	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
86 	    ml->ml_num_sublists, KM_SLEEP);
87 
88 	ASSERT3P(ml->ml_sublists, !=, NULL);
89 
90 	for (int i = 0; i < ml->ml_num_sublists; i++) {
91 		multilist_sublist_t *mls = &ml->ml_sublists[i];
92 		mutex_init(&mls->mls_lock, NULL, MUTEX_NOLOCKDEP, NULL);
93 		list_create(&mls->mls_list, size, offset);
94 	}
95 	return (ml);
96 }
97 
98 /*
99  * Allocate a new multilist, using the default number of sublists (the number
100  * of CPUs, or at least 4, or the tunable zfs_multilist_num_sublists). Note
101  * that the multilists do not expand if more CPUs are hot-added. In that case,
102  * we will have less fanout than boot_ncpus, but we don't want to always
103  * reserve the RAM necessary to create the extra slots for additional CPUs up
104  * front, and dynamically adding them is a complex task.
105  */
106 multilist_t *
107 multilist_create(size_t size, size_t offset,
108     multilist_sublist_index_func_t *index_func)
109 {
110 	int num_sublists;
111 
112 	if (zfs_multilist_num_sublists > 0) {
113 		num_sublists = zfs_multilist_num_sublists;
114 	} else {
115 		num_sublists = MAX(boot_ncpus, 4);
116 	}
117 
118 	return (multilist_create_impl(size, offset, num_sublists, index_func));
119 }
120 
121 /*
122  * Destroy the given multilist object, and free up any memory it holds.
123  */
124 void
125 multilist_destroy(multilist_t *ml)
126 {
127 	ASSERT(multilist_is_empty(ml));
128 
129 	for (int i = 0; i < ml->ml_num_sublists; i++) {
130 		multilist_sublist_t *mls = &ml->ml_sublists[i];
131 
132 		ASSERT(list_is_empty(&mls->mls_list));
133 
134 		list_destroy(&mls->mls_list);
135 		mutex_destroy(&mls->mls_lock);
136 	}
137 
138 	ASSERT3P(ml->ml_sublists, !=, NULL);
139 	kmem_free(ml->ml_sublists,
140 	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);
141 
142 	ml->ml_num_sublists = 0;
143 	ml->ml_offset = 0;
144 	kmem_free(ml, sizeof (multilist_t));
145 }
146 
147 /*
148  * Insert the given object into the multilist.
149  *
150  * This function will insert the object specified into the sublist
151  * determined using the function given at multilist creation time.
152  *
153  * The sublist locks are automatically acquired if not already held, to
154  * ensure consistency when inserting and removing from multiple threads.
155  */
156 void
157 multilist_insert(multilist_t *ml, void *obj)
158 {
159 	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
160 	multilist_sublist_t *mls;
161 	boolean_t need_lock;
162 
163 	DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
164 	    unsigned int, sublist_idx, void *, obj);
165 
166 	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
167 
168 	mls = &ml->ml_sublists[sublist_idx];
169 
170 	/*
171 	 * Note: Callers may already hold the sublist lock by calling
172 	 * multilist_sublist_lock().  Here we rely on MUTEX_HELD()
173 	 * returning TRUE if and only if the current thread holds the
174 	 * lock.  While it's a little ugly to make the lock recursive in
175 	 * this way, it works and allows the calling code to be much
176 	 * simpler -- otherwise it would have to pass around a flag
177 	 * indicating that it already has the lock.
178 	 */
179 	need_lock = !MUTEX_HELD(&mls->mls_lock);
180 
181 	if (need_lock)
182 		mutex_enter(&mls->mls_lock);
183 
184 	ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
185 
186 	multilist_sublist_insert_head(mls, obj);
187 
188 	if (need_lock)
189 		mutex_exit(&mls->mls_lock);
190 }
191 
192 /*
193  * Remove the given object from the multilist.
194  *
195  * This function will remove the object specified from the sublist
196  * determined using the function given at multilist creation time.
197  *
198  * The necessary sublist locks are automatically acquired, to ensure
199  * consistency when inserting and removing from multiple threads.
200  */
201 void
202 multilist_remove(multilist_t *ml, void *obj)
203 {
204 	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
205 	multilist_sublist_t *mls;
206 	boolean_t need_lock;
207 
208 	DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
209 	    unsigned int, sublist_idx, void *, obj);
210 
211 	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
212 
213 	mls = &ml->ml_sublists[sublist_idx];
214 	/* See comment in multilist_insert(). */
215 	need_lock = !MUTEX_HELD(&mls->mls_lock);
216 
217 	if (need_lock)
218 		mutex_enter(&mls->mls_lock);
219 
220 	ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
221 
222 	multilist_sublist_remove(mls, obj);
223 
224 	if (need_lock)
225 		mutex_exit(&mls->mls_lock);
226 }
227 
228 /*
229  * Check to see if this multilist object is empty.
230  *
231  * This will return TRUE if it finds all of the sublists of this
232  * multilist to be empty, and FALSE otherwise. Each sublist lock will be
233  * automatically acquired as necessary.
234  *
235  * If concurrent insertions and removals are occurring, the semantics
236  * of this function become a little fuzzy. Instead of locking all
237  * sublists for the entire call time of the function, each sublist is
238  * only locked as it is individually checked for emptiness. Thus, it's
239  * possible for this function to return TRUE with non-empty sublists at
240  * the time the function returns. This would be due to another thread
241  * inserting into a given sublist, after that specific sublist was check
242  * and deemed empty, but before all sublists have been checked.
243  */
244 int
245 multilist_is_empty(multilist_t *ml)
246 {
247 	for (int i = 0; i < ml->ml_num_sublists; i++) {
248 		multilist_sublist_t *mls = &ml->ml_sublists[i];
249 		/* See comment in multilist_insert(). */
250 		boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
251 
252 		if (need_lock)
253 			mutex_enter(&mls->mls_lock);
254 
255 		if (!list_is_empty(&mls->mls_list)) {
256 			if (need_lock)
257 				mutex_exit(&mls->mls_lock);
258 
259 			return (FALSE);
260 		}
261 
262 		if (need_lock)
263 			mutex_exit(&mls->mls_lock);
264 	}
265 
266 	return (TRUE);
267 }
268 
269 /* Return the number of sublists composing this multilist */
270 unsigned int
271 multilist_get_num_sublists(multilist_t *ml)
272 {
273 	return (ml->ml_num_sublists);
274 }
275 
276 /* Return a randomly selected, valid sublist index for this multilist */
277 unsigned int
278 multilist_get_random_index(multilist_t *ml)
279 {
280 	return (spa_get_random(ml->ml_num_sublists));
281 }
282 
283 /* Lock and return the sublist specified at the given index */
284 multilist_sublist_t *
285 multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
286 {
287 	multilist_sublist_t *mls;
288 
289 	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
290 	mls = &ml->ml_sublists[sublist_idx];
291 	mutex_enter(&mls->mls_lock);
292 
293 	return (mls);
294 }
295 
296 /* Lock and return the sublist that would be used to store the specified obj */
297 multilist_sublist_t *
298 multilist_sublist_lock_obj(multilist_t *ml, void *obj)
299 {
300 	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
301 }
302 
303 void
304 multilist_sublist_unlock(multilist_sublist_t *mls)
305 {
306 	mutex_exit(&mls->mls_lock);
307 }
308 
309 /*
310  * We're allowing any object to be inserted into this specific sublist,
311  * but this can lead to trouble if multilist_remove() is called to
312  * remove this object. Specifically, if calling ml_index_func on this
313  * object returns an index for sublist different than what is passed as
314  * a parameter here, any call to multilist_remove() with this newly
315  * inserted object is undefined! (the call to multilist_remove() will
316  * remove the object from a list that it isn't contained in)
317  */
318 void
319 multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
320 {
321 	ASSERT(MUTEX_HELD(&mls->mls_lock));
322 	list_insert_head(&mls->mls_list, obj);
323 }
324 
325 /* please see comment above multilist_sublist_insert_head */
326 void
327 multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
328 {
329 	ASSERT(MUTEX_HELD(&mls->mls_lock));
330 	list_insert_tail(&mls->mls_list, obj);
331 }
332 
333 /*
334  * Move the object one element forward in the list.
335  *
336  * This function will move the given object forward in the list (towards
337  * the head) by one object. So, in essence, it will swap its position in
338  * the list with its "prev" pointer. If the given object is already at the
339  * head of the list, it cannot be moved forward any more than it already
340  * is, so no action is taken.
341  *
342  * NOTE: This function **must not** remove any object from the list other
343  *       than the object given as the parameter. This is relied upon in
344  *       arc_evict_state_impl().
345  */
346 void
347 multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
348 {
349 	void *prev = list_prev(&mls->mls_list, obj);
350 
351 	ASSERT(MUTEX_HELD(&mls->mls_lock));
352 	ASSERT(!list_is_empty(&mls->mls_list));
353 
354 	/* 'obj' must be at the head of the list, nothing to do */
355 	if (prev == NULL)
356 		return;
357 
358 	list_remove(&mls->mls_list, obj);
359 	list_insert_before(&mls->mls_list, prev, obj);
360 }
361 
362 void
363 multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
364 {
365 	ASSERT(MUTEX_HELD(&mls->mls_lock));
366 	list_remove(&mls->mls_list, obj);
367 }
368 
369 int
370 multilist_sublist_is_empty(multilist_sublist_t *mls)
371 {
372 	ASSERT(MUTEX_HELD(&mls->mls_lock));
373 	return (list_is_empty(&mls->mls_list));
374 }
375 
376 int
377 multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx)
378 {
379 	multilist_sublist_t *mls;
380 	int empty;
381 
382 	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
383 	mls = &ml->ml_sublists[sublist_idx];
384 	ASSERT(!MUTEX_HELD(&mls->mls_lock));
385 	mutex_enter(&mls->mls_lock);
386 	empty = list_is_empty(&mls->mls_list);
387 	mutex_exit(&mls->mls_lock);
388 	return (empty);
389 }
390 
391 void *
392 multilist_sublist_head(multilist_sublist_t *mls)
393 {
394 	ASSERT(MUTEX_HELD(&mls->mls_lock));
395 	return (list_head(&mls->mls_list));
396 }
397 
398 void *
399 multilist_sublist_tail(multilist_sublist_t *mls)
400 {
401 	ASSERT(MUTEX_HELD(&mls->mls_lock));
402 	return (list_tail(&mls->mls_list));
403 }
404 
405 void *
406 multilist_sublist_next(multilist_sublist_t *mls, void *obj)
407 {
408 	ASSERT(MUTEX_HELD(&mls->mls_lock));
409 	return (list_next(&mls->mls_list, obj));
410 }
411 
412 void *
413 multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
414 {
415 	ASSERT(MUTEX_HELD(&mls->mls_lock));
416 	return (list_prev(&mls->mls_list, obj));
417 }
418 
419 void
420 multilist_link_init(multilist_node_t *link)
421 {
422 	list_link_init(link);
423 }
424 
425 int
426 multilist_link_active(multilist_node_t *link)
427 {
428 	return (list_link_active(link));
429 }
430 
431 /* BEGIN CSTYLED */
432 ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, INT, ZMOD_RW,
433 	"Number of sublists used in each multilist");
434 /* END CSTYLED */
435