xref: /illumos-gate/usr/src/uts/common/fs/zfs/multilist.c (revision ed093b41a93e8563e6e1e5dae0768dda2a7bcc27)
1 /*
2  * CDDL HEADER START
3  *
4  * This file and its contents are supplied under the terms of the
5  * Common Development and Distribution License ("CDDL"), version 1.0.
6  * You may only use this file in accordance with the terms of version
7  * 1.0 of the CDDL.
8  *
9  * A full copy of the text of the CDDL should have accompanied this
10  * source.  A copy of the CDDL is also available via the Internet at
11  * http://www.illumos.org/license/CDDL.
12  *
13  * CDDL HEADER END
14  */
15 /*
16  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
17  */
18 
19 #include <sys/zfs_context.h>
20 #include <sys/multilist.h>
21 
22 /* needed for spa_get_random() */
23 #include <sys/spa.h>
24 
25 /*
26  * This overrides the number of sublists in each multilist_t, which defaults
27  * to the number of CPUs in the system (see multilist_create()).
28  */
29 int zfs_multilist_num_sublists = 0;
30 
31 /*
32  * Given the object contained on the list, return a pointer to the
33  * object's multilist_node_t structure it contains.
34  */
35 static multilist_node_t *
36 multilist_d2l(multilist_t *ml, void *obj)
37 {
38 	return ((multilist_node_t *)((char *)obj + ml->ml_offset));
39 }
40 
41 /*
42  * Initialize a new mutlilist using the parameters specified.
43  *
44  *  - 'size' denotes the size of the structure containing the
45  *     multilist_node_t.
46  *  - 'offset' denotes the byte offset of the mutlilist_node_t within
47  *     the structure that contains it.
48  *  - 'num' specifies the number of internal sublists to create.
49  *  - 'index_func' is used to determine which sublist to insert into
50  *     when the multilist_insert() function is called; as well as which
51  *     sublist to remove from when multilist_remove() is called. The
52  *     requirements this function must meet, are the following:
53  *
54  *      - It must always return the same value when called on the same
55  *        object (to ensure the object is removed from the list it was
56  *        inserted into).
57  *
58  *      - It must return a value in the range [0, number of sublists).
59  *        The multilist_get_num_sublists() function may be used to
60  *        determine the number of sublists in the multilist.
61  *
62  *     Also, in order to reduce internal contention between the sublists
63  *     during insertion and removal, this function should choose evenly
64  *     between all available sublists when inserting. This isn't a hard
65  *     requirement, but a general rule of thumb in order to garner the
66  *     best multi-threaded performance out of the data structure.
67  */
68 static multilist_t *
69 multilist_create_impl(size_t size, size_t offset,
70     unsigned int num, multilist_sublist_index_func_t *index_func)
71 {
72 	ASSERT3U(size, >, 0);
73 	ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
74 	ASSERT3U(num, >, 0);
75 	ASSERT3P(index_func, !=, NULL);
76 
77 	multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP);
78 	ml->ml_offset = offset;
79 	ml->ml_num_sublists = num;
80 	ml->ml_index_func = index_func;
81 
82 	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
83 	    ml->ml_num_sublists, KM_SLEEP);
84 
85 	ASSERT3P(ml->ml_sublists, !=, NULL);
86 
87 	for (int i = 0; i < ml->ml_num_sublists; i++) {
88 		multilist_sublist_t *mls = &ml->ml_sublists[i];
89 		mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL);
90 		list_create(&mls->mls_list, size, offset);
91 	}
92 	return (ml);
93 }
94 
95 /*
96  * Allocate a new multilist, using the default number of sublists
97  * (the number of CPUs, or at least 4, or the tunable
98  * zfs_multilist_num_sublists).
99  */
100 multilist_t *
101 multilist_create(size_t size, size_t offset,
102     multilist_sublist_index_func_t *index_func)
103 {
104 	int num_sublists;
105 
106 	if (zfs_multilist_num_sublists > 0) {
107 		num_sublists = zfs_multilist_num_sublists;
108 	} else {
109 		num_sublists = MAX(boot_ncpus, 4);
110 	}
111 
112 	return (multilist_create_impl(size, offset, num_sublists, index_func));
113 }
114 
115 /*
116  * Destroy the given multilist object, and free up any memory it holds.
117  */
118 void
119 multilist_destroy(multilist_t *ml)
120 {
121 	ASSERT(multilist_is_empty(ml));
122 
123 	for (int i = 0; i < ml->ml_num_sublists; i++) {
124 		multilist_sublist_t *mls = &ml->ml_sublists[i];
125 
126 		ASSERT(list_is_empty(&mls->mls_list));
127 
128 		list_destroy(&mls->mls_list);
129 		mutex_destroy(&mls->mls_lock);
130 	}
131 
132 	ASSERT3P(ml->ml_sublists, !=, NULL);
133 	kmem_free(ml->ml_sublists,
134 	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);
135 
136 	ml->ml_num_sublists = 0;
137 	ml->ml_offset = 0;
138 	kmem_free(ml, sizeof (multilist_t));
139 }
140 
141 /*
142  * Insert the given object into the multilist.
143  *
144  * This function will insert the object specified into the sublist
145  * determined using the function given at multilist creation time.
146  *
147  * The sublist locks are automatically acquired if not already held, to
148  * ensure consistency when inserting and removing from multiple threads.
149  */
150 void
151 multilist_insert(multilist_t *ml, void *obj)
152 {
153 	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
154 	multilist_sublist_t *mls;
155 	boolean_t need_lock;
156 
157 	DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
158 	    unsigned int, sublist_idx, void *, obj);
159 
160 	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
161 
162 	mls = &ml->ml_sublists[sublist_idx];
163 
164 	/*
165 	 * Note: Callers may already hold the sublist lock by calling
166 	 * multilist_sublist_lock().  Here we rely on MUTEX_HELD()
167 	 * returning TRUE if and only if the current thread holds the
168 	 * lock.  While it's a little ugly to make the lock recursive in
169 	 * this way, it works and allows the calling code to be much
170 	 * simpler -- otherwise it would have to pass around a flag
171 	 * indicating that it already has the lock.
172 	 */
173 	need_lock = !MUTEX_HELD(&mls->mls_lock);
174 
175 	if (need_lock)
176 		mutex_enter(&mls->mls_lock);
177 
178 	ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
179 
180 	multilist_sublist_insert_head(mls, obj);
181 
182 	if (need_lock)
183 		mutex_exit(&mls->mls_lock);
184 }
185 
186 /*
187  * Remove the given object from the multilist.
188  *
189  * This function will remove the object specified from the sublist
190  * determined using the function given at multilist creation time.
191  *
192  * The necessary sublist locks are automatically acquired, to ensure
193  * consistency when inserting and removing from multiple threads.
194  */
195 void
196 multilist_remove(multilist_t *ml, void *obj)
197 {
198 	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
199 	multilist_sublist_t *mls;
200 	boolean_t need_lock;
201 
202 	DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
203 	    unsigned int, sublist_idx, void *, obj);
204 
205 	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
206 
207 	mls = &ml->ml_sublists[sublist_idx];
208 	/* See comment in multilist_insert(). */
209 	need_lock = !MUTEX_HELD(&mls->mls_lock);
210 
211 	if (need_lock)
212 		mutex_enter(&mls->mls_lock);
213 
214 	ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
215 
216 	multilist_sublist_remove(mls, obj);
217 
218 	if (need_lock)
219 		mutex_exit(&mls->mls_lock);
220 }
221 
222 /*
223  * Check to see if this multilist object is empty.
224  *
225  * This will return TRUE if it finds all of the sublists of this
226  * multilist to be empty, and FALSE otherwise. Each sublist lock will be
227  * automatically acquired as necessary.
228  *
229  * If concurrent insertions and removals are occurring, the semantics
230  * of this function become a little fuzzy. Instead of locking all
231  * sublists for the entire call time of the function, each sublist is
232  * only locked as it is individually checked for emptiness. Thus, it's
233  * possible for this function to return TRUE with non-empty sublists at
234  * the time the function returns. This would be due to another thread
235  * inserting into a given sublist, after that specific sublist was check
236  * and deemed empty, but before all sublists have been checked.
237  */
238 int
239 multilist_is_empty(multilist_t *ml)
240 {
241 	for (int i = 0; i < ml->ml_num_sublists; i++) {
242 		multilist_sublist_t *mls = &ml->ml_sublists[i];
243 		/* See comment in multilist_insert(). */
244 		boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
245 
246 		if (need_lock)
247 			mutex_enter(&mls->mls_lock);
248 
249 		if (!list_is_empty(&mls->mls_list)) {
250 			if (need_lock)
251 				mutex_exit(&mls->mls_lock);
252 
253 			return (FALSE);
254 		}
255 
256 		if (need_lock)
257 			mutex_exit(&mls->mls_lock);
258 	}
259 
260 	return (TRUE);
261 }
262 
263 /* Return the number of sublists composing this multilist */
264 unsigned int
265 multilist_get_num_sublists(multilist_t *ml)
266 {
267 	return (ml->ml_num_sublists);
268 }
269 
270 /* Return a randomly selected, valid sublist index for this multilist */
271 unsigned int
272 multilist_get_random_index(multilist_t *ml)
273 {
274 	return (spa_get_random(ml->ml_num_sublists));
275 }
276 
277 /* Lock and return the sublist specified at the given index */
278 multilist_sublist_t *
279 multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
280 {
281 	multilist_sublist_t *mls;
282 
283 	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
284 	mls = &ml->ml_sublists[sublist_idx];
285 	mutex_enter(&mls->mls_lock);
286 
287 	return (mls);
288 }
289 
290 /* Lock and return the sublist that would be used to store the specified obj */
291 multilist_sublist_t *
292 multilist_sublist_lock_obj(multilist_t *ml, void *obj)
293 {
294 	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
295 }
296 
297 void
298 multilist_sublist_unlock(multilist_sublist_t *mls)
299 {
300 	mutex_exit(&mls->mls_lock);
301 }
302 
303 /*
304  * We're allowing any object to be inserted into this specific sublist,
305  * but this can lead to trouble if multilist_remove() is called to
306  * remove this object. Specifically, if calling ml_index_func on this
307  * object returns an index for sublist different than what is passed as
308  * a parameter here, any call to multilist_remove() with this newly
309  * inserted object is undefined! (the call to multilist_remove() will
310  * remove the object from a list that it isn't contained in)
311  */
312 void
313 multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
314 {
315 	ASSERT(MUTEX_HELD(&mls->mls_lock));
316 	list_insert_head(&mls->mls_list, obj);
317 }
318 
319 /* please see comment above multilist_sublist_insert_head */
320 void
321 multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
322 {
323 	ASSERT(MUTEX_HELD(&mls->mls_lock));
324 	list_insert_tail(&mls->mls_list, obj);
325 }
326 
327 /*
328  * Move the object one element forward in the list.
329  *
330  * This function will move the given object forward in the list (towards
331  * the head) by one object. So, in essence, it will swap its position in
332  * the list with its "prev" pointer. If the given object is already at the
333  * head of the list, it cannot be moved forward any more than it already
334  * is, so no action is taken.
335  *
336  * NOTE: This function **must not** remove any object from the list other
337  *       than the object given as the parameter. This is relied upon in
338  *       arc_evict_state_impl().
339  */
340 void
341 multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
342 {
343 	void *prev = list_prev(&mls->mls_list, obj);
344 
345 	ASSERT(MUTEX_HELD(&mls->mls_lock));
346 	ASSERT(!list_is_empty(&mls->mls_list));
347 
348 	/* 'obj' must be at the head of the list, nothing to do */
349 	if (prev == NULL)
350 		return;
351 
352 	list_remove(&mls->mls_list, obj);
353 	list_insert_before(&mls->mls_list, prev, obj);
354 }
355 
356 void
357 multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
358 {
359 	ASSERT(MUTEX_HELD(&mls->mls_lock));
360 	list_remove(&mls->mls_list, obj);
361 }
362 
363 void *
364 multilist_sublist_head(multilist_sublist_t *mls)
365 {
366 	ASSERT(MUTEX_HELD(&mls->mls_lock));
367 	return (list_head(&mls->mls_list));
368 }
369 
370 void *
371 multilist_sublist_tail(multilist_sublist_t *mls)
372 {
373 	ASSERT(MUTEX_HELD(&mls->mls_lock));
374 	return (list_tail(&mls->mls_list));
375 }
376 
377 void *
378 multilist_sublist_next(multilist_sublist_t *mls, void *obj)
379 {
380 	ASSERT(MUTEX_HELD(&mls->mls_lock));
381 	return (list_next(&mls->mls_list, obj));
382 }
383 
384 void *
385 multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
386 {
387 	ASSERT(MUTEX_HELD(&mls->mls_lock));
388 	return (list_prev(&mls->mls_list, obj));
389 }
390 
391 void
392 multilist_link_init(multilist_node_t *link)
393 {
394 	list_link_init(link);
395 }
396 
397 int
398 multilist_link_active(multilist_node_t *link)
399 {
400 	return (list_link_active(link));
401 }
402