xref: /illumos-gate/usr/src/uts/common/fs/zfs/rrwlock.c (revision 9853d9e82e7a067a2b88dae2fd257207e6be5f94)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/refcount.h>
27 #include <sys/rrwlock.h>
28 
29 /*
30  * This file contains the implementation of a re-entrant read
31  * reader/writer lock (aka "rrwlock").
32  *
33  * This is a normal reader/writer lock with the additional feature
34  * of allowing threads who have already obtained a read lock to
35  * re-enter another read lock (re-entrant read) - even if there are
36  * waiting writers.
37  *
38  * Callers who have not obtained a read lock give waiting writers priority.
39  *
40  * The rrwlock_t lock does not allow re-entrant writers, nor does it
41  * allow a re-entrant mix of reads and writes (that is, it does not
42  * allow a caller who has already obtained a read lock to be able to
43  * then grab a write lock without first dropping all read locks, and
44  * vice versa).
45  *
46  * The rrwlock_t uses tsd (thread specific data) to keep a list of
47  * nodes (rrw_node_t), where each node keeps track of which specific
48  * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
49  * should be rare, a thread that grabs multiple reads on the same rrwlock_t
50  * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
51  * tsd list can represent a different rrwlock_t.  This allows a thread
52  * to enter multiple and unique rrwlock_ts for read locks at the same time.
53  *
54  * Since using tsd exposes some overhead, the rrwlock_t only needs to
55  * keep tsd data when writers are waiting.  If no writers are waiting, then
56  * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
57  * is needed.  Once a writer attempts to grab the lock, readers then
58  * keep tsd data and bump the linked readers count (rr_linked_rcount).
59  *
60  * If there are waiting writers and there are anonymous readers, then a
61  * reader doesn't know if it is a re-entrant lock. But since it may be one,
62  * we allow the read to proceed (otherwise it could deadlock).  Since once
63  * waiting writers are active, readers no longer bump the anonymous count,
64  * the anonymous readers will eventually flush themselves out.  At this point,
65  * readers will be able to tell if they are a re-entrant lock (have a
66  * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
67  * we must let the proceed.  If they are not, then the reader blocks for the
68  * waiting writers.  Hence, we do not starve writers.
69  */
70 
71 /* global key for TSD */
72 uint_t rrw_tsd_key;
73 
74 typedef struct rrw_node {
75 	struct rrw_node	*rn_next;
76 	rrwlock_t	*rn_rrl;
77 } rrw_node_t;
78 
79 static rrw_node_t *
80 rrn_find(rrwlock_t *rrl)
81 {
82 	rrw_node_t *rn;
83 
84 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
85 		return (NULL);
86 
87 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
88 		if (rn->rn_rrl == rrl)
89 			return (rn);
90 	}
91 	return (NULL);
92 }
93 
94 /*
95  * Add a node to the head of the singly linked list.
96  */
97 static void
98 rrn_add(rrwlock_t *rrl)
99 {
100 	rrw_node_t *rn;
101 
102 	rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
103 	rn->rn_rrl = rrl;
104 	rn->rn_next = tsd_get(rrw_tsd_key);
105 	VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
106 }
107 
108 /*
109  * If a node is found for 'rrl', then remove the node from this
110  * thread's list and return TRUE; otherwise return FALSE.
111  */
112 static boolean_t
113 rrn_find_and_remove(rrwlock_t *rrl)
114 {
115 	rrw_node_t *rn;
116 	rrw_node_t *prev = NULL;
117 
118 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
119 		return (B_FALSE);
120 
121 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
122 		if (rn->rn_rrl == rrl) {
123 			if (prev)
124 				prev->rn_next = rn->rn_next;
125 			else
126 				VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
127 			kmem_free(rn, sizeof (*rn));
128 			return (B_TRUE);
129 		}
130 		prev = rn;
131 	}
132 	return (B_FALSE);
133 }
134 
135 void
136 rrw_init(rrwlock_t *rrl)
137 {
138 	mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
139 	cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
140 	rrl->rr_writer = NULL;
141 	refcount_create(&rrl->rr_anon_rcount);
142 	refcount_create(&rrl->rr_linked_rcount);
143 	rrl->rr_writer_wanted = B_FALSE;
144 }
145 
146 void
147 rrw_destroy(rrwlock_t *rrl)
148 {
149 	mutex_destroy(&rrl->rr_lock);
150 	cv_destroy(&rrl->rr_cv);
151 	ASSERT(rrl->rr_writer == NULL);
152 	refcount_destroy(&rrl->rr_anon_rcount);
153 	refcount_destroy(&rrl->rr_linked_rcount);
154 }
155 
156 static void
157 rrw_enter_read(rrwlock_t *rrl, void *tag)
158 {
159 	mutex_enter(&rrl->rr_lock);
160 #if !defined(DEBUG) && defined(_KERNEL)
161 	if (!rrl->rr_writer && !rrl->rr_writer_wanted) {
162 		rrl->rr_anon_rcount.rc_count++;
163 		mutex_exit(&rrl->rr_lock);
164 		return;
165 	}
166 	DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
167 #endif
168 	ASSERT(rrl->rr_writer != curthread);
169 	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
170 
171 	while (rrl->rr_writer || (rrl->rr_writer_wanted &&
172 	    refcount_is_zero(&rrl->rr_anon_rcount) &&
173 	    rrn_find(rrl) == NULL))
174 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
175 
176 	if (rrl->rr_writer_wanted) {
177 		/* may or may not be a re-entrant enter */
178 		rrn_add(rrl);
179 		(void) refcount_add(&rrl->rr_linked_rcount, tag);
180 	} else {
181 		(void) refcount_add(&rrl->rr_anon_rcount, tag);
182 	}
183 	ASSERT(rrl->rr_writer == NULL);
184 	mutex_exit(&rrl->rr_lock);
185 }
186 
187 static void
188 rrw_enter_write(rrwlock_t *rrl)
189 {
190 	mutex_enter(&rrl->rr_lock);
191 	ASSERT(rrl->rr_writer != curthread);
192 
193 	while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
194 	    refcount_count(&rrl->rr_linked_rcount) > 0 ||
195 	    rrl->rr_writer != NULL) {
196 		rrl->rr_writer_wanted = B_TRUE;
197 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
198 	}
199 	rrl->rr_writer_wanted = B_FALSE;
200 	rrl->rr_writer = curthread;
201 	mutex_exit(&rrl->rr_lock);
202 }
203 
204 void
205 rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
206 {
207 	if (rw == RW_READER)
208 		rrw_enter_read(rrl, tag);
209 	else
210 		rrw_enter_write(rrl);
211 }
212 
213 void
214 rrw_exit(rrwlock_t *rrl, void *tag)
215 {
216 	mutex_enter(&rrl->rr_lock);
217 #if !defined(DEBUG) && defined(_KERNEL)
218 	if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
219 		rrl->rr_anon_rcount.rc_count--;
220 		if (rrl->rr_anon_rcount.rc_count == 0)
221 			cv_broadcast(&rrl->rr_cv);
222 		mutex_exit(&rrl->rr_lock);
223 		return;
224 	}
225 	DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
226 #endif
227 	ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
228 	    !refcount_is_zero(&rrl->rr_linked_rcount) ||
229 	    rrl->rr_writer != NULL);
230 
231 	if (rrl->rr_writer == NULL) {
232 		int64_t count;
233 		if (rrn_find_and_remove(rrl))
234 			count = refcount_remove(&rrl->rr_linked_rcount, tag);
235 		else
236 			count = refcount_remove(&rrl->rr_anon_rcount, tag);
237 		if (count == 0)
238 			cv_broadcast(&rrl->rr_cv);
239 	} else {
240 		ASSERT(rrl->rr_writer == curthread);
241 		ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
242 		    refcount_is_zero(&rrl->rr_linked_rcount));
243 		rrl->rr_writer = NULL;
244 		cv_broadcast(&rrl->rr_cv);
245 	}
246 	mutex_exit(&rrl->rr_lock);
247 }
248 
249 boolean_t
250 rrw_held(rrwlock_t *rrl, krw_t rw)
251 {
252 	boolean_t held;
253 
254 	mutex_enter(&rrl->rr_lock);
255 	if (rw == RW_WRITER) {
256 		held = (rrl->rr_writer == curthread);
257 	} else {
258 		held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
259 		    !refcount_is_zero(&rrl->rr_linked_rcount));
260 	}
261 	mutex_exit(&rrl->rr_lock);
262 
263 	return (held);
264 }
265