1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 2012 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/refcount.h> 30 #include <sys/rrwlock.h> 31 32 /* 33 * This file contains the implementation of a re-entrant read 34 * reader/writer lock (aka "rrwlock"). 35 * 36 * This is a normal reader/writer lock with the additional feature 37 * of allowing threads who have already obtained a read lock to 38 * re-enter another read lock (re-entrant read) - even if there are 39 * waiting writers. 40 * 41 * Callers who have not obtained a read lock give waiting writers priority. 42 * 43 * The rrwlock_t lock does not allow re-entrant writers, nor does it 44 * allow a re-entrant mix of reads and writes (that is, it does not 45 * allow a caller who has already obtained a read lock to be able to 46 * then grab a write lock without first dropping all read locks, and 47 * vice versa). 48 * 49 * The rrwlock_t uses tsd (thread specific data) to keep a list of 50 * nodes (rrw_node_t), where each node keeps track of which specific 51 * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering 52 * should be rare, a thread that grabs multiple reads on the same rrwlock_t 53 * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the 54 * tsd list can represent a different rrwlock_t. This allows a thread 55 * to enter multiple and unique rrwlock_ts for read locks at the same time. 56 * 57 * Since using tsd exposes some overhead, the rrwlock_t only needs to 58 * keep tsd data when writers are waiting. If no writers are waiting, then 59 * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd 60 * is needed. Once a writer attempts to grab the lock, readers then 61 * keep tsd data and bump the linked readers count (rr_linked_rcount). 62 * 63 * If there are waiting writers and there are anonymous readers, then a 64 * reader doesn't know if it is a re-entrant lock. But since it may be one, 65 * we allow the read to proceed (otherwise it could deadlock). Since once 66 * waiting writers are active, readers no longer bump the anonymous count, 67 * the anonymous readers will eventually flush themselves out. At this point, 68 * readers will be able to tell if they are a re-entrant lock (have a 69 * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then 70 * we must let the proceed. If they are not, then the reader blocks for the 71 * waiting writers. Hence, we do not starve writers. 72 */ 73 74 /* global key for TSD */ 75 uint_t rrw_tsd_key; 76 77 typedef struct rrw_node { 78 struct rrw_node *rn_next; 79 rrwlock_t *rn_rrl; 80 } rrw_node_t; 81 82 static rrw_node_t * 83 rrn_find(rrwlock_t *rrl) 84 { 85 rrw_node_t *rn; 86 87 if (refcount_count(&rrl->rr_linked_rcount) == 0) 88 return (NULL); 89 90 for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 91 if (rn->rn_rrl == rrl) 92 return (rn); 93 } 94 return (NULL); 95 } 96 97 /* 98 * Add a node to the head of the singly linked list. 99 */ 100 static void 101 rrn_add(rrwlock_t *rrl) 102 { 103 rrw_node_t *rn; 104 105 rn = kmem_alloc(sizeof (*rn), KM_SLEEP); 106 rn->rn_rrl = rrl; 107 rn->rn_next = tsd_get(rrw_tsd_key); 108 VERIFY(tsd_set(rrw_tsd_key, rn) == 0); 109 } 110 111 /* 112 * If a node is found for 'rrl', then remove the node from this 113 * thread's list and return TRUE; otherwise return FALSE. 114 */ 115 static boolean_t 116 rrn_find_and_remove(rrwlock_t *rrl) 117 { 118 rrw_node_t *rn; 119 rrw_node_t *prev = NULL; 120 121 if (refcount_count(&rrl->rr_linked_rcount) == 0) 122 return (B_FALSE); 123 124 for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 125 if (rn->rn_rrl == rrl) { 126 if (prev) 127 prev->rn_next = rn->rn_next; 128 else 129 VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); 130 kmem_free(rn, sizeof (*rn)); 131 return (B_TRUE); 132 } 133 prev = rn; 134 } 135 return (B_FALSE); 136 } 137 138 void 139 rrw_init(rrwlock_t *rrl) 140 { 141 mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); 142 cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); 143 rrl->rr_writer = NULL; 144 refcount_create(&rrl->rr_anon_rcount); 145 refcount_create(&rrl->rr_linked_rcount); 146 rrl->rr_writer_wanted = B_FALSE; 147 } 148 149 void 150 rrw_destroy(rrwlock_t *rrl) 151 { 152 mutex_destroy(&rrl->rr_lock); 153 cv_destroy(&rrl->rr_cv); 154 ASSERT(rrl->rr_writer == NULL); 155 refcount_destroy(&rrl->rr_anon_rcount); 156 refcount_destroy(&rrl->rr_linked_rcount); 157 } 158 159 static void 160 rrw_enter_read(rrwlock_t *rrl, void *tag) 161 { 162 mutex_enter(&rrl->rr_lock); 163 #if !defined(DEBUG) && defined(_KERNEL) 164 if (!rrl->rr_writer && !rrl->rr_writer_wanted) { 165 rrl->rr_anon_rcount.rc_count++; 166 mutex_exit(&rrl->rr_lock); 167 return; 168 } 169 DTRACE_PROBE(zfs__rrwfastpath__rdmiss); 170 #endif 171 ASSERT(rrl->rr_writer != curthread); 172 ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); 173 174 while (rrl->rr_writer || (rrl->rr_writer_wanted && 175 refcount_is_zero(&rrl->rr_anon_rcount) && 176 rrn_find(rrl) == NULL)) 177 cv_wait(&rrl->rr_cv, &rrl->rr_lock); 178 179 if (rrl->rr_writer_wanted) { 180 /* may or may not be a re-entrant enter */ 181 rrn_add(rrl); 182 (void) refcount_add(&rrl->rr_linked_rcount, tag); 183 } else { 184 (void) refcount_add(&rrl->rr_anon_rcount, tag); 185 } 186 ASSERT(rrl->rr_writer == NULL); 187 mutex_exit(&rrl->rr_lock); 188 } 189 190 static void 191 rrw_enter_write(rrwlock_t *rrl) 192 { 193 mutex_enter(&rrl->rr_lock); 194 ASSERT(rrl->rr_writer != curthread); 195 196 while (refcount_count(&rrl->rr_anon_rcount) > 0 || 197 refcount_count(&rrl->rr_linked_rcount) > 0 || 198 rrl->rr_writer != NULL) { 199 rrl->rr_writer_wanted = B_TRUE; 200 cv_wait(&rrl->rr_cv, &rrl->rr_lock); 201 } 202 rrl->rr_writer_wanted = B_FALSE; 203 rrl->rr_writer = curthread; 204 mutex_exit(&rrl->rr_lock); 205 } 206 207 void 208 rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) 209 { 210 if (rw == RW_READER) 211 rrw_enter_read(rrl, tag); 212 else 213 rrw_enter_write(rrl); 214 } 215 216 void 217 rrw_exit(rrwlock_t *rrl, void *tag) 218 { 219 mutex_enter(&rrl->rr_lock); 220 #if !defined(DEBUG) && defined(_KERNEL) 221 if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { 222 rrl->rr_anon_rcount.rc_count--; 223 if (rrl->rr_anon_rcount.rc_count == 0) 224 cv_broadcast(&rrl->rr_cv); 225 mutex_exit(&rrl->rr_lock); 226 return; 227 } 228 DTRACE_PROBE(zfs__rrwfastpath__exitmiss); 229 #endif 230 ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || 231 !refcount_is_zero(&rrl->rr_linked_rcount) || 232 rrl->rr_writer != NULL); 233 234 if (rrl->rr_writer == NULL) { 235 int64_t count; 236 if (rrn_find_and_remove(rrl)) 237 count = refcount_remove(&rrl->rr_linked_rcount, tag); 238 else 239 count = refcount_remove(&rrl->rr_anon_rcount, tag); 240 if (count == 0) 241 cv_broadcast(&rrl->rr_cv); 242 } else { 243 ASSERT(rrl->rr_writer == curthread); 244 ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && 245 refcount_is_zero(&rrl->rr_linked_rcount)); 246 rrl->rr_writer = NULL; 247 cv_broadcast(&rrl->rr_cv); 248 } 249 mutex_exit(&rrl->rr_lock); 250 } 251 252 boolean_t 253 rrw_held(rrwlock_t *rrl, krw_t rw) 254 { 255 boolean_t held; 256 257 mutex_enter(&rrl->rr_lock); 258 if (rw == RW_WRITER) { 259 held = (rrl->rr_writer == curthread); 260 } else { 261 held = (!refcount_is_zero(&rrl->rr_anon_rcount) || 262 !refcount_is_zero(&rrl->rr_linked_rcount)); 263 } 264 mutex_exit(&rrl->rr_lock); 265 266 return (held); 267 } 268 269 void 270 rrw_tsd_destroy(void *arg) 271 { 272 rrw_node_t *rn = arg; 273 if (rn != NULL) { 274 panic("thread %p terminating with rrw lock %p held", 275 (void *)curthread, (void *)rn->rn_rrl); 276 } 277 } 278